SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
MLDataHDF5File.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Copyright (C) 2013 Zhengyang Liu (zhengyangl)
00008  */
00009 
00010 #include <shogun/lib/config.h>
00011 
00012 #if defined(HAVE_HDF5) && defined( HAVE_CURL)
00013 
00014 #include <stdio.h>
00015 #include <stdlib.h>
00016 #include <string.h>
00017 #include <hdf5.h>
00018 #include <curl/curl.h>
00019 #include <shogun/lib/memory.h>
00020 #include <shogun/io/MLDataHDF5File.h>
00021 
00022 #include <shogun/features/StringFeatures.h>
00023 #include <shogun/features/SparseFeatures.h>
00024 
00025 using namespace shogun;
00026 
00027 CMLDataHDF5File::CMLDataHDF5File()
00028 {
00029     SG_UNSTABLE("CMLDataHDF5File::CMLDataHDF5File()", "\n")
00030 
00031     get_boolean_type();
00032     h5file = -1;
00033 }
00034 
00035 size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
00036     size_t written = fwrite(ptr, size, nmemb, stream);
00037     return written;
00038 }
00039 
00040 CMLDataHDF5File::CMLDataHDF5File(char* data_name,
00041                                  const char* name,
00042                                  const char* url_prefix) : CFile()
00043 {
00044     get_boolean_type();
00045     H5Eset_auto2(H5E_DEFAULT, NULL, NULL);
00046 
00047     if (name)
00048         set_variable_name(name);
00049 
00050     CURL *curl;
00051     FILE *fp=NULL;
00052 
00053     mldata_url = SG_CALLOC(char, strlen(url_prefix)+strlen(data_name)+1);
00054     strcat(mldata_url, url_prefix);
00055     strcat(mldata_url, data_name);
00056 
00057     fname = SG_CALLOC(char, strlen((char*)"/tmp/")+strlen(data_name)+strlen((char*)".h5")+1);
00058     strcat(fname, (char*) "/tmp/");
00059     strcat(fname, data_name);
00060     strcat(fname, (char*) ".h5");
00061 
00062     curl = curl_easy_init();
00063     fp = fopen(fname,"wb");
00064 
00065     if (!fp)
00066     {
00067         SG_ERROR("Could not open file '%s'\n", fname)
00068         return;
00069     }
00070 
00071     if (curl) {
00072         curl_easy_setopt(curl, CURLOPT_URL, mldata_url);
00073         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &write_data);
00074         curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
00075         curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
00076         curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
00077         curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
00078         curl_easy_perform(curl);
00079         curl_easy_cleanup(curl);
00080     }
00081 
00082     if(fp)
00083         fclose(fp);
00084 
00085     h5file = H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
00086 
00087     if (h5file<0)
00088         SG_ERROR("Could not open data repository '%s'\n", data_name)
00089 }
00090 
00091 CMLDataHDF5File::~CMLDataHDF5File()
00092 {
00093     H5Fclose(h5file);
00094     remove(fname);
00095     SG_FREE(fname);
00096     SG_FREE(mldata_url);
00097 }
00098 
00099 #define GET_VECTOR(fname, sg_type, datatype)                                        \
00100 void CMLDataHDF5File::fname(sg_type*& vec, int32_t& len)                            \
00101 {                                                                                   \
00102     if (!h5file)                                                                    \
00103         SG_ERROR("File invalid.\n")                                             \
00104                                                                                     \
00105     int32_t* dims;                                                                  \
00106     int32_t ndims;                                                                  \
00107     int64_t nelements;                                                              \
00108     hid_t dataset=H5Dopen2(h5file, variable_name, H5P_DEFAULT);                 \
00109     if (dataset<0)                                                                  \
00110         SG_ERROR("Error opening data set\n")                                        \
00111     hid_t dtype=H5Dget_type(dataset);                                               \
00112     H5T_class_t t_class=H5Tget_class(dtype);                                        \
00113     TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t);         \
00114     if (h5_type==-1)                                                                \
00115     {                                                                               \
00116         H5Dclose(dataset);                                                          \
00117         SG_INFO("No compatible datatype found\n")                                   \
00118     }                                                                               \
00119     get_dims(dataset, dims, ndims, nelements);                                      \
00120     if (!((ndims==2 && dims[0]==nelements && dims[1]==1) ||                         \
00121             (ndims==2 && dims[0]==1 && dims[1]==nelements) ||                       \
00122             (ndims==1 && dims[0]==nelements)))                                      \
00123         SG_ERROR("Error not a 1-dimensional vector (ndims=%d, dims[0]=%d)\n", ndims, dims[0])   \
00124     vec=SG_MALLOC(sg_type, nelements);                                                      \
00125     len=nelements;                                                                  \
00126     herr_t status = H5Dread(dataset, h5_type, H5S_ALL,                              \
00127             H5S_ALL, H5P_DEFAULT, vec);                                             \
00128     H5Dclose(dataset);                                                              \
00129     H5Tclose(dtype);                                                                \
00130     SG_FREE(dims);                                                                  \
00131     if (status<0)                                                                   \
00132     {                                                                               \
00133         SG_FREE(vec);                                                               \
00134         SG_ERROR("Error reading dataset\n")                                     \
00135     }                                                                               \
00136 }
00137 
00138 GET_VECTOR(get_vector, bool, (CT_VECTOR, ST_NONE, PT_BOOL))
00139 GET_VECTOR(get_vector, int8_t, (CT_VECTOR, ST_NONE, PT_INT8))
00140 GET_VECTOR(get_vector, uint8_t, (CT_VECTOR, ST_NONE, PT_UINT8))
00141 GET_VECTOR(get_vector, char, (CT_VECTOR, ST_NONE, PT_CHAR))
00142 GET_VECTOR(get_vector, int32_t, (CT_VECTOR, ST_NONE, PT_INT32))
00143 GET_VECTOR(get_vector, uint32_t, (CT_VECTOR, ST_NONE, PT_UINT32))
00144 GET_VECTOR(get_vector, float32_t, (CT_VECTOR, ST_NONE, PT_FLOAT32))
00145 GET_VECTOR(get_vector, float64_t, (CT_VECTOR, ST_NONE, PT_FLOAT64))
00146 GET_VECTOR(get_vector, floatmax_t, (CT_VECTOR, ST_NONE, PT_FLOATMAX))
00147 GET_VECTOR(get_vector, int16_t, (CT_VECTOR, ST_NONE, PT_INT16))
00148 GET_VECTOR(get_vector, uint16_t, (CT_VECTOR, ST_NONE, PT_INT16))
00149 GET_VECTOR(get_vector, int64_t, (CT_VECTOR, ST_NONE, PT_INT64))
00150 GET_VECTOR(get_vector, uint64_t, (CT_VECTOR, ST_NONE, PT_UINT64))
00151 #undef GET_VECTOR
00152 
00153 #define GET_MATRIX(fname, sg_type, datatype)                                        \
00154 void CMLDataHDF5File::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec)  \
00155 {                                                                                   \
00156     if (!h5file)                                                                    \
00157         SG_ERROR("File invalid.\n")                                             \
00158                                                                                     \
00159     int32_t* dims;                                                                  \
00160     int32_t ndims;                                                                  \
00161     int64_t nelements;                                                              \
00162     hid_t dataset = H5Dopen2(h5file, variable_name, H5P_DEFAULT);                   \
00163     if (dataset<0)                                                                  \
00164         SG_ERROR("Error opening data set\n")                                        \
00165     hid_t dtype = H5Dget_type(dataset);                                             \
00166     H5T_class_t t_class=H5Tget_class(dtype);                                        \
00167     TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t);         \
00168     if (h5_type==-1)                                                                \
00169     {                                                                               \
00170         H5Dclose(dataset);                                                          \
00171         SG_INFO("No compatible datatype found\n")                                   \
00172     }                                                                               \
00173     get_dims(dataset, dims, ndims, nelements);                                      \
00174     if (ndims!=2)                                                                   \
00175         SG_ERROR("Error not a 2-dimensional matrix\n")                              \
00176     matrix=SG_MALLOC(sg_type, nelements);                                                   \
00177     num_feat=dims[0];                                                               \
00178     num_vec=dims[1];                                                                \
00179     herr_t status = H5Dread(dataset, h5_type, H5S_ALL,                              \
00180             H5S_ALL, H5P_DEFAULT, matrix);                                          \
00181     H5Dclose(dataset);                                                              \
00182     H5Tclose(dtype);                                                                \
00183     SG_FREE(dims);                                                                  \
00184     if (status<0)                                                                   \
00185     {                                                                               \
00186         SG_FREE(matrix);                                                            \
00187         SG_ERROR("Error reading dataset\n")                                     \
00188     }                                                                               \
00189 }
00190 
00191 GET_MATRIX(get_matrix, bool, (CT_MATRIX, ST_NONE, PT_BOOL))
00192 GET_MATRIX(get_matrix, char, (CT_MATRIX, ST_NONE, PT_CHAR))
00193 GET_MATRIX(get_matrix, uint8_t, (CT_MATRIX, ST_NONE, PT_UINT8))
00194 GET_MATRIX(get_matrix, int32_t, (CT_MATRIX, ST_NONE, PT_INT32))
00195 GET_MATRIX(get_matrix, uint32_t, (CT_MATRIX, ST_NONE, PT_INT32))
00196 GET_MATRIX(get_matrix, int64_t, (CT_MATRIX, ST_NONE, PT_INT64))
00197 GET_MATRIX(get_matrix, uint64_t, (CT_MATRIX, ST_NONE, PT_INT64))
00198 GET_MATRIX(get_matrix, int16_t, (CT_MATRIX, ST_NONE, PT_INT16))
00199 GET_MATRIX(get_matrix, uint16_t, (CT_MATRIX, ST_NONE, PT_INT16))
00200 GET_MATRIX(get_matrix, float32_t, (CT_MATRIX, ST_NONE, PT_FLOAT32))
00201 GET_MATRIX(get_matrix, float64_t, (CT_MATRIX, ST_NONE, PT_FLOAT64))
00202 GET_MATRIX(get_matrix, floatmax_t, (CT_MATRIX, ST_NONE, PT_FLOATMAX))
00203 #undef GET_MATRIX
00204 
00205 void CMLDataHDF5File::get_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims)
00206 {
00207 }
00208 
00209 void CMLDataHDF5File::get_ndarray(char*& array, int32_t*& dims, int32_t& num_dims)
00210 {
00211 }
00212 
00213 void CMLDataHDF5File::get_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims)
00214 {
00215 }
00216 
00217 void CMLDataHDF5File::get_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims)
00218 {
00219 }
00220 
00221 void CMLDataHDF5File::get_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims)
00222 {
00223 }
00224 
00225 void CMLDataHDF5File::get_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims)
00226 {
00227 }
00228 
00229 void CMLDataHDF5File::get_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims)
00230 {
00231 }
00232 
00233 #define GET_SPARSEMATRIX(fname, sg_type, datatype)                                      \
00234 void CMLDataHDF5File::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec)  \
00235 {                                                                                       \
00236     if (!(file))                                                                        \
00237         SG_ERROR("File invalid.\n")                                                 \
00238 }
00239 GET_SPARSEMATRIX(get_sparse_matrix, bool, DT_SPARSE_BOOL)
00240 GET_SPARSEMATRIX(get_sparse_matrix, char, DT_SPARSE_CHAR)
00241 GET_SPARSEMATRIX(get_sparse_matrix, int8_t, DT_SPARSE_INT8)
00242 GET_SPARSEMATRIX(get_sparse_matrix, uint8_t, DT_SPARSE_BYTE)
00243 GET_SPARSEMATRIX(get_sparse_matrix, int32_t, DT_SPARSE_INT)
00244 GET_SPARSEMATRIX(get_sparse_matrix, uint32_t, DT_SPARSE_UINT)
00245 GET_SPARSEMATRIX(get_sparse_matrix, int64_t, DT_SPARSE_LONG)
00246 GET_SPARSEMATRIX(get_sparse_matrix, uint64_t, DT_SPARSE_ULONG)
00247 GET_SPARSEMATRIX(get_sparse_matrix, int16_t, DT_SPARSE_SHORT)
00248 GET_SPARSEMATRIX(get_sparse_matrix, uint16_t, DT_SPARSE_WORD)
00249 GET_SPARSEMATRIX(get_sparse_matrix, float32_t, DT_SPARSE_SHORTREAL)
00250 GET_SPARSEMATRIX(get_sparse_matrix, float64_t, DT_SPARSE_REAL)
00251 GET_SPARSEMATRIX(get_sparse_matrix, floatmax_t, DT_SPARSE_LONGREAL)
00252 #undef GET_SPARSEMATRIX
00253 
00254 
00255 #define GET_STRING_LIST(fname, sg_type, datatype)                                               \
00256 void CMLDataHDF5File::fname(SGString<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \
00257 {                                                                                               \
00258 }
00259 
00260 GET_STRING_LIST(get_string_list, bool, DT_STRING_BOOL)
00261 GET_STRING_LIST(get_string_list, char, DT_STRING_CHAR)
00262 GET_STRING_LIST(get_string_list, int8_t, DT_STRING_INT8)
00263 GET_STRING_LIST(get_string_list, uint8_t, DT_STRING_BYTE)
00264 GET_STRING_LIST(get_string_list, int32_t, DT_STRING_INT)
00265 GET_STRING_LIST(get_string_list, uint32_t, DT_STRING_UINT)
00266 GET_STRING_LIST(get_string_list, int64_t, DT_STRING_LONG)
00267 GET_STRING_LIST(get_string_list, uint64_t, DT_STRING_ULONG)
00268 GET_STRING_LIST(get_string_list, int16_t, DT_STRING_SHORT)
00269 GET_STRING_LIST(get_string_list, uint16_t, DT_STRING_WORD)
00270 GET_STRING_LIST(get_string_list, float32_t, DT_STRING_SHORTREAL)
00271 GET_STRING_LIST(get_string_list, float64_t, DT_STRING_REAL)
00272 GET_STRING_LIST(get_string_list, floatmax_t, DT_STRING_LONGREAL)
00273 #undef GET_STRING_LIST
00274 
00275 void CMLDataHDF5File::get_boolean_type()
00276 {
00277     boolean_type=H5T_NATIVE_UCHAR;
00278     switch (sizeof(bool))
00279     {
00280         case 1:
00281             boolean_type = H5T_NATIVE_UCHAR;
00282             break;
00283         case 2:
00284             boolean_type = H5T_NATIVE_UINT16;
00285             break;
00286         case 4:
00287             boolean_type = H5T_NATIVE_UINT32;
00288             break;
00289         case 8:
00290             boolean_type = H5T_NATIVE_UINT64;
00291             break;
00292         default:
00293             SG_ERROR("Boolean type not supported on this platform\n")
00294     }
00295 }
00296 
00297 hid_t CMLDataHDF5File::get_compatible_type(H5T_class_t t_class,
00298                                      const TSGDataType* datatype)
00299 {
00300     switch (t_class)
00301     {
00302         case H5T_FLOAT:
00303         case H5T_INTEGER:
00304             switch (datatype->m_ptype)
00305             {
00306             case PT_BOOL: return boolean_type;
00307             case PT_CHAR: return H5T_NATIVE_CHAR;
00308             case PT_INT8: return H5T_NATIVE_INT8;
00309             case PT_UINT8: return H5T_NATIVE_UINT8;
00310             case PT_INT16: return H5T_NATIVE_INT16;
00311             case PT_UINT16: return H5T_NATIVE_UINT16;
00312             case PT_INT32: return H5T_NATIVE_INT32;
00313             case PT_UINT32: return H5T_NATIVE_UINT32;
00314             case PT_INT64: return H5T_NATIVE_INT64;
00315             case PT_UINT64: return H5T_NATIVE_UINT64;
00316             case PT_FLOAT32: return H5T_NATIVE_FLOAT;
00317             case PT_FLOAT64: return H5T_NATIVE_DOUBLE;
00318             case PT_FLOATMAX: return H5T_NATIVE_LDOUBLE;
00319             case PT_COMPLEX128:
00320                 SG_ERROR("complex128_t not compatible with HDF5File!");
00321                 return -1;
00322             case PT_UNDEFINED:
00323             case PT_SGOBJECT:
00324                 SG_ERROR("Implementation error during writing "
00325                          "HDF5File!");
00326                 return -1;
00327             }
00328         case H5T_STRING:
00329             SG_ERROR("Strings not supported")
00330             return -1;
00331         case H5T_VLEN:
00332             SG_ERROR("Variable length containers currently not supported")
00333             return -1;
00334         case H5T_ARRAY:
00335             SG_ERROR("Array containers currently not supported")
00336             return -1;
00337         default:
00338             SG_ERROR("Datatype mismatchn")
00339             return -1;
00340     }
00341 }
00342 
00343 void CMLDataHDF5File::get_dims(hid_t dataset, int32_t*& dims, int32_t& ndims, int64_t& total_elements)
00344 {
00345     hid_t dataspace = H5Dget_space(dataset);
00346     if (dataspace<0)
00347         SG_ERROR("Error obtaining hdf5 dataspace\n")
00348 
00349     ndims = H5Sget_simple_extent_ndims(dataspace);
00350     total_elements=H5Sget_simple_extent_npoints(dataspace);
00351     hsize_t* dims_out=SG_MALLOC(hsize_t, ndims);
00352     dims=SG_MALLOC(int32_t, ndims);
00353     H5Sget_simple_extent_dims(dataspace, dims_out, NULL);
00354     for (int32_t i=0; i<ndims; i++)
00355         dims[i]=dims_out[i];
00356     SG_FREE(dims_out);
00357     H5Sclose(dataspace);
00358 }
00359 
00360 void CMLDataHDF5File::create_group_hierarchy()
00361 {
00362     char* vname=get_strdup(variable_name);
00363     int32_t vlen=strlen(vname);
00364     for (int32_t i=0; i<vlen; i++)
00365     {
00366         if (i!=0 && vname[i]=='/')
00367         {
00368             vname[i]='\0';
00369             hid_t g = H5Gopen2(h5file, vname, H5P_DEFAULT);
00370             if (g<0)
00371             {
00372                 g=H5Gcreate2(h5file, vname, H5P_DEFAULT, H5P_DEFAULT,
00373                         H5P_DEFAULT);
00374                 if (g<0)
00375                     SG_ERROR("Error creating group '%s'\n", vname)
00376                 vname[i]='/';
00377             }
00378             H5Gclose(g);
00379         }
00380     }
00381     SG_FREE(vname);
00382 }
00383 #endif //  HAVE_CURL && HAVE_HDF5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation