SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Copyright (C) 2013 Zhengyang Liu (zhengyangl) 00008 */ 00009 00010 #include <shogun/lib/config.h> 00011 00012 #if defined(HAVE_HDF5) && defined( HAVE_CURL) 00013 00014 #include <stdio.h> 00015 #include <stdlib.h> 00016 #include <string.h> 00017 #include <hdf5.h> 00018 #include <curl/curl.h> 00019 #include <shogun/lib/memory.h> 00020 #include <shogun/io/MLDataHDF5File.h> 00021 00022 #include <shogun/features/StringFeatures.h> 00023 #include <shogun/features/SparseFeatures.h> 00024 00025 using namespace shogun; 00026 00027 CMLDataHDF5File::CMLDataHDF5File() 00028 { 00029 SG_UNSTABLE("CMLDataHDF5File::CMLDataHDF5File()", "\n") 00030 00031 get_boolean_type(); 00032 h5file = -1; 00033 } 00034 00035 size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) { 00036 size_t written = fwrite(ptr, size, nmemb, stream); 00037 return written; 00038 } 00039 00040 CMLDataHDF5File::CMLDataHDF5File(char* data_name, 00041 const char* name, 00042 const char* url_prefix) : CFile() 00043 { 00044 get_boolean_type(); 00045 H5Eset_auto2(H5E_DEFAULT, NULL, NULL); 00046 00047 if (name) 00048 set_variable_name(name); 00049 00050 CURL *curl; 00051 FILE *fp=NULL; 00052 00053 mldata_url = SG_CALLOC(char, strlen(url_prefix)+strlen(data_name)+1); 00054 strcat(mldata_url, url_prefix); 00055 strcat(mldata_url, data_name); 00056 00057 fname = SG_CALLOC(char, strlen((char*)"/tmp/")+strlen(data_name)+strlen((char*)".h5")+1); 00058 strcat(fname, (char*) "/tmp/"); 00059 strcat(fname, data_name); 00060 strcat(fname, (char*) ".h5"); 00061 00062 curl = curl_easy_init(); 00063 fp = fopen(fname,"wb"); 00064 00065 if (!fp) 00066 { 00067 SG_ERROR("Could not open file '%s'\n", fname) 00068 return; 00069 } 00070 00071 if (curl) { 00072 curl_easy_setopt(curl, CURLOPT_URL, mldata_url); 00073 curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &write_data); 00074 curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); 00075 curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L); 00076 curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); 00077 curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); 00078 curl_easy_perform(curl); 00079 curl_easy_cleanup(curl); 00080 } 00081 00082 if(fp) 00083 fclose(fp); 00084 00085 h5file = H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT); 00086 00087 if (h5file<0) 00088 SG_ERROR("Could not open data repository '%s'\n", data_name) 00089 } 00090 00091 CMLDataHDF5File::~CMLDataHDF5File() 00092 { 00093 H5Fclose(h5file); 00094 remove(fname); 00095 SG_FREE(fname); 00096 SG_FREE(mldata_url); 00097 } 00098 00099 #define GET_VECTOR(fname, sg_type, datatype) \ 00100 void CMLDataHDF5File::fname(sg_type*& vec, int32_t& len) \ 00101 { \ 00102 if (!h5file) \ 00103 SG_ERROR("File invalid.\n") \ 00104 \ 00105 int32_t* dims; \ 00106 int32_t ndims; \ 00107 int64_t nelements; \ 00108 hid_t dataset=H5Dopen2(h5file, variable_name, H5P_DEFAULT); \ 00109 if (dataset<0) \ 00110 SG_ERROR("Error opening data set\n") \ 00111 hid_t dtype=H5Dget_type(dataset); \ 00112 H5T_class_t t_class=H5Tget_class(dtype); \ 00113 TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \ 00114 if (h5_type==-1) \ 00115 { \ 00116 H5Dclose(dataset); \ 00117 SG_INFO("No compatible datatype found\n") \ 00118 } \ 00119 get_dims(dataset, dims, ndims, nelements); \ 00120 if (!((ndims==2 && dims[0]==nelements && dims[1]==1) || \ 00121 (ndims==2 && dims[0]==1 && dims[1]==nelements) || \ 00122 (ndims==1 && dims[0]==nelements))) \ 00123 SG_ERROR("Error not a 1-dimensional vector (ndims=%d, dims[0]=%d)\n", ndims, dims[0]) \ 00124 vec=SG_MALLOC(sg_type, nelements); \ 00125 len=nelements; \ 00126 herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \ 00127 H5S_ALL, H5P_DEFAULT, vec); \ 00128 H5Dclose(dataset); \ 00129 H5Tclose(dtype); \ 00130 SG_FREE(dims); \ 00131 if (status<0) \ 00132 { \ 00133 SG_FREE(vec); \ 00134 SG_ERROR("Error reading dataset\n") \ 00135 } \ 00136 } 00137 00138 GET_VECTOR(get_vector, bool, (CT_VECTOR, ST_NONE, PT_BOOL)) 00139 GET_VECTOR(get_vector, int8_t, (CT_VECTOR, ST_NONE, PT_INT8)) 00140 GET_VECTOR(get_vector, uint8_t, (CT_VECTOR, ST_NONE, PT_UINT8)) 00141 GET_VECTOR(get_vector, char, (CT_VECTOR, ST_NONE, PT_CHAR)) 00142 GET_VECTOR(get_vector, int32_t, (CT_VECTOR, ST_NONE, PT_INT32)) 00143 GET_VECTOR(get_vector, uint32_t, (CT_VECTOR, ST_NONE, PT_UINT32)) 00144 GET_VECTOR(get_vector, float32_t, (CT_VECTOR, ST_NONE, PT_FLOAT32)) 00145 GET_VECTOR(get_vector, float64_t, (CT_VECTOR, ST_NONE, PT_FLOAT64)) 00146 GET_VECTOR(get_vector, floatmax_t, (CT_VECTOR, ST_NONE, PT_FLOATMAX)) 00147 GET_VECTOR(get_vector, int16_t, (CT_VECTOR, ST_NONE, PT_INT16)) 00148 GET_VECTOR(get_vector, uint16_t, (CT_VECTOR, ST_NONE, PT_INT16)) 00149 GET_VECTOR(get_vector, int64_t, (CT_VECTOR, ST_NONE, PT_INT64)) 00150 GET_VECTOR(get_vector, uint64_t, (CT_VECTOR, ST_NONE, PT_UINT64)) 00151 #undef GET_VECTOR 00152 00153 #define GET_MATRIX(fname, sg_type, datatype) \ 00154 void CMLDataHDF5File::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00155 { \ 00156 if (!h5file) \ 00157 SG_ERROR("File invalid.\n") \ 00158 \ 00159 int32_t* dims; \ 00160 int32_t ndims; \ 00161 int64_t nelements; \ 00162 hid_t dataset = H5Dopen2(h5file, variable_name, H5P_DEFAULT); \ 00163 if (dataset<0) \ 00164 SG_ERROR("Error opening data set\n") \ 00165 hid_t dtype = H5Dget_type(dataset); \ 00166 H5T_class_t t_class=H5Tget_class(dtype); \ 00167 TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \ 00168 if (h5_type==-1) \ 00169 { \ 00170 H5Dclose(dataset); \ 00171 SG_INFO("No compatible datatype found\n") \ 00172 } \ 00173 get_dims(dataset, dims, ndims, nelements); \ 00174 if (ndims!=2) \ 00175 SG_ERROR("Error not a 2-dimensional matrix\n") \ 00176 matrix=SG_MALLOC(sg_type, nelements); \ 00177 num_feat=dims[0]; \ 00178 num_vec=dims[1]; \ 00179 herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \ 00180 H5S_ALL, H5P_DEFAULT, matrix); \ 00181 H5Dclose(dataset); \ 00182 H5Tclose(dtype); \ 00183 SG_FREE(dims); \ 00184 if (status<0) \ 00185 { \ 00186 SG_FREE(matrix); \ 00187 SG_ERROR("Error reading dataset\n") \ 00188 } \ 00189 } 00190 00191 GET_MATRIX(get_matrix, bool, (CT_MATRIX, ST_NONE, PT_BOOL)) 00192 GET_MATRIX(get_matrix, char, (CT_MATRIX, ST_NONE, PT_CHAR)) 00193 GET_MATRIX(get_matrix, uint8_t, (CT_MATRIX, ST_NONE, PT_UINT8)) 00194 GET_MATRIX(get_matrix, int32_t, (CT_MATRIX, ST_NONE, PT_INT32)) 00195 GET_MATRIX(get_matrix, uint32_t, (CT_MATRIX, ST_NONE, PT_INT32)) 00196 GET_MATRIX(get_matrix, int64_t, (CT_MATRIX, ST_NONE, PT_INT64)) 00197 GET_MATRIX(get_matrix, uint64_t, (CT_MATRIX, ST_NONE, PT_INT64)) 00198 GET_MATRIX(get_matrix, int16_t, (CT_MATRIX, ST_NONE, PT_INT16)) 00199 GET_MATRIX(get_matrix, uint16_t, (CT_MATRIX, ST_NONE, PT_INT16)) 00200 GET_MATRIX(get_matrix, float32_t, (CT_MATRIX, ST_NONE, PT_FLOAT32)) 00201 GET_MATRIX(get_matrix, float64_t, (CT_MATRIX, ST_NONE, PT_FLOAT64)) 00202 GET_MATRIX(get_matrix, floatmax_t, (CT_MATRIX, ST_NONE, PT_FLOATMAX)) 00203 #undef GET_MATRIX 00204 00205 void CMLDataHDF5File::get_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims) 00206 { 00207 } 00208 00209 void CMLDataHDF5File::get_ndarray(char*& array, int32_t*& dims, int32_t& num_dims) 00210 { 00211 } 00212 00213 void CMLDataHDF5File::get_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims) 00214 { 00215 } 00216 00217 void CMLDataHDF5File::get_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims) 00218 { 00219 } 00220 00221 void CMLDataHDF5File::get_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims) 00222 { 00223 } 00224 00225 void CMLDataHDF5File::get_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims) 00226 { 00227 } 00228 00229 void CMLDataHDF5File::get_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims) 00230 { 00231 } 00232 00233 #define GET_SPARSEMATRIX(fname, sg_type, datatype) \ 00234 void CMLDataHDF5File::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00235 { \ 00236 if (!(file)) \ 00237 SG_ERROR("File invalid.\n") \ 00238 } 00239 GET_SPARSEMATRIX(get_sparse_matrix, bool, DT_SPARSE_BOOL) 00240 GET_SPARSEMATRIX(get_sparse_matrix, char, DT_SPARSE_CHAR) 00241 GET_SPARSEMATRIX(get_sparse_matrix, int8_t, DT_SPARSE_INT8) 00242 GET_SPARSEMATRIX(get_sparse_matrix, uint8_t, DT_SPARSE_BYTE) 00243 GET_SPARSEMATRIX(get_sparse_matrix, int32_t, DT_SPARSE_INT) 00244 GET_SPARSEMATRIX(get_sparse_matrix, uint32_t, DT_SPARSE_UINT) 00245 GET_SPARSEMATRIX(get_sparse_matrix, int64_t, DT_SPARSE_LONG) 00246 GET_SPARSEMATRIX(get_sparse_matrix, uint64_t, DT_SPARSE_ULONG) 00247 GET_SPARSEMATRIX(get_sparse_matrix, int16_t, DT_SPARSE_SHORT) 00248 GET_SPARSEMATRIX(get_sparse_matrix, uint16_t, DT_SPARSE_WORD) 00249 GET_SPARSEMATRIX(get_sparse_matrix, float32_t, DT_SPARSE_SHORTREAL) 00250 GET_SPARSEMATRIX(get_sparse_matrix, float64_t, DT_SPARSE_REAL) 00251 GET_SPARSEMATRIX(get_sparse_matrix, floatmax_t, DT_SPARSE_LONGREAL) 00252 #undef GET_SPARSEMATRIX 00253 00254 00255 #define GET_STRING_LIST(fname, sg_type, datatype) \ 00256 void CMLDataHDF5File::fname(SGString<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \ 00257 { \ 00258 } 00259 00260 GET_STRING_LIST(get_string_list, bool, DT_STRING_BOOL) 00261 GET_STRING_LIST(get_string_list, char, DT_STRING_CHAR) 00262 GET_STRING_LIST(get_string_list, int8_t, DT_STRING_INT8) 00263 GET_STRING_LIST(get_string_list, uint8_t, DT_STRING_BYTE) 00264 GET_STRING_LIST(get_string_list, int32_t, DT_STRING_INT) 00265 GET_STRING_LIST(get_string_list, uint32_t, DT_STRING_UINT) 00266 GET_STRING_LIST(get_string_list, int64_t, DT_STRING_LONG) 00267 GET_STRING_LIST(get_string_list, uint64_t, DT_STRING_ULONG) 00268 GET_STRING_LIST(get_string_list, int16_t, DT_STRING_SHORT) 00269 GET_STRING_LIST(get_string_list, uint16_t, DT_STRING_WORD) 00270 GET_STRING_LIST(get_string_list, float32_t, DT_STRING_SHORTREAL) 00271 GET_STRING_LIST(get_string_list, float64_t, DT_STRING_REAL) 00272 GET_STRING_LIST(get_string_list, floatmax_t, DT_STRING_LONGREAL) 00273 #undef GET_STRING_LIST 00274 00275 void CMLDataHDF5File::get_boolean_type() 00276 { 00277 boolean_type=H5T_NATIVE_UCHAR; 00278 switch (sizeof(bool)) 00279 { 00280 case 1: 00281 boolean_type = H5T_NATIVE_UCHAR; 00282 break; 00283 case 2: 00284 boolean_type = H5T_NATIVE_UINT16; 00285 break; 00286 case 4: 00287 boolean_type = H5T_NATIVE_UINT32; 00288 break; 00289 case 8: 00290 boolean_type = H5T_NATIVE_UINT64; 00291 break; 00292 default: 00293 SG_ERROR("Boolean type not supported on this platform\n") 00294 } 00295 } 00296 00297 hid_t CMLDataHDF5File::get_compatible_type(H5T_class_t t_class, 00298 const TSGDataType* datatype) 00299 { 00300 switch (t_class) 00301 { 00302 case H5T_FLOAT: 00303 case H5T_INTEGER: 00304 switch (datatype->m_ptype) 00305 { 00306 case PT_BOOL: return boolean_type; 00307 case PT_CHAR: return H5T_NATIVE_CHAR; 00308 case PT_INT8: return H5T_NATIVE_INT8; 00309 case PT_UINT8: return H5T_NATIVE_UINT8; 00310 case PT_INT16: return H5T_NATIVE_INT16; 00311 case PT_UINT16: return H5T_NATIVE_UINT16; 00312 case PT_INT32: return H5T_NATIVE_INT32; 00313 case PT_UINT32: return H5T_NATIVE_UINT32; 00314 case PT_INT64: return H5T_NATIVE_INT64; 00315 case PT_UINT64: return H5T_NATIVE_UINT64; 00316 case PT_FLOAT32: return H5T_NATIVE_FLOAT; 00317 case PT_FLOAT64: return H5T_NATIVE_DOUBLE; 00318 case PT_FLOATMAX: return H5T_NATIVE_LDOUBLE; 00319 case PT_COMPLEX128: 00320 SG_ERROR("complex128_t not compatible with HDF5File!"); 00321 return -1; 00322 case PT_UNDEFINED: 00323 case PT_SGOBJECT: 00324 SG_ERROR("Implementation error during writing " 00325 "HDF5File!"); 00326 return -1; 00327 } 00328 case H5T_STRING: 00329 SG_ERROR("Strings not supported") 00330 return -1; 00331 case H5T_VLEN: 00332 SG_ERROR("Variable length containers currently not supported") 00333 return -1; 00334 case H5T_ARRAY: 00335 SG_ERROR("Array containers currently not supported") 00336 return -1; 00337 default: 00338 SG_ERROR("Datatype mismatchn") 00339 return -1; 00340 } 00341 } 00342 00343 void CMLDataHDF5File::get_dims(hid_t dataset, int32_t*& dims, int32_t& ndims, int64_t& total_elements) 00344 { 00345 hid_t dataspace = H5Dget_space(dataset); 00346 if (dataspace<0) 00347 SG_ERROR("Error obtaining hdf5 dataspace\n") 00348 00349 ndims = H5Sget_simple_extent_ndims(dataspace); 00350 total_elements=H5Sget_simple_extent_npoints(dataspace); 00351 hsize_t* dims_out=SG_MALLOC(hsize_t, ndims); 00352 dims=SG_MALLOC(int32_t, ndims); 00353 H5Sget_simple_extent_dims(dataspace, dims_out, NULL); 00354 for (int32_t i=0; i<ndims; i++) 00355 dims[i]=dims_out[i]; 00356 SG_FREE(dims_out); 00357 H5Sclose(dataspace); 00358 } 00359 00360 void CMLDataHDF5File::create_group_hierarchy() 00361 { 00362 char* vname=get_strdup(variable_name); 00363 int32_t vlen=strlen(vname); 00364 for (int32_t i=0; i<vlen; i++) 00365 { 00366 if (i!=0 && vname[i]=='/') 00367 { 00368 vname[i]='\0'; 00369 hid_t g = H5Gopen2(h5file, vname, H5P_DEFAULT); 00370 if (g<0) 00371 { 00372 g=H5Gcreate2(h5file, vname, H5P_DEFAULT, H5P_DEFAULT, 00373 H5P_DEFAULT); 00374 if (g<0) 00375 SG_ERROR("Error creating group '%s'\n", vname) 00376 vname[i]='/'; 00377 } 00378 H5Gclose(g); 00379 } 00380 } 00381 SG_FREE(vname); 00382 } 00383 #endif // HAVE_CURL && HAVE_HDF5