SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evgeniy Andreev (gsomix) 00008 */ 00009 00010 #include <shogun/io/CSVFile.h> 00011 00012 #include <shogun/lib/SGVector.h> 00013 #include <shogun/lib/SGMatrix.h> 00014 00015 using namespace shogun; 00016 00017 CCSVFile::CCSVFile() 00018 { 00019 init(); 00020 } 00021 00022 CCSVFile::CCSVFile(FILE* f, const char* name) : 00023 CFile(f, name) 00024 { 00025 init(); 00026 init_with_defaults(); 00027 } 00028 00029 CCSVFile::CCSVFile(int fd, const char* mode, const char* name) : 00030 CFile(fd, mode, name) 00031 { 00032 init(); 00033 init_with_defaults(); 00034 } 00035 00036 CCSVFile::CCSVFile(const char* fname, char rw, const char* name) : 00037 CFile(fname, rw, name) 00038 { 00039 init(); 00040 init_with_defaults(); 00041 } 00042 00043 CCSVFile::~CCSVFile() 00044 { 00045 SG_UNREF(m_tokenizer); 00046 SG_UNREF(m_line_tokenizer); 00047 SG_UNREF(m_parser); 00048 SG_UNREF(m_line_reader); 00049 } 00050 00051 void CCSVFile::set_transpose(bool value) 00052 { 00053 is_data_transposed=value; 00054 } 00055 00056 void CCSVFile::set_delimiter(char delimiter) 00057 { 00058 m_tokenizer->delimiters[m_delimiter]=0; 00059 00060 m_delimiter=delimiter; 00061 m_tokenizer->delimiters[m_delimiter]=1; 00062 00063 m_tokenizer->delimiters[' ']=1; 00064 } 00065 00066 void CCSVFile::set_lines_to_skip(int32_t num_lines) 00067 { 00068 m_num_to_skip=num_lines; 00069 } 00070 00071 int32_t CCSVFile::get_stats(int32_t& num_tokens) 00072 { 00073 int32_t num_lines=0; 00074 num_tokens=-1; 00075 00076 while (m_line_reader->has_next()) 00077 { 00078 if (num_tokens==-1) 00079 { 00080 SGVector<char> line=m_line_reader->read_line(); 00081 m_tokenizer->set_text(line); 00082 00083 num_tokens=0; 00084 while (m_tokenizer->has_next()) 00085 { 00086 index_t temp_start=0; 00087 m_tokenizer->next_token_idx(temp_start); 00088 num_tokens++; 00089 } 00090 } 00091 else 00092 m_line_reader->skip_line(); 00093 num_lines++; 00094 } 00095 m_line_reader->reset(); 00096 00097 return num_lines; 00098 } 00099 00100 void CCSVFile::init() 00101 { 00102 is_data_transposed=false; 00103 m_delimiter=0; 00104 m_num_to_skip=0; 00105 00106 m_tokenizer=NULL; 00107 m_line_tokenizer=NULL; 00108 m_parser=NULL; 00109 m_line_reader=NULL; 00110 } 00111 00112 void CCSVFile::init_with_defaults() 00113 { 00114 is_data_transposed=false; 00115 m_delimiter=','; 00116 00117 m_tokenizer=new CDelimiterTokenizer(true); 00118 m_tokenizer->delimiters[m_delimiter]=1; 00119 m_tokenizer->delimiters[' ']=1; 00120 SG_REF(m_tokenizer); 00121 00122 m_line_tokenizer=new CDelimiterTokenizer(true); 00123 m_line_tokenizer->delimiters['\n']=1; 00124 SG_REF(m_line_tokenizer); 00125 00126 m_parser=new CParser(); 00127 m_parser->set_tokenizer(m_tokenizer); 00128 00129 m_line_reader=new CLineReader(file, m_line_tokenizer); 00130 } 00131 00132 void CCSVFile::skip_lines(int32_t num_lines) 00133 { 00134 for (int32_t i=0; i<num_lines; i++) 00135 m_line_reader->skip_line(); 00136 } 00137 00138 #define GET_VECTOR(read_func, sg_type) \ 00139 void CCSVFile::get_vector(sg_type*& vector, int32_t& len) \ 00140 { \ 00141 if (!m_line_reader->has_next()) \ 00142 return; \ 00143 \ 00144 int32_t num_feat=0; \ 00145 int32_t num_vec=0; \ 00146 get_matrix(vector, num_feat, num_vec); \ 00147 \ 00148 if (num_feat==1) \ 00149 { \ 00150 len=num_vec; \ 00151 return; \ 00152 } \ 00153 \ 00154 if (num_vec==1) \ 00155 { \ 00156 len=num_feat; \ 00157 return; \ 00158 } \ 00159 \ 00160 len=0; \ 00161 } 00162 00163 GET_VECTOR(read_char, int8_t) 00164 GET_VECTOR(read_byte, uint8_t) 00165 GET_VECTOR(read_char, char) 00166 GET_VECTOR(read_int, int32_t) 00167 GET_VECTOR(read_uint, uint32_t) 00168 GET_VECTOR(read_short_real, float32_t) 00169 GET_VECTOR(read_real, float64_t) 00170 GET_VECTOR(read_long_real, floatmax_t) 00171 GET_VECTOR(read_short, int16_t) 00172 GET_VECTOR(read_word, uint16_t) 00173 GET_VECTOR(read_long, int64_t) 00174 GET_VECTOR(read_ulong, uint64_t) 00175 #undef GET_VECTOR 00176 00177 #define GET_MATRIX(read_func, sg_type) \ 00178 void CCSVFile::get_matrix(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00179 { \ 00180 int32_t num_lines=0; \ 00181 int32_t num_tokens=-1; \ 00182 int32_t current_line_idx=0; \ 00183 SGVector<char> line; \ 00184 \ 00185 skip_lines(m_num_to_skip); \ 00186 num_lines=get_stats(num_tokens); \ 00187 \ 00188 SG_SET_LOCALE_C; \ 00189 \ 00190 matrix=SG_MALLOC(sg_type, num_lines*num_tokens); \ 00191 skip_lines(m_num_to_skip); \ 00192 while (m_line_reader->has_next()) \ 00193 { \ 00194 line=m_line_reader->read_line(); \ 00195 m_parser->set_text(line); \ 00196 \ 00197 for (int32_t i=0; i<num_tokens; i++) \ 00198 { \ 00199 if (!m_parser->has_next()) \ 00200 return; \ 00201 \ 00202 if (!is_data_transposed) \ 00203 matrix[i+current_line_idx*num_tokens]=m_parser->read_func(); \ 00204 else \ 00205 matrix[current_line_idx+i*num_tokens]=m_parser->read_func(); \ 00206 } \ 00207 current_line_idx++; \ 00208 } \ 00209 \ 00210 SG_RESET_LOCALE; \ 00211 \ 00212 if (!is_data_transposed) \ 00213 { \ 00214 num_feat=num_tokens; \ 00215 num_vec=num_lines; \ 00216 } \ 00217 else \ 00218 { \ 00219 num_feat=num_lines; \ 00220 num_vec=num_tokens; \ 00221 } \ 00222 } 00223 00224 GET_MATRIX(read_char, int8_t) 00225 GET_MATRIX(read_byte, uint8_t) 00226 GET_MATRIX(read_char, char) 00227 GET_MATRIX(read_int, int32_t) 00228 GET_MATRIX(read_uint, uint32_t) 00229 GET_MATRIX(read_short_real, float32_t) 00230 GET_MATRIX(read_real, float64_t) 00231 GET_MATRIX(read_long_real, floatmax_t) 00232 GET_MATRIX(read_short, int16_t) 00233 GET_MATRIX(read_word, uint16_t) 00234 GET_MATRIX(read_long, int64_t) 00235 GET_MATRIX(read_ulong, uint64_t) 00236 #undef GET_MATRIX 00237 00238 #define GET_NDARRAY(read_func, sg_type) \ 00239 void CCSVFile::get_ndarray(sg_type*& array, int32_t*& dims, int32_t& num_dims) \ 00240 { \ 00241 SG_NOTIMPLEMENTED \ 00242 } 00243 00244 GET_NDARRAY(read_byte, uint8_t) 00245 GET_NDARRAY(read_char, char) 00246 GET_NDARRAY(read_int, int32_t) 00247 GET_NDARRAY(read_short_real, float32_t) 00248 GET_NDARRAY(read_real, float64_t) 00249 GET_NDARRAY(read_short, int16_t) 00250 GET_NDARRAY(read_word, uint16_t) 00251 #undef GET_NDARRAY 00252 00253 #define GET_SPARSE_MATRIX(read_func, sg_type) \ 00254 void CCSVFile::get_sparse_matrix( \ 00255 SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00256 { \ 00257 SG_NOTIMPLEMENTED \ 00258 } 00259 00260 GET_SPARSE_MATRIX(read_char, bool) 00261 GET_SPARSE_MATRIX(read_char, int8_t) 00262 GET_SPARSE_MATRIX(read_byte, uint8_t) 00263 GET_SPARSE_MATRIX(read_char, char) 00264 GET_SPARSE_MATRIX(read_int, int32_t) 00265 GET_SPARSE_MATRIX(read_uint, uint32_t) 00266 GET_SPARSE_MATRIX(read_short_real, float32_t) 00267 GET_SPARSE_MATRIX(read_real, float64_t) 00268 GET_SPARSE_MATRIX(read_long_real, floatmax_t) 00269 GET_SPARSE_MATRIX(read_short, int16_t) 00270 GET_SPARSE_MATRIX(read_word, uint16_t) 00271 GET_SPARSE_MATRIX(read_long, int64_t) 00272 GET_SPARSE_MATRIX(read_ulong, uint64_t) 00273 #undef GET_SPARSE_MATRIX 00274 00275 #define SET_VECTOR(format, sg_type) \ 00276 void CCSVFile::set_vector(const sg_type* vector, int32_t len) \ 00277 { \ 00278 SG_SET_LOCALE_C; \ 00279 \ 00280 if (!is_data_transposed) \ 00281 { \ 00282 for (int32_t i=0; i<len; i++) \ 00283 fprintf(file, "%" format "\n", vector[i]); \ 00284 } \ 00285 else \ 00286 { \ 00287 int32_t i; \ 00288 for (i=0; i<len-1; i++) \ 00289 fprintf(file, "%" format "%c", vector[i], m_delimiter); \ 00290 fprintf(file, "%" format "\n", vector[i]); \ 00291 } \ 00292 \ 00293 SG_RESET_LOCALE; \ 00294 } 00295 00296 SET_VECTOR(SCNi8, int8_t) 00297 SET_VECTOR(SCNu8, uint8_t) 00298 SET_VECTOR(SCNu8, char) 00299 SET_VECTOR(SCNi32, int32_t) 00300 SET_VECTOR(SCNu32, uint32_t) 00301 SET_VECTOR(SCNi64, int64_t) 00302 SET_VECTOR(SCNu64, uint64_t) 00303 SET_VECTOR(".16g", float32_t) 00304 SET_VECTOR(".16g", float64_t) 00305 SET_VECTOR(".16Lg", floatmax_t) 00306 SET_VECTOR(SCNi16, int16_t) 00307 SET_VECTOR(SCNu16, uint16_t) 00308 #undef SET_VECTOR 00309 00310 #define SET_MATRIX(format, sg_type) \ 00311 void CCSVFile::set_matrix(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \ 00312 { \ 00313 SG_SET_LOCALE_C; \ 00314 \ 00315 if (!is_data_transposed) \ 00316 { \ 00317 for (int32_t i=0; i<num_vec; i++) \ 00318 { \ 00319 int32_t j; \ 00320 for (j=0; j<num_feat-1; j++) \ 00321 fprintf(file, "%" format "%c", matrix[j+i*num_feat], m_delimiter); \ 00322 fprintf(file, "%" format "\n", matrix[j+i*num_feat]); \ 00323 } \ 00324 } \ 00325 else \ 00326 { \ 00327 for (int32_t i=0; i<num_feat; i++) \ 00328 { \ 00329 int32_t j; \ 00330 for (j=0; j<num_vec-1; j++) \ 00331 fprintf(file, "%" format "%c", matrix[i+j*num_vec], m_delimiter); \ 00332 fprintf(file, "%" format "\n", matrix[i+j*num_vec]); \ 00333 } \ 00334 } \ 00335 \ 00336 SG_RESET_LOCALE; \ 00337 } 00338 00339 SET_MATRIX(SCNi8, int8_t) 00340 SET_MATRIX(SCNu8, uint8_t) 00341 SET_MATRIX(SCNu8, char) 00342 SET_MATRIX(SCNi32, int32_t) 00343 SET_MATRIX(SCNu32, uint32_t) 00344 SET_MATRIX(SCNi64, int64_t) 00345 SET_MATRIX(SCNu64, uint64_t) 00346 SET_MATRIX(".16g", float32_t) 00347 SET_MATRIX(".16g", float64_t) 00348 SET_MATRIX(".16Lg", floatmax_t) 00349 SET_MATRIX(SCNi16, int16_t) 00350 SET_MATRIX(SCNu16, uint16_t) 00351 #undef SET_MATRIX 00352 00353 #define SET_SPARSE_MATRIX(format, sg_type) \ 00354 void CCSVFile::set_sparse_matrix( \ 00355 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \ 00356 { \ 00357 SG_NOTIMPLEMENTED \ 00358 } 00359 00360 SET_SPARSE_MATRIX(SCNi8, bool) 00361 SET_SPARSE_MATRIX(SCNi8, int8_t) 00362 SET_SPARSE_MATRIX(SCNu8, uint8_t) 00363 SET_SPARSE_MATRIX(SCNu8, char) 00364 SET_SPARSE_MATRIX(SCNi32, int32_t) 00365 SET_SPARSE_MATRIX(SCNu32, uint32_t) 00366 SET_SPARSE_MATRIX(SCNi64, int64_t) 00367 SET_SPARSE_MATRIX(SCNu64, uint64_t) 00368 SET_SPARSE_MATRIX(".16g", float32_t) 00369 SET_SPARSE_MATRIX(".16g", float64_t) 00370 SET_SPARSE_MATRIX(".16Lg", floatmax_t) 00371 SET_SPARSE_MATRIX(SCNi16, int16_t) 00372 SET_SPARSE_MATRIX(SCNu16, uint16_t) 00373 #undef SET_SPARSE_MATRIX 00374 00375 void CCSVFile::get_string_list( 00376 SGString<char>*& strings, int32_t& num_str, 00377 int32_t& max_string_len) 00378 { 00379 SGVector<char> line; 00380 int32_t current_line_idx=0; 00381 int32_t num_tokens=0; 00382 00383 max_string_len=0; 00384 num_str=get_stats(num_tokens); 00385 strings=SG_MALLOC(SGString<char>, num_str); 00386 00387 skip_lines(m_num_to_skip); 00388 while (m_line_reader->has_next()) 00389 { 00390 line=m_line_reader->read_line(); 00391 strings[current_line_idx].slen=line.vlen; 00392 strings[current_line_idx].string=SG_MALLOC(char, line.vlen); 00393 for (int32_t i=0; i<line.vlen; i++) 00394 strings[current_line_idx].string[i]=line[i]; 00395 00396 if (line.vlen>max_string_len) 00397 max_string_len=line.vlen; 00398 00399 current_line_idx++; 00400 } 00401 00402 num_str=current_line_idx; 00403 } 00404 00405 #define GET_STRING_LIST(sg_type) \ 00406 void CCSVFile::get_string_list( \ 00407 SGString<sg_type>*& strings, int32_t& num_str, \ 00408 int32_t& max_string_len) \ 00409 { \ 00410 SG_NOTIMPLEMENTED \ 00411 } 00412 00413 GET_STRING_LIST(int8_t) 00414 GET_STRING_LIST(uint8_t) 00415 GET_STRING_LIST(int32_t) 00416 GET_STRING_LIST(uint32_t) 00417 GET_STRING_LIST(int64_t) 00418 GET_STRING_LIST(uint64_t) 00419 GET_STRING_LIST(float32_t) 00420 GET_STRING_LIST(float64_t) 00421 GET_STRING_LIST(floatmax_t) 00422 GET_STRING_LIST(int16_t) 00423 GET_STRING_LIST(uint16_t) 00424 #undef GET_STRING_LIST 00425 00426 void CCSVFile::set_string_list( 00427 const SGString<char>* strings, int32_t num_str) 00428 { 00429 for (int32_t i=0; i<num_str; i++) 00430 { 00431 for (int32_t j=0; j<strings[i].slen; j++) 00432 fprintf(file, "%c", strings[i].string[j]); 00433 fprintf(file, "\n"); 00434 } 00435 } 00436 00437 #define SET_STRING_LIST(sg_type) \ 00438 void CCSVFile::set_string_list( \ 00439 const SGString<sg_type>* strings, int32_t num_str) \ 00440 { \ 00441 SG_NOTIMPLEMENTED \ 00442 } 00443 00444 SET_STRING_LIST(int8_t) 00445 SET_STRING_LIST(uint8_t) 00446 SET_STRING_LIST(int32_t) 00447 SET_STRING_LIST(uint32_t) 00448 SET_STRING_LIST(int64_t) 00449 SET_STRING_LIST(uint64_t) 00450 SET_STRING_LIST(float32_t) 00451 SET_STRING_LIST(float64_t) 00452 SET_STRING_LIST(floatmax_t) 00453 SET_STRING_LIST(int16_t) 00454 SET_STRING_LIST(uint16_t) 00455 #undef SET_STRING_LIST 00456 00457 void CCSVFile::tokenize(char delim, substring s, v_array<substring>& ret) 00458 { 00459 ret.erase(); 00460 char *last = s.start; 00461 for (; s.start != s.end; s.start++) 00462 { 00463 if (*s.start == delim) 00464 { 00465 if (s.start != last) 00466 { 00467 substring temp = {last,s.start}; 00468 ret.push(temp); 00469 } 00470 last = s.start+1; 00471 } 00472 } 00473 if (s.start != last) 00474 { 00475 substring final = {last, s.start}; 00476 ret.push(final); 00477 } 00478 }