SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
CSVFile.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evgeniy Andreev (gsomix)
00008  */
00009 
00010 #include <shogun/io/CSVFile.h>
00011 
00012 #include <shogun/lib/SGVector.h>
00013 #include <shogun/lib/SGMatrix.h>
00014 
00015 using namespace shogun;
00016 
00017 CCSVFile::CCSVFile()
00018 {
00019     init();
00020 }
00021 
00022 CCSVFile::CCSVFile(FILE* f, const char* name) :
00023     CFile(f, name)
00024 {
00025     init();
00026     init_with_defaults();
00027 }
00028 
00029 CCSVFile::CCSVFile(int fd, const char* mode, const char* name) :
00030     CFile(fd, mode, name)
00031 {
00032     init();
00033     init_with_defaults();
00034 }
00035 
00036 CCSVFile::CCSVFile(const char* fname, char rw, const char* name) :
00037     CFile(fname, rw, name)
00038 {
00039     init();
00040     init_with_defaults();
00041 }
00042 
00043 CCSVFile::~CCSVFile()
00044 {
00045     SG_UNREF(m_tokenizer);
00046     SG_UNREF(m_line_tokenizer);
00047     SG_UNREF(m_parser);
00048     SG_UNREF(m_line_reader);
00049 }
00050 
00051 void CCSVFile::set_transpose(bool value)
00052 {
00053     is_data_transposed=value;
00054 }
00055 
00056 void CCSVFile::set_delimiter(char delimiter)
00057 {
00058     m_tokenizer->delimiters[m_delimiter]=0;
00059 
00060     m_delimiter=delimiter;
00061     m_tokenizer->delimiters[m_delimiter]=1;
00062 
00063     m_tokenizer->delimiters[' ']=1;
00064 }
00065 
00066 void CCSVFile::set_lines_to_skip(int32_t num_lines)
00067 {
00068     m_num_to_skip=num_lines;
00069 }
00070 
00071 int32_t CCSVFile::get_stats(int32_t& num_tokens)
00072 {
00073     int32_t num_lines=0;
00074     num_tokens=-1;
00075 
00076     while (m_line_reader->has_next())
00077     {
00078         if (num_tokens==-1)
00079         {
00080             SGVector<char> line=m_line_reader->read_line();
00081             m_tokenizer->set_text(line);
00082 
00083             num_tokens=0;
00084             while (m_tokenizer->has_next())
00085             {
00086                 index_t temp_start=0;
00087                 m_tokenizer->next_token_idx(temp_start);
00088                 num_tokens++;
00089             }
00090         }
00091         else
00092             m_line_reader->skip_line();
00093         num_lines++;
00094     }
00095     m_line_reader->reset();
00096 
00097     return num_lines;
00098 }
00099 
00100 void CCSVFile::init()
00101 {
00102     is_data_transposed=false;
00103     m_delimiter=0;
00104     m_num_to_skip=0;
00105 
00106     m_tokenizer=NULL;
00107     m_line_tokenizer=NULL;
00108     m_parser=NULL;
00109     m_line_reader=NULL;
00110 }
00111 
00112 void CCSVFile::init_with_defaults()
00113 {
00114     is_data_transposed=false;
00115     m_delimiter=',';
00116 
00117     m_tokenizer=new CDelimiterTokenizer(true);
00118     m_tokenizer->delimiters[m_delimiter]=1;
00119     m_tokenizer->delimiters[' ']=1;
00120     SG_REF(m_tokenizer);
00121 
00122     m_line_tokenizer=new CDelimiterTokenizer(true);
00123     m_line_tokenizer->delimiters['\n']=1;
00124     SG_REF(m_line_tokenizer);
00125 
00126     m_parser=new CParser();
00127     m_parser->set_tokenizer(m_tokenizer);
00128 
00129     m_line_reader=new CLineReader(file, m_line_tokenizer);
00130 }
00131 
00132 void CCSVFile::skip_lines(int32_t num_lines)
00133 {
00134     for (int32_t i=0; i<num_lines; i++)
00135         m_line_reader->skip_line();
00136 }
00137 
00138 #define GET_VECTOR(read_func, sg_type) \
00139 void CCSVFile::get_vector(sg_type*& vector, int32_t& len) \
00140 { \
00141     if (!m_line_reader->has_next()) \
00142         return; \
00143     \
00144     int32_t num_feat=0; \
00145     int32_t num_vec=0; \
00146     get_matrix(vector, num_feat, num_vec); \
00147     \
00148     if (num_feat==1) \
00149     { \
00150         len=num_vec; \
00151         return; \
00152     } \
00153     \
00154     if (num_vec==1) \
00155     { \
00156         len=num_feat; \
00157         return; \
00158     } \
00159     \
00160     len=0; \
00161 }
00162 
00163 GET_VECTOR(read_char, int8_t)
00164 GET_VECTOR(read_byte, uint8_t)
00165 GET_VECTOR(read_char, char)
00166 GET_VECTOR(read_int, int32_t)
00167 GET_VECTOR(read_uint, uint32_t)
00168 GET_VECTOR(read_short_real, float32_t)
00169 GET_VECTOR(read_real, float64_t)
00170 GET_VECTOR(read_long_real, floatmax_t)
00171 GET_VECTOR(read_short, int16_t)
00172 GET_VECTOR(read_word, uint16_t)
00173 GET_VECTOR(read_long, int64_t)
00174 GET_VECTOR(read_ulong, uint64_t)
00175 #undef GET_VECTOR
00176 
00177 #define GET_MATRIX(read_func, sg_type) \
00178 void CCSVFile::get_matrix(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
00179 { \
00180     int32_t num_lines=0; \
00181     int32_t num_tokens=-1; \
00182     int32_t current_line_idx=0; \
00183     SGVector<char> line; \
00184     \
00185     skip_lines(m_num_to_skip); \
00186     num_lines=get_stats(num_tokens); \
00187     \
00188     SG_SET_LOCALE_C; \
00189     \
00190     matrix=SG_MALLOC(sg_type, num_lines*num_tokens); \
00191     skip_lines(m_num_to_skip); \
00192     while (m_line_reader->has_next()) \
00193     { \
00194         line=m_line_reader->read_line(); \
00195         m_parser->set_text(line); \
00196         \
00197         for (int32_t i=0; i<num_tokens; i++) \
00198         { \
00199             if (!m_parser->has_next()) \
00200                 return; \
00201             \
00202             if (!is_data_transposed) \
00203                 matrix[i+current_line_idx*num_tokens]=m_parser->read_func(); \
00204             else \
00205                 matrix[current_line_idx+i*num_tokens]=m_parser->read_func(); \
00206         } \
00207         current_line_idx++; \
00208     } \
00209     \
00210     SG_RESET_LOCALE; \
00211     \
00212     if (!is_data_transposed) \
00213     { \
00214         num_feat=num_tokens; \
00215         num_vec=num_lines; \
00216     } \
00217     else \
00218     { \
00219         num_feat=num_lines; \
00220         num_vec=num_tokens; \
00221     } \
00222 }
00223 
00224 GET_MATRIX(read_char, int8_t)
00225 GET_MATRIX(read_byte, uint8_t)
00226 GET_MATRIX(read_char, char)
00227 GET_MATRIX(read_int, int32_t)
00228 GET_MATRIX(read_uint, uint32_t)
00229 GET_MATRIX(read_short_real, float32_t)
00230 GET_MATRIX(read_real, float64_t)
00231 GET_MATRIX(read_long_real, floatmax_t)
00232 GET_MATRIX(read_short, int16_t)
00233 GET_MATRIX(read_word, uint16_t)
00234 GET_MATRIX(read_long, int64_t)
00235 GET_MATRIX(read_ulong, uint64_t)
00236 #undef GET_MATRIX
00237 
00238 #define GET_NDARRAY(read_func, sg_type) \
00239 void CCSVFile::get_ndarray(sg_type*& array, int32_t*& dims, int32_t& num_dims) \
00240 { \
00241     SG_NOTIMPLEMENTED \
00242 }
00243 
00244 GET_NDARRAY(read_byte, uint8_t)
00245 GET_NDARRAY(read_char, char)
00246 GET_NDARRAY(read_int, int32_t)
00247 GET_NDARRAY(read_short_real, float32_t)
00248 GET_NDARRAY(read_real, float64_t)
00249 GET_NDARRAY(read_short, int16_t)
00250 GET_NDARRAY(read_word, uint16_t)
00251 #undef GET_NDARRAY
00252 
00253 #define GET_SPARSE_MATRIX(read_func, sg_type) \
00254 void CCSVFile::get_sparse_matrix( \
00255             SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
00256 { \
00257     SG_NOTIMPLEMENTED \
00258 }
00259 
00260 GET_SPARSE_MATRIX(read_char, bool)
00261 GET_SPARSE_MATRIX(read_char, int8_t)
00262 GET_SPARSE_MATRIX(read_byte, uint8_t)
00263 GET_SPARSE_MATRIX(read_char, char)
00264 GET_SPARSE_MATRIX(read_int, int32_t)
00265 GET_SPARSE_MATRIX(read_uint, uint32_t)
00266 GET_SPARSE_MATRIX(read_short_real, float32_t)
00267 GET_SPARSE_MATRIX(read_real, float64_t)
00268 GET_SPARSE_MATRIX(read_long_real, floatmax_t)
00269 GET_SPARSE_MATRIX(read_short, int16_t)
00270 GET_SPARSE_MATRIX(read_word, uint16_t)
00271 GET_SPARSE_MATRIX(read_long, int64_t)
00272 GET_SPARSE_MATRIX(read_ulong, uint64_t)
00273 #undef GET_SPARSE_MATRIX
00274 
00275 #define SET_VECTOR(format, sg_type) \
00276 void CCSVFile::set_vector(const sg_type* vector, int32_t len) \
00277 { \
00278     SG_SET_LOCALE_C; \
00279     \
00280     if (!is_data_transposed) \
00281     { \
00282         for (int32_t i=0; i<len; i++) \
00283             fprintf(file, "%" format "\n", vector[i]); \
00284     } \
00285     else \
00286     { \
00287         int32_t i; \
00288         for (i=0; i<len-1; i++) \
00289             fprintf(file, "%" format "%c", vector[i], m_delimiter); \
00290         fprintf(file, "%" format "\n", vector[i]); \
00291     } \
00292     \
00293     SG_RESET_LOCALE; \
00294 }
00295 
00296 SET_VECTOR(SCNi8, int8_t)
00297 SET_VECTOR(SCNu8, uint8_t)
00298 SET_VECTOR(SCNu8, char)
00299 SET_VECTOR(SCNi32, int32_t)
00300 SET_VECTOR(SCNu32, uint32_t)
00301 SET_VECTOR(SCNi64, int64_t)
00302 SET_VECTOR(SCNu64, uint64_t)
00303 SET_VECTOR(".16g", float32_t)
00304 SET_VECTOR(".16g", float64_t)
00305 SET_VECTOR(".16Lg", floatmax_t)
00306 SET_VECTOR(SCNi16, int16_t)
00307 SET_VECTOR(SCNu16, uint16_t)
00308 #undef SET_VECTOR
00309 
00310 #define SET_MATRIX(format, sg_type) \
00311 void CCSVFile::set_matrix(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
00312 { \
00313     SG_SET_LOCALE_C; \
00314     \
00315     if (!is_data_transposed) \
00316     { \
00317         for (int32_t i=0; i<num_vec; i++) \
00318         { \
00319             int32_t j; \
00320             for (j=0; j<num_feat-1; j++) \
00321                 fprintf(file, "%" format "%c", matrix[j+i*num_feat], m_delimiter); \
00322             fprintf(file, "%" format "\n", matrix[j+i*num_feat]); \
00323         } \
00324     } \
00325     else \
00326     { \
00327         for (int32_t i=0; i<num_feat; i++) \
00328         { \
00329             int32_t j; \
00330             for (j=0; j<num_vec-1; j++) \
00331                 fprintf(file, "%" format "%c", matrix[i+j*num_vec], m_delimiter); \
00332             fprintf(file, "%" format "\n", matrix[i+j*num_vec]); \
00333         } \
00334     } \
00335     \
00336     SG_RESET_LOCALE; \
00337 }
00338 
00339 SET_MATRIX(SCNi8, int8_t)
00340 SET_MATRIX(SCNu8, uint8_t)
00341 SET_MATRIX(SCNu8, char)
00342 SET_MATRIX(SCNi32, int32_t)
00343 SET_MATRIX(SCNu32, uint32_t)
00344 SET_MATRIX(SCNi64, int64_t)
00345 SET_MATRIX(SCNu64, uint64_t)
00346 SET_MATRIX(".16g", float32_t)
00347 SET_MATRIX(".16g", float64_t)
00348 SET_MATRIX(".16Lg", floatmax_t)
00349 SET_MATRIX(SCNi16, int16_t)
00350 SET_MATRIX(SCNu16, uint16_t)
00351 #undef SET_MATRIX
00352 
00353 #define SET_SPARSE_MATRIX(format, sg_type) \
00354 void CCSVFile::set_sparse_matrix( \
00355             const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
00356 { \
00357     SG_NOTIMPLEMENTED \
00358 }
00359 
00360 SET_SPARSE_MATRIX(SCNi8, bool)
00361 SET_SPARSE_MATRIX(SCNi8, int8_t)
00362 SET_SPARSE_MATRIX(SCNu8, uint8_t)
00363 SET_SPARSE_MATRIX(SCNu8, char)
00364 SET_SPARSE_MATRIX(SCNi32, int32_t)
00365 SET_SPARSE_MATRIX(SCNu32, uint32_t)
00366 SET_SPARSE_MATRIX(SCNi64, int64_t)
00367 SET_SPARSE_MATRIX(SCNu64, uint64_t)
00368 SET_SPARSE_MATRIX(".16g", float32_t)
00369 SET_SPARSE_MATRIX(".16g", float64_t)
00370 SET_SPARSE_MATRIX(".16Lg", floatmax_t)
00371 SET_SPARSE_MATRIX(SCNi16, int16_t)
00372 SET_SPARSE_MATRIX(SCNu16, uint16_t)
00373 #undef SET_SPARSE_MATRIX
00374 
00375 void CCSVFile::get_string_list(
00376             SGString<char>*& strings, int32_t& num_str,
00377             int32_t& max_string_len)
00378 {
00379     SGVector<char> line;
00380     int32_t current_line_idx=0;
00381     int32_t num_tokens=0;
00382 
00383     max_string_len=0;
00384     num_str=get_stats(num_tokens);
00385     strings=SG_MALLOC(SGString<char>, num_str);
00386 
00387     skip_lines(m_num_to_skip);
00388     while (m_line_reader->has_next())
00389     {
00390         line=m_line_reader->read_line();
00391         strings[current_line_idx].slen=line.vlen;
00392         strings[current_line_idx].string=SG_MALLOC(char, line.vlen);
00393         for (int32_t i=0; i<line.vlen; i++)
00394             strings[current_line_idx].string[i]=line[i];
00395 
00396         if (line.vlen>max_string_len)
00397             max_string_len=line.vlen;
00398 
00399         current_line_idx++;
00400     }
00401 
00402     num_str=current_line_idx;
00403 }
00404 
00405 #define GET_STRING_LIST(sg_type) \
00406 void CCSVFile::get_string_list( \
00407             SGString<sg_type>*& strings, int32_t& num_str, \
00408             int32_t& max_string_len) \
00409 { \
00410     SG_NOTIMPLEMENTED \
00411 }
00412 
00413 GET_STRING_LIST(int8_t)
00414 GET_STRING_LIST(uint8_t)
00415 GET_STRING_LIST(int32_t)
00416 GET_STRING_LIST(uint32_t)
00417 GET_STRING_LIST(int64_t)
00418 GET_STRING_LIST(uint64_t)
00419 GET_STRING_LIST(float32_t)
00420 GET_STRING_LIST(float64_t)
00421 GET_STRING_LIST(floatmax_t)
00422 GET_STRING_LIST(int16_t)
00423 GET_STRING_LIST(uint16_t)
00424 #undef GET_STRING_LIST
00425 
00426 void CCSVFile::set_string_list(
00427             const SGString<char>* strings, int32_t num_str)
00428 {
00429     for (int32_t i=0; i<num_str; i++)
00430     {
00431         for (int32_t j=0; j<strings[i].slen; j++)
00432             fprintf(file, "%c", strings[i].string[j]);
00433         fprintf(file, "\n");
00434     }
00435 }
00436 
00437 #define SET_STRING_LIST(sg_type) \
00438 void CCSVFile::set_string_list( \
00439             const SGString<sg_type>* strings, int32_t num_str) \
00440 { \
00441     SG_NOTIMPLEMENTED \
00442 }
00443 
00444 SET_STRING_LIST(int8_t)
00445 SET_STRING_LIST(uint8_t)
00446 SET_STRING_LIST(int32_t)
00447 SET_STRING_LIST(uint32_t)
00448 SET_STRING_LIST(int64_t)
00449 SET_STRING_LIST(uint64_t)
00450 SET_STRING_LIST(float32_t)
00451 SET_STRING_LIST(float64_t)
00452 SET_STRING_LIST(floatmax_t)
00453 SET_STRING_LIST(int16_t)
00454 SET_STRING_LIST(uint16_t)
00455 #undef SET_STRING_LIST
00456 
00457 void CCSVFile::tokenize(char delim, substring s, v_array<substring>& ret)
00458 {
00459     ret.erase();
00460     char *last = s.start;
00461     for (; s.start != s.end; s.start++)
00462     {
00463         if (*s.start == delim)
00464         {
00465             if (s.start != last)
00466             {
00467                 substring temp = {last,s.start};
00468                 ret.push(temp);
00469             }
00470             last = s.start+1;
00471         }
00472     }
00473     if (s.start != last)
00474     {
00475         substring final = {last, s.start};
00476         ret.push(final);
00477     }
00478 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation