SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evgeniy Andreev (gsomix) 00008 */ 00009 00010 #ifndef __CSVFILE_H__ 00011 #define __CSVFILE_H__ 00012 00013 #include <shogun/io/File.h> 00014 00015 #include <shogun/io/LineReader.h> 00016 #include <shogun/io/Parser.h> 00017 #include <shogun/lib/DelimiterTokenizer.h> 00018 #include <shogun/lib/v_array.h> 00019 00020 namespace shogun 00021 { 00022 00026 class CCSVFile : public CFile 00027 { 00028 public: 00030 CCSVFile(); 00031 00037 CCSVFile(FILE* f, const char* name=NULL); 00038 00045 CCSVFile(int fd, const char* mode, const char* name=NULL); 00046 00053 CCSVFile(const char* fname, char rw='r', const char* name=NULL); 00054 00056 virtual ~CCSVFile(); 00057 00062 void set_transpose(bool value); 00063 00068 void set_delimiter(char delimiter); 00069 00074 void set_lines_to_skip(int32_t num_lines); 00075 00081 int32_t get_stats(int32_t& num_tokens); 00082 00090 virtual void get_vector(int8_t*& vector, int32_t& len); 00091 virtual void get_vector(uint8_t*& vector, int32_t& len); 00092 virtual void get_vector(char*& vector, int32_t& len); 00093 virtual void get_vector(int32_t*& vector, int32_t& len); 00094 virtual void get_vector(uint32_t*& vector, int32_t& len); 00095 virtual void get_vector(float64_t*& vector, int32_t& len); 00096 virtual void get_vector(float32_t*& vector, int32_t& len); 00097 virtual void get_vector(floatmax_t*& vector, int32_t& len); 00098 virtual void get_vector(int16_t*& vector, int32_t& len); 00099 virtual void get_vector(uint16_t*& vector, int32_t& len); 00100 virtual void get_vector(int64_t*& vector, int32_t& len); 00101 virtual void get_vector(uint64_t*& vector, int32_t& len); 00103 00112 virtual void get_matrix( 00113 uint8_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00114 virtual void get_matrix( 00115 int8_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00116 virtual void get_matrix( 00117 char*& matrix, int32_t& num_feat, int32_t& num_vec); 00118 virtual void get_matrix( 00119 int32_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00120 virtual void get_matrix( 00121 uint32_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00122 virtual void get_matrix( 00123 int64_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00124 virtual void get_matrix( 00125 uint64_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00126 virtual void get_matrix( 00127 float32_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00128 virtual void get_matrix( 00129 float64_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00130 virtual void get_matrix( 00131 floatmax_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00132 virtual void get_matrix( 00133 int16_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00134 virtual void get_matrix( 00135 uint16_t*& matrix, int32_t& num_feat, int32_t& num_vec); 00137 00146 virtual void get_ndarray( 00147 uint8_t*& array, int32_t*& dims, int32_t& num_dims); 00148 virtual void get_ndarray( 00149 char*& array, int32_t*& dims, int32_t& num_dims); 00150 virtual void get_ndarray( 00151 int32_t*& array, int32_t*& dims, int32_t& num_dims); 00152 virtual void get_ndarray( 00153 float32_t*& array, int32_t*& dims, int32_t& num_dims); 00154 virtual void get_ndarray( 00155 float64_t*& array, int32_t*& dims, int32_t& num_dims); 00156 virtual void get_ndarray( 00157 int16_t*& array, int32_t*& dims, int32_t& num_dims); 00158 virtual void get_ndarray( 00159 uint16_t*& array, int32_t*& dims, int32_t& num_dims); 00161 00170 virtual void get_sparse_matrix( 00171 SGSparseVector<bool>*& matrix, int32_t& num_feat, int32_t& num_vec); 00172 virtual void get_sparse_matrix( 00173 SGSparseVector<uint8_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00174 virtual void get_sparse_matrix( 00175 SGSparseVector<int8_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00176 virtual void get_sparse_matrix( 00177 SGSparseVector<char>*& matrix, int32_t& num_feat, int32_t& num_vec); 00178 virtual void get_sparse_matrix( 00179 SGSparseVector<int32_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00180 virtual void get_sparse_matrix( 00181 SGSparseVector<uint32_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00182 virtual void get_sparse_matrix( 00183 SGSparseVector<int64_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00184 virtual void get_sparse_matrix( 00185 SGSparseVector<uint64_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00186 virtual void get_sparse_matrix( 00187 SGSparseVector<int16_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00188 virtual void get_sparse_matrix( 00189 SGSparseVector<uint16_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00190 virtual void get_sparse_matrix( 00191 SGSparseVector<float32_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00192 virtual void get_sparse_matrix( 00193 SGSparseVector<float64_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00194 virtual void get_sparse_matrix( 00195 SGSparseVector<floatmax_t>*& matrix, int32_t& num_feat, int32_t& num_vec); 00197 00206 virtual void get_string_list( 00207 SGString<uint8_t>*& strings, int32_t& num_str, 00208 int32_t& max_string_len); 00209 virtual void get_string_list( 00210 SGString<int8_t>*& strings, int32_t& num_str, 00211 int32_t& max_string_len); 00212 virtual void get_string_list( 00213 SGString<char>*& strings, int32_t& num_str, 00214 int32_t& max_string_len); 00215 virtual void get_string_list( 00216 SGString<int32_t>*& strings, int32_t& num_str, 00217 int32_t& max_string_len); 00218 virtual void get_string_list( 00219 SGString<uint32_t>*& strings, int32_t& num_str, 00220 int32_t& max_string_len); 00221 virtual void get_string_list( 00222 SGString<int16_t>*& strings, int32_t& num_str, 00223 int32_t& max_string_len); 00224 virtual void get_string_list( 00225 SGString<uint16_t>*& strings, int32_t& num_str, 00226 int32_t& max_string_len); 00227 virtual void get_string_list( 00228 SGString<int64_t>*& strings, int32_t& num_str, 00229 int32_t& max_string_len); 00230 virtual void get_string_list( 00231 SGString<uint64_t>*& strings, int32_t& num_str, 00232 int32_t& max_string_len); 00233 virtual void get_string_list( 00234 SGString<float32_t>*& strings, int32_t& num_str, 00235 int32_t& max_string_len); 00236 virtual void get_string_list( 00237 SGString<float64_t>*& strings, int32_t& num_str, 00238 int32_t& max_string_len); 00239 virtual void get_string_list( 00240 SGString<floatmax_t>*& strings, int32_t& num_str, 00241 int32_t& max_string_len); 00243 00245 /*virtual void get_vector(void*& vector, int32_t& len, DataType& dtype);*/ 00246 00254 virtual void set_vector(const int8_t* vector, int32_t len); 00255 virtual void set_vector(const uint8_t* vector, int32_t len); 00256 virtual void set_vector(const char* vector, int32_t len); 00257 virtual void set_vector(const int32_t* vector, int32_t len); 00258 virtual void set_vector(const uint32_t* vector, int32_t len); 00259 virtual void set_vector(const float32_t* vector, int32_t len); 00260 virtual void set_vector(const float64_t* vector, int32_t len); 00261 virtual void set_vector(const floatmax_t* vector, int32_t len); 00262 virtual void set_vector(const int16_t* vector, int32_t len); 00263 virtual void set_vector(const uint16_t* vector, int32_t len); 00264 virtual void set_vector(const int64_t* vector, int32_t len); 00265 virtual void set_vector(const uint64_t* vector, int32_t len); 00267 00275 virtual void set_matrix( 00276 const uint8_t* matrix, int32_t num_feat, int32_t num_vec); 00277 virtual void set_matrix( 00278 const int8_t* matrix, int32_t num_feat, int32_t num_vec); 00279 virtual void set_matrix( 00280 const char* matrix, int32_t num_feat, int32_t num_vec); 00281 virtual void set_matrix( 00282 const int32_t* matrix, int32_t num_feat, int32_t num_vec); 00283 virtual void set_matrix( 00284 const uint32_t* matrix, int32_t num_feat, int32_t num_vec); 00285 virtual void set_matrix( 00286 const int64_t* matrix, int32_t num_feat, int32_t num_vec); 00287 virtual void set_matrix( 00288 const uint64_t* matrix, int32_t num_feat, int32_t num_vec); 00289 virtual void set_matrix( 00290 const float32_t* matrix, int32_t num_feat, int32_t num_vec); 00291 virtual void set_matrix( 00292 const float64_t* matrix, int32_t num_feat, int32_t num_vec); 00293 virtual void set_matrix( 00294 const floatmax_t* matrix, int32_t num_feat, int32_t num_vec); 00295 virtual void set_matrix( 00296 const int16_t* matrix, int32_t num_feat, int32_t num_vec); 00297 virtual void set_matrix( 00298 const uint16_t* matrix, int32_t num_feat, int32_t num_vec); 00300 00308 virtual void set_sparse_matrix( 00309 const SGSparseVector<bool>* matrix, int32_t num_feat, int32_t num_vec); 00310 virtual void set_sparse_matrix( 00311 const SGSparseVector<uint8_t>* matrix, int32_t num_feat, int32_t num_vec); 00312 virtual void set_sparse_matrix( 00313 const SGSparseVector<int8_t>* matrix, int32_t num_feat, int32_t num_vec); 00314 virtual void set_sparse_matrix( 00315 const SGSparseVector<char>* matrix, int32_t num_feat, int32_t num_vec); 00316 virtual void set_sparse_matrix( 00317 const SGSparseVector<int32_t>* matrix, int32_t num_feat, int32_t num_vec); 00318 virtual void set_sparse_matrix( 00319 const SGSparseVector<uint32_t>* matrix, int32_t num_feat, int32_t num_vec); 00320 virtual void set_sparse_matrix( 00321 const SGSparseVector<int64_t>* matrix, int32_t num_feat, int32_t num_vec); 00322 virtual void set_sparse_matrix( 00323 const SGSparseVector<uint64_t>* matrix, int32_t num_feat, int32_t num_vec); 00324 virtual void set_sparse_matrix( 00325 const SGSparseVector<int16_t>* matrix, int32_t num_feat, int32_t num_vec); 00326 virtual void set_sparse_matrix( 00327 const SGSparseVector<uint16_t>* matrix, int32_t num_feat, int32_t num_vec); 00328 virtual void set_sparse_matrix( 00329 const SGSparseVector<float32_t>* matrix, int32_t num_feat, int32_t num_vec); 00330 virtual void set_sparse_matrix( 00331 const SGSparseVector<float64_t>* matrix, int32_t num_feat, int32_t num_vec); 00332 virtual void set_sparse_matrix( 00333 const SGSparseVector<floatmax_t>* matrix, int32_t num_feat, int32_t num_vec); 00335 00344 virtual void set_string_list( 00345 const SGString<uint8_t>* strings, int32_t num_str); 00346 virtual void set_string_list( 00347 const SGString<int8_t>* strings, int32_t num_str); 00348 virtual void set_string_list( 00349 const SGString<char>* strings, int32_t num_str); 00350 virtual void set_string_list( 00351 const SGString<int32_t>* strings, int32_t num_str); 00352 virtual void set_string_list( 00353 const SGString<uint32_t>* strings, int32_t num_str); 00354 virtual void set_string_list( 00355 const SGString<int16_t>* strings, int32_t num_str); 00356 virtual void set_string_list( 00357 const SGString<uint16_t>* strings, int32_t num_str); 00358 virtual void set_string_list( 00359 const SGString<int64_t>* strings, int32_t num_str); 00360 virtual void set_string_list( 00361 const SGString<uint64_t>* strings, int32_t num_str); 00362 virtual void set_string_list( 00363 const SGString<float32_t>* strings, int32_t num_str); 00364 virtual void set_string_list( 00365 const SGString<float64_t>* strings, int32_t num_str); 00366 virtual void set_string_list( 00367 const SGString<floatmax_t>* strings, int32_t num_str); 00369 00378 static void tokenize(char delim, substring s, v_array<substring> &ret); 00379 00380 virtual const char* get_name() const { return "CSVFile"; } 00381 00382 private: 00384 void init(); 00385 00387 void init_with_defaults(); 00388 00390 void skip_lines(int32_t num_lines); 00391 00392 private: 00394 CLineReader* m_line_reader; 00395 00397 CParser* m_parser; 00398 00400 CDelimiterTokenizer* m_line_tokenizer; 00401 00403 CDelimiterTokenizer* m_tokenizer; 00404 00406 bool is_data_transposed; 00407 00409 char m_delimiter; 00410 00412 int32_t m_num_to_skip; 00413 }; 00414 00415 } 00416 00417 #endif