SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
LibSVMFile.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evgeniy Andreev (gsomix)
00008  */
00009 
00010 #include <shogun/io/LibSVMFile.h>
00011 
00012 #include <shogun/lib/SGVector.h>
00013 #include <shogun/lib/SGSparseVector.h>
00014 #include <shogun/base/DynArray.h>
00015 
00016 using namespace shogun;
00017 
00018 CLibSVMFile::CLibSVMFile()
00019 {
00020     init();
00021 }
00022 
00023 CLibSVMFile::CLibSVMFile(FILE* f, const char* name) :
00024     CFile(f, name)
00025 {
00026     init();
00027     init_with_defaults();
00028 }
00029 
00030 CLibSVMFile::CLibSVMFile(const char* fname, char rw, const char* name) :
00031     CFile(fname, rw, name)
00032 {
00033     init();
00034     init_with_defaults();
00035 }
00036 
00037 CLibSVMFile::~CLibSVMFile()
00038 {
00039     SG_UNREF(m_whitespace_tokenizer);
00040     SG_UNREF(m_delimiter_tokenizer);
00041     SG_UNREF(m_line_tokenizer);
00042     SG_UNREF(m_parser);
00043     SG_UNREF(m_line_reader);
00044 }
00045 
00046 void CLibSVMFile::init()
00047 {
00048     m_delimiter=0;
00049 
00050     m_whitespace_tokenizer=NULL;
00051     m_delimiter_tokenizer=NULL;
00052     m_line_tokenizer=NULL;
00053     m_parser=NULL;
00054     m_line_reader=NULL;
00055 }
00056 
00057 void CLibSVMFile::init_with_defaults()
00058 {
00059     m_delimiter=':';
00060 
00061     m_whitespace_tokenizer=new CDelimiterTokenizer(true);
00062     m_whitespace_tokenizer->delimiters[' ']=1;
00063     SG_REF(m_whitespace_tokenizer);
00064 
00065     m_delimiter_tokenizer=new CDelimiterTokenizer(true);
00066     m_delimiter_tokenizer->delimiters[m_delimiter]=1;
00067     SG_REF(m_delimiter_tokenizer);
00068 
00069     m_line_tokenizer=new CDelimiterTokenizer(true);
00070     m_line_tokenizer->delimiters['\n']=1;
00071     SG_REF(m_line_tokenizer);
00072 
00073     m_parser=new CParser();
00074     m_line_reader=new CLineReader(file, m_line_tokenizer);
00075 }
00076 
00077 #define GET_SPARSE_MATRIX(read_func, sg_type) \
00078 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
00079 { \
00080     float64_t* labels=NULL; \
00081     get_sparse_matrix(matrix, num_feat, num_vec, labels, false); \
00082 }
00083 
00084 GET_SPARSE_MATRIX(read_bool, bool)
00085 GET_SPARSE_MATRIX(read_char, int8_t)
00086 GET_SPARSE_MATRIX(read_byte, uint8_t)
00087 GET_SPARSE_MATRIX(read_char, char)
00088 GET_SPARSE_MATRIX(read_int, int32_t)
00089 GET_SPARSE_MATRIX(read_uint, uint32_t)
00090 GET_SPARSE_MATRIX(read_short_real, float32_t)
00091 GET_SPARSE_MATRIX(read_real, float64_t)
00092 GET_SPARSE_MATRIX(read_long_real, floatmax_t)
00093 GET_SPARSE_MATRIX(read_short, int16_t)
00094 GET_SPARSE_MATRIX(read_word, uint16_t)
00095 GET_SPARSE_MATRIX(read_long, int64_t)
00096 GET_SPARSE_MATRIX(read_ulong, uint64_t)
00097 #undef GET_SPARSE_MATRIX
00098 
00099 #define GET_LABELED_SPARSE_MATRIX(read_func, sg_type) \
00100 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec, \
00101                     float64_t*& labels, bool load_labels) \
00102 { \
00103     num_feat=0; \
00104     \
00105     SG_INFO("counting line numbers in file %s\n", filename) \
00106     num_vec=get_num_lines(); \
00107     \
00108     int32_t current_line_ind=0; \
00109     SGVector<char> line; \
00110     \
00111     int32_t num_entries=0; \
00112     DynArray<SGVector<char> > entries; \
00113     \
00114     matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
00115     if (load_labels) \
00116         labels=SG_MALLOC(float64_t, num_vec); \
00117     \
00118     SG_SET_LOCALE_C; \
00119     \
00120     while (m_line_reader->has_next()) \
00121     { \
00122         num_entries=0; \
00123         entries.reset(SGVector<char>(false)); \
00124         line=m_line_reader->read_line(); \
00125         \
00126         m_parser->set_tokenizer(m_whitespace_tokenizer); \
00127         m_parser->set_text(line); \
00128         \
00129         if (load_labels && m_parser->has_next()) \
00130             labels[current_line_ind]=m_parser->read_real(); \
00131         \
00132         while (m_parser->has_next()) \
00133         { \
00134             entries.push_back(m_parser->read_string()); \
00135             num_entries++; \
00136         } \
00137         \
00138         matrix[current_line_ind]=SGSparseVector<sg_type>(num_entries); \
00139         for (int32_t i=0; i<num_entries; i++) \
00140         { \
00141             m_parser->set_tokenizer(m_delimiter_tokenizer); \
00142             m_parser->set_text(entries[i]); \
00143             \
00144             int32_t feat_index=0; \
00145             if (m_parser->has_next()) \
00146                 feat_index=m_parser->read_int(); \
00147             \
00148             sg_type entry=0; \
00149             if (m_parser->has_next()) \
00150                 entry=m_parser->read_func(); \
00151             \
00152             if (feat_index>num_feat) \
00153                 num_feat=feat_index; \
00154             \
00155             matrix[current_line_ind].features[i].feat_index=feat_index-1; \
00156             matrix[current_line_ind].features[i].entry=entry; \
00157         } \
00158         \
00159         current_line_ind++; \
00160         SG_PROGRESS(current_line_ind, 0, num_vec, 1, "LOADING:\t") \
00161     } \
00162     \
00163     SG_RESET_LOCALE; \
00164     \
00165     SG_INFO("file successfully read\n") \
00166 }
00167 
00168 GET_LABELED_SPARSE_MATRIX(read_bool, bool)
00169 GET_LABELED_SPARSE_MATRIX(read_char, int8_t)
00170 GET_LABELED_SPARSE_MATRIX(read_byte, uint8_t)
00171 GET_LABELED_SPARSE_MATRIX(read_char, char)
00172 GET_LABELED_SPARSE_MATRIX(read_int, int32_t)
00173 GET_LABELED_SPARSE_MATRIX(read_uint, uint32_t)
00174 GET_LABELED_SPARSE_MATRIX(read_short_real, float32_t)
00175 GET_LABELED_SPARSE_MATRIX(read_real, float64_t)
00176 GET_LABELED_SPARSE_MATRIX(read_long_real, floatmax_t)
00177 GET_LABELED_SPARSE_MATRIX(read_short, int16_t)
00178 GET_LABELED_SPARSE_MATRIX(read_word, uint16_t)
00179 GET_LABELED_SPARSE_MATRIX(read_long, int64_t)
00180 GET_LABELED_SPARSE_MATRIX(read_ulong, uint64_t)
00181 #undef GET_LABELED_SPARSE_MATRIX
00182 
00183 #define SET_SPARSE_MATRIX(format, sg_type) \
00184 void CLibSVMFile::set_sparse_matrix( \
00185             const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
00186 { \
00187     set_sparse_matrix(matrix, num_feat, num_vec, NULL); \
00188 }
00189 
00190 SET_SPARSE_MATRIX(SCNi32, bool)
00191 SET_SPARSE_MATRIX(SCNi8, int8_t)
00192 SET_SPARSE_MATRIX(SCNu8, uint8_t)
00193 SET_SPARSE_MATRIX(SCNu8, char)
00194 SET_SPARSE_MATRIX(SCNi32, int32_t)
00195 SET_SPARSE_MATRIX(SCNu32, uint32_t)
00196 SET_SPARSE_MATRIX(SCNi64, int64_t)
00197 SET_SPARSE_MATRIX(SCNu64, uint64_t)
00198 SET_SPARSE_MATRIX(".16g", float32_t)
00199 SET_SPARSE_MATRIX(".16lg", float64_t)
00200 SET_SPARSE_MATRIX(".16Lg", floatmax_t)
00201 SET_SPARSE_MATRIX(SCNi16, int16_t)
00202 SET_SPARSE_MATRIX(SCNu16, uint16_t)
00203 #undef SET_SPARSE_MATRIX
00204 
00205 #define SET_LABELED_SPARSE_MATRIX(format, sg_type) \
00206 void CLibSVMFile::set_sparse_matrix( \
00207             const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \
00208             const float64_t* labels) \
00209 { \
00210     SG_SET_LOCALE_C; \
00211     \
00212     for (int32_t i=0; i<num_vec; i++) \
00213     { \
00214         if (labels!=NULL) \
00215             fprintf(file, "%lg ", labels[i]); \
00216         \
00217         for (int32_t j=0; j<matrix[i].num_feat_entries; j++) \
00218         { \
00219             fprintf(file, "%d%c%" format " ", \
00220                 matrix[i].features[j].feat_index+1, \
00221                 m_delimiter, \
00222                 matrix[i].features[j].entry); \
00223         } \
00224         fprintf(file, "\n"); \
00225     } \
00226     \
00227     SG_RESET_LOCALE; \
00228 }
00229 
00230 SET_LABELED_SPARSE_MATRIX(SCNi32, bool)
00231 SET_LABELED_SPARSE_MATRIX(SCNi8, int8_t)
00232 SET_LABELED_SPARSE_MATRIX(SCNu8, uint8_t)
00233 SET_LABELED_SPARSE_MATRIX(SCNu8, char)
00234 SET_LABELED_SPARSE_MATRIX(SCNi32, int32_t)
00235 SET_LABELED_SPARSE_MATRIX(SCNu32, uint32_t)
00236 SET_LABELED_SPARSE_MATRIX(SCNi64, int64_t)
00237 SET_LABELED_SPARSE_MATRIX(SCNu64, uint64_t)
00238 SET_LABELED_SPARSE_MATRIX(".16g", float32_t)
00239 SET_LABELED_SPARSE_MATRIX(".16lg", float64_t)
00240 SET_LABELED_SPARSE_MATRIX(".16Lg", floatmax_t)
00241 SET_LABELED_SPARSE_MATRIX(SCNi16, int16_t)
00242 SET_LABELED_SPARSE_MATRIX(SCNu16, uint16_t)
00243 #undef SET_LABELED_SPARSE_MATRIX
00244 
00245 int32_t CLibSVMFile::get_num_lines()
00246 {
00247     int32_t num_lines=0;
00248     while (m_line_reader->has_next())
00249     {
00250         m_line_reader->skip_line();
00251         num_lines++;
00252     }
00253     m_line_reader->reset();
00254 
00255     return num_lines;
00256 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation