SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evgeniy Andreev (gsomix) 00008 */ 00009 00010 #include <shogun/io/LibSVMFile.h> 00011 00012 #include <shogun/lib/SGVector.h> 00013 #include <shogun/lib/SGSparseVector.h> 00014 #include <shogun/base/DynArray.h> 00015 00016 using namespace shogun; 00017 00018 CLibSVMFile::CLibSVMFile() 00019 { 00020 init(); 00021 } 00022 00023 CLibSVMFile::CLibSVMFile(FILE* f, const char* name) : 00024 CFile(f, name) 00025 { 00026 init(); 00027 init_with_defaults(); 00028 } 00029 00030 CLibSVMFile::CLibSVMFile(const char* fname, char rw, const char* name) : 00031 CFile(fname, rw, name) 00032 { 00033 init(); 00034 init_with_defaults(); 00035 } 00036 00037 CLibSVMFile::~CLibSVMFile() 00038 { 00039 SG_UNREF(m_whitespace_tokenizer); 00040 SG_UNREF(m_delimiter_tokenizer); 00041 SG_UNREF(m_line_tokenizer); 00042 SG_UNREF(m_parser); 00043 SG_UNREF(m_line_reader); 00044 } 00045 00046 void CLibSVMFile::init() 00047 { 00048 m_delimiter=0; 00049 00050 m_whitespace_tokenizer=NULL; 00051 m_delimiter_tokenizer=NULL; 00052 m_line_tokenizer=NULL; 00053 m_parser=NULL; 00054 m_line_reader=NULL; 00055 } 00056 00057 void CLibSVMFile::init_with_defaults() 00058 { 00059 m_delimiter=':'; 00060 00061 m_whitespace_tokenizer=new CDelimiterTokenizer(true); 00062 m_whitespace_tokenizer->delimiters[' ']=1; 00063 SG_REF(m_whitespace_tokenizer); 00064 00065 m_delimiter_tokenizer=new CDelimiterTokenizer(true); 00066 m_delimiter_tokenizer->delimiters[m_delimiter]=1; 00067 SG_REF(m_delimiter_tokenizer); 00068 00069 m_line_tokenizer=new CDelimiterTokenizer(true); 00070 m_line_tokenizer->delimiters['\n']=1; 00071 SG_REF(m_line_tokenizer); 00072 00073 m_parser=new CParser(); 00074 m_line_reader=new CLineReader(file, m_line_tokenizer); 00075 } 00076 00077 #define GET_SPARSE_MATRIX(read_func, sg_type) \ 00078 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00079 { \ 00080 float64_t* labels=NULL; \ 00081 get_sparse_matrix(matrix, num_feat, num_vec, labels, false); \ 00082 } 00083 00084 GET_SPARSE_MATRIX(read_bool, bool) 00085 GET_SPARSE_MATRIX(read_char, int8_t) 00086 GET_SPARSE_MATRIX(read_byte, uint8_t) 00087 GET_SPARSE_MATRIX(read_char, char) 00088 GET_SPARSE_MATRIX(read_int, int32_t) 00089 GET_SPARSE_MATRIX(read_uint, uint32_t) 00090 GET_SPARSE_MATRIX(read_short_real, float32_t) 00091 GET_SPARSE_MATRIX(read_real, float64_t) 00092 GET_SPARSE_MATRIX(read_long_real, floatmax_t) 00093 GET_SPARSE_MATRIX(read_short, int16_t) 00094 GET_SPARSE_MATRIX(read_word, uint16_t) 00095 GET_SPARSE_MATRIX(read_long, int64_t) 00096 GET_SPARSE_MATRIX(read_ulong, uint64_t) 00097 #undef GET_SPARSE_MATRIX 00098 00099 #define GET_LABELED_SPARSE_MATRIX(read_func, sg_type) \ 00100 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec, \ 00101 float64_t*& labels, bool load_labels) \ 00102 { \ 00103 num_feat=0; \ 00104 \ 00105 SG_INFO("counting line numbers in file %s\n", filename) \ 00106 num_vec=get_num_lines(); \ 00107 \ 00108 int32_t current_line_ind=0; \ 00109 SGVector<char> line; \ 00110 \ 00111 int32_t num_entries=0; \ 00112 DynArray<SGVector<char> > entries; \ 00113 \ 00114 matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \ 00115 if (load_labels) \ 00116 labels=SG_MALLOC(float64_t, num_vec); \ 00117 \ 00118 SG_SET_LOCALE_C; \ 00119 \ 00120 while (m_line_reader->has_next()) \ 00121 { \ 00122 num_entries=0; \ 00123 entries.reset(SGVector<char>(false)); \ 00124 line=m_line_reader->read_line(); \ 00125 \ 00126 m_parser->set_tokenizer(m_whitespace_tokenizer); \ 00127 m_parser->set_text(line); \ 00128 \ 00129 if (load_labels && m_parser->has_next()) \ 00130 labels[current_line_ind]=m_parser->read_real(); \ 00131 \ 00132 while (m_parser->has_next()) \ 00133 { \ 00134 entries.push_back(m_parser->read_string()); \ 00135 num_entries++; \ 00136 } \ 00137 \ 00138 matrix[current_line_ind]=SGSparseVector<sg_type>(num_entries); \ 00139 for (int32_t i=0; i<num_entries; i++) \ 00140 { \ 00141 m_parser->set_tokenizer(m_delimiter_tokenizer); \ 00142 m_parser->set_text(entries[i]); \ 00143 \ 00144 int32_t feat_index=0; \ 00145 if (m_parser->has_next()) \ 00146 feat_index=m_parser->read_int(); \ 00147 \ 00148 sg_type entry=0; \ 00149 if (m_parser->has_next()) \ 00150 entry=m_parser->read_func(); \ 00151 \ 00152 if (feat_index>num_feat) \ 00153 num_feat=feat_index; \ 00154 \ 00155 matrix[current_line_ind].features[i].feat_index=feat_index-1; \ 00156 matrix[current_line_ind].features[i].entry=entry; \ 00157 } \ 00158 \ 00159 current_line_ind++; \ 00160 SG_PROGRESS(current_line_ind, 0, num_vec, 1, "LOADING:\t") \ 00161 } \ 00162 \ 00163 SG_RESET_LOCALE; \ 00164 \ 00165 SG_INFO("file successfully read\n") \ 00166 } 00167 00168 GET_LABELED_SPARSE_MATRIX(read_bool, bool) 00169 GET_LABELED_SPARSE_MATRIX(read_char, int8_t) 00170 GET_LABELED_SPARSE_MATRIX(read_byte, uint8_t) 00171 GET_LABELED_SPARSE_MATRIX(read_char, char) 00172 GET_LABELED_SPARSE_MATRIX(read_int, int32_t) 00173 GET_LABELED_SPARSE_MATRIX(read_uint, uint32_t) 00174 GET_LABELED_SPARSE_MATRIX(read_short_real, float32_t) 00175 GET_LABELED_SPARSE_MATRIX(read_real, float64_t) 00176 GET_LABELED_SPARSE_MATRIX(read_long_real, floatmax_t) 00177 GET_LABELED_SPARSE_MATRIX(read_short, int16_t) 00178 GET_LABELED_SPARSE_MATRIX(read_word, uint16_t) 00179 GET_LABELED_SPARSE_MATRIX(read_long, int64_t) 00180 GET_LABELED_SPARSE_MATRIX(read_ulong, uint64_t) 00181 #undef GET_LABELED_SPARSE_MATRIX 00182 00183 #define SET_SPARSE_MATRIX(format, sg_type) \ 00184 void CLibSVMFile::set_sparse_matrix( \ 00185 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \ 00186 { \ 00187 set_sparse_matrix(matrix, num_feat, num_vec, NULL); \ 00188 } 00189 00190 SET_SPARSE_MATRIX(SCNi32, bool) 00191 SET_SPARSE_MATRIX(SCNi8, int8_t) 00192 SET_SPARSE_MATRIX(SCNu8, uint8_t) 00193 SET_SPARSE_MATRIX(SCNu8, char) 00194 SET_SPARSE_MATRIX(SCNi32, int32_t) 00195 SET_SPARSE_MATRIX(SCNu32, uint32_t) 00196 SET_SPARSE_MATRIX(SCNi64, int64_t) 00197 SET_SPARSE_MATRIX(SCNu64, uint64_t) 00198 SET_SPARSE_MATRIX(".16g", float32_t) 00199 SET_SPARSE_MATRIX(".16lg", float64_t) 00200 SET_SPARSE_MATRIX(".16Lg", floatmax_t) 00201 SET_SPARSE_MATRIX(SCNi16, int16_t) 00202 SET_SPARSE_MATRIX(SCNu16, uint16_t) 00203 #undef SET_SPARSE_MATRIX 00204 00205 #define SET_LABELED_SPARSE_MATRIX(format, sg_type) \ 00206 void CLibSVMFile::set_sparse_matrix( \ 00207 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \ 00208 const float64_t* labels) \ 00209 { \ 00210 SG_SET_LOCALE_C; \ 00211 \ 00212 for (int32_t i=0; i<num_vec; i++) \ 00213 { \ 00214 if (labels!=NULL) \ 00215 fprintf(file, "%lg ", labels[i]); \ 00216 \ 00217 for (int32_t j=0; j<matrix[i].num_feat_entries; j++) \ 00218 { \ 00219 fprintf(file, "%d%c%" format " ", \ 00220 matrix[i].features[j].feat_index+1, \ 00221 m_delimiter, \ 00222 matrix[i].features[j].entry); \ 00223 } \ 00224 fprintf(file, "\n"); \ 00225 } \ 00226 \ 00227 SG_RESET_LOCALE; \ 00228 } 00229 00230 SET_LABELED_SPARSE_MATRIX(SCNi32, bool) 00231 SET_LABELED_SPARSE_MATRIX(SCNi8, int8_t) 00232 SET_LABELED_SPARSE_MATRIX(SCNu8, uint8_t) 00233 SET_LABELED_SPARSE_MATRIX(SCNu8, char) 00234 SET_LABELED_SPARSE_MATRIX(SCNi32, int32_t) 00235 SET_LABELED_SPARSE_MATRIX(SCNu32, uint32_t) 00236 SET_LABELED_SPARSE_MATRIX(SCNi64, int64_t) 00237 SET_LABELED_SPARSE_MATRIX(SCNu64, uint64_t) 00238 SET_LABELED_SPARSE_MATRIX(".16g", float32_t) 00239 SET_LABELED_SPARSE_MATRIX(".16lg", float64_t) 00240 SET_LABELED_SPARSE_MATRIX(".16Lg", floatmax_t) 00241 SET_LABELED_SPARSE_MATRIX(SCNi16, int16_t) 00242 SET_LABELED_SPARSE_MATRIX(SCNu16, uint16_t) 00243 #undef SET_LABELED_SPARSE_MATRIX 00244 00245 int32_t CLibSVMFile::get_num_lines() 00246 { 00247 int32_t num_lines=0; 00248 while (m_line_reader->has_next()) 00249 { 00250 m_line_reader->skip_line(); 00251 num_lines++; 00252 } 00253 m_line_reader->reset(); 00254 00255 return num_lines; 00256 }