SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/features/SNPFeatures.h> 00012 #include <shogun/io/SGIO.h> 00013 #include <shogun/features/Alphabet.h> 00014 #include <shogun/lib/memory.h> 00015 00016 using namespace shogun; 00017 00018 CSNPFeatures::CSNPFeatures() 00019 { 00020 SG_UNSTABLE("CSNPFeatures::CSNPFeatures()", "\n") 00021 00022 strings = NULL; 00023 00024 string_length = 0; 00025 num_strings = 0; 00026 w_dim = 0; 00027 00028 normalization_const = 0.0; 00029 00030 m_str_min = NULL; 00031 m_str_maj = NULL; 00032 } 00033 00034 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(), 00035 m_str_min(NULL), m_str_maj(NULL) 00036 { 00037 ASSERT(str) 00038 ASSERT(str->have_same_length()) 00039 SG_REF(str); 00040 00041 strings=str; 00042 string_length=str->get_max_vector_length(); 00043 ASSERT((string_length & 1) == 0) // length divisible by 2 00044 w_dim=3*string_length/2; 00045 num_strings=str->get_num_vectors(); 00046 CAlphabet* alpha=str->get_alphabet(); 00047 ASSERT(alpha->get_alphabet()==SNP) 00048 SG_UNREF(alpha); 00049 00050 obtain_base_strings(); 00051 set_normalization_const(); 00052 00053 } 00054 00055 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig) 00056 : CDotFeatures(orig), strings(orig.strings), 00057 normalization_const(orig.normalization_const), 00058 m_str_min(NULL), m_str_maj(NULL) 00059 { 00060 SG_REF(strings); 00061 00062 if (strings) 00063 { 00064 string_length=strings->get_max_vector_length(); 00065 ASSERT((string_length & 1) == 0) // length divisible by 2 00066 w_dim=3*string_length; 00067 num_strings=strings->get_num_vectors(); 00068 } 00069 else 00070 { 00071 string_length = 0; 00072 w_dim = 0; 00073 num_strings = 0; 00074 } 00075 00076 obtain_base_strings(); 00077 } 00078 00079 CSNPFeatures::~CSNPFeatures() 00080 { 00081 SG_UNREF(strings); 00082 } 00083 00084 int32_t CSNPFeatures::get_dim_feature_space() const 00085 { 00086 return w_dim; 00087 } 00088 00089 int32_t CSNPFeatures::get_nnz_features_for_vector(int32_t num) 00090 { 00091 return w_dim/3; 00092 } 00093 00094 EFeatureType CSNPFeatures::get_feature_type() const 00095 { 00096 return F_UNKNOWN; 00097 } 00098 00099 EFeatureClass CSNPFeatures::get_feature_class() const 00100 { 00101 return C_WD; 00102 } 00103 00104 int32_t CSNPFeatures::get_num_vectors() const 00105 { 00106 return num_strings; 00107 } 00108 00109 float64_t CSNPFeatures::get_normalization_const() 00110 { 00111 return normalization_const; 00112 } 00113 00114 void CSNPFeatures::set_minor_base_string(const char* str) 00115 { 00116 m_str_min=(uint8_t*) get_strdup(str); 00117 } 00118 00119 void CSNPFeatures::set_major_base_string(const char* str) 00120 { 00121 m_str_maj=(uint8_t*) get_strdup(str); 00122 } 00123 00124 char* CSNPFeatures::get_minor_base_string() 00125 { 00126 return (char*) m_str_min; 00127 } 00128 00129 char* CSNPFeatures::get_major_base_string() 00130 { 00131 return (char*) m_str_maj; 00132 } 00133 00134 float64_t CSNPFeatures::dot(int32_t idx_a, CDotFeatures* df, int32_t idx_b) 00135 { 00136 ASSERT(df) 00137 ASSERT(df->get_feature_type() == get_feature_type()) 00138 ASSERT(df->get_feature_class() == get_feature_class()) 00139 CSNPFeatures* sf=(CSNPFeatures*) df; 00140 00141 int32_t alen, blen; 00142 bool free_avec, free_bvec; 00143 00144 uint8_t* avec = strings->get_feature_vector(idx_a, alen, free_avec); 00145 uint8_t* bvec = sf->strings->get_feature_vector(idx_b, blen, free_bvec); 00146 00147 ASSERT(alen==blen) 00148 if (alen!=string_length) 00149 SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length) 00150 ASSERT(m_str_min) 00151 ASSERT(m_str_maj) 00152 00153 float64_t total=0; 00154 00155 for (int32_t i = 0; i<alen-1; i+=2) 00156 { 00157 int32_t sumaa=0; 00158 int32_t sumbb=0; 00159 int32_t sumab=0; 00160 00161 uint8_t a1=avec[i]; 00162 uint8_t a2=avec[i+1]; 00163 uint8_t b1=bvec[i]; 00164 uint8_t b2=bvec[i+1]; 00165 00166 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0')) 00167 sumab++; 00168 else if (a1==a2 && b1==b2) 00169 { 00170 if (a1!=b1) 00171 continue; 00172 00173 if (a1==m_str_min[i]) 00174 sumaa++; 00175 else if (a1==m_str_maj[i]) 00176 sumbb++; 00177 else 00178 { 00179 SG_ERROR("The impossible happened i=%d a1=%c " 00180 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]); 00181 } 00182 00183 } 00184 total+=sumaa+sumbb+sumab; 00185 } 00186 00187 strings->free_feature_vector(avec, idx_a, free_avec); 00188 sf->strings->free_feature_vector(bvec, idx_b, free_bvec); 00189 return total; 00190 } 00191 00192 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00193 { 00194 if (vec2_len != w_dim) 00195 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim) 00196 00197 float64_t sum=0; 00198 int32_t len; 00199 bool free_vec1; 00200 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00201 int32_t offs=0; 00202 00203 for (int32_t i=0; i<len; i+=2) 00204 { 00205 int32_t dim=0; 00206 00207 char a1=vec[i]; 00208 char a2=vec[i+1]; 00209 00210 if (a1==a2 && a1!='0' && a2!='0') 00211 { 00212 if (a1==m_str_min[i]) 00213 dim=1; 00214 else if (a1==m_str_maj[i]) 00215 dim=2; 00216 else 00217 { 00218 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n", 00219 i, a1,a2, m_str_min[i], m_str_maj[i]); 00220 } 00221 } 00222 00223 sum+=vec2[offs+dim]; 00224 offs+=3; 00225 } 00226 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00227 00228 return sum/normalization_const; 00229 } 00230 00231 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00232 { 00233 if (vec2_len != w_dim) 00234 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim) 00235 00236 int32_t len; 00237 bool free_vec1; 00238 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00239 int32_t offs=0; 00240 00241 if (abs_val) 00242 alpha=CMath::abs(alpha); 00243 00244 for (int32_t i=0; i<len; i+=2) 00245 { 00246 int32_t dim=0; 00247 00248 char a1=vec[i]; 00249 char a2=vec[i+1]; 00250 00251 if (a1==a2 && a1!='0' && a2!='0') 00252 { 00253 if (a1==m_str_min[i]) 00254 dim=1; 00255 else if (a1==m_str_maj[i]) 00256 dim=2; 00257 else 00258 { 00259 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n", 00260 i, a1,a2, m_str_min[i], m_str_maj[i]); 00261 } 00262 } 00263 00264 vec2[offs+dim]+=alpha; 00265 offs+=3; 00266 } 00267 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00268 } 00269 00270 void CSNPFeatures::find_minor_major_strings(uint8_t* minor, uint8_t* major) 00271 { 00272 for (int32_t i=0; i<num_strings; i++) 00273 { 00274 int32_t len; 00275 bool free_vec; 00276 uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec); 00277 ASSERT(string_length==len) 00278 00279 for (int32_t j=0; j<len; j++) 00280 { 00281 // skip sequencing errors 00282 if (vec[j]=='0') 00283 continue; 00284 00285 if (minor[j]==0) 00286 minor[j]=vec[j]; 00287 else if (major[j]==0 && vec[j]!=minor[j]) 00288 major[j]=vec[j]; 00289 } 00290 00291 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec); 00292 } 00293 } 00294 00295 void CSNPFeatures::obtain_base_strings(CSNPFeatures* snp) 00296 { 00297 SG_FREE(m_str_min); 00298 SG_FREE(m_str_maj); 00299 size_t tlen=(string_length+1)*sizeof(uint8_t); 00300 00301 m_str_min=SG_CALLOC(uint8_t, tlen); 00302 m_str_maj=SG_CALLOC(uint8_t, tlen); 00303 00304 find_minor_major_strings(m_str_min, m_str_maj); 00305 00306 if (snp) 00307 snp->find_minor_major_strings(m_str_min, m_str_maj); 00308 00309 for (int32_t j=0; j<string_length; j++) 00310 { 00311 // if only one symbol occurs use 0 00312 if (m_str_min[j]==0) 00313 m_str_min[j]='0'; 00314 if (m_str_maj[j]==0) 00315 m_str_maj[j]='0'; 00316 00317 if (m_str_min[j]>m_str_maj[j]) 00318 CMath::swap(m_str_min[j], m_str_maj[j]); 00319 } 00320 } 00321 00322 void CSNPFeatures::set_normalization_const(float64_t n) 00323 { 00324 if (n==0) 00325 { 00326 normalization_const=string_length; 00327 normalization_const=CMath::sqrt(normalization_const); 00328 } 00329 else 00330 normalization_const=n; 00331 00332 SG_DEBUG("normalization_const:%f\n", normalization_const) 00333 } 00334 00335 void* CSNPFeatures::get_feature_iterator(int32_t vector_index) 00336 { 00337 return NULL; 00338 } 00339 00340 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00341 { 00342 return false; 00343 } 00344 00345 void CSNPFeatures::free_feature_iterator(void* iterator) 00346 { 00347 } 00348 00349 CFeatures* CSNPFeatures::duplicate() const 00350 { 00351 return new CSNPFeatures(*this); 00352 } 00353 00354 SGMatrix<float64_t> CSNPFeatures::get_histogram(bool normalize) 00355 { 00356 int32_t nsym=3; 00357 float64_t* h= SG_CALLOC(float64_t, size_t(nsym)*string_length/2); 00358 00359 float64_t* h_normalizer=SG_MALLOC(float64_t, string_length/2); 00360 memset(h_normalizer, 0, string_length/2*sizeof(float64_t)); 00361 int32_t num_str=get_num_vectors(); 00362 for (int32_t i=0; i<num_str; i++) 00363 { 00364 int32_t len; 00365 bool free_vec; 00366 uint8_t* vec = strings->get_feature_vector(i, len, free_vec); 00367 00368 for (int32_t j=0; j<len; j+=2) 00369 { 00370 int32_t dim=0; 00371 00372 char a1=vec[j]; 00373 char a2=vec[j+1]; 00374 00375 if (a1==a2 && a1!='0' && a2!='0') 00376 { 00377 if (a1==m_str_min[j]) 00378 dim=1; 00379 else if (a1==m_str_maj[j]) 00380 dim=2; 00381 else 00382 { 00383 SG_ERROR("The impossible happened j=%d a1=%c a2=%c min=%c maj=%c\n", 00384 j, a1,a2, m_str_min[j], m_str_maj[j]); 00385 } 00386 } 00387 00388 h[int64_t(j/2)*nsym+dim]++; 00389 h_normalizer[j/2]++; 00390 } 00391 00392 strings->free_feature_vector(vec, i, free_vec); 00393 } 00394 00395 if (normalize) 00396 { 00397 for (int32_t i=0; i<string_length/2; i++) 00398 { 00399 for (int32_t j=0; j<nsym; j++) 00400 { 00401 if (h_normalizer && h_normalizer[i]) 00402 h[int64_t(i)*nsym+j]/=h_normalizer[i]; 00403 } 00404 } 00405 } 00406 SG_FREE(h_normalizer); 00407 00408 return SGMatrix<float64_t>(h, nsym, string_length/2); 00409 } 00410 00411 SGMatrix<float64_t> CSNPFeatures::get_2x3_table(CSNPFeatures* pos, CSNPFeatures* neg) 00412 { 00413 00414 ASSERT(pos->strings->get_max_vector_length() == neg->strings->get_max_vector_length()) 00415 int32_t len=pos->strings->get_max_vector_length(); 00416 00417 float64_t* table=SG_MALLOC(float64_t, 3*2*len/2); 00418 00419 SGMatrix<float64_t> p_hist=pos->get_histogram(false); 00420 SGMatrix<float64_t> n_hist=neg->get_histogram(false); 00421 00422 00423 for (int32_t i=0; i<3*len/2; i++) 00424 { 00425 table[2*i]=p_hist.matrix[i]; 00426 table[2*i+1]=n_hist.matrix[i]; 00427 } 00428 return SGMatrix<float64_t>(table, 2,3*len/2); 00429 }