SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/features/DotFeatures.h> 00012 #include <shogun/io/SGIO.h> 00013 #include <shogun/lib/Signal.h> 00014 #include <shogun/lib/Time.h> 00015 #include <shogun/mathematics/Math.h> 00016 #include <shogun/base/Parallel.h> 00017 #include <shogun/base/Parameter.h> 00018 00019 #ifdef HAVE_PTHREAD 00020 #include <pthread.h> 00021 #endif 00022 00023 using namespace shogun; 00024 00025 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00026 struct DF_THREAD_PARAM 00027 { 00028 CDotFeatures* df; 00029 int32_t* sub_index; 00030 float64_t* output; 00031 int32_t start; 00032 int32_t stop; 00033 float64_t* alphas; 00034 float64_t* vec; 00035 int32_t dim; 00036 float64_t bias; 00037 bool progress; 00038 }; 00039 #endif // DOXYGEN_SHOULD_SKIP_THIS 00040 00041 00042 CDotFeatures::CDotFeatures(int32_t size) 00043 :CFeatures(size), combined_weight(1.0) 00044 { 00045 init(); 00046 } 00047 00048 00049 CDotFeatures::CDotFeatures(const CDotFeatures & orig) 00050 :CFeatures(orig), combined_weight(orig.combined_weight) 00051 { 00052 init(); 00053 } 00054 00055 00056 CDotFeatures::CDotFeatures(CFile* loader) 00057 :CFeatures(loader) 00058 { 00059 init(); 00060 } 00061 00062 float64_t CDotFeatures::dense_dot_sgvec(int32_t vec_idx1, SGVector<float64_t> vec2) 00063 { 00064 return dense_dot(vec_idx1, vec2.vector, vec2.vlen); 00065 } 00066 00067 void CDotFeatures::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b) 00068 { 00069 ASSERT(output) 00070 // write access is internally between output[start..stop] so the following 00071 // line is necessary to write to output[0...(stop-start-1)] 00072 output-=start; 00073 ASSERT(start>=0) 00074 ASSERT(start<stop) 00075 ASSERT(stop<=get_num_vectors()) 00076 00077 int32_t num_vectors=stop-start; 00078 ASSERT(num_vectors>0) 00079 00080 int32_t num_threads=parallel->get_num_threads(); 00081 ASSERT(num_threads>0) 00082 00083 CSignal::clear_cancel(); 00084 00085 #ifdef HAVE_PTHREAD 00086 if (num_threads < 2) 00087 { 00088 #endif 00089 DF_THREAD_PARAM params; 00090 params.df=this; 00091 params.sub_index=NULL; 00092 params.output=output; 00093 params.start=start; 00094 params.stop=stop; 00095 params.alphas=alphas; 00096 params.vec=vec; 00097 params.dim=dim; 00098 params.bias=b; 00099 params.progress=false; //true; 00100 dense_dot_range_helper((void*) ¶ms); 00101 #ifdef HAVE_PTHREAD 00102 } 00103 else 00104 { 00105 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1); 00106 DF_THREAD_PARAM* params = SG_MALLOC(DF_THREAD_PARAM, num_threads); 00107 int32_t step= num_vectors/num_threads; 00108 00109 int32_t t; 00110 00111 for (t=0; t<num_threads-1; t++) 00112 { 00113 params[t].df = this; 00114 params[t].sub_index=NULL; 00115 params[t].output = output; 00116 params[t].start = start+t*step; 00117 params[t].stop = start+(t+1)*step; 00118 params[t].alphas=alphas; 00119 params[t].vec=vec; 00120 params[t].dim=dim; 00121 params[t].bias=b; 00122 params[t].progress = false; 00123 pthread_create(&threads[t], NULL, 00124 CDotFeatures::dense_dot_range_helper, (void*)¶ms[t]); 00125 } 00126 00127 params[t].df = this; 00128 params[t].output = output; 00129 params[t].sub_index=NULL; 00130 params[t].start = start+t*step; 00131 params[t].stop = stop; 00132 params[t].alphas=alphas; 00133 params[t].vec=vec; 00134 params[t].dim=dim; 00135 params[t].bias=b; 00136 params[t].progress = false; //true; 00137 dense_dot_range_helper((void*) ¶ms[t]); 00138 00139 for (t=0; t<num_threads-1; t++) 00140 pthread_join(threads[t], NULL); 00141 00142 SG_FREE(params); 00143 SG_FREE(threads); 00144 } 00145 #endif 00146 00147 #ifndef WIN32 00148 if ( CSignal::cancel_computations() ) 00149 SG_INFO("prematurely stopped. \n") 00150 #endif 00151 } 00152 00153 void CDotFeatures::dense_dot_range_subset(int32_t* sub_index, int32_t num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b) 00154 { 00155 ASSERT(sub_index) 00156 ASSERT(output) 00157 00158 int32_t num_threads=parallel->get_num_threads(); 00159 ASSERT(num_threads>0) 00160 00161 CSignal::clear_cancel(); 00162 00163 #ifdef HAVE_PTHREAD 00164 if (num_threads < 2) 00165 { 00166 #endif 00167 DF_THREAD_PARAM params; 00168 params.df=this; 00169 params.sub_index=sub_index; 00170 params.output=output; 00171 params.start=0; 00172 params.stop=num; 00173 params.alphas=alphas; 00174 params.vec=vec; 00175 params.dim=dim; 00176 params.bias=b; 00177 params.progress=false; //true; 00178 dense_dot_range_helper((void*) ¶ms); 00179 #ifdef HAVE_PTHREAD 00180 } 00181 else 00182 { 00183 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1); 00184 DF_THREAD_PARAM* params = SG_MALLOC(DF_THREAD_PARAM, num_threads); 00185 int32_t step= num/num_threads; 00186 00187 int32_t t; 00188 00189 for (t=0; t<num_threads-1; t++) 00190 { 00191 params[t].df = this; 00192 params[t].sub_index=sub_index; 00193 params[t].output = output; 00194 params[t].start = t*step; 00195 params[t].stop = (t+1)*step; 00196 params[t].alphas=alphas; 00197 params[t].vec=vec; 00198 params[t].dim=dim; 00199 params[t].bias=b; 00200 params[t].progress = false; 00201 pthread_create(&threads[t], NULL, 00202 CDotFeatures::dense_dot_range_helper, (void*)¶ms[t]); 00203 } 00204 00205 params[t].df = this; 00206 params[t].sub_index=sub_index; 00207 params[t].output = output; 00208 params[t].start = t*step; 00209 params[t].stop = num; 00210 params[t].alphas=alphas; 00211 params[t].vec=vec; 00212 params[t].dim=dim; 00213 params[t].bias=b; 00214 params[t].progress = false; //true; 00215 dense_dot_range_helper((void*) ¶ms[t]); 00216 00217 for (t=0; t<num_threads-1; t++) 00218 pthread_join(threads[t], NULL); 00219 00220 SG_FREE(params); 00221 SG_FREE(threads); 00222 } 00223 #endif 00224 00225 #ifndef WIN32 00226 if ( CSignal::cancel_computations() ) 00227 SG_INFO("prematurely stopped. \n") 00228 #endif 00229 } 00230 00231 void* CDotFeatures::dense_dot_range_helper(void* p) 00232 { 00233 DF_THREAD_PARAM* par=(DF_THREAD_PARAM*) p; 00234 CDotFeatures* df=par->df; 00235 int32_t* sub_index=par->sub_index; 00236 float64_t* output=par->output; 00237 int32_t start=par->start; 00238 int32_t stop=par->stop; 00239 float64_t* alphas=par->alphas; 00240 float64_t* vec=par->vec; 00241 int32_t dim=par->dim; 00242 float64_t bias=par->bias; 00243 bool progress=par->progress; 00244 00245 if (sub_index) 00246 { 00247 #ifdef WIN32 00248 for (int32_t i=start; i<stop i++) 00249 #else 00250 for (int32_t i=start; i<stop && 00251 !CSignal::cancel_computations(); i++) 00252 #endif 00253 { 00254 if (alphas) 00255 output[i]=alphas[sub_index[i]]*df->dense_dot(sub_index[i], vec, dim)+bias; 00256 else 00257 output[i]=df->dense_dot(sub_index[i], vec, dim)+bias; 00258 if (progress) 00259 df->display_progress(start, stop, i); 00260 } 00261 00262 } 00263 else 00264 { 00265 #ifdef WIN32 00266 for (int32_t i=start; i<stop i++) 00267 #else 00268 for (int32_t i=start; i<stop && 00269 !CSignal::cancel_computations(); i++) 00270 #endif 00271 { 00272 if (alphas) 00273 output[i]=alphas[i]*df->dense_dot(i, vec, dim)+bias; 00274 else 00275 output[i]=df->dense_dot(i, vec, dim)+bias; 00276 if (progress) 00277 df->display_progress(start, stop, i); 00278 } 00279 } 00280 00281 return NULL; 00282 } 00283 00284 SGMatrix<float64_t> CDotFeatures::get_computed_dot_feature_matrix() 00285 { 00286 00287 int64_t offs=0; 00288 int32_t num=get_num_vectors(); 00289 int32_t dim=get_dim_feature_space(); 00290 ASSERT(num>0) 00291 ASSERT(dim>0) 00292 00293 SGMatrix<float64_t> m(dim, num); 00294 m.zero(); 00295 00296 for (int32_t i=0; i<num; i++) 00297 { 00298 add_to_dense_vec(1.0, i, &(m.matrix[offs]), dim); 00299 offs+=dim; 00300 } 00301 00302 return m; 00303 } 00304 00305 SGVector<float64_t> CDotFeatures::get_computed_dot_feature_vector(int32_t num) 00306 { 00307 00308 int32_t dim=get_dim_feature_space(); 00309 ASSERT(num>=0 && num<=get_num_vectors()) 00310 ASSERT(dim>0) 00311 00312 SGVector<float64_t> v(dim); 00313 v.zero(); 00314 add_to_dense_vec(1.0, num, v.vector, dim); 00315 return v; 00316 } 00317 00318 void CDotFeatures::benchmark_add_to_dense_vector(int32_t repeats) 00319 { 00320 int32_t num=get_num_vectors(); 00321 int32_t d=get_dim_feature_space(); 00322 float64_t* w= SG_MALLOC(float64_t, d); 00323 SGVector<float64_t>::fill_vector(w, d, 0.0); 00324 00325 CTime t; 00326 float64_t start_cpu=t.get_runtime(); 00327 float64_t start_wall=t.get_curtime(); 00328 for (int32_t r=0; r<repeats; r++) 00329 { 00330 for (int32_t i=0; i<num; i++) 00331 add_to_dense_vec(1.172343*(r+1), i, w, d); 00332 } 00333 00334 SG_PRINT("Time to process %d x num=%d add_to_dense_vector ops: cputime %fs walltime %fs\n", 00335 repeats, num, (t.get_runtime()-start_cpu)/repeats, 00336 (t.get_curtime()-start_wall)/repeats); 00337 00338 SG_FREE(w); 00339 } 00340 00341 void CDotFeatures::benchmark_dense_dot_range(int32_t repeats) 00342 { 00343 int32_t num=get_num_vectors(); 00344 int32_t d=get_dim_feature_space(); 00345 float64_t* w= SG_MALLOC(float64_t, d); 00346 float64_t* out= SG_MALLOC(float64_t, num); 00347 float64_t* alphas= SG_MALLOC(float64_t, num); 00348 SGVector<float64_t>::range_fill_vector(w, d, 17.0); 00349 SGVector<float64_t>::range_fill_vector(alphas, num, 1.2345); 00350 //SGVector<float64_t>::fill_vector(w, d, 17.0); 00351 //SGVector<float64_t>::fill_vector(alphas, num, 1.2345); 00352 00353 CTime t; 00354 float64_t start_cpu=t.get_runtime(); 00355 float64_t start_wall=t.get_curtime(); 00356 00357 for (int32_t r=0; r<repeats; r++) 00358 dense_dot_range(out, 0, num, alphas, w, d, 23); 00359 00360 #ifdef DEBUG_DOTFEATURES 00361 CMath::display_vector(out, 40, "dense_dot_range"); 00362 float64_t* out2= SG_MALLOC(float64_t, num); 00363 00364 for (int32_t r=0; r<repeats; r++) 00365 { 00366 CMath::fill_vector(out2, num, 0.0); 00367 for (int32_t i=0; i<num; i++) 00368 out2[i]+=dense_dot(i, w, d)*alphas[i]+23; 00369 } 00370 CMath::display_vector(out2, 40, "dense_dot"); 00371 for (int32_t i=0; i<num; i++) 00372 out2[i]-=out[i]; 00373 CMath::display_vector(out2, 40, "diff"); 00374 #endif 00375 SG_PRINT("Time to process %d x num=%d dense_dot_range ops: cputime %fs walltime %fs\n", 00376 repeats, num, (t.get_runtime()-start_cpu)/repeats, 00377 (t.get_curtime()-start_wall)/repeats); 00378 00379 SG_FREE(alphas); 00380 SG_FREE(out); 00381 SG_FREE(w); 00382 } 00383 00384 SGVector<float64_t> CDotFeatures::get_mean() 00385 { 00386 int32_t num=get_num_vectors(); 00387 int32_t dim=get_dim_feature_space(); 00388 ASSERT(num>0) 00389 ASSERT(dim>0) 00390 00391 SGVector<float64_t> mean(dim); 00392 memset(mean.vector, 0, sizeof(float64_t)*dim); 00393 00394 for (int i = 0; i < num; i++) 00395 add_to_dense_vec(1, i, mean.vector, dim); 00396 for (int j = 0; j < dim; j++) 00397 mean.vector[j] /= num; 00398 00399 return mean; 00400 } 00401 00402 SGVector<float64_t> CDotFeatures::get_mean(CDotFeatures* lhs, CDotFeatures* rhs) 00403 { 00404 ASSERT(lhs && rhs) 00405 ASSERT(lhs->get_dim_feature_space() == rhs->get_dim_feature_space()) 00406 00407 int32_t num_lhs=lhs->get_num_vectors(); 00408 int32_t num_rhs=rhs->get_num_vectors(); 00409 int32_t dim=lhs->get_dim_feature_space(); 00410 ASSERT(num_lhs>0) 00411 ASSERT(num_rhs>0) 00412 ASSERT(dim>0) 00413 00414 SGVector<float64_t> mean(dim); 00415 memset(mean.vector, 0, sizeof(float64_t)*dim); 00416 00417 for (int i = 0; i < num_lhs; i++) 00418 lhs->add_to_dense_vec(1, i, mean.vector, dim); 00419 for (int i = 0; i < num_rhs; i++) 00420 rhs->add_to_dense_vec(1, i, mean.vector, dim); 00421 for (int j = 0; j < dim; j++) 00422 mean.vector[j] /= (num_lhs+num_rhs); 00423 00424 return mean; 00425 } 00426 00427 SGMatrix<float64_t> CDotFeatures::get_cov() 00428 { 00429 int32_t num=get_num_vectors(); 00430 int32_t dim=get_dim_feature_space(); 00431 ASSERT(num>0) 00432 ASSERT(dim>0) 00433 00434 SGMatrix<float64_t> cov(dim, dim); 00435 00436 memset(cov.matrix, 0, sizeof(float64_t)*dim*dim); 00437 00438 SGVector<float64_t> mean = get_mean(); 00439 00440 for (int i = 0; i < num; i++) 00441 { 00442 SGVector<float64_t> v = get_computed_dot_feature_vector(i); 00443 SGVector<float64_t>::add(v.vector, 1, v.vector, -1, mean.vector, v.vlen); 00444 for (int m = 0; m < v.vlen; m++) 00445 { 00446 for (int n = 0; n <= m ; n++) 00447 { 00448 (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n]; 00449 } 00450 } 00451 } 00452 for (int m = 0; m < dim; m++) 00453 { 00454 for (int n = 0; n <= m ; n++) 00455 { 00456 (cov.matrix)[m*dim+n] /= num; 00457 } 00458 } 00459 for (int m = 0; m < dim-1; m++) 00460 { 00461 for (int n = m+1; n < dim; n++) 00462 { 00463 (cov.matrix)[m*dim+n] = (cov.matrix)[n*dim+m]; 00464 } 00465 } 00466 return cov; 00467 } 00468 00469 SGMatrix<float64_t> CDotFeatures::compute_cov(CDotFeatures* lhs, CDotFeatures* rhs) 00470 { 00471 CDotFeatures* feats[2]; 00472 feats[0]=lhs; 00473 feats[1]=rhs; 00474 00475 int32_t nums[2], dims[2], num=0; 00476 00477 for (int i = 0; i < 2; i++) 00478 { 00479 nums[i]=feats[i]->get_num_vectors(); 00480 dims[i]=feats[i]->get_dim_feature_space(); 00481 ASSERT(nums[i]>0) 00482 ASSERT(dims[i]>0) 00483 num += nums[i]; 00484 } 00485 00486 ASSERT(dims[0]==dims[1]) 00487 int32_t dim = dims[0]; 00488 00489 SGMatrix<float64_t> cov(dim, dim); 00490 00491 memset(cov.matrix, 0, sizeof(float64_t)*dim*dim); 00492 00493 SGVector<float64_t> mean=get_mean(lhs,rhs); 00494 00495 for (int i = 0; i < 2; i++) 00496 { 00497 for (int j = 0; j < nums[i]; j++) 00498 { 00499 SGVector<float64_t> v = feats[i]->get_computed_dot_feature_vector(j); 00500 SGVector<float64_t>::add(v.vector, 1, v.vector, -1, mean.vector, v.vlen); 00501 for (int m = 0; m < v.vlen; m++) 00502 { 00503 for (int n = 0; n <= m; n++) 00504 { 00505 (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n]; 00506 } 00507 } 00508 } 00509 } 00510 for (int m = 0; m < dim; m++) 00511 { 00512 for (int n = 0; n <= m; n++) 00513 { 00514 (cov.matrix)[m*dim+n] /= num; 00515 } 00516 } 00517 for (int m = 0; m < dim-1; m++) 00518 { 00519 for (int n = m+1; n < dim; n++) 00520 { 00521 (cov.matrix[m*dim+n]) = (cov.matrix)[n*dim+m]; 00522 } 00523 } 00524 00525 return cov; 00526 } 00527 00528 void CDotFeatures::display_progress(int32_t start, int32_t stop, int32_t v) 00529 { 00530 int32_t num_vectors=stop-start; 00531 int32_t i=v-start; 00532 00533 if ( (i% (num_vectors/100+1))== 0) 00534 SG_PROGRESS(v, 0.0, num_vectors-1) 00535 } 00536 00537 void CDotFeatures::init() 00538 { 00539 set_property(FP_DOT); 00540 m_parameters->add(&combined_weight, "combined_weight", 00541 "Feature weighting in combined dot features."); 00542 }