SHOGUN
v3.2.0
|
00001 /* 00002 * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights 00003 * embodied in the content of this file are licensed under the BSD 00004 * (revised) open source license. 00005 * 00006 * This program is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 3 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * Written (W) 2011 Shashwat Lal Das 00012 * Adaptation of Vowpal Wabbit v5.1. 00013 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society. 00014 */ 00015 00016 #include <shogun/classifier/vw/cache/VwNativeCacheWriter.h> 00017 00018 using namespace shogun; 00019 00020 CVwNativeCacheWriter::CVwNativeCacheWriter() 00021 : CVwCacheWriter() 00022 { 00023 init(); 00024 } 00025 00026 CVwNativeCacheWriter::CVwNativeCacheWriter(char * fname, CVwEnvironment* env_to_use) 00027 : CVwCacheWriter(fname, env_to_use) 00028 { 00029 init(); 00030 buf.use_file(fd); 00031 00032 write_header(); 00033 } 00034 00035 CVwNativeCacheWriter::~CVwNativeCacheWriter() 00036 { 00037 buf.flush(); 00038 buf.close_file(); 00039 } 00040 00041 void CVwNativeCacheWriter::set_file(int32_t f) 00042 { 00043 if (fd > 0) 00044 { 00045 buf.flush(); 00046 buf.close_file(); 00047 } 00048 00049 fd = f; 00050 buf.use_file(fd); 00051 00052 write_header(); 00053 } 00054 00055 void CVwNativeCacheWriter::init() 00056 { 00057 neg_1 = 1; 00058 general = 2; 00059 int_size = 6; 00060 } 00061 00062 void CVwNativeCacheWriter::write_header() 00063 { 00064 const char* vw_version = env->vw_version; 00065 vw_size_t numbits = env->num_bits; 00066 vw_size_t v_length = 4; 00067 00068 // Version and numbits info 00069 buf.write_file(&v_length, sizeof(vw_size_t)); 00070 buf.write_file(vw_version,v_length); 00071 buf.write_file(&numbits, sizeof(vw_size_t)); 00072 } 00073 00074 char* CVwNativeCacheWriter::run_len_encode(char *p, vw_size_t i) 00075 { 00076 while (i >= 128) 00077 { 00078 *(p++) = (i & 127) | 128; 00079 i = i >> 7; 00080 } 00081 *(p++) = (i & 127); 00082 00083 return p; 00084 } 00085 00086 char* CVwNativeCacheWriter::bufcache_label(VwLabel* ld, char* c) 00087 { 00088 *(float32_t*)c = ld->label; 00089 c += sizeof(ld->label); 00090 *(float32_t*)c = ld->weight; 00091 c += sizeof(ld->weight); 00092 *(float32_t*)c = ld->initial; 00093 c += sizeof(ld->initial); 00094 return c; 00095 } 00096 00097 void CVwNativeCacheWriter::cache_label(VwLabel* ld) 00098 { 00099 char *c; 00100 buf.buf_write(c, sizeof(ld->label)+sizeof(ld->weight)+sizeof(ld->initial)); 00101 c = bufcache_label(ld,c); 00102 } 00103 00104 void CVwNativeCacheWriter::cache_tag(v_array<char> tag) 00105 { 00106 // Store the size of the tag and the tag itself 00107 char *c; 00108 00109 buf.buf_write(c, sizeof(vw_size_t)+tag.index()); 00110 *(vw_size_t*)c = tag.index(); 00111 c += sizeof(vw_size_t); 00112 memcpy(c, tag.begin, tag.index()); 00113 c += tag.index(); 00114 00115 buf.set(c); 00116 } 00117 00118 void CVwNativeCacheWriter::output_byte(unsigned char s) 00119 { 00120 char *c; 00121 00122 buf.buf_write(c, 1); 00123 *(c++) = s; 00124 buf.set(c); 00125 } 00126 00127 void CVwNativeCacheWriter::output_features(unsigned char index, VwFeature* begin, VwFeature* end) 00128 { 00129 char* c; 00130 vw_size_t storage = (end-begin) * int_size; 00131 for (VwFeature* i = begin; i != end; i++) 00132 if (i->x != 1. && i->x != -1.) 00133 storage+=sizeof(float32_t); 00134 00135 buf.buf_write(c, sizeof(index) + storage + sizeof(vw_size_t)); 00136 *(unsigned char*)c = index; 00137 c += sizeof(index); 00138 00139 char *storage_size_loc = c; 00140 c += sizeof(vw_size_t); 00141 00142 vw_size_t last = 0; 00143 00144 // Store the differences in hashed feature indices 00145 for (VwFeature* i = begin; i != end; i++) 00146 { 00147 int32_t s_diff = (i->weight_index - last); 00148 vw_size_t diff = ZigZagEncode(s_diff) << 2; 00149 last = i->weight_index; 00150 00151 if (i->x == 1.) 00152 c = run_len_encode(c, diff); 00153 else if (i->x == -1.) 00154 c = run_len_encode(c, diff | neg_1); 00155 else 00156 { 00157 c = run_len_encode(c, diff | general); 00158 *(float32_t*)c = i->x; 00159 c += sizeof(float32_t); 00160 } 00161 } 00162 buf.set(c); 00163 *(vw_size_t*)storage_size_loc = c - storage_size_loc - sizeof(vw_size_t); 00164 } 00165 00166 void CVwNativeCacheWriter::cache_example(VwExample* &ex) 00167 { 00168 cache_label(ex->ld); 00169 cache_tag(ex->tag); 00170 output_byte(ex->indices.index()); 00171 for (vw_size_t* b = ex->indices.begin; b != ex->indices.end; b++) 00172 output_features(*b, ex->atomics[*b].begin,ex->atomics[*b].end); 00173 } 00174