SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
LineReader.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evgeniy Andreev (gsomix)
00008  */
00009 
00010 #include <shogun/io/LineReader.h>
00011 #include <cstdio>
00012 
00013 using namespace shogun;
00014 
00015 CLineReader::CLineReader()
00016 {
00017     init();
00018 
00019     m_buffer=new CCircularBuffer();
00020 }
00021 
00022 CLineReader::CLineReader(FILE* stream, CTokenizer* tokenizer)
00023 {
00024     init();
00025 
00026     m_stream=stream;
00027     m_max_token_length=10*1024*1024;
00028 
00029     SG_REF(tokenizer);
00030     m_tokenizer=tokenizer;
00031 
00032     m_buffer=new CCircularBuffer(m_max_token_length);
00033     m_buffer->set_tokenizer(m_tokenizer);
00034 }
00035 
00036 CLineReader::CLineReader(int32_t max_token_length, FILE* stream, CTokenizer* tokenizer)
00037 {
00038     init();
00039 
00040     m_stream=stream;
00041     m_max_token_length=max_token_length;
00042 
00043     SG_REF(tokenizer);
00044     m_tokenizer=tokenizer;
00045 
00046     m_buffer=new CCircularBuffer(m_max_token_length);
00047     m_buffer->set_tokenizer(m_tokenizer);
00048 }
00049 
00050 CLineReader::~CLineReader()
00051 {
00052     SG_UNREF(m_tokenizer);
00053     SG_UNREF(m_buffer);
00054 }
00055 
00056 bool CLineReader::has_next()
00057 {
00058     if (m_stream==NULL || m_max_token_length==0 || m_tokenizer==NULL)
00059     {
00060         SG_ERROR("CLineReader::has_next():: Class is not initialized\n");
00061         return false;
00062     }
00063 
00064     if (ferror(m_stream))
00065     {
00066         SG_ERROR("CLineReader::has_next():: Error reading file\n");
00067         return false;
00068     }
00069 
00070     if (feof(m_stream) && (m_buffer->num_bytes_contained()<=0 || !m_buffer->has_next()))
00071         return false; // nothing to read
00072 
00073     return true;
00074 }
00075 
00076 void CLineReader::skip_line()
00077 {
00078     int32_t bytes_to_skip=0;
00079     m_next_token_length=read(bytes_to_skip);
00080     if (m_next_token_length==-1)
00081         return;
00082     else
00083         m_buffer->skip_characters(bytes_to_skip);
00084 }
00085 
00086 SGVector<char> CLineReader::read_line()
00087 {
00088     SGVector<char> line;
00089 
00090     int32_t bytes_to_skip=0;
00091     m_next_token_length=read(bytes_to_skip);
00092     if (m_next_token_length==-1)
00093         line=SGVector<char>();
00094     else
00095     {
00096         m_buffer->skip_characters(bytes_to_skip);
00097         line=read_token(m_next_token_length-bytes_to_skip);
00098     }
00099 
00100     return line;
00101 }
00102 
00103 void CLineReader::reset()
00104 {
00105     rewind(m_stream);
00106     m_buffer->clear();
00107 }
00108 
00109 void CLineReader::set_tokenizer(CTokenizer* tokenizer)
00110 {
00111     SG_REF(tokenizer);
00112     SG_UNREF(m_tokenizer);
00113     m_tokenizer=tokenizer;
00114 
00115     m_buffer->set_tokenizer(tokenizer);
00116 }
00117 
00118 void CLineReader::init()
00119 {
00120     m_buffer=NULL;
00121     m_tokenizer=NULL;
00122     m_stream=NULL;
00123 
00124     m_max_token_length=0;
00125     m_next_token_length=-1;
00126 }
00127 
00128 int32_t CLineReader::read(int32_t& bytes_to_skip)
00129 {
00130     int32_t line_end=0;
00131     int32_t bytes_to_read=0;
00132     int32_t temp_bytes_to_skip=0;
00133 
00134     while (1)
00135     {
00136         if (bytes_to_skip==line_end)
00137             line_end=m_buffer->next_token_idx(bytes_to_skip);
00138         else
00139             line_end=m_buffer->next_token_idx(temp_bytes_to_skip);
00140 
00141         if (m_buffer->num_bytes_contained()!=0 && line_end<m_buffer->num_bytes_contained())
00142             return line_end;
00143         else if (m_buffer->available()==0)
00144             return -1; // we need some limit in case file does not contain delimiter
00145 
00146         // if there is no delimiter in buffer
00147         // try get more data from stream
00148         // and write it into buffer
00149         if (m_buffer->available() < m_max_token_length)
00150             bytes_to_read=m_buffer->available();
00151         else
00152             bytes_to_read=m_max_token_length;
00153 
00154         if (feof(m_stream))
00155             return line_end;
00156         else
00157             m_buffer->push(m_stream, bytes_to_read);
00158 
00159         if (ferror(m_stream))
00160         {
00161             SG_ERROR("CLineReader::read(int32_t&):: Error reading file\n");
00162             return -1;
00163         }
00164     }
00165 }
00166 
00167 SGVector<char> CLineReader::read_token(int32_t line_len)
00168 {
00169     SGVector<char> line;
00170 
00171     if (line_len==0)
00172         line=SGVector<char>();
00173     else
00174         line=m_buffer->pop(line_len);
00175 
00176     return line;
00177 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation