Package Pyblio :: Package Parsers :: Package Syntax :: Module Tagged
[hide private]
[frames] | no frames]

Source Code for Module Pyblio.Parsers.Syntax.Tagged

  1  # This file is part of pybliographer 
  2  #  
  3  # Copyright (C) 1998-2006 Frederic GOBRY 
  4  # Email : gobry@pybliographer.org 
  5  #           
  6  # This program is free software; you can redistribute it and/or modify 
  7  # it under the terms of the GNU General Public License as published by 
  8  # the Free Software Foundation; either version 2 of the License, or 
  9  # (at your option) any later version. 
 10  #    
 11  # This program is distributed in the hope that it will be useful, but 
 12  # WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
 14  # General Public License for more details. 
 15  #  
 16  # You should have received a copy of the GNU General Public License 
 17  # along with this program; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 
 19  # 02111-1307, USA. 
 20   
 21  from gettext import gettext as _ 
 22   
 23  from Pyblio import Callback, Store, Attribute 
 24   
 25   
26 -class Parser(object):
27 28 """ Generic Parser for 'tagged' records, to be derived by actual 29 parsers. An actual subclass will need to at least override the 30 self.line_handler () method to generate events by calling 31 self.push (). The parser is in charge of general state 32 bookkeeping, and that sort of things...""" 33 34 35 EV_RECORD_START, EV_RECORD_END, EV_FIELD_START, \ 36 EV_FIELD_DATA, EV_FIELD_END, \ 37 EV_FILE_END, EV_DONE, \ 38 EV_METADATA = range (8) 39 40 # States 41 ST_IN_RECORD, ST_IN_FIELD, ST_OUTSIDE = range (3) 42 43
44 - def __init__ (self, fd, charset = 'UTF-8'):
45 46 """ Create a new parser for a file containing 'tagged' records """ 47 48 self._fd = fd 49 self._ln = 0 50 51 self._charset = charset 52 self._stack = [] 53 self._evstack = [] 54 self._started = False 55 56 self.state = self.ST_OUTSIDE 57 58 self.file_start () 59 return
60 61
62 - def file_start (self):
63 """ Override me to have a function called before the first 64 record is to be parsed """ 65 66 pass
67 68
69 - def file_stop (self):
70 71 """ Override me to be called after the last record has been parsed """ 72 73 pass
74
75 - def file_stopping (self):
76 77 """ Override me to be called just at the end of file """ 78 79 pass
80
81 - def line_handler (self, line, number):
82 83 """ Override me to handle each line of input and generate 84 self.push () events. Will be called with line == '' when the 85 end of file is reached. """ 86 87 return
88 89
90 - def field_handler (self, tag, value):
91 92 """ Transforms a single field of a record """ 93 94 return tag, value.decode (self._charset)
95 96
97 - def push (self, * ev):
98 99 """ Emit a new event. Available events are listed below, with 100 their additional parameters listed, when needed: 101 102 - self.EV_RECORD_START 103 - self.EV_RECORD_END 104 - self.EV_FIELD_START, tag, line 105 - self.EV_FIELD_DATA, data 106 - self.EV_FIELD_END 107 - self.EV_FILE_END 108 109 """ 110 111 self._evstack.append (ev) 112 return
113
114 - def metadata_add (self, tag, value):
115 """ Call me to notify the availability of a new meta data """ 116 self.push (self.EV_METADATA, tag, value) 117 return
118
119 - def record_start (self):
120 self.push (self.EV_RECORD_START) 121 return
122
123 - def record_end (self):
124 self.push (self.EV_RECORD_END) 125 return
126
127 - def field_start (self, tag, line):
128 self.push (self.EV_FIELD_START, tag, line) 129 return
130
131 - def field_end (self):
132 self.push (self.EV_FIELD_END) 133 return
134
135 - def field_data (self, data):
136 self.push (self.EV_FIELD_DATA, data) 137 return
138 139
140 - def unread (self, line, count):
141 142 """ Put back a line so that it will be returned by self._pop 143 when it is next invoked.""" 144 145 self._stack.append ((line, count)) 146 return
147 148
149 - def next (self):
150 151 """ Call this function to get the next record as a list of tuples 152 153 ('D', [ (tag, value), ...]) 154 ('M', tag, value) 155 156 or None when there are no more records 157 """ 158 159 record = [] 160 161 while 1: 162 ev = self._ev_pop () 163 164 ev, args = ev [0], ev [1:] 165 166 if ev == self.EV_FIELD_DATA: 167 if self.state != self.ST_IN_FIELD: 168 raise SyntaxError (_('line %d: unexpected field content') % self._ln) 169 170 data = data + args [0] 171 continue 172 173 if ev == self.EV_FIELD_START: 174 if self.state == self.ST_IN_FIELD: 175 raise SyntaxError (_('line %d: nested field') % self._ln) 176 177 if self.state == self.ST_OUTSIDE: 178 raise SyntaxError (_('line %d: field is not in a record') % self._ln) 179 180 self.state = self.ST_IN_FIELD 181 182 tag, start = args 183 data = '' 184 continue 185 186 if ev == self.EV_FIELD_END: 187 record.append ((start,) + self.field_handler (tag, data)) 188 189 self.state = self.ST_IN_RECORD 190 continue 191 192 if ev == self.EV_RECORD_START: 193 if self.state == self.ST_IN_RECORD: 194 raise SyntaxError (_('line %d: nested record') % self._ln) 195 196 self.state = self.ST_IN_RECORD 197 198 record = [] 199 continue 200 201 if ev == self.EV_RECORD_END: 202 if self.state != self.ST_IN_RECORD: 203 raise SyntaxError (_('line %d: unexpected end of record') % self._ln) 204 self.state = self.ST_OUTSIDE 205 return ('D', record) 206 207 if ev == self.EV_FILE_END: 208 self.file_stopping () 209 self.push (self.EV_DONE) 210 continue 211 212 if ev == self.EV_DONE: 213 if self.state != self.ST_OUTSIDE: 214 raise SyntaxError (_('line %d: unexpected end of file') % self._ln) 215 self.file_stop () 216 return None 217 218 if ev == self.EV_METADATA: 219 if self.state != self.ST_OUTSIDE: 220 raise SyntaxError (_('line %d: metadata in the middle of a record') % self._ln) 221 return ('M', args) 222 223 return
224
225 - def _ev_pop (self):
226 227 """ Parse enough lines to get the next event """ 228 229 while 1: 230 try: 231 return self._evstack.pop (0) 232 233 except IndexError: 234 pass 235 236 line, count = self._pop () 237 238 self.line_handler (line, count) 239 240 if line == '': self.push (self.EV_FILE_END) 241 242 return
243 244
245 - def _pop (self):
246 247 """ Return a line from the file with its line number. """ 248 249 try: 250 line, count = self._stack.pop () 251 252 except IndexError: 253 self._ln = self._ln + 1 254 255 line = self._fd.readline () 256 count = self._ln 257 258 return line, count
259 260 261
262 -class Reader(Callback.Publisher):
263 264 Parser = None 265
266 - def parse (self, fd, db, charset = 'UTF-8'):
267 268 self.parser = self.Parser (fd, charset) 269 self.db = db 270 271 self.emit ('file-start') 272 273 while 1: 274 record = self.parser.next () 275 if record is None: break 276 277 t, record = record 278 279 if t == 'D': self.record_parse (record) 280 elif t == 'M': self.metadata_parse (record) 281 282 self.emit ('file-stop') 283 return
284
285 - def metadata_parse (self, meta):
286 287 pass
288 289
290 - def record_begin (self):
291 292 pass
293
294 - def record_end (self):
295 296 pass
297
298 - def record_parse (self, record):
299 300 self.record = Store.Record () 301 302 self.record_begin () 303 304 for line, tag, data in record: 305 306 try: 307 cmd = getattr (self, 'do_%s' % tag.replace ('-', '_')) 308 309 except AttributeError: 310 311 try: 312 cmd = getattr (self, 'do_default') 313 314 except AttributeError: 315 316 self.emit ('warning', _('line %d: unhandled tag %s' % ( 317 line, `tag`))) 318 continue 319 320 cmd (line, tag, data) 321 322 self.record_end () 323 324 # The record might have been discarded by self.record_end (), 325 # so insert conditionally. 326 if self.record is not None: 327 328 k = self.db.add (self.record) 329 self.emit ('record-added', k) 330 331 self.record = None 332 333 return
334
335 - def text_add (self, field, value):
336 self.record.add (field, value, Attribute.Text) 337 return
338
339 - def id_add (self, field, value):
340 self.record.add (field, value, Attribute.ID) 341 return
342
343 - def url_add (self, field, value):
344 self.record.add (field, value, Attribute.URL) 345 return
346