1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """ Extension module for BibTeX files """
24
25
26 import re, os, string, logging
27
28 from Pyblio.Parsers.Syntax.BibTeX import Parser, Coding
29 from Pyblio.Parsers.Syntax.BibTeX import Environ as BaseEnviron
30
31 from Pyblio import Attribute, Store, Exceptions, Tools
32
33 from gettext import gettext as _
34
35
36
37
38
41 self.strings = {
42 'jan': Parser.Text('January'),
43 'feb': Parser.Text('February'),
44 'mar': Parser.Text('March'),
45 'apr': Parser.Text('April'),
46 'may': Parser.Text('May'),
47 'jun': Parser.Text('June'),
48 'jul': Parser.Text('July'),
49 'aug': Parser.Text('August'),
50 'sep': Parser.Text('September'),
51 'oct': Parser.Text('October'),
52 'nov': Parser.Text('November'),
53 'dec': Parser.Text('December'),
54 }
55
56
57
58
59
60 _lf_re = re.compile ('^N+I+$')
61 _fl_re = re.compile ('^[IN]*?I+N+$')
62
63 _split_re = re.compile (r'[,.]|\s+|\~')
64
65 _dotdash_re = re.compile(r'\.\s+-')
66
69
71
72 log = logging.getLogger('pyblio.import.bibtex')
73
74 - def __init__(self, charset='ISO8859-1'):
85
86 - def id_add(self, field, stream):
89
91 url = stream.flat().replace(u'\xa0', u'~')
92 self.record.add(field, url, Attribute.URL)
93 return
94
97
98 - def to_text(self, stream):
99 return Attribute.Text (stream.execute (self.env).flat ())
100
101 - def text_add(self, field, stream):
102 self.record [field] = [ self.to_text (stream) ]
103 return
104
106 ''' Parse a stream of tokens as a series of person names '''
107
108
109
110
111
112
113 stream = stream.join ()
114
115
116 stream, os = [], stream
117 for v in os:
118 if not isinstance (v, Parser.Text):
119 stream.append (v)
120 continue
121
122 i = 0
123 for m in _split_re.finditer (v):
124 s, e = m.start (0), m.end (0)
125 if i != s: stream.append (Parser.Text (v [i:s]))
126
127 sep = Parser.Text (v [s:e])
128 if sep [0] in ' \n\t~': sep = Parser.Text (' ')
129 stream.append (sep)
130
131 i = e
132
133 if i < len (v): stream.append (Parser.Text (v [i:]))
134
135
136 avail = []
137
138 while 1:
139 try:
140 i = stream.index ('and')
141 except ValueError:
142 break
143
144 avail.append (stream [0:i])
145 stream = stream [i+1:]
146
147 if stream:
148 avail.append (stream)
149
150 def _wordify (stream):
151
152 stream = stream.execute(self.env)
153 stream = stream.subst()
154
155
156
157
158
159
160 in_space = True
161 os, stream = stream, []
162
163 while os:
164 s = os.pop (0)
165
166 if s == '.':
167 stream [-1] += '.'
168 in_space = True
169 continue
170
171 is_space = s in (' ', '\n')
172
173 if in_space:
174 if not is_space:
175 stream.append (s)
176 in_space = False
177 continue
178
179 else:
180 if is_space:
181 in_space = True
182 else:
183 if s == ',':
184 stream.append (s)
185 in_space = True
186 else:
187 stream [-1] += s
188
189 return stream
190
191 def _typetag (stream):
192 """ For each element of the string, return a list that
193 indicates if the corresponding element is :
194 - I : an initial
195 - L : a lower case word
196 - N : a name
197 """
198
199 tags = []
200
201 for s in stream:
202 if '.' in s:
203 tags.append ('I')
204
205 elif s.lower () == s:
206 tags.append ('L')
207
208 elif s.lower () in ('van', 'von', 'de'):
209 tags.append ('L')
210
211 else:
212 tags.append ('N')
213
214 return tags
215
216 def _person_decode (stream):
217
218 if len(stream) == 1 and isinstance(stream[0], Parser.Block):
219 return Attribute.Person(last=stream [0].flat())
220
221 stream = _wordify (Parser.Block ('', stream))
222
223
224 comma = stream.count (',')
225
226 if comma == 0:
227
228 ls = len (stream)
229 if ls == 1:
230 return Attribute.Person (last = stream [0])
231 elif ls == 0:
232 return None
233 else:
234 tt = ''.join(_typetag(stream))
235
236 if _lf_re.match(tt):
237 idx = tt.index('I')
238 return Attribute.Person(first=_nodotdash(' '.join(stream[idx:])),
239 last=' '.join(stream[:idx]))
240 if tt == 'NN':
241 return Attribute.Person(first=_nodotdash(stream[0]),
242 last=stream[1])
243 if _fl_re.match (tt):
244 idx = tt.rindex ('I') + 1
245 return Attribute.Person (first=_nodotdash(' '.join (stream [:idx])),
246 last = ' '.join (stream [idx:]))
247
248 try:
249 von = tt.index ('L')
250
251 return Attribute.Person (first =_nodotdash(' '.join (stream [0:von])),
252 last = ' '.join (stream [von:]))
253
254 except ValueError:
255 pass
256
257
258 if tt == 'NNN':
259 return Attribute.Person (first = ' '.join (stream [:-1]),
260 last = stream [-1])
261
262 elif tt == 'II':
263
264 first, last = stream
265
266 if last[-1] == '.' and len(last) > 2:
267 last = last[:-1]
268 return Attribute.Person (first=_nodotdash(first), last=last)
269
270
271 raise Exceptions.ParserError ("%s: unable to parse name properly: %s (typed as %s)" % (
272 unicode(self.key), repr(stream), repr(tt)))
273
274 elif comma == 1:
275 i = stream.index (',')
276
277 return Attribute.Person \
278 (last = ' '.join (stream [:i]),
279 first = _nodotdash(' '.join (stream [i+1:])))
280
281
282 raise Exceptions.ParserError ("%s: unable to parse name %s properly: %d commas" % (
283 unicode(self.key), repr(stream), comma))
284
285 self.record [field] = filter(None, map(_person_decode, avail))
286 return
287
288
292
296
300
304
307
310
315
317
318 try:
319 m = getattr(self, 'do_' + k.lower())
320 return m(v)
321 except AttributeError:
322 pass
323
324
325 try:
326 attp = self.db.schema [k]
327 except KeyError:
328 return self.do_default(k, v)
329 return self._mapping[attp.type](k, v)
330
332
333 tp = record.type.lower ()
334
335 if tp == 'string':
336 return self.string_add (record)
337
338 elif tp == 'preamble':
339 return self.preamble_add (record)
340
341 self.tp, self.key, val = record.type, record.key, record.fields
342
343 self.record = Store.Record ()
344
345 self.record_begin ()
346
347 for k, v in val:
348 self.record_dispatch (k.lower (), v)
349
350
351
352 self.type_add (self.tp)
353
354 self.record_end ()
355 return
356
357
358 - def parse(self, fd, db):
359
360 self.db = db
361
362 self.doctype = {}
363
364 rs = db.rs.new()
365 rs.name = _('Imported from BibTeX')
366
367 for v in db.schema.txo['doctype'].values ():
368 self.doctype [v.names ['C'].lower ()] = v
369
370 for data in Parser.read (fd, self.charset):
371
372 if isinstance (data, Parser.Comment):
373 self.comment_add (data)
374 continue
375
376 self.record = None
377
378 self.record_parse (data)
379
380 if self.record:
381 k = self.db.add (self.record)
382 rs.add(k)
383
384 return rs
385
386
387
388
389
391
392
393 log = logging.getLogger('pyblio.export.bibtex')
394
395 _collapse = re.compile (r'[\s\n]+', re.MULTILINE)
396
408
410 if not text:
411 return ''
412 return Coding.encode(text)
413
415
416 r = []
417 for d in data:
418 v = self.db.schema.txo[d.group][d.id]
419
420
421
422 try: n = v.names.get ('C', None)
423 except KeyError: n = v.name
424
425 if n: r.append (n)
426
427 data = self._escape ('; '.join (r))
428
429 self.field [field] = '{%s}' % data
430 return
431
432 - def text_add (self, field, data):
433
434 data = self._escape (' '.join (data))
435
436
437 data = self._collapse.sub (' ', data)
438
439 self.field [field] = '{%s}' % data
440 return
441
442 - def capitalized_text_add (self, field, data):
443
444
445 data = self._collapse.sub (' ', ' '.join (data))
446
447
448
449
450
451 res = Parser.Block ('{', [])
452
453 beginning = True
454 in_upper = False
455 block = []
456 braced = False
457
458 def _close_upper ():
459 res.append (Parser.Block ('{', (Parser.Text (''.join (block)),)))
460 del block[:]
461
462 while data:
463 c, data = data [0], data [1:]
464
465 if c in '.!?':
466 if in_upper:
467 _close_upper ()
468 in_upper = False
469
470 beginning = True
471 block.append (c)
472 continue
473
474 if not c.isalpha ():
475 if in_upper:
476 _close_upper ()
477 in_upper = False
478
479 block.append (c)
480
481 if c == '"': braced = not braced
482 continue
483
484 if not braced:
485 if beginning and c.lower () == c:
486 res.append (Parser.Text (''.join (block)))
487 res.append (Parser.Block ('{', (Parser.Text (c),)))
488
489 block = []
490 beginning = False
491 continue
492
493 if (not beginning and c.lower () != c) \
494 or (beginning and data and data [0].lower () != data [0]):
495 if in_upper:
496 block.append (c)
497 else:
498 in_upper = True
499 res.append (Parser.Text (''.join (block)))
500
501 block = [c]
502 beginning = False
503 continue
504
505 if in_upper:
506 _close_upper ()
507 in_upper = False
508
509 block.append (c)
510 beginning = False
511
512
513 if in_upper: _close_upper ()
514 if block: res.append (Parser.Text (''.join (block)))
515
516 self.field [field] = res.tobib ()
517 return
518
519 - def id_add (self, field, data):
520
521 data = self._escape ('; '.join (data))
522
523 self.field [field] = '{%s}' % data
524 return
525
527
528 if person.first:
529 return self._escape('%s, %s' % (person.last, person.first))
530 else:
531 return '{' + self._escape(person.last) + '}'
532
534
535 v = ' and '.join (map (self._single_person, data))
536
537 self.field [field] = '{%s}' % v
538 return
539
541
542 v = ', '.join (data)
543
544 self.field [field] = '{%s}' % v
545 return
546
548
549 v = str (data [0].year)
550
551 self.field [field] = v
552 return
553
555 if 'id' in self.record:
556 self.key = str(self.record['id'][0])
557
558 tp = self.record ['doctype'] [0]
559 self.type = self.db.schema.txo[tp.group][tp.id].names ['C']
560
563
565
566 if key in ('id', 'doctype'): return
567
568 key = Coding.encode(key)
569
570 self._mapping[self.db.schema[key].type](key, self.record [key])
571 return
572
573 - def write (self, fd, rs, db):
574
575 """ Write a result set to a given file descriptor """
576
577 self.db = db
578 self.rs = rs
579
580 self.doctype = {}
581
582 for v in db.schema.txo['doctype'].values ():
583 self.doctype [v.names ['C'].lower ()] = v
584
585 for e in rs.itervalues ():
586
587 self.record = e
588
589 self.field = {}
590 self.type = None
591 self.key = None
592
593 self.to_delete = False
594 self.record_begin ()
595
596 for k, v in e.items ():
597 self.record_parse (k, v)
598
599 self.record_end ()
600
601 if self.to_delete:
602 continue
603
604
605
606
607 if self.key is None:
608 key = ''
609 else:
610 key = self.key + ','
611 ret = '@%s{%s\n' % (self.type, key)
612
613 attrs = []
614 keys = self.field.keys ()
615 keys.sort ()
616
617 maxlen = 0
618 for k in keys:
619 l = len (k)
620 if l > maxlen: maxlen = l
621
622 for k in keys:
623 v = self.field [k]
624
625 left = ' %s%s = ' % (k, ' ' * (maxlen - len (k)))
626
627 attrs.append (left + Tools.format (v, 75, 0, len (left)))
628
629 fd.write (ret + ',\n'.join (attrs) + '\n}\n')
630
631 return
632