1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 """html2text: Turn HTML into equivalent Markdown-structured text.
19
20 There is some specific mtconvter code at the end to define the
21 html to text transformation.
22
23 Copyright (C) 2004-2008 Aaron Swartz. GNU GPL 3.
24 Copyright (C) 2008 Logilab S.A.
25 """
26
27 __version__ = "2.38"
28 __author__ = "Aaron Swartz (me@aaronsw.com)"
29 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
30 __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
31
32
33
34
35 if not hasattr(__builtins__, 'True'): True, False = 1, 0
36 import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
37 import sgmllib
38 import urlparse
39 sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
40
41 try: from textwrap import wrap
42 except: pass
43
44
45 UNICODE_SNOB = 0
46
47
48 LINKS_EACH_PARAGRAPH = 0
49
50
51 BODY_WIDTH = 78
52
53
54
55 SKIP_INTERNAL_LINKS = False
56
57
58
60 if k == 'apos': return ord("'")
61 if hasattr(htmlentitydefs, "name2codepoint"):
62 return htmlentitydefs.name2codepoint[k]
63 else:
64 k = htmlentitydefs.entitydefs[k]
65 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1])
66 return ord(codecs.latin_1_decode(k)[0])
67
68 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
69 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
70 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
71 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
72 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
73 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
74 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
75 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
76
77 unifiable_n = {}
78
79 for k in unifiable.keys():
80 unifiable_n[name2cp(k)] = unifiable[k]
81
92
100
102 s = s.group(1)
103 if s[0] == "#":
104 return charref(s[1:])
105 else: return entityref(s)
106
107 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
110
112
113 if not attrs: return attrs
114 newattrs = []
115 for attr in attrs:
116 newattrs.append((attr[0], unescape(attr[1])))
117 return newattrs
118
119
120
122 """Return true if the line does only consist of whitespace characters."""
123 for c in line:
124 if c is not ' ' and c is not ' ':
125 return c is ' '
126 return line
127
129 """Wrap all paragraphs in the provided text."""
130 if not BODY_WIDTH:
131 return text
132
133 assert wrap, "Requires Python 2.3."
134 result = ''
135 newlines = 0
136 for para in text.split("\n"):
137 if len(para) > 0:
138 if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
139 for line in wrap(para, BODY_WIDTH):
140 result += line + "\n"
141 result += "\n"
142 newlines = 2
143 else:
144 if not onlywhite(para):
145 result += para + "\n"
146 newlines = 1
147 else:
148 if newlines < 2:
149 result += "\n"
150 newlines += 1
151 return result
152
154 if tag[0] == 'h' and len(tag) == 2:
155 try:
156 n = int(tag[1])
157 if n in range(1, 10): return n
158 except ValueError: return 0
159
160 -class _html2text(sgmllib.SGMLParser):
161
162 - def __init__(self, out=None, baseurl='', encoding='utf8'):
163 sgmllib.SGMLParser.__init__(self)
164
165 if out is None: self.out = self.outtextf
166 else: self.out = out
167 self.outtext = []
168 self.quiet = 0
169 self.p_p = 0
170 self.outcount = 0
171 self.start = 1
172 self.space = 0
173 self.a = []
174 self.astack = []
175 self.acount = 0
176 self.list = []
177 self.blockquote = 0
178 self.pre = 0
179 self.startpre = 0
180 self.lastWasNL = 0
181 self.abbr_title = None
182 self.abbr_data = None
183 self.abbr_list = {}
184 self.baseurl = baseurl
185 self._encoding = encoding
186
187 - def outtextf(self, s):
188 if isinstance(s, str):
189 s = unicode(s, self._encoding)
190 self.outtext.append( s )
191
193 sgmllib.SGMLParser.close(self)
194
195 self.pbr()
196 self.o('', 0, 'end')
197
198 return ''.join(self.outtext)
199
200 - def handle_charref(self, c):
202
203 - def handle_entityref(self, c):
205
206 - def unknown_starttag(self, tag, attrs):
207 self.handle_tag(tag, attrs, 1)
208
209 - def unknown_endtag(self, tag):
210 self.handle_tag(tag, None, 0)
211
212 - def previousIndex(self, attrs):
213 """ returns the index of certain set of attributes (of a link) in the
214 self.a list
215
216 If the set of attributes is not found, returns None
217 """
218 if not attrs.has_key('href'): return None
219
220 i = -1
221 for a in self.a:
222 i += 1
223 match = 0
224
225 if a.has_key('href') and a['href'] == attrs['href']:
226 if a.has_key('title') or attrs.has_key('title'):
227 if (a.has_key('title') and attrs.has_key('title') and
228 a['title'] == attrs['title']):
229 match = True
230 else:
231 match = True
232
233 if match: return i
234
235 - def handle_tag(self, tag, attrs, start):
236 attrs = fixattrs(attrs)
237
238 if hn(tag):
239 self.p()
240 if start: self.o(hn(tag)*"#" + ' ')
241
242 if tag in ['p', 'div']: self.p()
243
244 if tag == "br" and start: self.o(" \n")
245
246 if tag == "hr" and start:
247 self.p()
248 self.o("* * *")
249 self.p()
250
251 if tag in ["head", "style", 'script']:
252 if start: self.quiet += 1
253 else: self.quiet -= 1
254
255 if tag in ["body"]:
256 self.quiet = 0
257
258 if tag == "blockquote":
259 if start:
260 self.p(); self.o('> ', 0, 1); self.start = 1
261 self.blockquote += 1
262 else:
263 self.blockquote -= 1
264 self.p()
265
266 if tag in ['em', 'i', 'u']: self.o("_")
267 if tag in ['strong', 'b']: self.o("**")
268 if tag == "code" and not self.pre: self.o('`')
269 if tag == "abbr":
270 if start:
271 attrsD = {}
272 for (x, y) in attrs: attrsD[x] = y
273 attrs = attrsD
274
275 self.abbr_title = None
276 self.abbr_data = ''
277 if attrs.has_key('title'):
278 self.abbr_title = attrs['title']
279 else:
280 if self.abbr_title != None:
281 self.abbr_list[self.abbr_data] = self.abbr_title
282 self.abbr_title = None
283 self.abbr_data = ''
284
285 if tag == "a":
286 if start:
287 attrsD = {}
288 for (x, y) in attrs: attrsD[x] = y
289 attrs = attrsD
290 if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
291 self.astack.append(attrs)
292 self.o("[")
293 else:
294 self.astack.append(None)
295 else:
296 if self.astack:
297 a = self.astack.pop()
298 if a:
299 i = self.previousIndex(a)
300 if i is not None:
301 a = self.a[i]
302 else:
303 self.acount += 1
304 a['count'] = self.acount
305 a['outcount'] = self.outcount
306 self.a.append(a)
307 self.o("][" + `a['count']` + "]")
308
309 if tag == "img" and start:
310 attrsD = {}
311 for (x, y) in attrs: attrsD[x] = y
312 attrs = attrsD
313 if attrs.has_key('src'):
314 attrs['href'] = attrs['src']
315 alt = attrs.get('alt', '')
316 i = self.previousIndex(attrs)
317 if i is not None:
318 attrs = self.a[i]
319 else:
320 self.acount += 1
321 attrs['count'] = self.acount
322 attrs['outcount'] = self.outcount
323 self.a.append(attrs)
324 self.o("![")
325 self.o(alt)
326 self.o("]["+`attrs['count']`+"]")
327
328 if tag == 'dl' and start: self.p()
329 if tag == 'dt' and not start: self.pbr()
330 if tag == 'dd' and start: self.o(' ')
331 if tag == 'dd' and not start: self.pbr()
332
333 if tag in ["ol", "ul"]:
334 if start:
335 self.list.append({'name':tag, 'num':0})
336 else:
337 if self.list: self.list.pop()
338
339 self.p()
340
341 if tag == 'li':
342 if start:
343 self.pbr()
344 if self.list: li = self.list[-1]
345 else: li = {'name':'ul', 'num':0}
346 self.o(" "*len(self.list))
347 if li['name'] == "ul": self.o("* ")
348 elif li['name'] == "ol":
349 li['num'] += 1
350 self.o(`li['num']`+". ")
351 self.start = 1
352 else:
353 self.pbr()
354
355 if tag in ["table", "tr"] and start: self.p()
356 if tag == 'td': self.pbr()
357
358 if tag == "pre":
359 if start:
360 self.startpre = 1
361 self.pre = 1
362 else:
363 self.pre = 0
364 self.p()
365
367 if self.p_p == 0: self.p_p = 1
368
369 - def p(self): self.p_p = 2
370
371 - def o(self, data, puredata=0, force=0):
372 if self.abbr_data is not None: self.abbr_data += data
373
374 if not self.quiet:
375 if puredata and not self.pre:
376 data = re.sub('\s+', ' ', data)
377 if data and data[0] == ' ':
378 self.space = 1
379 data = data[1:]
380 if not data and not force: return
381
382 if self.startpre:
383
384 self.startpre = 0
385
386 bq = (">" * self.blockquote)
387 if not (force and data and data[0] == ">") and self.blockquote: bq += " "
388
389 if self.pre:
390 bq += " "
391 data = data.replace("\n", "\n"+bq)
392
393 if self.start:
394 self.space = 0
395 self.p_p = 0
396 self.start = 0
397
398 if force == 'end':
399
400 self.p_p = 0
401 self.out("\n")
402 self.space = 0
403
404
405 if self.p_p:
406 self.out(('\n'+bq)*self.p_p)
407 self.space = 0
408
409 if self.space:
410 if not self.lastWasNL: self.out(' ')
411 self.space = 0
412
413 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
414 if force == "end": self.out("\n")
415
416 newa = []
417 for link in self.a:
418 if self.outcount > link['outcount']:
419 self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
420 if link.has_key('title'): self.out(" ("+link['title']+")")
421 self.out("\n")
422 else:
423 newa.append(link)
424
425 if self.a != newa: self.out("\n")
426
427 self.a = newa
428
429 if self.abbr_list and force == "end":
430 for abbr, definition in self.abbr_list.items():
431 self.out(" *[" + abbr + "]: " + definition + "\n")
432
433 self.p_p = 0
434 self.out(data)
435 self.lastWasNL = data and data[-1] == '\n'
436 self.outcount += 1
437
438 - def handle_data(self, data):
439 if r'\/script>' in data: self.quiet -= 1
440 self.o(data, 1)
441
442 - def unknown_decl(self, data): pass
443
445
446 -def html2text_file(html, out=wrapwrite, baseurl='', encoding='utf8'):
447 h = _html2text(out, baseurl, encoding=encoding)
448 h.feed(html)
449 h.feed("")
450 return h.close()
451
452 -def html2text(html, baseurl='', encoding='utf8'):
453 return optwrap(html2text_file(html.replace('/>', '>'), None,
454 baseurl, encoding=encoding))
455
456
457
458
459 from logilab.mtconverter.transform import Transform
460
462 """transforms html to formatted plain text"""
463
464 name = "html_to_text"
465 inputs = ("text/html",)
466 output = "text/plain"
467
468
470 return html2text(trdata.data, encoding=trdata.encoding)
471