Package python-module-logilab-mtconverter-0 :: Package 8 :: Package 4 :: Package transforms :: Module html2text
[frames] | no frames]

Source Code for Module python-module-logilab-mtconverter-0.8.4.transforms.html2text

  1  # copyright 2006-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-mtconverter. 
  5  # 
  6  # logilab-mtconverter is free software: you can redistribute it and/or modify it 
  7  # under the terms of the GNU Lesser General Public License as published by the 
  8  # Free Software Foundation, either version 2.1 of the License, or (at your 
  9  # option) any later version. 
 10  # 
 11  # logilab-mtconverter is distributed in the hope that it will be useful, but 
 12  # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 13  # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License 
 14  # for more details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>. 
 18  """html2text: Turn HTML into equivalent Markdown-structured text. 
 19   
 20  There is some specific mtconvter code at the end to define the 
 21  html to text transformation. 
 22   
 23  Copyright (C) 2004-2008 Aaron Swartz. GNU GPL 3. 
 24  Copyright (C) 2008 Logilab S.A. 
 25  """ 
 26   
 27  __version__ = "2.38" 
 28  __author__ = "Aaron Swartz (me@aaronsw.com)" 
 29  __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." 
 30  __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] 
 31   
 32  # TODO: 
 33  #   Support decoded entities with unifiable. 
 34   
 35  if not hasattr(__builtins__, 'True'): True, False = 1, 0 
 36  import re, sys, urllib, htmlentitydefs, codecs, StringIO, types 
 37  import sgmllib 
 38  import urlparse 
 39  sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') 
 40   
 41  try: from textwrap import wrap 
 42  except: pass 
 43   
 44  # Use Unicode characters instead of their ascii psuedo-replacements 
 45  UNICODE_SNOB = 0 
 46   
 47  # Put the links after each paragraph instead of at the end. 
 48  LINKS_EACH_PARAGRAPH = 0 
 49   
 50  # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) 
 51  BODY_WIDTH = 78 
 52   
 53  # Don't show internal links (href="#local-anchor") -- corresponding link targets 
 54  # won't be visible in the plain text file anyway. 
 55  SKIP_INTERNAL_LINKS = False 
 56   
 57  ### Entity Nonsense ### 
 58   
59 -def name2cp(k):
60 if k == 'apos': return ord("'") 61 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 62 return htmlentitydefs.name2codepoint[k] 63 else: 64 k = htmlentitydefs.entitydefs[k] 65 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 66 return ord(codecs.latin_1_decode(k)[0])
67 68 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 69 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', 70 'ndash':'-', 'oelig':'oe', 'aelig':'ae', 71 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 72 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 73 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', 74 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 75 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} 76 77 unifiable_n = {} 78 79 for k in unifiable.keys(): 80 unifiable_n[name2cp(k)] = unifiable[k] 81
82 -def charref(name):
83 if name[0] in ['x','X']: 84 c = int(name[1:], 16) 85 else: 86 c = int(name) 87 88 if not UNICODE_SNOB and c in unifiable_n.keys(): 89 return unifiable_n[c] 90 else: 91 return unichr(c)
92
93 -def entityref(c):
94 if not UNICODE_SNOB and c in unifiable.keys(): 95 return unifiable[c] 96 else: 97 try: name2cp(c) 98 except KeyError: return "&" + c 99 else: return unichr(name2cp(c))
100
101 -def replaceEntities(s):
102 s = s.group(1) 103 if s[0] == "#": 104 return charref(s[1:]) 105 else: return entityref(s)
106 107 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
108 -def unescape(s):
109 return r_unescape.sub(replaceEntities, s)
110
111 -def fixattrs(attrs):
112 # Fix bug in sgmllib.py 113 if not attrs: return attrs 114 newattrs = [] 115 for attr in attrs: 116 newattrs.append((attr[0], unescape(attr[1]))) 117 return newattrs
118 119 ### End Entity Nonsense ### 120
121 -def onlywhite(line):
122 """Return true if the line does only consist of whitespace characters.""" 123 for c in line: 124 if c is not ' ' and c is not ' ': 125 return c is ' ' 126 return line
127
128 -def optwrap(text):
129 """Wrap all paragraphs in the provided text.""" 130 if not BODY_WIDTH: 131 return text 132 133 assert wrap, "Requires Python 2.3." 134 result = '' 135 newlines = 0 136 for para in text.split("\n"): 137 if len(para) > 0: 138 if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': 139 for line in wrap(para, BODY_WIDTH): 140 result += line + "\n" 141 result += "\n" 142 newlines = 2 143 else: 144 if not onlywhite(para): 145 result += para + "\n" 146 newlines = 1 147 else: 148 if newlines < 2: 149 result += "\n" 150 newlines += 1 151 return result
152
153 -def hn(tag):
154 if tag[0] == 'h' and len(tag) == 2: 155 try: 156 n = int(tag[1]) 157 if n in range(1, 10): return n 158 except ValueError: return 0
159
160 -class _html2text(sgmllib.SGMLParser):
161
162 - def __init__(self, out=None, baseurl='', encoding='utf8'):
163 sgmllib.SGMLParser.__init__(self) 164 165 if out is None: self.out = self.outtextf 166 else: self.out = out 167 self.outtext = [] 168 self.quiet = 0 169 self.p_p = 0 170 self.outcount = 0 171 self.start = 1 172 self.space = 0 173 self.a = [] 174 self.astack = [] 175 self.acount = 0 176 self.list = [] 177 self.blockquote = 0 178 self.pre = 0 179 self.startpre = 0 180 self.lastWasNL = 0 181 self.abbr_title = None # current abbreviation definition 182 self.abbr_data = None # last inner HTML (for abbr being defined) 183 self.abbr_list = {} # stack of abbreviations to write later 184 self.baseurl = baseurl 185 self._encoding = encoding
186
187 - def outtextf(self, s):
188 if isinstance(s, str): 189 s = unicode(s, self._encoding) 190 self.outtext.append( s )
191
192 - def close(self):
193 sgmllib.SGMLParser.close(self) 194 195 self.pbr() 196 self.o('', 0, 'end') 197 198 return ''.join(self.outtext)
199
200 - def handle_charref(self, c):
201 self.o(charref(c))
202
203 - def handle_entityref(self, c):
204 self.o(entityref(c))
205
206 - def unknown_starttag(self, tag, attrs):
207 self.handle_tag(tag, attrs, 1)
208
209 - def unknown_endtag(self, tag):
210 self.handle_tag(tag, None, 0)
211
212 - def previousIndex(self, attrs):
213 """ returns the index of certain set of attributes (of a link) in the 214 self.a list 215 216 If the set of attributes is not found, returns None 217 """ 218 if not attrs.has_key('href'): return None 219 220 i = -1 221 for a in self.a: 222 i += 1 223 match = 0 224 225 if a.has_key('href') and a['href'] == attrs['href']: 226 if a.has_key('title') or attrs.has_key('title'): 227 if (a.has_key('title') and attrs.has_key('title') and 228 a['title'] == attrs['title']): 229 match = True 230 else: 231 match = True 232 233 if match: return i
234
235 - def handle_tag(self, tag, attrs, start):
236 attrs = fixattrs(attrs) 237 238 if hn(tag): 239 self.p() 240 if start: self.o(hn(tag)*"#" + ' ') 241 242 if tag in ['p', 'div']: self.p() 243 244 if tag == "br" and start: self.o(" \n") 245 246 if tag == "hr" and start: 247 self.p() 248 self.o("* * *") 249 self.p() 250 251 if tag in ["head", "style", 'script']: 252 if start: self.quiet += 1 253 else: self.quiet -= 1 254 255 if tag in ["body"]: 256 self.quiet = 0 # sites like 9rules.com never close <head> 257 258 if tag == "blockquote": 259 if start: 260 self.p(); self.o('> ', 0, 1); self.start = 1 261 self.blockquote += 1 262 else: 263 self.blockquote -= 1 264 self.p() 265 266 if tag in ['em', 'i', 'u']: self.o("_") 267 if tag in ['strong', 'b']: self.o("**") 268 if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` 269 if tag == "abbr": 270 if start: 271 attrsD = {} 272 for (x, y) in attrs: attrsD[x] = y 273 attrs = attrsD 274 275 self.abbr_title = None 276 self.abbr_data = '' 277 if attrs.has_key('title'): 278 self.abbr_title = attrs['title'] 279 else: 280 if self.abbr_title != None: 281 self.abbr_list[self.abbr_data] = self.abbr_title 282 self.abbr_title = None 283 self.abbr_data = '' 284 285 if tag == "a": 286 if start: 287 attrsD = {} 288 for (x, y) in attrs: attrsD[x] = y 289 attrs = attrsD 290 if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): 291 self.astack.append(attrs) 292 self.o("[") 293 else: 294 self.astack.append(None) 295 else: 296 if self.astack: 297 a = self.astack.pop() 298 if a: 299 i = self.previousIndex(a) 300 if i is not None: 301 a = self.a[i] 302 else: 303 self.acount += 1 304 a['count'] = self.acount 305 a['outcount'] = self.outcount 306 self.a.append(a) 307 self.o("][" + `a['count']` + "]") 308 309 if tag == "img" and start: 310 attrsD = {} 311 for (x, y) in attrs: attrsD[x] = y 312 attrs = attrsD 313 if attrs.has_key('src'): 314 attrs['href'] = attrs['src'] 315 alt = attrs.get('alt', '') 316 i = self.previousIndex(attrs) 317 if i is not None: 318 attrs = self.a[i] 319 else: 320 self.acount += 1 321 attrs['count'] = self.acount 322 attrs['outcount'] = self.outcount 323 self.a.append(attrs) 324 self.o("![") 325 self.o(alt) 326 self.o("]["+`attrs['count']`+"]") 327 328 if tag == 'dl' and start: self.p() 329 if tag == 'dt' and not start: self.pbr() 330 if tag == 'dd' and start: self.o(' ') 331 if tag == 'dd' and not start: self.pbr() 332 333 if tag in ["ol", "ul"]: 334 if start: 335 self.list.append({'name':tag, 'num':0}) 336 else: 337 if self.list: self.list.pop() 338 339 self.p() 340 341 if tag == 'li': 342 if start: 343 self.pbr() 344 if self.list: li = self.list[-1] 345 else: li = {'name':'ul', 'num':0} 346 self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly. 347 if li['name'] == "ul": self.o("* ") 348 elif li['name'] == "ol": 349 li['num'] += 1 350 self.o(`li['num']`+". ") 351 self.start = 1 352 else: 353 self.pbr() 354 355 if tag in ["table", "tr"] and start: self.p() 356 if tag == 'td': self.pbr() 357 358 if tag == "pre": 359 if start: 360 self.startpre = 1 361 self.pre = 1 362 else: 363 self.pre = 0 364 self.p()
365
366 - def pbr(self):
367 if self.p_p == 0: self.p_p = 1
368
369 - def p(self): self.p_p = 2
370
371 - def o(self, data, puredata=0, force=0):
372 if self.abbr_data is not None: self.abbr_data += data 373 374 if not self.quiet: 375 if puredata and not self.pre: 376 data = re.sub('\s+', ' ', data) 377 if data and data[0] == ' ': 378 self.space = 1 379 data = data[1:] 380 if not data and not force: return 381 382 if self.startpre: 383 #self.out(" :") #TODO: not output when already one there 384 self.startpre = 0 385 386 bq = (">" * self.blockquote) 387 if not (force and data and data[0] == ">") and self.blockquote: bq += " " 388 389 if self.pre: 390 bq += " " 391 data = data.replace("\n", "\n"+bq) 392 393 if self.start: 394 self.space = 0 395 self.p_p = 0 396 self.start = 0 397 398 if force == 'end': 399 # It's the end. 400 self.p_p = 0 401 self.out("\n") 402 self.space = 0 403 404 405 if self.p_p: 406 self.out(('\n'+bq)*self.p_p) 407 self.space = 0 408 409 if self.space: 410 if not self.lastWasNL: self.out(' ') 411 self.space = 0 412 413 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): 414 if force == "end": self.out("\n") 415 416 newa = [] 417 for link in self.a: 418 if self.outcount > link['outcount']: 419 self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) 420 if link.has_key('title'): self.out(" ("+link['title']+")") 421 self.out("\n") 422 else: 423 newa.append(link) 424 425 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. 426 427 self.a = newa 428 429 if self.abbr_list and force == "end": 430 for abbr, definition in self.abbr_list.items(): 431 self.out(" *[" + abbr + "]: " + definition + "\n") 432 433 self.p_p = 0 434 self.out(data) 435 self.lastWasNL = data and data[-1] == '\n' 436 self.outcount += 1
437
438 - def handle_data(self, data):
439 if r'\/script>' in data: self.quiet -= 1 440 self.o(data, 1)
441
442 - def unknown_decl(self, data): pass
443
444 -def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
445
446 -def html2text_file(html, out=wrapwrite, baseurl='', encoding='utf8'):
447 h = _html2text(out, baseurl, encoding=encoding) 448 h.feed(html) 449 h.feed("") 450 return h.close()
451
452 -def html2text(html, baseurl='', encoding='utf8'):
453 return optwrap(html2text_file(html.replace('/>', '>'), None, 454 baseurl, encoding=encoding))
455 456 457 ## mtconverter's specific code ################################################ 458 459 from logilab.mtconverter.transform import Transform 460
461 -class html_to_formatted_text(Transform):
462 """transforms html to formatted plain text""" 463 464 name = "html_to_text" 465 inputs = ("text/html",) 466 output = "text/plain" 467 468
469 - def _convert(self, trdata):
470 return html2text(trdata.data, encoding=trdata.encoding)
471