Package python-module-logilab-mtconverter-0 :: Package 8 :: Package 4
[frames] | no frames]

Source Code for Package python-module-logilab-mtconverter-0.8.4

  1  # copyright 2006-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-mtconverter. 
  5  # 
  6  # logilab-mtconverter is free software: you can redistribute it and/or modify it 
  7  # under the terms of the GNU Lesser General Public License as published by the 
  8  # Free Software Foundation, either version 2.1 of the License, or (at your 
  9  # option) any later version. 
 10  # 
 11  # logilab-mtconverter is distributed in the hope that it will be useful, but 
 12  # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 13  # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License 
 14  # for more details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>. 
 18  """Mime type conversion package. 
 19   
 20    2006-2012 `LOGILAB S.A. <http://www.logilab.fr>`_ (Paris, FRANCE), 
 21    all rights reserved. 
 22   
 23    http://www.logilab.org/project/logilab-mtconverter -- 
 24    mailto:python-projects@logilab.org 
 25   
 26    `Lesser General Public License version 2` 
 27  """ 
 28  __docformat__ = "restructuredtext en" 
 29   
 30  from logilab.mtconverter.__pkginfo__ import version as __version__ 
 31   
 32  import locale 
 33  import mimetypes 
 34  import re 
 35  import string 
 36  import htmlentitydefs 
 37  import codecs 
 38  from StringIO import StringIO 
 39   
 40  try: 
 41      import chardet 
 42  except ImportError: 
 43      # chardet unvailable 
 44      chardet = None 
 45   
 46  mimetypes.encodings_map['.bz2'] = 'bzip2' # register bzip2 encoding 
 47  try: 
 48      DEFAULT_ENCODING = locale.getpreferredencoding() 
 49  except locale.Error: 
 50      DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False) 
 51   
 52  BINARY_ENCODINGS = set(('gzip', 'bzip2', 'base64')) 
 53   
 54  TEXT_MIMETYPES = set(('application/xml', 'application/xhtml+xml')) 
 55   
 56  UNICODE_POLICY = 'strict' 
 57   
 58  CHARSET_DECL_RGX = re.compile('(?:charset|(?:(?:en)?coding))[=:\s"\']*([^\s"\']*)', 
 59                                re.I | re.S | re.U) 
 60  CHARSET_DECL_SEARCH_SIZE = 512 
 61   
 62  CHARDET_MIN_SIZE = 20 
 63  CHARDET_CONFIDENCE_THRESHOLD = 0.75 
 64   
65 -def need_guess(mimetype, encoding):
66 """return True if we can complete given mimetype / encoding information""" 67 if not mimetype: 68 return True 69 if not encoding and is_text_mimetype(mimetype): 70 return True 71 return False
72
73 -def is_text_mimetype(mimetype):
74 return (mimetype.startswith('text/') or mimetype in TEXT_MIMETYPES)
75
76 -def guess_encoding(buffer, fallbackencoding=None):
77 """try to guess encoding from a buffer""" 78 if hasattr(buffer, 'getvalue'): # may be a StringIO 79 buffer = buffer.getvalue() 80 # try to get a character set declaration 81 m = CHARSET_DECL_RGX.search(buffer[:CHARSET_DECL_SEARCH_SIZE]) 82 if m is not None: 83 guessed = m.group(1) 84 try: 85 # ensure encoding is known by python 86 codecs.lookup(guessed) 87 return guessed 88 except LookupError: 89 pass 90 if buffer.lstrip().startswith('<?xml'): 91 # xml files with no encoding declaration default to UTF-8 92 return 'UTF-8' 93 # use text analysis if enough data 94 if chardet is not None and len(buffer) > CHARDET_MIN_SIZE: 95 detected = chardet.detect(buffer) 96 if detected['confidence'] >= CHARDET_CONFIDENCE_THRESHOLD: 97 return detected['encoding'] 98 return fallbackencoding or DEFAULT_ENCODING
99
100 -def guess_mimetype_and_encoding(format=None, encoding=None, data=None, 101 filename=None, fallbackencoding=None, 102 fallbackmimetype=u'application/octet-stream'):
103 if format and format.split('/')[-1] in BINARY_ENCODINGS: 104 format = None # try to do better 105 if filename and not format: 106 format, enc = mimetypes.guess_type(filename) 107 if format: 108 if not encoding: 109 encoding = enc 110 elif enc: 111 format = u'application/%s' % enc 112 else: 113 format = fallbackmimetype 114 if not encoding and data and format and is_text_mimetype(format): 115 encoding = guess_encoding(data, fallbackencoding) 116 return format, encoding
117 118 119 CONTROL_CHARS = [chr(ci) for ci in range(32)] 120 TR_CONTROL_CHARS = [' '] * len(CONTROL_CHARS) 121 for c in ('\n', '\r', '\t'): 122 TR_CONTROL_CHARS[ord(c)] = c 123 TR_CONTROL_CHARS[ord('\f')] = '\n' 124 TR_CONTROL_CHARS[ord('\v')] = '\n' 125 ESC_CAR_TABLE = string.maketrans(''.join(CONTROL_CHARS), 126 ''.join(TR_CONTROL_CHARS)) 127 ESC_UCAR_TABLE = unicode(ESC_CAR_TABLE, 'latin1') 128 129 # XXX deprecate at some point (once less used :) 130 #@obsolete('use xml_escape')
131 -def html_escape(data):
132 return xml_escape(data)
133
134 -def xml_escape(data):
135 """escapes XML forbidden characters in attributes and PCDATA""" 136 if isinstance(data, unicode): 137 data = data.translate(ESC_UCAR_TABLE) 138 else: 139 data = data.translate(ESC_CAR_TABLE) 140 return (data.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;') 141 .replace('"','&quot;').replace("'",'&#39;'))
142
143 -def html_unescape(data):
144 """unescapes XML/HTML entities""" 145 for entityname, codepoint in htmlentitydefs.name2codepoint.iteritems(): 146 data = data.replace('&%s;' % entityname, unichr(codepoint)) 147 return data.replace('&#39;', "'")
148
149 -class TransformData(object):
150 """wrapper arround transformed data to add extra infos such as MIME 151 type and encoding in case it applies 152 """
153 - def __init__(self, data, mimetype, encoding=None, **kwargs):
154 self.__dict__.update(kwargs) 155 self.data = data 156 self.mimetype = mimetype 157 self.encoding = encoding 158 if not self.is_binary() and not encoding and not isinstance(self.data, unicode): 159 self.encoding = guess_encoding(data)
160
161 - def get(self, attr, default=None):
162 """get an optional data attribute""" 163 return getattr(self, attr, default)
164
165 - def decode(self, force=False):
166 """return the data as an unicode string""" 167 if isinstance(self.data, unicode): 168 return self.data 169 if force: 170 if self.encoding in BINARY_ENCODINGS: 171 self.binary_decode() 172 elif self.is_binary(): 173 raise Exception("can't decode binary stream (mime type: %s, encoding: %s)" 174 % (self.mimetype, self.encoding)) 175 if self.encoding: 176 encoding = self.encoding 177 else: 178 encoding = guess_encoding(self.data) 179 return self.data.decode(encoding, UNICODE_POLICY)
180
181 - def encode(self, encoding=None):
182 """return the data as an encoded string""" 183 if (encoding is None or self.encoding == encoding) and \ 184 isinstance(self.data, str): 185 return self.data 186 encoding = encoding or self.encoding or 'utf8' 187 return self.decode().encode(encoding)
188
189 - def is_binary(self):
190 return (not is_text_mimetype(self.mimetype) 191 or self.encoding in BINARY_ENCODINGS)
192
193 - def check_encoding(self):
194 if is_text_mimetype(self.mimetype) and self.is_binary(): 195 raise TransformError()
196
197 - def binary_decode(self):
198 if self.encoding == 'gzip': 199 import gzip 200 stream = gzip.GzipFile(fileobj=StringIO(self.data)) 201 self.data = stream.read() 202 self.encoding = guess_encoding(self.data) 203 elif self.encoding == 'bzip2': 204 import bz2 205 self.data = bz2.decompress(StringIO(self.data)) # StringIO or not? 206 self.encoding = guess_encoding(self.data) 207 elif self.encoding == 'base64': 208 import base64 209 self.data = base64.decodestring(self.data) 210 self.encoding = guess_encoding(self.data)
211 212
213 -class MtConverterError(Exception):
214 """base class for this package's errors"""
215
216 -class MissingBinary(MtConverterError):
217 """raised when a system binary on whic rely a transform has not been found 218 """
219 -class TransformError(MtConverterError):
220 """raised when something can't be transformed due to missing necessary 221 transforms 222 """
223 224
225 -def register_pil_transforms(engine, verb=True):
226 try: 227 from logilab.mtconverter.transforms import piltransforms 228 except ImportError: 229 # pil not available, do nothing 230 if verb: 231 print "PIL isn't available, image transforms won't be available'" 232 return False 233 else: 234 for trclass in piltransforms.transform_classes: 235 engine.add_transform(trclass()) 236 return True
237 238
239 -def register_pygments_transforms(engine, verb=True):
240 try: 241 from logilab.mtconverter.transforms import pygmentstransforms 242 except ImportError: 243 # pygments not available, do nothing 244 if verb: 245 print "PYGMENTS isn't available, transforms won't be available'" 246 return False 247 else: 248 for trclass in pygmentstransforms.transform_classes: 249 engine.add_transform(trclass()) 250 return True
251 252
253 -def register_base_transforms(engine, verb=True):
254 from logilab.mtconverter.transforms import cmdtransforms, text_to_text, \ 255 xml_to_text, text_to_html, xlog_to_html 256 from logilab.mtconverter.transforms.python import python_to_html 257 from logilab.mtconverter.transforms.html2text import html_to_formatted_text 258 from logilab.mtconverter.transforms.odt2text import odt_to_unformatted_text 259 from logilab.mtconverter.transforms.pgpsignature import pgpsignature_to_text 260 engine.add_transform(text_to_text()) 261 engine.add_transform(xml_to_text()) 262 engine.add_transform(text_to_html()) 263 engine.add_transform(xlog_to_html()) 264 engine.add_transform(python_to_html()) 265 engine.add_transform(html_to_formatted_text()) 266 engine.add_transform(odt_to_unformatted_text()) 267 engine.add_transform(pgpsignature_to_text()) 268 for trclass in cmdtransforms.transform_classes: 269 try: 270 engine.add_transform(trclass()) 271 except MissingBinary, ex: 272 if verb: 273 print ex 274 return True
275