Home | Trees | Indices | Help |
|
---|
|
1 # copyright 2006-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 3 # 4 # This file is part of logilab-mtconverter. 5 # 6 # logilab-mtconverter is free software: you can redistribute it and/or modify it 7 # under the terms of the GNU Lesser General Public License as published by the 8 # Free Software Foundation, either version 2.1 of the License, or (at your 9 # option) any later version. 10 # 11 # logilab-mtconverter is distributed in the hope that it will be useful, but 12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 14 # for more details. 15 # 16 # You should have received a copy of the GNU Lesser General Public License along 17 # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>. 18 """Mime type conversion package. 19 20 2006-2012 `LOGILAB S.A. <http://www.logilab.fr>`_ (Paris, FRANCE), 21 all rights reserved. 22 23 http://www.logilab.org/project/logilab-mtconverter -- 24 mailto:python-projects@logilab.org 25 26 `Lesser General Public License version 2` 27 """ 28 __docformat__ = "restructuredtext en" 29 30 from logilab.mtconverter.__pkginfo__ import version as __version__ 31 32 import locale 33 import mimetypes 34 import re 35 import string 36 import htmlentitydefs 37 import codecs 38 from StringIO import StringIO 39 40 try: 41 import chardet 42 except ImportError: 43 # chardet unvailable 44 chardet = None 45 46 mimetypes.encodings_map['.bz2'] = 'bzip2' # register bzip2 encoding 47 try: 48 DEFAULT_ENCODING = locale.getpreferredencoding() 49 except locale.Error: 50 DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False) 51 52 BINARY_ENCODINGS = set(('gzip', 'bzip2', 'base64')) 53 54 TEXT_MIMETYPES = set(('application/xml', 'application/xhtml+xml')) 55 56 UNICODE_POLICY = 'strict' 57 58 CHARSET_DECL_RGX = re.compile('(?:charset|(?:(?:en)?coding))[=:\s"\']*([^\s"\']*)', 59 re.I | re.S | re.U) 60 CHARSET_DECL_SEARCH_SIZE = 512 61 62 CHARDET_MIN_SIZE = 20 63 CHARDET_CONFIDENCE_THRESHOLD = 0.75 6466 """return True if we can complete given mimetype / encoding information""" 67 if not mimetype: 68 return True 69 if not encoding and is_text_mimetype(mimetype): 70 return True 71 return False7274 return (mimetype.startswith('text/') or mimetype in TEXT_MIMETYPES)7577 """try to guess encoding from a buffer""" 78 if hasattr(buffer, 'getvalue'): # may be a StringIO 79 buffer = buffer.getvalue() 80 # try to get a character set declaration 81 m = CHARSET_DECL_RGX.search(buffer[:CHARSET_DECL_SEARCH_SIZE]) 82 if m is not None: 83 guessed = m.group(1) 84 try: 85 # ensure encoding is known by python 86 codecs.lookup(guessed) 87 return guessed 88 except LookupError: 89 pass 90 if buffer.lstrip().startswith('<?xml'): 91 # xml files with no encoding declaration default to UTF-8 92 return 'UTF-8' 93 # use text analysis if enough data 94 if chardet is not None and len(buffer) > CHARDET_MIN_SIZE: 95 detected = chardet.detect(buffer) 96 if detected['confidence'] >= CHARDET_CONFIDENCE_THRESHOLD: 97 return detected['encoding'] 98 return fallbackencoding or DEFAULT_ENCODING99100 -def guess_mimetype_and_encoding(format=None, encoding=None, data=None, 101 filename=None, fallbackencoding=None, 102 fallbackmimetype=u'application/octet-stream'):103 if format and format.split('/')[-1] in BINARY_ENCODINGS: 104 format = None # try to do better 105 if filename and not format: 106 format, enc = mimetypes.guess_type(filename) 107 if format: 108 if not encoding: 109 encoding = enc 110 elif enc: 111 format = u'application/%s' % enc 112 else: 113 format = fallbackmimetype 114 if not encoding and data and format and is_text_mimetype(format): 115 encoding = guess_encoding(data, fallbackencoding) 116 return format, encoding117 118 119 CONTROL_CHARS = [chr(ci) for ci in range(32)] 120 TR_CONTROL_CHARS = [' '] * len(CONTROL_CHARS) 121 for c in ('\n', '\r', '\t'): 122 TR_CONTROL_CHARS[ord(c)] = c 123 TR_CONTROL_CHARS[ord('\f')] = '\n' 124 TR_CONTROL_CHARS[ord('\v')] = '\n' 125 ESC_CAR_TABLE = string.maketrans(''.join(CONTROL_CHARS), 126 ''.join(TR_CONTROL_CHARS)) 127 ESC_UCAR_TABLE = unicode(ESC_CAR_TABLE, 'latin1') 128 129 # XXX deprecate at some point (once less used :) 130 #@obsolete('use xml_escape')132 return xml_escape(data)133135 """escapes XML forbidden characters in attributes and PCDATA""" 136 if isinstance(data, unicode): 137 data = data.translate(ESC_UCAR_TABLE) 138 else: 139 data = data.translate(ESC_CAR_TABLE) 140 return (data.replace('&','&').replace('<','<').replace('>','>') 141 .replace('"','"').replace("'",'''))142144 """unescapes XML/HTML entities""" 145 for entityname, codepoint in htmlentitydefs.name2codepoint.iteritems(): 146 data = data.replace('&%s;' % entityname, unichr(codepoint)) 147 return data.replace(''', "'")148150 """wrapper arround transformed data to add extra infos such as MIME 151 type and encoding in case it applies 152 """211 212 215 223 224154 self.__dict__.update(kwargs) 155 self.data = data 156 self.mimetype = mimetype 157 self.encoding = encoding 158 if not self.is_binary() and not encoding and not isinstance(self.data, unicode): 159 self.encoding = guess_encoding(data)160 164166 """return the data as an unicode string""" 167 if isinstance(self.data, unicode): 168 return self.data 169 if force: 170 if self.encoding in BINARY_ENCODINGS: 171 self.binary_decode() 172 elif self.is_binary(): 173 raise Exception("can't decode binary stream (mime type: %s, encoding: %s)" 174 % (self.mimetype, self.encoding)) 175 if self.encoding: 176 encoding = self.encoding 177 else: 178 encoding = guess_encoding(self.data) 179 return self.data.decode(encoding, UNICODE_POLICY)180182 """return the data as an encoded string""" 183 if (encoding is None or self.encoding == encoding) and \ 184 isinstance(self.data, str): 185 return self.data 186 encoding = encoding or self.encoding or 'utf8' 187 return self.decode().encode(encoding)188 192 196198 if self.encoding == 'gzip': 199 import gzip 200 stream = gzip.GzipFile(fileobj=StringIO(self.data)) 201 self.data = stream.read() 202 self.encoding = guess_encoding(self.data) 203 elif self.encoding == 'bzip2': 204 import bz2 205 self.data = bz2.decompress(StringIO(self.data)) # StringIO or not? 206 self.encoding = guess_encoding(self.data) 207 elif self.encoding == 'base64': 208 import base64 209 self.data = base64.decodestring(self.data) 210 self.encoding = guess_encoding(self.data)226 try: 227 from logilab.mtconverter.transforms import piltransforms 228 except ImportError: 229 # pil not available, do nothing 230 if verb: 231 print "PIL isn't available, image transforms won't be available'" 232 return False 233 else: 234 for trclass in piltransforms.transform_classes: 235 engine.add_transform(trclass()) 236 return True237 238240 try: 241 from logilab.mtconverter.transforms import pygmentstransforms 242 except ImportError: 243 # pygments not available, do nothing 244 if verb: 245 print "PYGMENTS isn't available, transforms won't be available'" 246 return False 247 else: 248 for trclass in pygmentstransforms.transform_classes: 249 engine.add_transform(trclass()) 250 return True251 252254 from logilab.mtconverter.transforms import cmdtransforms, text_to_text, \ 255 xml_to_text, text_to_html, xlog_to_html 256 from logilab.mtconverter.transforms.python import python_to_html 257 from logilab.mtconverter.transforms.html2text import html_to_formatted_text 258 from logilab.mtconverter.transforms.odt2text import odt_to_unformatted_text 259 from logilab.mtconverter.transforms.pgpsignature import pgpsignature_to_text 260 engine.add_transform(text_to_text()) 261 engine.add_transform(xml_to_text()) 262 engine.add_transform(text_to_html()) 263 engine.add_transform(xlog_to_html()) 264 engine.add_transform(python_to_html()) 265 engine.add_transform(html_to_formatted_text()) 266 engine.add_transform(odt_to_unformatted_text()) 267 engine.add_transform(pgpsignature_to_text()) 268 for trclass in cmdtransforms.transform_classes: 269 try: 270 engine.add_transform(trclass()) 271 except MissingBinary, ex: 272 if verb: 273 print ex 274 return True275
Home | Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1 on Mon Mar 14 19:07:58 2016 | http://epydoc.sourceforge.net |