Home | Trees | Indices | Help |
|
---|
|
1 # copyright 2006-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 3 # 4 # This file is part of logilab-mtconverter. 5 # 6 # logilab-mtconverter is free software: you can redistribute it and/or modify it 7 # under the terms of the GNU Lesser General Public License as published by the 8 # Free Software Foundation, either version 2.1 of the License, or (at your 9 # option) any later version. 10 # 11 # logilab-mtconverter is distributed in the hope that it will be useful, but 12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 14 # for more details. 15 # 16 # You should have received a copy of the GNU Lesser General Public License along 17 # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>. 18 import os 19 from tempfile import mkstemp 20 import subprocess 21 22 from logilab.mtconverter import MissingBinary 23 from logilab.mtconverter.transform import Transform 24 25 bin_search_path = [path for path in os.environ['PATH'].split(os.pathsep) 26 if os.path.isdir(path)] 27 2830 """search the bin_search_path for a given binary returning its fullname or 31 raises MissingBinary""" 32 result = None 33 mode = os.R_OK | os.X_OK 34 for path in bin_search_path: 35 pathbin = os.path.join(path, binary) 36 if os.access(pathbin, mode) == 1: 37 return pathbin 38 break 39 raise MissingBinary('Unable to find binary "%s" in %s' % 40 (binary, os.pathsep.join(bin_search_path)))41 4244 """abstract class for external command based transform 45 46 The external command may read from stdin but must write to stdout 47 If use_stdin is False, a temporary file will be used as input for 48 the command 49 """ 50 51 cmdname = None 52 cmdargs = "" 53 use_stdin = True 54 input_encoding = None 55 #output_encoding = 'utf-8' 5690 9159 if name is not None: 60 self.name = name 61 if binary is not None: 62 self.binary = bin_search(binary) 63 else: 64 self.binary = bin_search(self.cmdname) 65 if cmdargs is not None: 66 self.cmdargs = cmdargs 67 if use_stdin is not None: 68 self.use_stdin = use_stdin6971 return "%s %s" % (self.binary, self.cmdargs)7274 command = self._command_line(trdata) 75 data = trdata.encode(self.input_encoding) 76 if not self.use_stdin: 77 tmpfile, tmpname = mkstemp(text=False) # create tmp 78 os.write(tmpfile, data) # write data to tmp using a file descriptor 79 os.close(tmpfile) # close it so the other process can read it 80 command = command % {'infile' : tmpname} # apply tmp name to command 81 data = None 82 cmd = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, 83 stdout=subprocess.PIPE, 84 stderr=subprocess.STDOUT, close_fds=True) 85 out, _ = cmd.communicate(data) 86 if not self.use_stdin: 87 # remove tmp file 88 os.unlink(tmpname) 89 return out.strip()93 name = "pdf_to_text" 94 inputs = ('application/pdf',) 95 output = 'text/plain' 96 output_encoding = 'utf-8' 97 98 cmdname = "pdftotext" 99 cmdargs = "%(infile)s -enc UTF-8 -" 100 use_stdin = False101 102104 name = "lynx_dump" 105 inputs = ('text/html', 'text/xhtml') 106 output = 'text/plain' 107 108 cmdname = "lynx" 109 cmdargs = "-dump -stdin" 110 use_stdin = True 111118 119 120 transform_classes = [pdf_to_text] # , lynx_dump] 121113 encoding = trdata.encoding 114 if encoding == 'ascii': 115 encoding = 'iso-8859-1' # lynx doesn't know ascii ! 116 return '%s %s -assume_charset=%s -display_charset=%s' % ( 117 self.binary, self.cmdargs, encoding, encoding)
Home | Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1 on Mon Mar 14 19:07:57 2016 | http://epydoc.sourceforge.net |