Package python-module-logilab-mtconverter-0 :: Package 8 :: Package 4 :: Package transforms :: Module cmdtransforms
[frames] | no frames]

Source Code for Module python-module-logilab-mtconverter-0.8.4.transforms.cmdtransforms

  1  # copyright 2006-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-mtconverter. 
  5  # 
  6  # logilab-mtconverter is free software: you can redistribute it and/or modify it 
  7  # under the terms of the GNU Lesser General Public License as published by the 
  8  # Free Software Foundation, either version 2.1 of the License, or (at your 
  9  # option) any later version. 
 10  # 
 11  # logilab-mtconverter is distributed in the hope that it will be useful, but 
 12  # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 13  # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License 
 14  # for more details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>. 
 18  import os 
 19  from tempfile import mkstemp 
 20  import subprocess 
 21   
 22  from logilab.mtconverter import MissingBinary 
 23  from logilab.mtconverter.transform import Transform 
 24   
 25  bin_search_path = [path for path in os.environ['PATH'].split(os.pathsep) 
 26                     if os.path.isdir(path)] 
 27   
 28   
29 -def bin_search(binary):
30 """search the bin_search_path for a given binary returning its fullname or 31 raises MissingBinary""" 32 result = None 33 mode = os.R_OK | os.X_OK 34 for path in bin_search_path: 35 pathbin = os.path.join(path, binary) 36 if os.access(pathbin, mode) == 1: 37 return pathbin 38 break 39 raise MissingBinary('Unable to find binary "%s" in %s' % 40 (binary, os.pathsep.join(bin_search_path)))
41 42
43 -class POpenTransform(Transform):
44 """abstract class for external command based transform 45 46 The external command may read from stdin but must write to stdout 47 If use_stdin is False, a temporary file will be used as input for 48 the command 49 """ 50 51 cmdname = None 52 cmdargs = "" 53 use_stdin = True 54 input_encoding = None 55 #output_encoding = 'utf-8' 56
57 - def __init__(self, name=None, binary=None, cmdargs=None, use_stdin=None, 58 **kwargs):
59 if name is not None: 60 self.name = name 61 if binary is not None: 62 self.binary = bin_search(binary) 63 else: 64 self.binary = bin_search(self.cmdname) 65 if cmdargs is not None: 66 self.cmdargs = cmdargs 67 if use_stdin is not None: 68 self.use_stdin = use_stdin
69
70 - def _command_line(self, trdata):
71 return "%s %s" % (self.binary, self.cmdargs)
72
73 - def _convert(self, trdata):
74 command = self._command_line(trdata) 75 data = trdata.encode(self.input_encoding) 76 if not self.use_stdin: 77 tmpfile, tmpname = mkstemp(text=False) # create tmp 78 os.write(tmpfile, data) # write data to tmp using a file descriptor 79 os.close(tmpfile) # close it so the other process can read it 80 command = command % {'infile' : tmpname} # apply tmp name to command 81 data = None 82 cmd = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, 83 stdout=subprocess.PIPE, 84 stderr=subprocess.STDOUT, close_fds=True) 85 out, _ = cmd.communicate(data) 86 if not self.use_stdin: 87 # remove tmp file 88 os.unlink(tmpname) 89 return out.strip()
90 91
92 -class pdf_to_text(POpenTransform):
93 name = "pdf_to_text" 94 inputs = ('application/pdf',) 95 output = 'text/plain' 96 output_encoding = 'utf-8' 97 98 cmdname = "pdftotext" 99 cmdargs = "%(infile)s -enc UTF-8 -" 100 use_stdin = False
101 102
103 -class lynx_dump(POpenTransform):
104 name = "lynx_dump" 105 inputs = ('text/html', 'text/xhtml') 106 output = 'text/plain' 107 108 cmdname = "lynx" 109 cmdargs = "-dump -stdin" 110 use_stdin = True 111
112 - def _command_line(self, trdata):
113 encoding = trdata.encoding 114 if encoding == 'ascii': 115 encoding = 'iso-8859-1' # lynx doesn't know ascii ! 116 return '%s %s -assume_charset=%s -display_charset=%s' % ( 117 self.binary, self.cmdargs, encoding, encoding)
118 119 120 transform_classes = [pdf_to_text] # , lynx_dump] 121