| Home | Trees | Indices | Help |
|
|---|
|
|
1 # copyright 2006-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
3 #
4 # This file is part of logilab-mtconverter.
5 #
6 # logilab-mtconverter is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Lesser General Public License as published by the
8 # Free Software Foundation, either version 2.1 of the License, or (at your
9 # option) any later version.
10 #
11 # logilab-mtconverter is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
14 # for more details.
15 #
16 # You should have received a copy of the GNU Lesser General Public License along
17 # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>.
18 """Mime type conversion package.
19
20 2006-2012 `LOGILAB S.A. <http://www.logilab.fr>`_ (Paris, FRANCE),
21 all rights reserved.
22
23 http://www.logilab.org/project/logilab-mtconverter --
24 mailto:python-projects@logilab.org
25
26 `Lesser General Public License version 2`
27 """
28 __docformat__ = "restructuredtext en"
29
30 from logilab.mtconverter.__pkginfo__ import version as __version__
31
32 import locale
33 import mimetypes
34 import re
35 import string
36 import htmlentitydefs
37 import codecs
38 from StringIO import StringIO
39
40 try:
41 import chardet
42 except ImportError:
43 # chardet unvailable
44 chardet = None
45
46 mimetypes.encodings_map['.bz2'] = 'bzip2' # register bzip2 encoding
47 try:
48 DEFAULT_ENCODING = locale.getpreferredencoding()
49 except locale.Error:
50 DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False)
51
52 BINARY_ENCODINGS = set(('gzip', 'bzip2', 'base64'))
53
54 TEXT_MIMETYPES = set(('application/xml', 'application/xhtml+xml'))
55
56 UNICODE_POLICY = 'strict'
57
58 CHARSET_DECL_RGX = re.compile('(?:charset|(?:(?:en)?coding))[=:\s"\']*([^\s"\']*)',
59 re.I | re.S | re.U)
60 CHARSET_DECL_SEARCH_SIZE = 512
61
62 CHARDET_MIN_SIZE = 20
63 CHARDET_CONFIDENCE_THRESHOLD = 0.75
64
66 """return True if we can complete given mimetype / encoding information"""
67 if not mimetype:
68 return True
69 if not encoding and is_text_mimetype(mimetype):
70 return True
71 return False
72
74 return (mimetype.startswith('text/') or mimetype in TEXT_MIMETYPES)
75
77 """try to guess encoding from a buffer"""
78 if hasattr(buffer, 'getvalue'): # may be a StringIO
79 buffer = buffer.getvalue()
80 # try to get a character set declaration
81 m = CHARSET_DECL_RGX.search(buffer[:CHARSET_DECL_SEARCH_SIZE])
82 if m is not None:
83 guessed = m.group(1)
84 try:
85 # ensure encoding is known by python
86 codecs.lookup(guessed)
87 return guessed
88 except LookupError:
89 pass
90 if buffer.lstrip().startswith('<?xml'):
91 # xml files with no encoding declaration default to UTF-8
92 return 'UTF-8'
93 # use text analysis if enough data
94 if chardet is not None and len(buffer) > CHARDET_MIN_SIZE:
95 detected = chardet.detect(buffer)
96 if detected['confidence'] >= CHARDET_CONFIDENCE_THRESHOLD:
97 return detected['encoding']
98 return fallbackencoding or DEFAULT_ENCODING
99
100 -def guess_mimetype_and_encoding(format=None, encoding=None, data=None,
101 filename=None, fallbackencoding=None,
102 fallbackmimetype=u'application/octet-stream'):
103 if format and format.split('/')[-1] in BINARY_ENCODINGS:
104 format = None # try to do better
105 if filename and not format:
106 format, enc = mimetypes.guess_type(filename)
107 if format:
108 if not encoding:
109 encoding = enc
110 elif enc:
111 format = u'application/%s' % enc
112 else:
113 format = fallbackmimetype
114 if not encoding and data and format and is_text_mimetype(format):
115 encoding = guess_encoding(data, fallbackencoding)
116 return format, encoding
117
118
119 CONTROL_CHARS = [chr(ci) for ci in range(32)]
120 TR_CONTROL_CHARS = [' '] * len(CONTROL_CHARS)
121 for c in ('\n', '\r', '\t'):
122 TR_CONTROL_CHARS[ord(c)] = c
123 TR_CONTROL_CHARS[ord('\f')] = '\n'
124 TR_CONTROL_CHARS[ord('\v')] = '\n'
125 ESC_CAR_TABLE = string.maketrans(''.join(CONTROL_CHARS),
126 ''.join(TR_CONTROL_CHARS))
127 ESC_UCAR_TABLE = unicode(ESC_CAR_TABLE, 'latin1')
128
129 # XXX deprecate at some point (once less used :)
130 #@obsolete('use xml_escape')
132 return xml_escape(data)
133
135 """escapes XML forbidden characters in attributes and PCDATA"""
136 if isinstance(data, unicode):
137 data = data.translate(ESC_UCAR_TABLE)
138 else:
139 data = data.translate(ESC_CAR_TABLE)
140 return (data.replace('&','&').replace('<','<').replace('>','>')
141 .replace('"','"').replace("'",'''))
142
144 """unescapes XML/HTML entities"""
145 for entityname, codepoint in htmlentitydefs.name2codepoint.iteritems():
146 data = data.replace('&%s;' % entityname, unichr(codepoint))
147 return data.replace(''', "'")
148
150 """wrapper arround transformed data to add extra infos such as MIME
151 type and encoding in case it applies
152 """
154 self.__dict__.update(kwargs)
155 self.data = data
156 self.mimetype = mimetype
157 self.encoding = encoding
158 if not self.is_binary() and not encoding and not isinstance(self.data, unicode):
159 self.encoding = guess_encoding(data)
160
164
166 """return the data as an unicode string"""
167 if isinstance(self.data, unicode):
168 return self.data
169 if force:
170 if self.encoding in BINARY_ENCODINGS:
171 self.binary_decode()
172 elif self.is_binary():
173 raise Exception("can't decode binary stream (mime type: %s, encoding: %s)"
174 % (self.mimetype, self.encoding))
175 if self.encoding:
176 encoding = self.encoding
177 else:
178 encoding = guess_encoding(self.data)
179 return self.data.decode(encoding, UNICODE_POLICY)
180
182 """return the data as an encoded string"""
183 if (encoding is None or self.encoding == encoding) and \
184 isinstance(self.data, str):
185 return self.data
186 encoding = encoding or self.encoding or 'utf8'
187 return self.decode().encode(encoding)
188
192
196
198 if self.encoding == 'gzip':
199 import gzip
200 stream = gzip.GzipFile(fileobj=StringIO(self.data))
201 self.data = stream.read()
202 self.encoding = guess_encoding(self.data)
203 elif self.encoding == 'bzip2':
204 import bz2
205 self.data = bz2.decompress(StringIO(self.data)) # StringIO or not?
206 self.encoding = guess_encoding(self.data)
207 elif self.encoding == 'base64':
208 import base64
209 self.data = base64.decodestring(self.data)
210 self.encoding = guess_encoding(self.data)
211
212
215
223
224
226 try:
227 from logilab.mtconverter.transforms import piltransforms
228 except ImportError:
229 # pil not available, do nothing
230 if verb:
231 print "PIL isn't available, image transforms won't be available'"
232 return False
233 else:
234 for trclass in piltransforms.transform_classes:
235 engine.add_transform(trclass())
236 return True
237
238
240 try:
241 from logilab.mtconverter.transforms import pygmentstransforms
242 except ImportError:
243 # pygments not available, do nothing
244 if verb:
245 print "PYGMENTS isn't available, transforms won't be available'"
246 return False
247 else:
248 for trclass in pygmentstransforms.transform_classes:
249 engine.add_transform(trclass())
250 return True
251
252
254 from logilab.mtconverter.transforms import cmdtransforms, text_to_text, \
255 xml_to_text, text_to_html, xlog_to_html
256 from logilab.mtconverter.transforms.python import python_to_html
257 from logilab.mtconverter.transforms.html2text import html_to_formatted_text
258 from logilab.mtconverter.transforms.odt2text import odt_to_unformatted_text
259 from logilab.mtconverter.transforms.pgpsignature import pgpsignature_to_text
260 engine.add_transform(text_to_text())
261 engine.add_transform(xml_to_text())
262 engine.add_transform(text_to_html())
263 engine.add_transform(xlog_to_html())
264 engine.add_transform(python_to_html())
265 engine.add_transform(html_to_formatted_text())
266 engine.add_transform(odt_to_unformatted_text())
267 engine.add_transform(pgpsignature_to_text())
268 for trclass in cmdtransforms.transform_classes:
269 try:
270 engine.add_transform(trclass())
271 except MissingBinary, ex:
272 if verb:
273 print ex
274 return True
275
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Mar 14 19:07:58 2016 | http://epydoc.sourceforge.net |