|
1 ### -*- coding: utf-8 -*- #################################################### |
|
2 ############################################################################## |
|
3 # |
|
4 # Copyright (c) 2008-2010 Thierry Florac <tflorac AT ulthar.net> |
|
5 # All Rights Reserved. |
|
6 # |
|
7 # This software is subject to the provisions of the Zope Public License, |
|
8 # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. |
|
9 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED |
|
10 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
11 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS |
|
12 # FOR A PARTICULAR PURPOSE. |
|
13 # |
|
14 ############################################################################## |
|
15 |
|
16 __docformat__ = "restructuredtext" |
|
17 |
|
18 # import standard packages |
|
19 from sgmllib import SGMLParser |
|
20 |
|
21 # import Zope3 interfaces |
|
22 |
|
23 # import local interfaces |
|
24 |
|
25 # import Zope3 packages |
|
26 |
|
27 # import local packages |
|
28 |
|
29 |
|
30 class HTMLParser(SGMLParser): |
|
31 |
|
32 data = '' |
|
33 entitydefs = { 'amp': '&', 'lt': '<', 'gt': '>', |
|
34 'apos': "'", 'quot': '"', |
|
35 'Agrave': 'À', 'Aacute': 'A', 'Acirc': 'Â', 'Atilde': 'A', 'Auml': 'Ä', 'Aring': 'A', |
|
36 'AElig': 'AE', |
|
37 'Ccedil': 'Ç', |
|
38 'Egrave': 'É', 'Eacute': 'È', 'Ecirc': 'Ê', 'Euml': 'Ë', |
|
39 'Igrave': 'I', 'Iacute': 'I', 'Icirc': 'I', 'Iuml': 'I', |
|
40 'Ntilde': 'N', |
|
41 'Ograve': 'O', 'Oacute': 'O', 'Ocirc': 'Ô', 'Otilde': 'O', 'Ouml': 'Ö', 'Oslash': 'O', |
|
42 'Ugrave': 'Ù', 'Uacute': 'U', 'Ucirc': 'Û', 'Uuml': 'Ü', |
|
43 'Yacute': 'Y', |
|
44 'THORN': 'T', |
|
45 'agrave': 'à', 'aacute': 'a', 'acirc': 'â', 'atilde': 'a', 'auml': 'ä', 'aring': 'a', 'aelig': 'ae', |
|
46 'ccedil': 'ç', |
|
47 'egrave': 'è', 'eacute': 'é', 'ecirc': 'ê', 'euml': 'ë', |
|
48 'igrave': 'i', 'iacute': 'i', 'icirc': 'î', 'iuml': 'ï', |
|
49 'ntilde': 'n', |
|
50 'ograve': 'o', 'oacute': 'o', 'ocirc': 'ô', 'otilde': 'o', 'ouml': 'ö', 'oslash': 'o', |
|
51 'ugrave': 'ù', 'uacute': 'u', 'ucirc': 'û', 'uuml': 'ü', |
|
52 'yacute': 'y', |
|
53 'thorn': 't', |
|
54 'yuml': 'ÿ' } |
|
55 |
|
56 charrefs = { 34 : '"', 38 : '&', 39 : "'", |
|
57 60 : '<', 62 : '>', |
|
58 192 : 'À', 193 : 'A', 194 : 'Â', 195 : 'A', 196 : 'Ä', 197 : 'A', |
|
59 198 : 'AE', |
|
60 199 : 'Ç', |
|
61 200 : 'È', 201 : 'É', 202 : 'Ê', 203 : 'Ë', |
|
62 204 : 'I', 205 : 'I', 206 : 'Î', 207 : 'Ï', |
|
63 208 : 'D', |
|
64 209 : 'N', |
|
65 210 : 'O', 211 : 'O', 212 : 'Ô', 213 : 'O', 214 : 'Ö', 216 : 'O', |
|
66 215 : 'x', |
|
67 217 : 'Ù', 218 : 'U', 219 : 'Û', 220 : 'Ü', |
|
68 221 : 'Y', 222 : 'T', |
|
69 223 : 'sz', |
|
70 224 : 'à', 225 : 'a', 226 : 'â', 227 : 'a', 228 : 'ä', 229 : 'a', |
|
71 230 : 'ae', |
|
72 231 : 'ç', |
|
73 232 : 'è', 233 : 'é', 234 : 'ê', 235 : 'ë', |
|
74 236 : 'i', 237 : 'i', 238 : 'î', 239 : 'ï', |
|
75 240 : 'e', |
|
76 241 : 'n', |
|
77 242 : 'o', 243 : 'o', 244 : 'ô', 245 : 'o', 246 : 'ö', 248 : 'o', |
|
78 249 : 'ù', 250 : 'u', 251 : 'û', 252 : 'ü', |
|
79 253 : 'y', 255 : 'ÿ' } |
|
80 |
|
81 def handle_data(self, data): |
|
82 try: |
|
83 self.data += data |
|
84 except: |
|
85 self.data += unicode(data, 'utf8') |
|
86 |
|
87 def handle_charref(self, name): |
|
88 try: |
|
89 n = int(name) |
|
90 except ValueError: |
|
91 self.unknown_charref(name) |
|
92 return |
|
93 if not 0 <= n <= 255: |
|
94 self.unknown_charref(name) |
|
95 return |
|
96 self.handle_data(self.charrefs.get(n) or unicode(chr(n), 'latin1')) |
|
97 |
|
98 def start_td(self, attributes): |
|
99 self.data += ' ' |
|
100 |
|
101 def start_p(self, attributes): |
|
102 pass |
|
103 |
|
104 def end_p(self): |
|
105 self.data += '\n' |
|
106 |
|
107 |
|
108 def htmlToText(value): |
|
109 """Utility function to extract text content from HTML""" |
|
110 if value is None: |
|
111 return '' |
|
112 parser = HTMLParser() |
|
113 parser.feed(value) |
|
114 parser.close() |
|
115 return parser.data |