8 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
8 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
9 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS |
9 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS |
10 # FOR A PARTICULAR PURPOSE. |
10 # FOR A PARTICULAR PURPOSE. |
11 # |
11 # |
12 |
12 |
13 __docformat__ = 'restructuredtext' |
13 """PyAMS_utils.html module |
|
14 |
|
15 This module provides functions which are used to convert HTML code to plain text, by extracting |
|
16 useful text and removing all HTML tags. |
|
17 """ |
|
18 |
|
19 from html.parser import HTMLParser |
|
20 from warnings import warn |
14 |
21 |
15 |
22 |
16 # import standard library |
23 __docformat__ = 'restructuredtext' |
17 from html.parser import HTMLParser |
|
18 |
|
19 # import interfaces |
|
20 |
|
21 # import packages |
|
22 |
24 |
23 |
25 |
24 class MyHTMLParser(HTMLParser): |
26 class MyHTMLParser(HTMLParser): |
25 """HTML parser""" |
27 """HTML parser""" |
26 data = '' |
28 data = '' |
27 entitydefs = {'amp': '&', 'lt': '<', 'gt': '>', |
29 entitydefs = {'amp': '&', 'lt': '<', 'gt': '>', |
28 'nbsp': ' ', |
30 'nbsp': ' ', |
29 'apos': "'", 'quot': '"', |
31 'apos': "'", 'quot': '"', |
30 'Agrave': 'À', 'Aacute': 'A', 'Acirc': 'Â', 'Atilde': 'A', 'Auml': 'Ä', 'Aring': 'A', |
32 'Agrave': 'À', 'Aacute': 'A', 'Acirc': 'Â', 'Atilde': 'A', |
|
33 'Auml': 'Ä', 'Aring': 'A', |
31 'AElig': 'AE', |
34 'AElig': 'AE', |
32 'Ccedil': 'Ç', |
35 'Ccedil': 'Ç', |
33 'Egrave': 'É', 'Eacute': 'È', 'Ecirc': 'Ê', 'Euml': 'Ë', |
36 'Egrave': 'É', 'Eacute': 'È', 'Ecirc': 'Ê', 'Euml': 'Ë', |
34 'Igrave': 'I', 'Iacute': 'I', 'Icirc': 'I', 'Iuml': 'I', |
37 'Igrave': 'I', 'Iacute': 'I', 'Icirc': 'I', 'Iuml': 'I', |
35 'Ntilde': 'N', |
38 'Ntilde': 'N', |
36 'Ograve': 'O', 'Oacute': 'O', 'Ocirc': 'Ô', 'Otilde': 'O', 'Ouml': 'Ö', 'Oslash': 'O', |
39 'Ograve': 'O', 'Oacute': 'O', 'Ocirc': 'Ô', 'Otilde': 'O', |
|
40 'Ouml': 'Ö', 'Oslash': '0', |
37 'Ugrave': 'Ù', 'Uacute': 'U', 'Ucirc': 'Û', 'Uuml': 'Ü', |
41 'Ugrave': 'Ù', 'Uacute': 'U', 'Ucirc': 'Û', 'Uuml': 'Ü', |
38 'Yacute': 'Y', |
42 'Yacute': 'Y', |
39 'THORN': 'T', |
43 'THORN': 'T', |
40 'agrave': 'à', 'aacute': 'a', 'acirc': 'â', 'atilde': 'a', 'auml': 'ä', 'aring': 'a', 'aelig': 'ae', |
44 'agrave': 'à', 'aacute': 'a', 'acirc': 'â', 'atilde': 'a', |
|
45 'auml': 'ä', 'aring': 'a', 'aelig': 'ae', |
41 'ccedil': 'ç', |
46 'ccedil': 'ç', |
42 'egrave': 'è', 'eacute': 'é', 'ecirc': 'ê', 'euml': 'ë', |
47 'egrave': 'è', 'eacute': 'é', 'ecirc': 'ê', 'euml': 'ë', |
43 'igrave': 'i', 'iacute': 'i', 'icirc': 'î', 'iuml': 'ï', |
48 'igrave': 'i', 'iacute': 'i', 'icirc': 'î', 'iuml': 'ï', |
44 'ntilde': 'n', |
49 'ntilde': 'n', |
45 'ograve': 'o', 'oacute': 'o', 'ocirc': 'ô', 'otilde': 'o', 'ouml': 'ö', 'oslash': 'o', |
50 'ograve': 'o', 'oacute': 'o', 'ocirc': 'ô', 'otilde': 'o', |
|
51 'ouml': 'ö', 'oslash': 'o', |
46 'ugrave': 'ù', 'uacute': 'u', 'ucirc': 'û', 'uuml': 'ü', |
52 'ugrave': 'ù', 'uacute': 'u', 'ucirc': 'û', 'uuml': 'ü', |
47 'yacute': 'y', |
53 'yacute': 'y', |
48 'thorn': 't', |
54 'thorn': 't', |
49 'yuml': 'ÿ'} |
55 'yuml': 'ÿ'} |
50 |
56 |
82 def handle_entityref(self, name): |
88 def handle_entityref(self, name): |
83 self.data += self.entitydefs.get(name, '') |
89 self.data += self.entitydefs.get(name, '') |
84 |
90 |
85 def handle_charref(self, name): |
91 def handle_charref(self, name): |
86 try: |
92 try: |
87 n = int(name) |
93 int_value = int(name) |
88 except ValueError: |
94 except ValueError: |
89 return |
95 return |
90 if not 0 <= n <= 255: |
96 if not 0 <= int_value <= 255: |
91 return |
97 return |
92 self.handle_data(self.charrefs.get(n)) |
98 self.handle_data(self.charrefs.get(int_value)) |
93 |
99 |
94 def handle_starttag(self, tag, attrs): |
100 def handle_starttag(self, tag, attrs): |
95 if tag == 'td': |
101 if tag == 'td': |
96 self.data += ' ' |
102 self.data += ' ' |
97 elif tag == 'br': |
103 elif tag == 'br': |
98 self.data += '\n' |
104 self.data += '\n' |
99 |
105 |
100 def handle_endtag(self, tag): |
106 def handle_endtag(self, tag): |
101 if tag == 'p': |
107 if tag == 'p': |
102 self.data += '\n' |
108 self.data += '\n' |
|
109 |
|
110 def error(self, message): |
|
111 warn(message) |
103 |
112 |
104 |
113 |
105 def html_to_text(value): |
114 def html_to_text(value): |
106 """Utility function to extract text content from HTML |
115 """Utility function to extract text content from HTML |
107 |
116 |