src/pyams_utils/html.py
branchdev-tf
changeset 408 cf2304af0fab
parent 292 b338586588ad
equal deleted inserted replaced
407:0037199881fb 408:cf2304af0fab
     8 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     8 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     9 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
     9 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
    10 # FOR A PARTICULAR PURPOSE.
    10 # FOR A PARTICULAR PURPOSE.
    11 #
    11 #
    12 
    12 
    13 __docformat__ = 'restructuredtext'
    13 """PyAMS_utils.html module
       
    14 
       
    15 This module provides functions which are used to convert HTML code to plain text, by extracting
       
    16 useful text and removing all HTML tags.
       
    17 """
       
    18 
       
    19 from html.parser import HTMLParser
       
    20 from warnings import warn
    14 
    21 
    15 
    22 
    16 # import standard library
    23 __docformat__ = 'restructuredtext'
    17 from html.parser import HTMLParser
       
    18 
       
    19 # import interfaces
       
    20 
       
    21 # import packages
       
    22 
    24 
    23 
    25 
    24 class MyHTMLParser(HTMLParser):
    26 class MyHTMLParser(HTMLParser):
    25     """HTML parser"""
    27     """HTML parser"""
    26     data = ''
    28     data = ''
    27     entitydefs = {'amp': '&', 'lt': '<', 'gt': '>',
    29     entitydefs = {'amp': '&', 'lt': '<', 'gt': '>',
    28                   'nbsp': ' ',
    30                   'nbsp': ' ',
    29                   'apos': "'", 'quot': '"',
    31                   'apos': "'", 'quot': '"',
    30                   'Agrave': 'À', 'Aacute': 'A', 'Acirc': 'Â', 'Atilde': 'A', 'Auml': 'Ä', 'Aring': 'A',
    32                   'Agrave': 'À', 'Aacute': 'A', 'Acirc': 'Â', 'Atilde': 'A',
       
    33                   'Auml': 'Ä', 'Aring': 'A',
    31                   'AElig': 'AE',
    34                   'AElig': 'AE',
    32                   'Ccedil': 'Ç',
    35                   'Ccedil': 'Ç',
    33                   'Egrave': 'É', 'Eacute': 'È', 'Ecirc': 'Ê', 'Euml': 'Ë',
    36                   'Egrave': 'É', 'Eacute': 'È', 'Ecirc': 'Ê', 'Euml': 'Ë',
    34                   'Igrave': 'I', 'Iacute': 'I', 'Icirc': 'I', 'Iuml': 'I',
    37                   'Igrave': 'I', 'Iacute': 'I', 'Icirc': 'I', 'Iuml': 'I',
    35                   'Ntilde': 'N',
    38                   'Ntilde': 'N',
    36                   'Ograve': 'O', 'Oacute': 'O', 'Ocirc': 'Ô', 'Otilde': 'O', 'Ouml': 'Ö', 'Oslash': 'O',
    39                   'Ograve': 'O', 'Oacute': 'O', 'Ocirc': 'Ô', 'Otilde': 'O',
       
    40                   'Ouml': 'Ö', 'Oslash': '0',
    37                   'Ugrave': 'Ù', 'Uacute': 'U', 'Ucirc': 'Û', 'Uuml': 'Ü',
    41                   'Ugrave': 'Ù', 'Uacute': 'U', 'Ucirc': 'Û', 'Uuml': 'Ü',
    38                   'Yacute': 'Y',
    42                   'Yacute': 'Y',
    39                   'THORN': 'T',
    43                   'THORN': 'T',
    40                   'agrave': 'à', 'aacute': 'a', 'acirc': 'â', 'atilde': 'a', 'auml': 'ä', 'aring': 'a', 'aelig': 'ae',
    44                   'agrave': 'à', 'aacute': 'a', 'acirc': 'â', 'atilde': 'a',
       
    45                   'auml': 'ä', 'aring': 'a', 'aelig': 'ae',
    41                   'ccedil': 'ç',
    46                   'ccedil': 'ç',
    42                   'egrave': 'è', 'eacute': 'é', 'ecirc': 'ê', 'euml': 'ë',
    47                   'egrave': 'è', 'eacute': 'é', 'ecirc': 'ê', 'euml': 'ë',
    43                   'igrave': 'i', 'iacute': 'i', 'icirc': 'î', 'iuml': 'ï',
    48                   'igrave': 'i', 'iacute': 'i', 'icirc': 'î', 'iuml': 'ï',
    44                   'ntilde': 'n',
    49                   'ntilde': 'n',
    45                   'ograve': 'o', 'oacute': 'o', 'ocirc': 'ô', 'otilde': 'o', 'ouml': 'ö', 'oslash': 'o',
    50                   'ograve': 'o', 'oacute': 'o', 'ocirc': 'ô', 'otilde': 'o',
       
    51                   'ouml': 'ö', 'oslash': 'o',
    46                   'ugrave': 'ù', 'uacute': 'u', 'ucirc': 'û', 'uuml': 'ü',
    52                   'ugrave': 'ù', 'uacute': 'u', 'ucirc': 'û', 'uuml': 'ü',
    47                   'yacute': 'y',
    53                   'yacute': 'y',
    48                   'thorn': 't',
    54                   'thorn': 't',
    49                   'yuml': 'ÿ'}
    55                   'yuml': 'ÿ'}
    50 
    56 
    82     def handle_entityref(self, name):
    88     def handle_entityref(self, name):
    83         self.data += self.entitydefs.get(name, '')
    89         self.data += self.entitydefs.get(name, '')
    84 
    90 
    85     def handle_charref(self, name):
    91     def handle_charref(self, name):
    86         try:
    92         try:
    87             n = int(name)
    93             int_value = int(name)
    88         except ValueError:
    94         except ValueError:
    89             return
    95             return
    90         if not 0 <= n <= 255:
    96         if not 0 <= int_value <= 255:
    91             return
    97             return
    92         self.handle_data(self.charrefs.get(n))
    98         self.handle_data(self.charrefs.get(int_value))
    93 
    99 
    94     def handle_starttag(self, tag, attrs):
   100     def handle_starttag(self, tag, attrs):
    95         if tag == 'td':
   101         if tag == 'td':
    96             self.data += ' '
   102             self.data += ' '
    97         elif tag == 'br':
   103         elif tag == 'br':
    98             self.data += '\n'
   104             self.data += '\n'
    99 
   105 
   100     def handle_endtag(self, tag):
   106     def handle_endtag(self, tag):
   101         if tag == 'p':
   107         if tag == 'p':
   102             self.data += '\n'
   108             self.data += '\n'
       
   109 
       
   110     def error(self, message):
       
   111         warn(message)
   103 
   112 
   104 
   113 
   105 def html_to_text(value):
   114 def html_to_text(value):
   106     """Utility function to extract text content from HTML
   115     """Utility function to extract text content from HTML
   107 
   116 
   118 
   127 
   119     >>> html = '''<div><p>Header</p><p>This is an &lt; &#242; &gt; entity.<br /></p></div>'''
   128     >>> html = '''<div><p>Header</p><p>This is an &lt; &#242; &gt; entity.<br /></p></div>'''
   120     >>> html_to_text(html)
   129     >>> html_to_text(html)
   121     'Header\\nThis is an < ò > entity.\\n\\n'
   130     'Header\\nThis is an < ò > entity.\\n\\n'
   122 
   131 
   123     >>> html = '''<div><p>Header</p><p>This is an &lt;&nbsp;&#242;&nbsp;&gt; entity.<br /></p></div>'''
   132     >>> html = '''<div><p>Header</p><p>This is an &lt;&nbsp;&#242;&nbsp;&gt; ''' + \
       
   133                '''entity.<br /></p></div>'''
   124     >>> html_to_text(html)
   134     >>> html_to_text(html)
   125     'Header\\nThis is an <\xa0ò\xa0> entity.\\n\\n'
   135     'Header\\nThis is an <\xa0ò\xa0> entity.\\n\\n'
   126     """
   136     """
   127     if value is None:
   137     if value is None:
   128         return ''
   138         return ''