src/ztfy/utils/html.py
branchZTK-1.1
changeset 148 d3668ecd9137
parent 70 82d8de021806
equal deleted inserted replaced
147:044dc196ec8a 148:d3668ecd9137
       
     1 ### -*- coding: utf-8 -*- ####################################################
       
     2 ##############################################################################
       
     3 #
       
     4 # Copyright (c) 2008-2010 Thierry Florac <tflorac AT ulthar.net>
       
     5 # All Rights Reserved.
       
     6 #
       
     7 # This software is subject to the provisions of the Zope Public License,
       
     8 # Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
       
     9 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
       
    10 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
       
    11 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
       
    12 # FOR A PARTICULAR PURPOSE.
       
    13 #
       
    14 ##############################################################################
       
    15 
       
    16 __docformat__ = "restructuredtext"
       
    17 
       
    18 # import standard packages
       
    19 from sgmllib import SGMLParser
       
    20 
       
    21 # import Zope3 interfaces
       
    22 
       
    23 # import local interfaces
       
    24 
       
    25 # import Zope3 packages
       
    26 
       
    27 # import local packages
       
    28 
       
    29 
       
    30 class HTMLParser(SGMLParser):
       
    31 
       
    32     data = ''
       
    33     entitydefs = { 'amp': '&', 'lt': '<', 'gt': '>',
       
    34                    'apos': "'", 'quot': '"',
       
    35                    'Agrave': 'À', 'Aacute': 'A', 'Acirc': 'Â', 'Atilde': 'A', 'Auml': 'Ä', 'Aring': 'A',
       
    36                    'AElig': 'AE',
       
    37                    'Ccedil': 'Ç',
       
    38                    'Egrave': 'É', 'Eacute': 'È', 'Ecirc': 'Ê', 'Euml': 'Ë',
       
    39                    'Igrave': 'I', 'Iacute': 'I', 'Icirc': 'I', 'Iuml': 'I',
       
    40                    'Ntilde': 'N',
       
    41                    'Ograve': 'O', 'Oacute': 'O', 'Ocirc': 'Ô', 'Otilde': 'O', 'Ouml': 'Ö', 'Oslash': 'O',
       
    42                    'Ugrave': 'Ù', 'Uacute': 'U', 'Ucirc': 'Û', 'Uuml': 'Ü',
       
    43                    'Yacute': 'Y',
       
    44                    'THORN': 'T',
       
    45                    'agrave': 'à', 'aacute': 'a', 'acirc': 'â', 'atilde': 'a', 'auml': 'ä', 'aring': 'a', 'aelig': 'ae',
       
    46                    'ccedil': 'ç',
       
    47                    'egrave': 'è', 'eacute': 'é', 'ecirc': 'ê', 'euml': 'ë',
       
    48                    'igrave': 'i', 'iacute': 'i', 'icirc': 'î', 'iuml': 'ï',
       
    49                    'ntilde': 'n',
       
    50                    'ograve': 'o', 'oacute': 'o', 'ocirc': 'ô', 'otilde': 'o', 'ouml': 'ö', 'oslash': 'o',
       
    51                    'ugrave': 'ù', 'uacute': 'u', 'ucirc': 'û', 'uuml': 'ü',
       
    52                    'yacute': 'y',
       
    53                    'thorn': 't',
       
    54                    'yuml': 'ÿ' }
       
    55 
       
    56     charrefs = {  34 : '"', 38 : '&', 39 : "'",
       
    57                   60 : '<', 62 : '>',
       
    58                  192 : 'À', 193 : 'A', 194 : 'Â', 195 : 'A', 196 : 'Ä', 197 : 'A',
       
    59                  198 : 'AE',
       
    60                  199 : 'Ç',
       
    61                  200 : 'È', 201 : 'É', 202 : 'Ê', 203 : 'Ë',
       
    62                  204 : 'I', 205 : 'I', 206 : 'Î', 207 : 'Ï',
       
    63                  208 : 'D',
       
    64                  209 : 'N',
       
    65                  210 : 'O', 211 : 'O', 212 : 'Ô', 213 : 'O', 214 : 'Ö', 216 : 'O',
       
    66                  215 : 'x',
       
    67                  217 : 'Ù', 218 : 'U', 219 : 'Û', 220 : 'Ü',
       
    68                  221 : 'Y', 222 : 'T',
       
    69                  223 : 'sz',
       
    70                  224 : 'à', 225 : 'a', 226 : 'â', 227 : 'a', 228 : 'ä', 229 : 'a',
       
    71                  230 : 'ae',
       
    72                  231 : 'ç',
       
    73                  232 : 'è', 233 : 'é', 234 : 'ê', 235 : 'ë',
       
    74                  236 : 'i', 237 : 'i', 238 : 'î', 239 : 'ï',
       
    75                  240 : 'e',
       
    76                  241 : 'n',
       
    77                  242 : 'o', 243 : 'o', 244 : 'ô', 245 : 'o', 246 : 'ö', 248 : 'o',
       
    78                  249 : 'ù', 250 : 'u', 251 : 'û', 252 : 'ü',
       
    79                  253 : 'y', 255 : 'ÿ' }
       
    80 
       
    81     def handle_data(self, data):
       
    82         try:
       
    83             self.data += data
       
    84         except:
       
    85             self.data += unicode(data, 'utf8')
       
    86 
       
    87     def handle_charref(self, name):
       
    88         try:
       
    89             n = int(name)
       
    90         except ValueError:
       
    91             self.unknown_charref(name)
       
    92             return
       
    93         if not 0 <= n <= 255:
       
    94             self.unknown_charref(name)
       
    95             return
       
    96         self.handle_data(self.charrefs.get(n) or unicode(chr(n), 'latin1'))
       
    97 
       
    98     def start_td(self, attributes):
       
    99         self.data += ' '
       
   100 
       
   101     def start_p(self, attributes):
       
   102         pass
       
   103 
       
   104     def end_p(self):
       
   105         self.data += '\n'
       
   106 
       
   107 
       
   108 def htmlToText(value):
       
   109     """Utility function to extract text content from HTML"""
       
   110     if value is None:
       
   111         return ''
       
   112     parser = HTMLParser()
       
   113     parser.feed(value)
       
   114     parser.close()
       
   115     return parser.data