src/pyams_utils/unicode.py
branchdev-tf
changeset 408 cf2304af0fab
parent 391 49d63e4bf171
equal deleted inserted replaced
407:0037199881fb 408:cf2304af0fab
     8 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     8 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     9 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
     9 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
    10 # FOR A PARTICULAR PURPOSE.
    10 # FOR A PARTICULAR PURPOSE.
    11 #
    11 #
    12 
    12 
    13 __docformat__ = 'restructuredtext'
    13 """PyAMS_utils.unicode module
       
    14 
       
    15 This module provides a small set of functions which can be used to handle unicode data and
       
    16 their bytes equivalent.
       
    17 """
    14 
    18 
    15 import codecs
    19 import codecs
    16 import string
    20 import string
    17 
    21 
    18 
    22 __docformat__ = 'restructuredtext'
    19 _unicodeTransTable = {}
    23 
    20 
    24 
    21 
    25 _UNICODE_TRANS_TABLE = {}
    22 def _fillUnicodeTransTable():
    26 
       
    27 
       
    28 def _fill_unicode_trans_table():
    23     _corresp = [
    29     _corresp = [
    24         ("A", [0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x0100, 0x0102, 0x0104]),
    30         ("A", [0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x0100, 0x0102, 0x0104]),
    25         ("AE", [0x00C6]),
    31         ("AE", [0x00C6]),
    26         ("a", [0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x0101, 0x0103, 0x0105]),
    32         ("a", [0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x0101, 0x0103, 0x0105]),
    27         ("ae", [0x00E6]),
    33         ("ae", [0x00E6]),
    67         ("z", [0x017A, 0x017C, 0x017E]),
    73         ("z", [0x017A, 0x017C, 0x017E]),
    68         ("'", [0x2019])
    74         ("'", [0x2019])
    69     ]
    75     ]
    70     for char, codes in _corresp:
    76     for char, codes in _corresp:
    71         for code in codes:
    77         for code in codes:
    72             _unicodeTransTable[code] = char
    78             _UNICODE_TRANS_TABLE[code] = char
    73 
    79 
    74 
    80 
    75 _fillUnicodeTransTable()
    81 _fill_unicode_trans_table()
    76 
    82 
    77 removed_chars = '®©™…'
    83 
       
    84 _REMOVED_CHARS = '®©™…'
    78 """List of custom characters to remove from input strings"""
    85 """List of custom characters to remove from input strings"""
    79 
    86 
    80 
    87 
    81 def translate_string(s, escape_slashes=False, force_lower=True,
    88 def translate_string(value, escape_slashes=False, force_lower=True,
    82                      spaces=' ', remove_punctuation=True, keep_chars='_-.'):
    89                      spaces=' ', remove_punctuation=True, keep_chars='_-.'):
       
    90     # pylint: disable=too-many-arguments
    83     """Remove extended characters and diacritics from string and replace them with 'basic' ones
    91     """Remove extended characters and diacritics from string and replace them with 'basic' ones
    84     
    92 
    85     :param str s: text to be cleaned.
    93     :param str value: text to be translated
    86     :param boolean escape_slashes: if True, slashes are also converted
    94     :param boolean escape_slashes: if True, slashes are also converted
    87     :param boolean force_lower: if True, result is automatically converted to lower case
    95     :param boolean force_lower: if True, result is automatically converted to lower case
    88     :param str spaces: character used to replace spaces
    96     :param str spaces: character used to replace spaces
    89     :param boolean remove_punctuation: if True, all punctuation characters are removed
    97     :param boolean remove_punctuation: if True, all punctuation characters are removed
    90     :param str keep_chars: characters which may be kept in the input string
    98     :param str keep_chars: characters which may be kept in the input string
    91     :return: text without diacritics or special characters
    99     :return: text without diacritics or special characters
    92 
   100 
    93     >>> from pyams_utils.unicode import translate_string
   101     >>> from pyams_utils.unicode import translate_string
    94     >>> input = 'Ceci est un test en Français !!!'
   102     >>> input_string = 'Ceci est un test en Français !!!'
    95     >>> translate_string(input)
   103     >>> translate_string(input_string)
    96     'ceci est un test en francais'
   104     'ceci est un test en francais'
    97     >>> translate_string(input, force_lower=False)
   105     >>> translate_string(input_string, force_lower=False)
    98     'Ceci est un test en Francais'
   106     'Ceci est un test en Francais'
    99     >>> translate_string(input, spaces='-')
   107     >>> translate_string(input_string, spaces='-')
   100     'ceci-est-un-test-en-francais'
   108     'ceci-est-un-test-en-francais'
   101     >>> translate_string(input, remove_punctuation=False)
   109     >>> translate_string(input_string, remove_punctuation=False)
   102     'ceci est un test en francais !!!'
   110     'ceci est un test en francais !!!'
   103     >>> translate_string(input, keep_chars='!')
   111     >>> translate_string(input_string, keep_chars='!')
   104     'ceci est un test en francais !!!'
   112     'ceci est un test en francais !!!'
   105     """
   113     """
   106     if escape_slashes:
   114     if escape_slashes:
   107         s = s.replace("\\", "/").split("/")[-1]
   115         value = value.replace("\\", "/").split("/")[-1]
   108     s = s.strip()
   116     value = value.strip()
   109     if isinstance(s, bytes):
   117     if isinstance(value, bytes):
   110         s = s.decode("utf-8", "replace")
   118         value = value.decode("utf-8", "replace")
   111     s = s.translate(_unicodeTransTable)
   119     value = value.translate(_UNICODE_TRANS_TABLE)
   112     if remove_punctuation:
   120     if remove_punctuation:
   113         punctuation = ''.join(filter(lambda x: x not in keep_chars,
   121         punctuation = ''.join(filter(lambda x: x not in keep_chars,
   114                                      string.punctuation + removed_chars))
   122                                      string.punctuation + _REMOVED_CHARS))
   115         s = ''.join(filter(lambda x: x not in punctuation, s))
   123         value = ''.join(filter(lambda x: x not in punctuation, value))
   116     if force_lower:
   124     if force_lower:
   117         s = s.lower()
   125         value = value.lower()
   118     s = s.strip()
   126     value = value.strip()
   119     if spaces != ' ':
   127     if spaces != ' ':
   120         s = s.replace(' ', spaces)
   128         value = value.replace(' ', spaces)
   121     return s
   129     return value
   122 
   130 
   123 
   131 
   124 def nvl(value, default=''):
   132 def nvl(value, default=''):
   125     """Get specified value, or an empty string if value is empty
   133     """Get specified value, or an empty string if value is empty
   126     
   134 
   127     :param object value: value to be checked
   135     :param object value: value to be checked
   128     :param object default: default value to be returned if value is *false*
   136     :param object default: default value to be returned if value is *false*
   129     :return: input value, or *default* if value is *false*
   137     :return: input value, or *default* if value is *false*
   130 
   138 
   131     >>> from pyams_utils.unicode import nvl
   139     >>> from pyams_utils.unicode import nvl
   139     return value or default
   147     return value or default
   140 
   148 
   141 
   149 
   142 def uninvl(value, default='', encoding='utf-8'):
   150 def uninvl(value, default='', encoding='utf-8'):
   143     """Get specified value converted to unicode, or an empty unicode string if value is empty
   151     """Get specified value converted to unicode, or an empty unicode string if value is empty
   144     
   152 
   145     :param str/bytes value: the input to be checked
   153     :param str/bytes value: the input to be checked
   146     :param default: str; default value
   154     :param default: str; default value
   147     :param encoding: str; encoding name to use for conversion
   155     :param encoding: str; encoding name to use for conversion
   148     :return: str; value, or *default* if value is empty, converted to unicode
   156     :return: str; value, or *default* if value is empty, converted to unicode
   149 
   157 
   159     """
   167     """
   160     if isinstance(value, str):
   168     if isinstance(value, str):
   161         return value
   169         return value
   162     try:
   170     try:
   163         return codecs.decode(value or default, encoding)
   171         return codecs.decode(value or default, encoding)
   164     except:
   172     except ValueError:
   165         return codecs.decode(value or default, 'latin1')
   173         return codecs.decode(value or default, 'latin1')
   166 
   174 
   167 
   175 
   168 def unidict(value, encoding='utf-8'):
   176 def unidict(value, encoding='utf-8'):
   169     """Get specified dict with values converted to unicode
   177     """Get specified dict with values converted to unicode
   170     
   178 
   171     :param dict value: input mapping of strings which may be converted to unicode
   179     :param dict value: input mapping of strings which may be converted to unicode
   172     :param str encoding: output encoding
   180     :param str encoding: output encoding
   173     :return: dict; a new mapping with each value converted to unicode
   181     :return: dict; a new mapping with each value converted to unicode
   174 
   182 
   175     >>> from pyams_utils.unicode import unidict
   183     >>> from pyams_utils.unicode import unidict
   184     return result
   192     return result
   185 
   193 
   186 
   194 
   187 def unilist(value, encoding='utf-8'):
   195 def unilist(value, encoding='utf-8'):
   188     """Get specified list with values converted to unicode
   196     """Get specified list with values converted to unicode
   189     
   197 
   190     :param list value: input list of strings which may be converted to unicode
   198     :param list value: input list of strings which may be converted to unicode
   191     :param str encoding: output encoding
   199     :param str encoding: output encoding
   192     :return: list; a new list with each value converted to unicode
   200     :return: list; a new list with each value converted to unicode
   193 
   201 
   194     >>> from pyams_utils.unicode import unilist
   202     >>> from pyams_utils.unicode import unilist