8 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
8 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
9 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS |
9 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS |
10 # FOR A PARTICULAR PURPOSE. |
10 # FOR A PARTICULAR PURPOSE. |
11 # |
11 # |
12 |
12 |
13 __docformat__ = 'restructuredtext' |
13 """PyAMS_utils.unicode module |
|
14 |
|
15 This module provides a small set of functions which can be used to handle unicode data and |
|
16 their bytes equivalent. |
|
17 """ |
14 |
18 |
15 import codecs |
19 import codecs |
16 import string |
20 import string |
17 |
21 |
18 |
22 __docformat__ = 'restructuredtext' |
19 _unicodeTransTable = {} |
23 |
20 |
24 |
21 |
25 _UNICODE_TRANS_TABLE = {} |
22 def _fillUnicodeTransTable(): |
26 |
|
27 |
|
28 def _fill_unicode_trans_table(): |
23 _corresp = [ |
29 _corresp = [ |
24 ("A", [0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x0100, 0x0102, 0x0104]), |
30 ("A", [0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x0100, 0x0102, 0x0104]), |
25 ("AE", [0x00C6]), |
31 ("AE", [0x00C6]), |
26 ("a", [0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x0101, 0x0103, 0x0105]), |
32 ("a", [0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x0101, 0x0103, 0x0105]), |
27 ("ae", [0x00E6]), |
33 ("ae", [0x00E6]), |
67 ("z", [0x017A, 0x017C, 0x017E]), |
73 ("z", [0x017A, 0x017C, 0x017E]), |
68 ("'", [0x2019]) |
74 ("'", [0x2019]) |
69 ] |
75 ] |
70 for char, codes in _corresp: |
76 for char, codes in _corresp: |
71 for code in codes: |
77 for code in codes: |
72 _unicodeTransTable[code] = char |
78 _UNICODE_TRANS_TABLE[code] = char |
73 |
79 |
74 |
80 |
75 _fillUnicodeTransTable() |
81 _fill_unicode_trans_table() |
76 |
82 |
77 removed_chars = '®©™…' |
83 |
|
84 _REMOVED_CHARS = '®©™…' |
78 """List of custom characters to remove from input strings""" |
85 """List of custom characters to remove from input strings""" |
79 |
86 |
80 |
87 |
81 def translate_string(s, escape_slashes=False, force_lower=True, |
88 def translate_string(value, escape_slashes=False, force_lower=True, |
82 spaces=' ', remove_punctuation=True, keep_chars='_-.'): |
89 spaces=' ', remove_punctuation=True, keep_chars='_-.'): |
|
90 # pylint: disable=too-many-arguments |
83 """Remove extended characters and diacritics from string and replace them with 'basic' ones |
91 """Remove extended characters and diacritics from string and replace them with 'basic' ones |
84 |
92 |
85 :param str s: text to be cleaned. |
93 :param str value: text to be translated |
86 :param boolean escape_slashes: if True, slashes are also converted |
94 :param boolean escape_slashes: if True, slashes are also converted |
87 :param boolean force_lower: if True, result is automatically converted to lower case |
95 :param boolean force_lower: if True, result is automatically converted to lower case |
88 :param str spaces: character used to replace spaces |
96 :param str spaces: character used to replace spaces |
89 :param boolean remove_punctuation: if True, all punctuation characters are removed |
97 :param boolean remove_punctuation: if True, all punctuation characters are removed |
90 :param str keep_chars: characters which may be kept in the input string |
98 :param str keep_chars: characters which may be kept in the input string |
91 :return: text without diacritics or special characters |
99 :return: text without diacritics or special characters |
92 |
100 |
93 >>> from pyams_utils.unicode import translate_string |
101 >>> from pyams_utils.unicode import translate_string |
94 >>> input = 'Ceci est un test en Français !!!' |
102 >>> input_string = 'Ceci est un test en Français !!!' |
95 >>> translate_string(input) |
103 >>> translate_string(input_string) |
96 'ceci est un test en francais' |
104 'ceci est un test en francais' |
97 >>> translate_string(input, force_lower=False) |
105 >>> translate_string(input_string, force_lower=False) |
98 'Ceci est un test en Francais' |
106 'Ceci est un test en Francais' |
99 >>> translate_string(input, spaces='-') |
107 >>> translate_string(input_string, spaces='-') |
100 'ceci-est-un-test-en-francais' |
108 'ceci-est-un-test-en-francais' |
101 >>> translate_string(input, remove_punctuation=False) |
109 >>> translate_string(input_string, remove_punctuation=False) |
102 'ceci est un test en francais !!!' |
110 'ceci est un test en francais !!!' |
103 >>> translate_string(input, keep_chars='!') |
111 >>> translate_string(input_string, keep_chars='!') |
104 'ceci est un test en francais !!!' |
112 'ceci est un test en francais !!!' |
105 """ |
113 """ |
106 if escape_slashes: |
114 if escape_slashes: |
107 s = s.replace("\\", "/").split("/")[-1] |
115 value = value.replace("\\", "/").split("/")[-1] |
108 s = s.strip() |
116 value = value.strip() |
109 if isinstance(s, bytes): |
117 if isinstance(value, bytes): |
110 s = s.decode("utf-8", "replace") |
118 value = value.decode("utf-8", "replace") |
111 s = s.translate(_unicodeTransTable) |
119 value = value.translate(_UNICODE_TRANS_TABLE) |
112 if remove_punctuation: |
120 if remove_punctuation: |
113 punctuation = ''.join(filter(lambda x: x not in keep_chars, |
121 punctuation = ''.join(filter(lambda x: x not in keep_chars, |
114 string.punctuation + removed_chars)) |
122 string.punctuation + _REMOVED_CHARS)) |
115 s = ''.join(filter(lambda x: x not in punctuation, s)) |
123 value = ''.join(filter(lambda x: x not in punctuation, value)) |
116 if force_lower: |
124 if force_lower: |
117 s = s.lower() |
125 value = value.lower() |
118 s = s.strip() |
126 value = value.strip() |
119 if spaces != ' ': |
127 if spaces != ' ': |
120 s = s.replace(' ', spaces) |
128 value = value.replace(' ', spaces) |
121 return s |
129 return value |
122 |
130 |
123 |
131 |
124 def nvl(value, default=''): |
132 def nvl(value, default=''): |
125 """Get specified value, or an empty string if value is empty |
133 """Get specified value, or an empty string if value is empty |
126 |
134 |
127 :param object value: value to be checked |
135 :param object value: value to be checked |
128 :param object default: default value to be returned if value is *false* |
136 :param object default: default value to be returned if value is *false* |
129 :return: input value, or *default* if value is *false* |
137 :return: input value, or *default* if value is *false* |
130 |
138 |
131 >>> from pyams_utils.unicode import nvl |
139 >>> from pyams_utils.unicode import nvl |