# HG changeset patch # User Thierry Florac # Date 1426867994 -3600 # Node ID 60cdded859d4ea97f8f8e0b90798d27ce3ac5d78 # Parent 98d0305e0587e2bcc6bbe5e17b798e67d5ca9d56 Update string translation method diff -r 98d0305e0587 -r 60cdded859d4 src/pyams_utils/doctests/README.txt --- a/src/pyams_utils/doctests/README.txt Tue Mar 17 16:05:03 2015 +0100 +++ b/src/pyams_utils/doctests/README.txt Fri Mar 20 17:13:14 2015 +0100 @@ -32,6 +32,19 @@ >>> unicode.translate_string(sample, force_lower=True, spaces='-') 'mon-titre-accentue' + >>> sample = 'Texte accentué avec "ponctuation" !' + >>> unicode.translate_string(sample, force_lower=True, spaces=' ') + 'texte accentue avec ponctuation' + >>> unicode.translate_string(sample, force_lower=True, remove_punctuation=False, spaces=' ') + 'texte accentue avec "ponctuation" !' + >>> unicode.translate_string(sample, force_lower=True, remove_punctuation=False, spaces='-') + 'texte-accentue-avec-"ponctuation"-!' + >>> unicode.translate_string(sample, force_lower=True, remove_punctuation=True, spaces='-') + 'texte-accentue-avec-ponctuation' + >>> unicode.translate_string(sample, force_lower=True, remove_punctuation=True, spaces=' ', keep_chars='!') + 'texte accentue avec ponctuation !' + + If input string can contain 'slashes' (/) or 'backslashes' (\), they are normally removed ; by using the 'escape_slashes' parameter, the input string is splitted and only the last element is returned ; this is handy to handle filenames on Windows platform: diff -r 98d0305e0587 -r 60cdded859d4 src/pyams_utils/unicode.py --- a/src/pyams_utils/unicode.py Tue Mar 17 16:05:03 2015 +0100 +++ b/src/pyams_utils/unicode.py Fri Mar 20 17:13:14 2015 +0100 @@ -77,7 +77,8 @@ _fillUnicodeTransTable() -def translate_string(s, escape_slashes=False, force_lower=True, spaces=' ', keep_chars='_-.'): +def translate_string(s, escape_slashes=False, force_lower=True, + spaces=' ', remove_punctuation=True, keep_chars='_-.'): """Remove extended characters from string and replace them with 'basic' ones @param s: text to be cleaned. @@ -95,10 +96,12 @@ if isinstance(s, bytes): s = s.decode("utf-8", "replace") s = s.translate(_unicodeTransTable) - s = ''.join([a for a in s.translate(_unicodeTransTable) - if a.replace(' ', '-') in (string.ascii_letters + string.digits + (keep_chars or ''))]) + if remove_punctuation: + punctuation = ''.join(filter(lambda x: x not in keep_chars, string.punctuation)) + s = ''.join(filter(lambda x: x not in punctuation, s)) if force_lower: s = s.lower() + s = s.strip() if spaces != ' ': s = s.replace(' ', spaces) return s