Update string translation method
authorThierry Florac <thierry.florac@onf.fr>
Fri, 20 Mar 2015 17:13:14 +0100
changeset 25 60cdded859d4
parent 24 98d0305e0587
child 26 5e36949fce6e
Update string translation method
src/pyams_utils/doctests/README.txt
src/pyams_utils/unicode.py
--- a/src/pyams_utils/doctests/README.txt	Tue Mar 17 16:05:03 2015 +0100
+++ b/src/pyams_utils/doctests/README.txt	Fri Mar 20 17:13:14 2015 +0100
@@ -32,6 +32,19 @@
     >>> unicode.translate_string(sample, force_lower=True, spaces='-')
     'mon-titre-accentue'
 
+    >>> sample = 'Texte accentué avec "ponctuation" !'
+    >>> unicode.translate_string(sample, force_lower=True, spaces=' ')
+    'texte accentue avec ponctuation'
+    >>> unicode.translate_string(sample, force_lower=True, remove_punctuation=False, spaces=' ')
+    'texte accentue avec "ponctuation" !'
+    >>> unicode.translate_string(sample, force_lower=True, remove_punctuation=False, spaces='-')
+    'texte-accentue-avec-"ponctuation"-!'
+    >>> unicode.translate_string(sample, force_lower=True, remove_punctuation=True, spaces='-')
+    'texte-accentue-avec-ponctuation'
+    >>> unicode.translate_string(sample, force_lower=True, remove_punctuation=True, spaces=' ', keep_chars='!')
+    'texte accentue avec ponctuation !'
+
+
 If input string can contain 'slashes' (/) or 'backslashes' (\), they are normally removed ; 
 by using the 'escape_slashes' parameter, the input string is splitted and only the last element is
 returned ; this is handy to handle filenames on Windows platform:
--- a/src/pyams_utils/unicode.py	Tue Mar 17 16:05:03 2015 +0100
+++ b/src/pyams_utils/unicode.py	Fri Mar 20 17:13:14 2015 +0100
@@ -77,7 +77,8 @@
 _fillUnicodeTransTable()
 
 
-def translate_string(s, escape_slashes=False, force_lower=True, spaces=' ', keep_chars='_-.'):
+def translate_string(s, escape_slashes=False, force_lower=True,
+                     spaces=' ', remove_punctuation=True, keep_chars='_-.'):
     """Remove extended characters from string and replace them with 'basic' ones
     
     @param s: text to be cleaned.
@@ -95,10 +96,12 @@
     if isinstance(s, bytes):
         s = s.decode("utf-8", "replace")
     s = s.translate(_unicodeTransTable)
-    s = ''.join([a for a in s.translate(_unicodeTransTable)
-                 if a.replace(' ', '-') in (string.ascii_letters + string.digits + (keep_chars or ''))])
+    if remove_punctuation:
+        punctuation = ''.join(filter(lambda x: x not in keep_chars, string.punctuation))
+        s = ''.join(filter(lambda x: x not in punctuation, s))
     if force_lower:
         s = s.lower()
+    s = s.strip()
     if spaces != ' ':
         s = s.replace(' ', spaces)
     return s