src/pyams_catalog/nltk.py
changeset 0 15b51dd45bab
child 2 959d098e49d9
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/pyams_catalog/nltk.py	Thu Mar 19 15:16:09 2015 +0100
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2008-2015 Thierry Florac <tflorac AT ulthar.net>
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+
+__docformat__ = 'restructuredtext'
+
+
+# import standard library
+
+# import interfaces
+from hypatia.text.interfaces import IPipelineElement
+
+# import packages
+import nltk
+from pyams_utils.unicode import translate_string
+from zope.interface import implementer
+
+
+@implementer(IPipelineElement)
+class NltkStemmedTextProcessor(object):
+    """NLTK based text processor using stemmer"""
+
+    def __init__(self, language='english'):
+        self.language = language
+        self.stemmer = nltk.stem.SnowballStemmer(language, ignore_stopwords=True)
+
+    def process(self, lst):
+        result = []
+        for s in lst:
+            translated = translate_string(s, keep_chars="'-").replace("'", ' ')
+            tokens = nltk.word_tokenize(translated, self.language)
+            result += [stem for stem in [self.stemmer.stem(token) for token in tokens
+                                         if token not in self.stemmer.stopwords]
+                       if stem and (len(stem) > 1) and (stem not in self.stemmer.stopwords)]
+        return result
+
+    def processGlob(self, lst):
+        result = []
+        for s in lst:
+            translated = translate_string(s, keep_chars="'-*?").replace("'", ' ')
+            tokens = nltk.word_tokenize(translated, self.language)
+            result += [stem for stem in [self.stemmer.stem(token) for token in tokens
+                                         if token not in self.stemmer.stopwords]
+                       if stem and (len(stem) > 1) and (stem not in self.stemmer.stopwords)]
+        return result
+
+
+@implementer(IPipelineElement)
+class NltkFullTextProcessor(object):
+    """NLTK based full text processor"""
+
+    def __init__(self, language='english'):
+        self.language = language
+
+    def process(self, lst):
+        result = []
+        for s in lst:
+            translated = translate_string(s, keep_chars="'-").replace("'", ' ')
+            result += [token for token in nltk.word_tokenize(translated, self.language)
+                       if token and len(token) > 1]
+        return result
+
+    def processGlob(self, lst):
+        result = []
+        for s in lst:
+            translated = translate_string(s, keep_chars="'-*?").replace("'", ' ')
+            result += [token for token in nltk.word_tokenize(translated, self.language)
+                       if token and len(token) > 1]
+        return result