src/pyams_catalog/nltk.py
changeset 2 959d098e49d9
parent 0 15b51dd45bab
equal deleted inserted replaced
1:5add0249012c 2:959d098e49d9
    18 # import interfaces
    18 # import interfaces
    19 from hypatia.text.interfaces import IPipelineElement
    19 from hypatia.text.interfaces import IPipelineElement
    20 
    20 
    21 # import packages
    21 # import packages
    22 import nltk
    22 import nltk
       
    23 from pyams_i18n.language import BASE_LANGUAGES
    23 from pyams_utils.unicode import translate_string
    24 from pyams_utils.unicode import translate_string
    24 from zope.interface import implementer
    25 from zope.interface import implementer
    25 
    26 
    26 
    27 
    27 @implementer(IPipelineElement)
    28 @implementer(IPipelineElement)
    28 class NltkStemmedTextProcessor(object):
    29 class NltkStemmedTextProcessor(object):
    29     """NLTK based text processor using stemmer"""
    30     """NLTK based text processor using stemmer"""
    30 
    31 
    31     def __init__(self, language='english'):
    32     def __init__(self, language='english'):
       
    33         if language in BASE_LANGUAGES:
       
    34             language = BASE_LANGUAGES[language].lower()
    32         self.language = language
    35         self.language = language
    33         self.stemmer = nltk.stem.SnowballStemmer(language, ignore_stopwords=True)
    36         self.stemmer = nltk.stem.SnowballStemmer(language, ignore_stopwords=True)
    34 
    37 
    35     def process(self, lst):
    38     def process(self, lst):
    36         result = []
    39         result = []
    56 @implementer(IPipelineElement)
    59 @implementer(IPipelineElement)
    57 class NltkFullTextProcessor(object):
    60 class NltkFullTextProcessor(object):
    58     """NLTK based full text processor"""
    61     """NLTK based full text processor"""
    59 
    62 
    60     def __init__(self, language='english'):
    63     def __init__(self, language='english'):
       
    64         if language in BASE_LANGUAGES:
       
    65             language = BASE_LANGUAGES[language].lower()
    61         self.language = language
    66         self.language = language
    62 
    67 
    63     def process(self, lst):
    68     def process(self, lst):
    64         result = []
    69         result = []
    65         for s in lst:
    70         for s in lst: