Source code for orangecontrib.text.preprocess.preprocess
from orangecontrib.text.preprocess import FrequencyFilter
__all__ = ['Preprocessor']
[docs]class Preprocessor:
"""Holds document processing objects.
Attributes:
transformers (List([BaseTransformer]): transforms strings
tokenizer (BaseTokenizer): tokenizes string
normalizer (BaseNormalizer): normalizes tokens
filters (List[BaseTokenFilter]): filters unneeded tokens
"""
def __init__(self, transformers=None, tokenizer=None,
normalizer=None, filters=None, ngrams_range=None, pos_tagger=None):
if callable(transformers):
transformers = [transformers]
if callable(filters):
filters = [filters]
self.transformers = transformers or []
self.tokenizer = tokenizer
self.filters = filters or []
self.normalizer = normalizer
self.ngrams_range = ngrams_range
self.pos_tagger = pos_tagger
self.progress = 0
self._report_frequency = 1
[docs] def __call__(self, corpus, inplace=True, on_progress=None):
""" Runs preprocessing over a corpus.
Args:
corpus(orangecontrib.text.Corpus): A corpus to preprocess.
inplace(bool): Whether to create a new Corpus instance.
"""
self.set_up()
self._on_progress = on_progress
if not inplace:
corpus = corpus.copy()
self.progress = 1
self._report_frequency = len(corpus) // 80 or 1
self._len = len(corpus) / 80
tokens = list(map(self.process_document, corpus.documents))
corpus.store_tokens(tokens)
self.on_progress(80)
if self.ngrams_range is not None:
corpus.ngram_range = self.ngrams_range
if self.freq_filter is not None:
tokens, dictionary = self.freq_filter.fit_filter(corpus)
corpus.store_tokens(tokens, dictionary)
if self.pos_tagger:
self.pos_tagger.tag_corpus(corpus)
self.on_progress(100)
corpus.used_preprocessor = self
corpus.used_preprocessor._on_progress = None # remove on_progress that is causing pickling problems
self.tear_down()
return corpus
@property
def filters(self):
return self._filters
@filters.setter
def filters(self, filters):
self._filters = []
self.freq_filter = None
for f in filters:
if isinstance(f, FrequencyFilter):
self.freq_filter = f
else:
self._filters.append(f)
def process_document(self, document):
for transformer in self.transformers:
document = transformer.transform(document)
if self.tokenizer:
tokens = self.tokenizer.tokenize(document)
else:
tokens = [document]
if self.normalizer:
tokens = self.normalizer(tokens)
for filter in self.filters:
tokens = filter(tokens)
self.progress += 1
if self.progress % self._report_frequency == 0:
self.on_progress(self.progress / self._len)
return tokens
def on_progress(self, progress):
if self._on_progress:
self._on_progress(progress)
[docs] def set_up(self):
""" Called before every __call__. Used for setting up tokenizer & filters. """
if self.tokenizer:
self.tokenizer.set_up()
for f in self.filters:
f.set_up()
[docs] def tear_down(self):
""" Called after every __call__. Used for cleaning up tokenizer & filters. """
if self.tokenizer:
self.tokenizer.tear_down()
for f in self.filters:
f.tear_down()
def __str__(self):
return '\n'.join(['{}: {}'.format(name, value) for name, value in self.report()])
def report(self):
return (
('Transformers', ', '.join(str(tr) for tr in self.transformers)),
('Tokenizer', str(self.tokenizer)),
('Normalizer', str(self.normalizer)),
('Filters', ', '.join(str(f) for f in self.filters)),
('Ngrams range', str(self.ngrams_range)),
('Frequency filter', str(self.freq_filter)),
('Pos tagger', str(self.pos_tagger)),
)