Source code for orangecontrib.text.tag.pos

import nltk
import numpy as np

from orangecontrib.text.util import chunkable

nltk.download(['averaged_perceptron_tagger', 'maxent_treebank_pos_tagger'], quiet=True)


[docs]class POSTagger: """A class that wraps `nltk.TaggerI` and performs Corpus tagging. """ def __init__(self, tagger, name='POS Tagger'): self.tag_sents = tagger.tag_sents self.name = name
[docs] def tag_corpus(self, corpus, **kwargs): """ Marks tokens of a corpus with POS tags. Args: corpus (orangecontrib.text.corpus.Corpus): A corpus instance. """ corpus.pos_tags = np.array(self._tag_sents(corpus.tokens, **kwargs), dtype=object) return corpus
@chunkable def _tag_sents(self, documents): return list(map(lambda sent: list(map(lambda x: x[1], sent)), self.tag_sents(documents))) def __str__(self): return self.name
[docs]class StanfordPOSTagger(nltk.StanfordPOSTagger, POSTagger): name = 'Stanford POS Tagger' @classmethod
[docs] def check(cls, path_to_model, path_to_jar): """ Checks whether provided `path_to_model` and `path_to_jar` are valid. Raises: ValueError: in case at least one of the paths is invalid. Notes: Can raise an exception if Java Development Kit is not installed or not properly configured. Examples: >>> try: ... StanfordPOSTagger.check('path/to/model', 'path/to/stanford.jar') ... except ValueError as e: ... print(e) Could not find stanford-postagger.jar jar file at path/to/stanford.jar """ try: cls(path_to_model, path_to_jar).tag(()) except OSError as e: raise ValueError('Either Java SDK not installed or some of the files are invalid.\n' + str(e)) except LookupError as e: raise ValueError(str(e).strip(' =\n'))
def __str__(self): return "{} (model: {})".format(self.name, self._stanford_model)
taggers = [ POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'), POSTagger(nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'), 'Treebank POS Tagger (MaxEnt)'), ]