Source code for orangecontrib.text.vectorization.bagofwords

""" This module constructs a new corpus with tokens as features.

First create a corpus::

    >>> from orangecontrib.text import Corpus
    >>> corpus = Corpus.from_file('deerwester')
    >>> corpus.domain
    [ | Category] {Text}

Then create :class:`BowVectorizer` object and call transform:

    >>> from orangecontrib.text.vectorization.bagofwords import BowVectorizer
    >>> bow = BowVectorizer()
    >>> new_corpus = bow.transform(corpus)
    >>> new_corpus.domain
    [a, abc, and, applications, binary, computer, engineering, eps, error, for,
    generation, graph, human, in, interface, intersection, iv, lab, machine,
    management, measurement, minors, of, opinion, ordering, paths, perceived,
    quasi, random, relation, response, survey, system, testing, the, time, to,
    trees, unordered, user, well, widths | Category] {Text}


from collections import OrderedDict
from functools import partial

import numpy as np
from Orange.util import dummy_callback
from gensim import corpora, models, matutils
from sklearn.preprocessing import normalize

from orangecontrib.text.vectorization.base import BaseVectorizer,\
    SharedTransform, VectorizationComputeValue

[docs]class BowVectorizer(BaseVectorizer): name = 'BoW Vectorizer' COUNT = 'Count' BINARY = 'Binary' SUBLINEAR = 'Sublinear' NONE = '(None)' IDF = 'IDF' SMOOTH = 'Smooth IDF' L1 = 'L1 (Sum of elements)' L2 = 'L2 (Euclidean)' wlocals = OrderedDict(( (COUNT, lambda tf: tf), (BINARY, lambda tf: np.greater(tf, 0).astype(int) if tf.size else np.array([], dtype=int)), (SUBLINEAR, lambda tf: 1 + np.log(tf)), )) wglobals = OrderedDict(( (NONE, lambda df, N: 1), (IDF, lambda df, N: np.log(N/df)), (SMOOTH, lambda df, N: np.log(1 + N/df)), )) norms = OrderedDict(( (NONE, None), (L1, partial(normalize, norm='l1')), (L2, partial(normalize, norm='l2')), ))
[docs] def __init__(self, norm=NONE, wlocal=COUNT, wglobal=NONE): self.norm = norm self.wlocal = wlocal self.wglobal = wglobal
def _transform(self, corpus, source_dict=None, callback=dummy_callback): if not (len(corpus.dictionary) or source_dict) or not len(corpus): return corpus temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True)) dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict callback(0.3) temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus] model = models.TfidfModel(dictionary=dic, normalize=False, wlocal=self.wlocals[self.wlocal], wglobal=self.wglobals[self.wglobal]) callback(0.6) X = matutils.corpus2csc(model[temp_corpus], dtype=float, num_terms=len(dic)).T norm = self.norms[self.norm] if norm: X = norm(X) callback(0.9) # set compute values shared_cv = SharedTransform(self, corpus.used_preprocessor, source_dict=dic) cv = [VectorizationComputeValue(shared_cv, dic[i]) for i in range(len(dic))] corpus = self.add_features(corpus, X, dic, cv, var_attrs={'bow-feature': True}) callback(1) return corpus def report(self): return (('Term Frequency', self.wlocal), ('Document Frequency', self.wglobal), ('Regularization', self.norm),)