Source code for orangecontrib.text.nyt

import json
import math
import os
import shelve
import warnings
from datetime import date
from time import sleep
from urllib import request, parse
from http.client import HTTPException
from urllib.error import HTTPError, URLError


from Orange import data
from Orange.canvas.utils import environ
from orangecontrib.text.corpus import Corpus

SLEEP = 1
TIMEOUT = 10
MAX_DOCS = 1000
BATCH_SIZE = 10
MIN_DATE = date(1851, 1, 1)
BASE_URL = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'


[docs]class NYT:
    """ Class for fetching records from the NYT API. """

    @staticmethod
    def keywords(doc, name):
        return ', '.join([kw.get('value')
                          for kw in doc.get('keywords', [])
                          if kw['name'] == name])

    attributes = []

    class_vars = [
        (data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)),
    ]

    tv = data.TimeVariable('Publication Date')
    metas = [
        (data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''),
        (data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''),
        (data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''),
        (data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''),
        (data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')),
        (data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''),
        (data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')),
        (data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')),
        (data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')),
        (data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')),
        (tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))),
        (data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)),
        (data.DiscreteVariable('Word Count'), lambda doc: doc.get('word_count', None)),
    ]

    text_features = [metas[0][0], metas[1][0]]  # headline + abstract

[docs]    def __init__(self, api_key):
        """
        Args:
            api_key (str): NY Time API key.
        """
        self.api_key = api_key
        self.on_error = None
        self.on_rate_limit = None
        self.on_no_connection = None
        self.cache_path = None
        self._cache_init()

[docs]    def api_key_valid(self):
        """ Checks whether api key given at initialization is valid. """
        url = self._encode_url('test')
        try:
            with request.urlopen(url) as connection:
                if connection.getcode() == 200:
                    return True
        except (HTTPError, URLError, HTTPException):
            return False

[docs]    def search(self, query, date_from=None, date_to=None, max_docs=None,
               on_progress=None, should_break=None):
        """
        Args:
            query (str): Search query.
            date_from (date): Start date limit.
            date_to (date): End date limit.
            max_docs (int): Maximal number of documents returned.
            on_progress (callback): Called after every iteration of downloading.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.

        Returns:
            Corpus: Search results.
        """
        if max_docs is None or max_docs > MAX_DOCS:
            max_docs = MAX_DOCS

        # TODO create corpus on the fly and extend, so it stops faster.
        records = []
        data, go_sleep = self._fetch_page(query, date_from, date_to, 0)
        if data is None:
            return None
        records.extend(data['response']['docs'])
        max_docs = min(data['response']['meta']['hits'], max_docs)
        if callable(on_progress):
            on_progress(len(records), max_docs)

        for page in range(1, math.ceil(max_docs/BATCH_SIZE)):
            if callable(should_break) and should_break():
                break

            if go_sleep:
                sleep(SLEEP)

            data, go_sleep = self._fetch_page(query, date_from, date_to, page)

            if data is None:
                break

            records.extend(data['response']['docs'])
            if callable(on_progress):
                on_progress(len(records), max_docs)

        if len(records) > max_docs:
            records = records[:max_docs]

        return Corpus.from_documents(records, 'NY Times', self.attributes,
                                     self.class_vars, self.metas, title_indices=[-1])

    def _cache_init(self):
        """ Initialize cache in Orange environment buffer dir. """
        path = os.path.join(environ.buffer_dir, "nytcache")
        try:
            if not os.path.exists(path):
                os.makedirs(path)
            self.cache_path = os.path.join(path, "query_cache")
        except OSError as e:
            warnings.warn('Could not initialize NYT cache: {}'.format(str(e)), RuntimeWarning)

    def _cache_fetch(self, url):
        """ Fetch URL from cache if present. """
        with shelve.open(self.cache_path) as cache:
            if url in cache.keys():
                return cache[url]
            else:
                return None

    def _cache_store(self, url, data):
        """ Store data for URL in cache. """
        with shelve.open(self.cache_path) as cache:
            cache[url] = data

    def _fetch_page(self, query, date_from, date_to, page):
        """ Fetch one page either from cache or web. """
        cache_url = self._encode_url(query, date_from, date_to, page, for_caching=True)
        data = self._cache_fetch(cache_url)
        if data:
            return data, False
        else:
            url = self._encode_url(query, date_from, date_to, page, for_caching=False)
            try:
                with request.urlopen(url, timeout=TIMEOUT) as conn:
                    data = conn.read().decode('utf-8')
            except HTTPError as e:
                if e.code == 403 and page > 0:
                    # occasionally some pages return error 403 (Forbidden)
                    # while all other page numbers seem to work just fine.
                    # Skip such pages and don't break loading!
                    warnings.warn('NYT api returned HTTPError with code 403 '
                                  '(Forbidden)! Skipping this page ...')
                    return {'response': {'docs': []}}, True
                if e.code == 429 and callable(self.on_rate_limit):
                    self.on_rate_limit()
                elif callable(self.on_error):
                    self.on_error(str(e))
                return None, False
            except URLError:
                if callable(self.on_no_connection):
                    self.on_no_connection()
                    return None, False
                raise
            data = json.loads(data)
            self._cache_store(cache_url, data)
            return data, True

    def _encode_url(self, query, date_from=None, date_to=None, page=0, for_caching=False):
        """
        Encode url for given query, date restrictions and page number.

        Args:
            query (str): Search query.
            date_from (date): Date restriction.
            date_to (date): Date restriction.
            page (int): Page number.
            for_caching (bool): Whether URL would be used for caching. If set, exclude BASE_URL
                and API key.

        Returns:
            str: An encoded URL.
        """
        params = [   # list required to preserve order - important for caching
            ('fq', 'The New York Times'),
            ('api-key', self.api_key),
            ('q', query),
            ('page', page),
        ]
        if date_from:
            params.append(('begin_date', date_from.strftime('%Y%m%d')))
        if date_to:
            params.append(('end_date', date_to.strftime('%Y%m%d')))

        if for_caching:     # remove api key, return only params
            del params[0]
            return parse.urlencode(params)
        else:
            return '{}?{}'.format(BASE_URL, parse.urlencode(params))