""" This module fetches data from The Guardian API.
To use first create :class:`TheGuardianCredentials`:
>>> from orangecontrib.text.guardian import TheGuardianCredentials
>>> credentials = TheGuardianCredentials('<your-api-key>')
Then create :class:`TheGuardianAPI` object and use it for searching:
>>> from orangecontrib.text.guardian import TheGuardianAPI
>>> api = TheGuardianAPI(credentials)
>>> corpus = api.search('Slovenia', max_documents=10)
>>> len(corpus)
10
"""
import requests
import math
import json
from Orange import data
from orangecontrib.text.corpus import Corpus
BASE_URL = 'http://content.guardianapis.com/search'
ARTICLES_PER_PAGE = 10
[docs]class TheGuardianCredentials:
""" The Guardian API credentials. """
[docs] def __init__(self, key):
"""
Args:
key (str): The Guardian API key. Use `test` for testing purposes.
"""
self.key = key
@property
def valid(self):
""" Check if given API key is valid. """
response = requests.get(BASE_URL, {'api-key': self.key})
return response.status_code != 403 # 403 == Forbidden
def __eq__(self, other):
return self.key == other.key
[docs]class TheGuardianAPI:
attributes = []
class_vars = [
(data.DiscreteVariable('Section'), lambda doc: doc['sectionName']),
]
tv = data.TimeVariable('Publication Date')
metas = [
(data.StringVariable('Headline'), lambda doc: doc['fields']['headline']),
(data.StringVariable('Content'), lambda doc: doc['fields']['bodyText']),
(data.StringVariable('Trail Text'), lambda doc: doc['fields']['trailText']),
(data.StringVariable('HTML'), lambda doc: doc['fields']['body']),
(tv, lambda doc: TheGuardianAPI.tv.parse(doc['webPublicationDate'])),
(data.DiscreteVariable('Type'), lambda doc: doc['type']),
(data.DiscreteVariable('Language'), lambda doc: doc['fields']['lang']),
(data.StringVariable('Tags'),
lambda doc: ', '.join(tag['webTitle'] for tag in doc['tags'])),
(data.StringVariable('URL'), lambda doc: doc['webUrl']),
(data.ContinuousVariable('Word Count'),
lambda doc: doc['fields']['wordcount']),
]
text_features = [metas[0][0], metas[1][0]] # Headline + Content
title_indices = [-1] # Headline
[docs] def __init__(self, credentials, on_progress=None, should_break=None):
"""
Args:
credentials (:class:`TheGuardianCredentials`): The Guardian Creentials.
on_progress (callable): Function for progress reporting.
should_break (callable): Function for early stopping.
"""
self.per_page = ARTICLES_PER_PAGE
self.pages = 0
self.credentials = credentials
self.on_progress = on_progress or (lambda x, y: None)
self.should_break = should_break or (lambda: False)
self.results = []
def _search(self, query, from_date, to_date, page=1):
data = self._build_query(query, from_date, to_date, page)
response = requests.get(BASE_URL, data)
parsed = json.loads(response.text)
if page == 1: # store number of pages
self.pages = parsed['response']['pages']
self.results.extend(parsed['response']['results'])
def _build_query(self, query, from_date=None, to_date=None, page=1):
data = {
'q': query,
'api-key': self.credentials.key,
'page': str(page),
'show-fields': 'headline,trailText,body,bodyText,lang,wordcount',
'show-tags': 'all',
}
if from_date is not None:
data['from-date'] = from_date
if to_date is not None:
data['to-date'] = to_date
return data
[docs] def search(self, query, from_date=None, to_date=None, max_documents=None,
accumulate=False):
"""
Search The Guardian API for articles.
Args:
query (str): A query for searching the articles by
from_date (str): Search only articles newer than the date provided.
Date should be in ISO format; e.g. '2016-12-31'.
to_date (str): Search only articles older than the date provided.
Date should be in ISO format; e.g. '2016-12-31'.
max_documents (int): Maximum number of documents to retrieve.
When not given, retrieve all documents.
accumulate (bool): A flag indicating whether to accumulate results
of multiple consequent search calls.
Returns:
:ref:`Corpus`
"""
if not accumulate:
self.results = []
self._search(query, from_date, to_date)
pages = math.ceil(max_documents/self.per_page) if max_documents else self.pages
self.on_progress(self.per_page, pages * self.per_page)
for p in range(2, pages+1): # to one based
if self.should_break():
break
self._search(query, from_date, to_date, p)
self.on_progress(p*self.per_page, pages * self.per_page)
c = Corpus.from_documents(
self.results, 'The Guardian', self.attributes, self.class_vars,
self.metas, title_indices=self.title_indices)
c.text_features = self.text_features
return c
if __name__ == '__main__':
credentials = TheGuardianCredentials('')
print(credentials.valid)
api = TheGuardianAPI(credentials=credentials)
c = api.search('refugees', max_documents=10)
print(c)