diff --git a/.gitchangelog.rc b/.gitchangelog.rc new file mode 100644 index 0000000..e5e271c --- /dev/null +++ b/.gitchangelog.rc @@ -0,0 +1,191 @@ +## +## Format +## +## ACTION: [AUDIENCE:] COMMIT_MSG [!TAG ...] +## +## Description +## +## ACTION is one of 'chg', 'fix', 'new' +## +## Is WHAT the change is about. +## +## 'chg' is for refactor, small improvement, cosmetic changes... +## 'fix' is for bug fixes +## 'new' is for new features, big improvement +## +## AUDIENCE is optional and one of 'dev', 'usr', 'pkg', 'test', 'doc' +## +## Is WHO is concerned by the change. +## +## 'dev' is for developers (API changes, refactors...) +## 'usr' is for final users (UI changes) +## 'pkg' is for packagers (packaging changes) +## 'test' is for testers (test only related changes) +## 'doc' is for doc guys (doc only changes) +## +## COMMIT_MSG is ... well ... the commit message itself. +## +## TAGs are additional adjective as 'refactor' 'minor' 'cosmetic' +## +## They are preceded with a '!' or a '@' (prefer the former, as the +## latter is wrongly interpreted in github.) Commonly used tags are: +## +## 'refactor' is obviously for refactoring code only +## 'minor' is for a very meaningless change (a typo, adding a comment) +## 'cosmetic' is for cosmetic driven change (re-indentation, 80-col...) +## 'wip' is for partial functionality but complete subfunctionality. +## +## Example: +## +## new: usr: support of bazaar implemented +## chg: re-indented some lines !cosmetic +## new: dev: updated code to be compatible with last version of killer lib. +## fix: pkg: updated year of licence coverage. +## new: test: added a bunch of test around user usability of feature X. +## fix: typo in spelling my name in comment. !minor +## +## Please note that multi-line commit message are supported, and only the +## first line will be considered as the "summary" of the commit message. So +## tags, and other rules only applies to the summary. The body of the commit +## message will be displayed in the changelog without reformatting. + + +## +## ``ignore_regexps`` is a line of regexps +## +## Any commit having its full commit message matching any regexp listed here +## will be ignored and won't be reported in the changelog. +## +ignore_regexps = [ + r'@minor', r'!minor', + r'@cosmetic', r'!cosmetic', + r'@refactor', r'!refactor', + r'@wip', r'!wip', + r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*[p|P]kg:', + r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*[d|D]ev:', + r'^(.{3,3}\s*:)?\s*[fF]irst commit.?\s*$', +] + + +## ``section_regexps`` is a list of 2-tuples associating a string label and a +## list of regexp +## +## Commit messages will be classified in sections thanks to this. Section +## titles are the label, and a commit is classified under this section if any +## of the regexps associated is matching. +## +section_regexps = [ + ('New', [ + r'^[nN]ew\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', + ]), + ('Changes', [ + r'^[cC]hg\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', + ]), + ('Fix', [ + r'^[fF]ix\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', + ]), + + ('Other', None ## Match all lines + ), + +] + + +## ``body_process`` is a callable +## +## This callable will be given the original body and result will +## be used in the changelog. +## +## Available constructs are: +## +## - any python callable that take one txt argument and return txt argument. +## +## - ReSub(pattern, replacement): will apply regexp substitution. +## +## - Indent(chars=" "): will indent the text with the prefix +## Please remember that template engines gets also to modify the text and +## will usually indent themselves the text if needed. +## +## - Wrap(regexp=r"\n\n"): re-wrap text in separate paragraph to fill 80-Columns +## +## - noop: do nothing +## +## - ucfirst: ensure the first letter is uppercase. +## (usually used in the ``subject_process`` pipeline) +## +## - final_dot: ensure text finishes with a dot +## (usually used in the ``subject_process`` pipeline) +## +## - strip: remove any spaces before or after the content of the string +## +## Additionally, you can `pipe` the provided filters, for instance: +#body_process = Wrap(regexp=r'\n(?=\w+\s*:)') | Indent(chars=" ") +#body_process = Wrap(regexp=r'\n(?=\w+\s*:)') +#body_process = noop +body_process = ReSub(r'((^|\n)[A-Z]\w+(-\w+)*: .*(\n\s+.*)*)+$', r'') | strip + + +## ``subject_process`` is a callable +## +## This callable will be given the original subject and result will +## be used in the changelog. +## +## Available constructs are those listed in ``body_process`` doc. +subject_process = (strip | + ReSub(r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n@]*)(@[a-z]+\s+)*$', r'\4') | + ucfirst | final_dot) + + +## ``tag_filter_regexp`` is a regexp +## +## Tags that will be used for the changelog must match this regexp. +## +tag_filter_regexp = r'^v?[0-9]+\.[0-9]+(\.[0-9]+)?$' + + +## ``unreleased_version_label`` is a string +## +## This label will be used as the changelog Title of the last set of changes +## between last valid tag and HEAD if any. +unreleased_version_label = "%%version%% (unreleased)" + + +## ``output_engine`` is a callable +## +## This will change the output format of the generated changelog file +## +## Available choices are: +## +## - rest_py +## +## Legacy pure python engine, outputs ReSTructured text. +## This is the default. +## +## - mustache() +## +## Template name could be any of the available templates in +## ``templates/mustache/*.tpl``. +## Requires python package ``pystache``. +## Examples: +## - mustache("markdown") +## - mustache("restructuredtext") +## +## - makotemplate() +## +## Template name could be any of the available templates in +## ``templates/mako/*.tpl``. +## Requires python package ``mako``. +## Examples: +## - makotemplate("restructuredtext") +## +output_engine = rest_py +#output_engine = mustache("restructuredtext") +#output_engine = mustache("markdown") +#output_engine = makotemplate("restructuredtext") + + +## ``include_merge`` is a boolean +## +## This option tells git-log whether to include merge commits in the log. +## The default is to include them. +include_merge = True diff --git a/.gitignore b/.gitignore index 0d20b64..f721904 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.pyc +.python-version diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..d2298ed --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,135 @@ +Changelog +========= + +v0.7.2 (2017-12-02) +------------------- + +- Updated README instructions and links. [Alex Olieman] + +- Ensure that ``candidates`` returns surface forms as strings. [Alex Olieman] + +- Ensure that surface forms are always strings (merge `PR #1`_). [ShomyLiu & Alex Olieman] + +.. _PR #1: https://github.com/aolieman/pyspotlight/pull/1 + +v0.7.1 (2016-07-25) +------------------- + +- Moved the shared request logic in ``annotate`` and ``candidates`` to a + helper function. [Alex Olieman] + +- Updated setup/package files [Alex Olieman] + +- Updated README. [Luis Nell & Alex Olieman] + +v0.7.0 (2016-07-18) +------------------- + +API Changes +~~~~~~~~~~~ + +- Changed default spotter to ``'Default'`` for 0.7 compatibility. [Alex + Olieman] + +- Moved filter parameters into a ``filters`` argument. [Alex Olieman] + + * **Removed** the ``policy`` argument from ``annotate`` and ``candidates``. + * Added a types parameter, which enables server-side filtering of resources. + It also makes for a nice addition to the policy parameter. + +Additions +~~~~~~~~~ + +- Python 3 compatibility. [Alex Olieman] + +- Moved to nose2 for tests. [Alex Olieman] + +Fixes +~~~~~ + +- Updated required version of the requests package. [Alex Olieman] + +- Remove mutable default arguments. [Luis Nell] + +v0.6.5.2 (2013-08-27) +--------------------- + +- Add manifest so README is included on PyPI. [Luis Nell] + +v0.6.5.1 (2013-08-12) +--------------------- + +- Update README for PyPI release. [Luis Nell] + +- Upgrade to requests 1.2.3. [Luis Nell] + +- BSD License. [Luis Nell] + +- Workaround for footnotes in ``surfaceForm`` that get parsed as a list. + [Luis Nell] + +- Do not assume in ``candidates`` that ``surfaceForm`` is always a list. + [Luis Nell] + +v0.6.5 (2012-10-07) +------------------- + +API Changes +~~~~~~~~~~~ + +- Have to explicitly provide a protocol in the URL. [Luis Nell] + +Additions +~~~~~~~~~ + +- Added stuff for testing. [Luis Nell] + +- Add requirements.txt for pip. [Luis Nell] + +- Make use of requests builtin json decoding. [Luis Nell] + +Fixes +~~~~~ + +- Some README updates. [Luis Nell] + +- Add ordereddict requirement for py2.6. [Luis Nell] + +- Tests: adapt to the requests raw handling. [Luis Nell] + +- Use requests 0.14.1 from now on. [Luis Nell] + +- Fixed typos, wrong link. [Pablo Mendes] + + * Minor: We spell it DBpedia, not DBPedia :) + * Fix: Link pointed to OpenCalais, a commercial closed-source + alternative to DBpedia Spotlight + +v0.5.3 (2012-08-01) +------------------- + +- Update README to reflect the exception changes. [Luis Nell] + +- Raise requests.exceptions.HTTPError on response.status_code != 200. + [Luis Nell] + +- Prefer simplejson to json. [Luis Nell] + +- Add tests for new exception handling. [Luis Nell] + +- Add Exception Handling. [Luis Nell] + +v0.5.2 (2012-04-06) +------------------- + +- Fixes setup.py issues. v0.5.2. [Luis Nell] + +v0.5.1 (2012-03-21) +------------------- + +- Fix setup.py - push 0.5.1. [Luis Nell] + +v0.5.0 (2012-03-20) +------------------- + +- Init. [Luis Nell] diff --git a/LICENSE b/LICENSE index 1ebd153..b28ae28 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,5 @@ Copyright (c) 2013, Luis Nell +Modifications 2016, Alex Olieman All rights reserved. Redistribution and use in source and binary forms, with or without modification, diff --git a/MANIFEST.in b/MANIFEST.in index 0c73842..36482f5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include README.rst LICENSE +include README.rst LICENSE HISTORY.rst diff --git a/README.rst b/README.rst index 4e896cf..83a8e3c 100644 --- a/README.rst +++ b/README.rst @@ -4,40 +4,45 @@ pyspotlight is a thin python wrapper around `DBpedia Spotlight`_'s `REST Interface`_. -The tested DBpedia Spotlight versions are 0.5 and 0.6.5, though it seems to also work with 0.7 as confirmed by some users. +This package is tested against DBpedia Spotlight version 0.7. As long as there are no major API overhauls, this wrapper might also -work with future versions. If you encounter a bug with a newer DBpedia version, +work with future versions. If you encounter a bug with a newer DBpedia Spotlight version, feel free to create an issue here on github. Note that we're trying to track DBpedia Spotlight release version numbers, so you can easily see which pyspotlight version has been tested with which Spotlight -release. Therefore all pyspotlight 0.5 releases are tested against -Spotlight 0.5 etc. +release. For example, all pyspotlight 0.6.x releases are compatible with +Spotlight 0.6.x, etc. While we aim for backwards-compatibility with older +Spotlight releases, it is not guaranteed. If you're using an older Spotlight +version, you may need to use an older pyspotlight version as well. -.. _`DBpedia Spotlight`: https://github.com/dbpedia-spotlight/dbpedia-spotlight#dbpedia-spotlight -.. _`REST Interface`: https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/Web-service +.. _`DBpedia Spotlight`: http://www.dbpedia-spotlight.org/faq +.. _`REST Interface`: http://www.dbpedia-spotlight.org/api Installation ============ -The newest stable release can be found on the `Python Package Index (PyPi) `__. +The newest stable release can be found on the `Python Package Index (PyPI) `__. Therefore installation is as easy as:: pip install pyspotlight +Older releases can be installed by specifying a version:: + + pip install pyspotlight~=0.6.1 + Requirements for installation from source/github ================================================ -This module has been tested with Python 2.6 and Python 2.7. +This module has been tested with Python 2.7 and Python 3.5. As long as you use the ``setup.py`` for the installation (``python setup.py install``), you'll be fine because Python takes care of the dependencies for you. If you decide not to use the ``setup.py`` you will need the ``requests`` -library. In case you are running a Python Version older than 2.7, you will -also need to install the ``ordereddict`` module. +library. All of these packages can be found on the `Python PackageIndex`_ and easily installed via either ``easy_install`` or, `the recommended`_, ``pip``. @@ -46,7 +51,7 @@ Using ``pip`` it is especially easy because you can just do this:: pip install -r requirements.txt -and it will install all packages from that file. +and it will install all package dependencies listed in that file. .. _`Python PackageIndex`: http://pypi.python.org/ .. _`the recommended`: http://stackoverflow.com/questions/3220404/why-use-pip-over-easy-install @@ -54,25 +59,7 @@ and it will install all packages from that file. Usage ===== -if you just want to play around with spotlight, there is a running version -available under ``http://spotlight.sztaki.hu:LANG_PORT/rest/annotate``, where ``LANG_PORT`` is one of the following depending on the language you want to annotate (thx to @robert-boulanger in Issue #10):: - - LANG_PORTS = { - "english": '2222', - "german": '2226', - "dutch": '2232', - "hungarian": '2229', - "french": '2225', - "portuguese": '2228', - "italian": '2230', - "russian": '2227', - "turkish": '2235', - "spanish": '2231' - } - -(Also the public server doesn't like the ``LingPipeSpotter``, which is used by *pyspotlight* by default. To work around this, simply pass ``spotter='Default'`` to the ``annotate()`` call) - -Usage is simple and easy, just as is the API:: +Usage is simple and easy, just as the API is:: >>> import spotlight >>> annotations = spotlight.annotate('http://localhost/rest/annotate', @@ -86,17 +73,70 @@ Assuming we did this for the following text:: We might get this back:: - >>> annotation - [{u'URI': u'http://dbpedia.org/resource/Presidency_of_Barack_Obama', - u'offset': 0, - u'percentageOfSecondRank': -1.0, - u'similarityScore': 0.10031112283468246, - u'support': 134, - u'surfaceForm': u'President Obama', - u'types': u'DBpedia:OfficeHolder,DBpedia:Person,Schema:Person,Freebase:/book/book_subject,Freebase:/book,Freebase:/book/periodical_subject,Freebase:/media_common/quotation_subject,Freebase:/media_common'},…(truncated remaining elements)…] - -The same parameters apply to the ``spotlight.candidates`` function. - + >>> spotlight.annotate('http://localhost/rest/annotate', sample_txt) + [ + { + 'URI': 'http://dbpedia.org/resource/Presidency_of_Barack_Obama', + 'offset': 0, + 'percentageOfSecondRank': -1.0, + 'similarityScore': 0.10031112283468246, + 'support': 134, + 'surfaceForm': 'President Obama', + 'types': 'DBpedia:OfficeHolder,DBpedia:Person,Schema:Person,Freebase:/book/book_subject,Freebase:/book,Freebase:/book/periodical_subject,Freebase:/media_common/quotation_subject,Freebase:/media_common' + }, + …(truncated remaining elements)… + ] + +Any additional filter parameters that are supported by the Spotlight API +can be passed to the ``filters`` argument in a dictionary. + +For example:: + + >>> only_person_filter = { + ... 'policy': "whitelist", + ... 'types': "DBpedia:Person", + ... 'coreferenceResolution': False + ... } + + >>> spotlight.annotate( + ... "http://localhost/rest/annotate", + ... "Any collaboration between Shakira and Metallica seems highly unlikely.", + ... filters=only_person_filter + ... ) + + [{ + 'URI': 'http://dbpedia.org/resource/Shakira', + 'offset': 26, + 'percentageOfSecondRank': 1.511934771738109e-09, + 'similarityScore': 0.9999999984880361, + 'support': 2587, + 'surfaceForm': 'Shakira', + 'types': 'Schema:MusicGroup,DBpedia:Agent,Schema:Person,DBpedia:Person,DBpedia:Artist,DBpedia:MusicalArtist' + }] + +The same parameters apply to the ``spotlight.candidates`` function, +which returns a list of all matching candidate entities rather than +only the top candidate. + +Note that the Spotlight API may support other interfaces that have not been +implemented in pyspotlight. Feel free to contribute :-)! + +Running DBpedia Spotlight +----------------------------- +If you just want to play around with Spotlight, there is an interactive demo +available at `demo.dbpedia-spotlight.org`_. To submit pyspotlight +requests to the demo servers, you may use the endpoints found in `sites.xml`_. + +.. _demo.dbpedia-spotlight.org : http://demo.dbpedia-spotlight.org/ +.. _sites.xml: http://demo.dbpedia-spotlight.org/config/sites.xml + +For any significant Spotlight usage, it is strongly recommended to run +your own server. Please follow the `installation instructions`_. + +.. _installation instructions: http://www.dbpedia-spotlight.org/faq#i-want-to-install-the-tool-how-do-i-do + +Exceptions +---------- The following exceptions can occur: * ``ValueError`` when: @@ -109,9 +149,9 @@ The following exceptions can occur: excepted. - You forgot to explicitly specify a protocol (http/https) in the API URL. - Usually the exception's message is telling you *exactly* what is wrong. If - not, I might have forgotten some error handling. So just open up an issue on - github. + Usually the exception's message tells you *exactly* what is wrong. If + not, we might have forgotten some error handling. So just open up an issue on + github if you encounter unexpected exceptions. * ``requests.exceptions.HTTPError`` @@ -119,17 +159,12 @@ The following exceptions can occur: if you have a load balancer like nginx in front of your spotlight cluster and there is not a single server available, so nginx throws a ``502 Bad Gateway``. - -Note that the API also supports a ``disambiguate`` interface, however I wasn't -able to get it running. Therefore there is *no* ``disambiguate`` function -available. Feel free to contribute :-)! - Tips ==== -I'd highly recommend playing around with the *confidence* and *support* values. +We highly recommend playing around with the *confidence* and *support* values. Furthermore it might be preferable to filter out more annotations by looking -at their *smiliarityScore* (read: contextual score). +at their *similiarityScore* (read: contextual score). If you want to change the default values, feel free to use ``itertools.partial`` to create a little wrapper with simplified signature:: @@ -138,26 +173,31 @@ to create a little wrapper with simplified signature:: >>> from functools import partial >>> api = partial(annotate, 'http://localhost/rest/annotate', ... confidence=0.4, support=20, - ... spotter='AtLeastOneNounSelector') - >>> api('This is your test text. This function has other confidence, - ... support and uses another spotter. Furthermore all calls go - ... directl to localhost/rest/annotate.') + ... spotter='SpotXmlParser') + >>> api('This is your test text. This function uses a non-default + ... confidence, support, and spotter. Furthermore all calls go + ... directly to localhost/rest/annotate.') As you can see this reduces the function's complexity greatly. -I did not feel the need to create fancy classes, they would've just lead to -more complexity. +Pyspotlight provides an interface based on functions rather than classes, +to avoid an unnecessary layer of indirection. Tests ===== -If you want to run the tests, you will have to install ``nose`` (1.2.1) from the -package index. Then you can simply run ``nosetests`` from the command line in +If you want to run the tests, you will have to install ``nose2`` (~0.6) from PyPI. +Then you can simply run ``nose2`` from the command line in this or the ``spotlight/`` directory. +All development and regular dependencies can be installed with a single command:: + + pip install -r requirements-dev.txt + + Bugs ==== In case you spot a bug, please open an issue and attach the raw response you -sent. Have a look at `Issue #3`_ for a great example on how to file a bug report. +sent. Have a look at `ubergrape/pyspotlight#3`_ for an example on how to file a good bug report. -.. _`Issue #3`: https://github.com/newsgrape/pyspotlight/issues/3 +.. _`ubergrape/pyspotlight#3`: https://github.com/ubergrape/pyspotlight/issues/3 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..1e742a7 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +-r requirements.txt +nose2~=0.6 +wheel +gitchangelog~=3.0 diff --git a/requirements.txt b/requirements.txt index 5ce7838..42b529e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ -# Testing also requires nose>=1.2.1 -requests==1.2.3 +requests~=2.10 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2a9acf1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal = 1 diff --git a/setup.py b/setup.py index 64e61b7..e44fce7 100644 --- a/setup.py +++ b/setup.py @@ -2,43 +2,48 @@ # coding: utf-8 from setuptools import setup from setuptools import find_packages +from io import open classifiers = [ - "Intended Audience :: Developers", - "Programming Language :: Python", - "Operating System :: OS Independent", - "Topic :: Software Development :: Libraries", - "Environment :: Web Environment", - "License :: OSI Approved :: BSD License", - "Development Status :: 5 - Production/Stable", + 'Intended Audience :: Developers', + 'Operating System :: OS Independent', + 'Topic :: Software Development :: Libraries', + 'Environment :: Web Environment', + 'License :: OSI Approved :: BSD License', + 'Development Status :: 5 - Production/Stable', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', ] -requires = ["requests==1.2.3", ] - -# This might not be the best idea. -try: - import json -except ImportError: - requires.append('simplejson>=2.0') - +requires = [ + 'requests~=2.10', +] +tests_require = [ + 'nose2~=0.6', +] -# Python 2.6 does not ship with an OrderedDict implementation. -# God save the cheeseshop! -try: - from collections import OrderedDict -except ImportError: - requires.append('ordereddict>=1.1') +with open('README.rst', 'r', encoding='utf-8') as f: + readme = f.read() +with open('HISTORY.rst', 'r', encoding='utf-8') as f: + history = f.read() setup(name='pyspotlight', - version='0.6.5.2', + version='0.7.2', license='BSD', - url='https://github.com/newsgrape/pyspotlight', + url='https://github.com/aolieman/pyspotlight', + author='Luis Nell', + author_email='luis.nell@simpleloop.com', + maintainer='Alex Olieman', + maintainer_email='alex@olieman.net', packages=find_packages(), - description='Python interface to the DBPedia Spotlight REST API', - long_description=open('README.rst').read(), - keywords="dbpedia spotlight semantic", + description='Python interface to the DBpedia Spotlight REST API', + long_description=readme + '\n\n' + history, + keywords=['dbpedia spotlight', 'semantic annotation', 'entity linking'], classifiers=classifiers, install_requires=requires, + tests_require=tests_require, + test_suite='nose2.collector.collector', ) diff --git a/spotlight/__init__.py b/spotlight/__init__.py index 89d5884..0d136df 100644 --- a/spotlight/__init__.py +++ b/spotlight/__init__.py @@ -4,19 +4,11 @@ This is just a simple interface to a Spotlight API. -Tested with DBPedia Spotlight 0.5 and 0.6.5. - -Note that I'm trying to track Spotlight release version numbers, so you can -easily see which pyspotlight version has been tested with which Spotlight -release. - -I hope the code and the small documentation speaks for itself :-) - -If you should encounter any problems, feel free to contact me on github -(originell). I'm happy to help out with anything related to my code. +Tested with DBPedia Spotlight 0.7. """ -__version_info__ = (0, 6, 5) +__version_info__ = (0, 7, 2) __version__ = '.'.join(map(str, __version_info__)) +__url__ = 'https://github.com/aolieman/pyspotlight' import requests @@ -33,6 +25,39 @@ class SpotlightException(Exception): # Some helper functions. + +def _post_request(address, payload, filters, headers): + """ + Build the Spotlight request, POST it to the server, and return + the response's JSON body. + """ + filter_kwargs = {'policy': 'whitelist'} + filter_kwargs.update(filters or {}) + payload.update(filter_kwargs) + + reqheaders = {'accept': 'application/json'} + reqheaders.update(headers or {}) + + # Its better for the user to have to explicitly provide a protocol in the + # URL, since transmissions might happen over HTTPS or any other secure or + # faster (spdy/HTTP2 :D) channel. + if '://' not in address: + raise SpotlightException('Oops. Looks like you forgot the protocol ' + '(http/https) in your url (%s).' % address) + + response = requests.post(address, data=payload, headers=reqheaders) + + # http status codes >=400,<600 shall raise an exception. + response.raise_for_status() + + json_body = response.json() + if json_body is None: + raise SpotlightException("Spotlight's response did not contain valid " + "JSON: %s" % response.text) + + return json_body + + def _convert_number(value): """ Try to convert a string to an int or float. @@ -42,7 +67,7 @@ def _convert_number(value): # Workaround for footnotes being put into Resources.surfaceForm and then # having them parsed by the JSON parser into a list. (issue #4) if isinstance(value, list): - value = unicode(value) + value = str(value) try: return int(value) @@ -61,7 +86,7 @@ def _dict_cleanup(dic, dict_type=dict): That way we can avoid stack fails. """ clean = dict_type() - for key, value in dic.iteritems(): + for key, value in dic.items(): if value is None: continue @@ -70,33 +95,33 @@ def _dict_cleanup(dic, dict_type=dict): try: # If this is a string or bool, # go straight to type conversion. - if (isinstance(value, basestring) or + if (hasattr(value, 'strip') or isinstance(value, bool)): raise AttributeError # Test for an iterable (list, tuple, set) value[0] # Clean up each element in the iterable - clean[key] = [_dict_cleanup(element, dict_type) - for element in value] + clean[key] = [ + _dict_cleanup(element, dict_type) + for element in value + ] except KeyError: clean[key] = _dict_cleanup(value, dict_type) except AttributeError: - clean[key] = _convert_number(value) + if key in {'surfaceForm', 'name'}: + clean[key] = value + else: + clean[key] = _convert_number(value) return clean # Main functions. -# -# I was inspired to go back to a function based approach after seeing this -# awesome talk by Jack Diederich: Stop Writing Classes -# http://pyvideo.org/video/880/stop-writing-classes -# Most of the class-based approach had the problems he described. -# Embarrassing! + def annotate(address, text, confidence=0.0, support=0, - spotter='LingPipeSpotter', disambiguator='Default', - policy='whitelist', headers=None): + spotter='Default', disambiguator='Default', + filters=None, headers=None): """ - Annotate a text. + Get semantic annotations (i.e. entity links) from a text. Can raise :exc:`requests.exceptions.HTTPError` or :exc:`SpotlightException`, depending on where the failure is (HTTP status @@ -117,10 +142,9 @@ def annotate(address, text, confidence=0.0, support=0, :type confidence: float :param support: - Only output annotations above a given prominence (support). - Based on my experience I would suggest you set this to something - above 20, however your experience might vary from text to text. - :type support: int + Only output annotations above a given prominence (i.e. support, + indegree on Wikipedia). + :type support: integer :param spotter: One of spotters available on your DBPedia Spotlight server. @@ -132,9 +156,24 @@ def annotate(address, text, confidence=0.0, support=0, The disambiguator to use on the annotation. :type disambiguator: string - :param policy: - The policy to be used. - :type disambiguator: string + :param filters: + Additional parameters that collectively define a filter function. + + For example: + 'policy' (string) + The policy to be used: + 'whitelist' or 'blacklist'; + 'types' (string) + Comma-separated list of types, + i.e. 'DBpedia:Agent,Schema:Organization'; + 'sparql' (string) + Select only entities that (don't) + match with the SPARQL query result; + 'coreferenceResolution' (boolean) + Annotate coreferences: true / false. + Set to false to use types (statistical only). + + :type filters: dictionary :param headers: Additional headers to be set on the request. @@ -142,65 +181,52 @@ def annotate(address, text, confidence=0.0, support=0, :rtype: list of resources """ - payload = {'confidence': confidence, 'support': support, - 'spotter': spotter, 'disambiguator': disambiguator, - 'policy': policy, 'text': text} - reqheaders = {'accept': 'application/json'} - reqheaders.update(headers or {}) + payload = { + 'confidence': confidence, + 'support': support, + 'text': text, + 'spotter': spotter, + 'disambiguator': disambiguator + } - # Its better for the user to have to explicitly provide a protocl in the - # URL, since transmissions might happen over HTTPS or any other secure or - # faster (spdy :D) channel. - if not '://' in address: - raise SpotlightException('Oops. Looks like you forgot the protocol ' - '(http/https) in your url (%s).' % address) + pydict = _post_request(address, payload, filters, headers) - response = requests.post(address, data=payload, headers=reqheaders) - if response.status_code != requests.codes.ok: - # Every http code besides 200 shall raise an exception. - response.raise_for_status() - - pydict = response.json() - if pydict is None: - raise SpotlightException("Spotlight's response did not contain valid " - "JSON: %s" % response.text) - - if not 'Resources' in pydict: + if 'Resources' not in pydict: raise SpotlightException( - 'No Resources found in spotlight response: %s' % pydict) + 'No Resources found in spotlight response: %s' % pydict + ) return [_dict_cleanup(resource) for resource in pydict['Resources']] -# This is more or less a duplicate of the annotate function, with just -# the return line being the difference haha. def candidates(address, text, confidence=0.0, support=0, - spotter='LingPipeSpotter', disambiguator='Default', - policy='whitelist', headers=None): + spotter='Default', disambiguator='Default', + filters=None, headers=None): """ - Get the candidates from a text. + Get the candidate entities from a text. Uses the same arguments as :meth:`annotate`. :rtype: list of surface forms """ - payload = {'confidence': confidence, 'support': support, - 'spotter': spotter, 'disambiguator': disambiguator, - 'policy': policy, 'text': text} - reqheaders = {'accept': 'application/json'} - reqheaders.update(headers or {}) - response = requests.post(address, data=payload, headers=reqheaders) - if response.status_code != requests.codes.ok: - # Every http code besides 200 shall raise an exception. - response.raise_for_status() + payload = { + 'confidence': confidence, + 'support': support, + 'text': text, + 'spotter': spotter, + 'disambiguator': disambiguator + } + + pydict = _post_request(address, payload, filters, headers) - pydict = response.json() - if not 'annotation' in pydict: + if 'annotation' not in pydict: raise SpotlightException( - 'No annotations found in spotlight response: %s' % pydict) - if not 'surfaceForm' in pydict['annotation']: + 'No annotations found in spotlight response: %s' % pydict + ) + if 'surfaceForm' not in pydict['annotation']: raise SpotlightException( - 'No surface forms found in spotlight response: %s' % pydict) + 'No surface forms found in spotlight response: %s' % pydict + ) # Previously we assumed that the surfaceForm is *always* a list, however # depending on how many are returned, this does not have to be the case. @@ -210,5 +236,7 @@ def candidates(address, text, confidence=0.0, support=0, except KeyError: # However note that we will *always* return a list. return [_dict_cleanup(pydict['annotation']['surfaceForm']), ] - return [_dict_cleanup(form) - for form in pydict['annotation']['surfaceForm']] + return [ + _dict_cleanup(form) + for form in pydict['annotation']['surfaceForm'] + ] diff --git a/spotlight/tests.py b/spotlight/tests.py index e13716e..9d7af55 100644 --- a/spotlight/tests.py +++ b/spotlight/tests.py @@ -1,19 +1,13 @@ -SKIP_ORDERED_DICT_TESTS = False -try: - from collections import OrderedDict -except ImportError: - SKIP_ORDERED_DICT_TESTS = True - import sys - sys.stderr.write('Skipping _dict_cleanup due to OrderedDict not being ' - 'available.\n') - -from collections import namedtuple -from nose.tools import eq_, nottest, raises - +from collections import namedtuple, OrderedDict +from nose2.tools.such import helper import spotlight -@nottest +# Expose unittest assertions to function-style tests +assert_equals = helper.assertEquals +assert_raises = helper.assertRaises + + def fake_request_post(self, *args, **kwargs): RawResponse = namedtuple('RawResponse', ['reason', ]) hear_me_RawR = RawResponse(reason='Just a fake reason.') @@ -32,62 +26,62 @@ def raise_for_status(self): def test_number_convert(): - eq_(spotlight._convert_number('0'), 0) - eq_(spotlight._convert_number('0.2'), 0.2) - eq_(spotlight._convert_number(True), True) - eq_(spotlight._convert_number('evi'), 'evi') + assert_equals(spotlight._convert_number('0'), 0) + assert_equals(spotlight._convert_number('0.2'), 0.2) + assert_equals(spotlight._convert_number(True), True) + assert_equals(spotlight._convert_number('evi'), 'evi') # Testing the footnote workaround. - eq_(spotlight._convert_number([1]), '[1]') + assert_equals(spotlight._convert_number([1]), '[1]') -@raises(spotlight.SpotlightException) def test_protocol_missing(): - spotlight.annotate('localhost', 'asdasdasd', - headers={'fake_response': 'invalid json', - 'fake_status': 502}) + with assert_raises(spotlight.SpotlightException): + spotlight.annotate('localhost', 'asdasdasd', + headers={'fake_response': b'invalid json', + 'fake_status': 502}) -@raises(spotlight.requests.exceptions.HTTPError) def test_http_fail(): - spotlight.annotate('http://localhost', 'asdasdasd', - headers={'fake_response': 'invalid json', - 'fake_status': 502}) + with assert_raises(spotlight.requests.exceptions.HTTPError): + spotlight.annotate('http://localhost', 'asdasdasd', + headers={'fake_response': b'invalid json', + 'fake_status': 502}) -@raises(ValueError) def test_annotation_invalid_json(): - spotlight.annotate('http://localhost', 'asdasdasd', - headers={'fake_response': 'invalid json'}) + with assert_raises(ValueError): + spotlight.annotate('http://localhost', 'asdasdasd', + headers={'fake_response': b'invalid json'}) -@raises(spotlight.SpotlightException) def test_missing_resources(): - spotlight.annotate('http://localhost', 'asdasdasd', - headers={'fake_response': '{"Test": "Win"}'}) + with assert_raises(spotlight.SpotlightException): + spotlight.annotate('http://localhost', 'asdasdasd', + headers={'fake_response': b'{"Test": "Win"}'}) -@raises(ValueError) def test_candidates_invalid_json(): - spotlight.annotate('http://localhost', 'asdasdasd', - headers={'fake_response': 'invalid json'}) + with assert_raises(ValueError): + spotlight.annotate('http://localhost', 'asdasdasd', + headers={'fake_response': b'invalid json'}) -@raises(spotlight.SpotlightException) def test_missing_annotation(): - spotlight.candidates('http://localhost', 'asdasdasd', - headers={'fake_response': '{"Test": "Win"}'}) + with assert_raises(spotlight.SpotlightException): + spotlight.candidates('http://localhost', 'asdasdasd', + headers={'fake_response': b'{"Test": "Win"}'}) -@raises(spotlight.SpotlightException) def test_missing_surfaceForms(): - spotlight.candidates('http://localhost', 'asdasdasd', - headers={'fake_response': '{"annotation": {"Test": "Win"}}'}) + with assert_raises(spotlight.SpotlightException): + spotlight.candidates('http://localhost', 'asdasdasd', + headers={'fake_response': b'{"annotation": {"Test": "Win"}}'}) def test_single_candidate(): # Test with a single returned candidate, as was reported by issue #3. # Thanks to aolieman for the awesome test data! - data = """ + data = b""" { "annotation":{ "@text":"Industrial Design at the Technische Universiteit Delft", @@ -123,61 +117,63 @@ def test_single_candidate(): candidates = spotlight.candidates('http://localhost', 'asdasdasd', headers={'fake_response': data}) expected_out = [ - {u'resource': - [ + { + 'resource': [ { - u'finalScore': 0.8754365122251001, - u'support': 3, - u'uri': u'Technische_Universiteit_Delft', - u'label': u'Technische Universiteit Delft', - u'types': u'', - u'percentageOfSecondRank': 0.1422872887244497, - u'priorScore': 2.8799662606192636e-08, - u'contextualScore': 0.9991813164782087 + 'finalScore': 0.8754365122251001, + 'support': 3, + 'uri': 'Technische_Universiteit_Delft', + 'label': 'Technische Universiteit Delft', + 'types': '', + 'percentageOfSecondRank': 0.1422872887244497, + 'priorScore': 2.8799662606192636e-08, + 'contextualScore': 0.9991813164782087 }, { - u'finalScore': 0.12456348777489806, - u'support': 521, - u'uri': u'Delft_University_of_Technology', - u'label': u'Delft University of Technology', - u'types': u'DBpedia:Agent, Schema:Organization, DBpedia:Organisation, Schema:EducationalOrganization, DBpedia:EducationalInstitution, Schema:CollegeOrUniversity, DBpedia:University', - u'percentageOfSecondRank': 0.0, - u'priorScore': 5.001541405942121e-06, - u'contextualScore': 0.0008186418452925803 + 'finalScore': 0.12456348777489806, + 'support': 521, + 'uri': 'Delft_University_of_Technology', + 'label': 'Delft University of Technology', + 'types': 'DBpedia:Agent, Schema:Organization, DBpedia:Organisation, Schema:EducationalOrganization, DBpedia:EducationalInstitution, Schema:CollegeOrUniversity, DBpedia:University', + 'percentageOfSecondRank': 0.0, + 'priorScore': 5.001541405942121e-06, + 'contextualScore': 0.0008186418452925803 }, - ], - u'name': u'Technische Universiteit Delft', - u'offset': 25 + ], + 'name': 'Technische Universiteit Delft', + 'offset': 25 } ] - eq_(candidates, expected_out) - - -if not SKIP_ORDERED_DICT_TESTS: - def test_dict_key_cleanup(): - dirty_dict = OrderedDict() - dirty_dict['@dirty'] = 'value' - dirty_dict['@empty'] = None # None values should be removed. - dirty_dict['@recursive'] = OrderedDict() - dirty_dict['@recursive']['tests'] = '1' - dirty_dict['@recursive']['stuff'] = OrderedDict() - more = OrderedDict() - more['something'] = 'isgoingon' - moremore = OrderedDict() - moremore['@moar'] = True - moar_iterable = [more, moremore] - dirty_dict['@recursive']['stuff'] = moar_iterable - - clean_dict = OrderedDict() - clean_dict['dirty'] = 'value' - clean_dict['recursive'] = OrderedDict() - clean_dict['recursive']['tests'] = 1 - clean_dict['recursive']['stuff'] = OrderedDict() - more = OrderedDict() - more['something'] = 'isgoingon' - moremore = OrderedDict() - moremore['moar'] = True - moar_iterable = [more, moremore] - clean_dict['recursive']['stuff'] = moar_iterable - eq_(spotlight._dict_cleanup(dirty_dict, dict_type=OrderedDict), - clean_dict) + assert_equals(candidates, expected_out) + + +def test_dict_key_cleanup(): + dirty_dict = OrderedDict() + dirty_dict['@dirty'] = 'value' + dirty_dict['@empty'] = None # None values should be removed. + dirty_dict['@recursive'] = OrderedDict() + dirty_dict['@recursive']['tests'] = '1' + dirty_dict['@recursive']['surfaceForm'] = '02' + dirty_dict['@recursive']['name'] = '02' + dirty_dict['@recursive']['stuff'] = OrderedDict() + more = OrderedDict() + more['something'] = 'isgoingon' + moremore = OrderedDict() + moremore['@moar'] = True + moar_iterable = [more, moremore] + dirty_dict['@recursive']['stuff'] = moar_iterable + + clean_dict = OrderedDict() + clean_dict['dirty'] = 'value' + clean_dict['recursive'] = OrderedDict() + clean_dict['recursive']['tests'] = 1 + clean_dict['recursive']['surfaceForm'] = '02' + clean_dict['recursive']['name'] = '02' + clean_dict['recursive']['stuff'] = OrderedDict() + more = OrderedDict() + more['something'] = 'isgoingon' + moremore = OrderedDict() + moremore['moar'] = True + moar_iterable = [more, moremore] + clean_dict['recursive']['stuff'] = moar_iterable + assert_equals(spotlight._dict_cleanup(dirty_dict, dict_type=OrderedDict), clean_dict)