larsmans · aolieman · Apr 1, 2019 · Apr 1, 2019 · Apr 2, 2019 · Apr 2, 2019
diff --git a/COPYING b/COPYING
@@ -1,5 +1,6 @@
 WeighWords: a Python library for creating word weights/word clouds from text
-Copyright 2011 University of Amsterdam
+Copyright 2011-2019 University of Amsterdam
+Copyright 2019 TinQwise Stamkracht
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU Lesser General Public License as published by the

diff --git a/README.rst b/README.rst
@@ -9,28 +9,88 @@ Rather than use simple word frequency, it weighs words by statistical models
 known as *parsimonious language models*. These models are good at picking up
 the words that distinguish a text document from other documents in a
 collection. The downside to this is that you can't use WeighWords to make a
-word cloud of a single document; you need a bunch of document to compare to.
+word cloud of a single document; you need a bunch of documents (i.e. a
+background collection) to compare to.
 
 
 Installation
 ------------
 
-Either::
+Either install the latest release from PyPI::
 
     pip install weighwords
 
-or::
+or clone this git repository, and::
 
     python setup.py install
 
+or::
+
+    pip install -e .
+
+Usage
+-----
+>>> quotes = [
+...     "Love all, trust a few, Do wrong to none",
+...     ...
+...     "A lover's eyes will gaze an eagle blind. "
+...     "A lover's ear will hear the lowest sound.",
+... ]
+>>> doc_tokens = [
+...     re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split()
+...     for quote in quotes
+... ]
+
+The ``ParsimoniousLM`` is initialized with all document tokens as a
+background corpus, and subsequently takes a single document's tokens
+as input. Its ``top`` method returns the top terms and their log-probabilities:
+
+>>> from weighwords import  ParsimoniousLM
+>>> plm = ParsimoniousLM(doc_tokens, w=.1)
+>>> plm.top(10, doc_tokens[-1])
+[('lover', -1.871802261651365),
+ ('will', -1.871802261651365),
+ ('eyes', -2.5649494422113044),
+ ('gaze', -2.5649494422113044),
+ ('an', -2.5649494422113044),
+ ('eagle', -2.5649494422113044),
+ ('blind', -2.5649494422113044),
+ ('ear', -2.5649494422113044),
+ ('hear', -2.5649494422113044),
+ ('lowest', -2.5649494422113044)]
+
+The ``SignificantWordsLM`` is similarly initialized with a background corpus,
+but subsequently takes a group of document tokens as input. Its ``group_top``
+method returns the top terms and their probabilities:
+
+>>> from weighwords import SignificantWordsLM
+>>> swlm = SignificantWordsLM(doc_tokens, lambdas=(.7, .1, .2))
+>>> swlm.group_top(10, doc_tokens[-3:])
+[('in', 0.37875318027881),
+ ('is', 0.07195732361699828),
+ ('mortal', 0.07195732361699828),
+ ('nature', 0.07195732361699828),
+ ('all', 0.07110584778711342),
+ ('we', 0.03597866180849914),
+ ('true', 0.03597866180849914),
+ ('lovers', 0.03597866180849914),
+ ('strange', 0.03597866180849914),
+ ('capers', 0.03597866180849914)]
+
+See ``example/dickens.py`` for a running example with more realistic data.
 
 References
 ----------
-D. Hiemstra, S. Robertson and H. Zaragoza (2004). `Parsimonious Language Models
+D. Hiemstra, S. Robertson, and H. Zaragoza (2004). `Parsimonious Language Models
 for Information Retrieval
 <http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5806>`_.
 Proc. SIGIR'04.
 
-R. Kaptein, D. Hiemstra and J. Kamps (2010). `How different are Language Models
-and word clouds? <http://riannekaptein.woelmuis.nl/2010/kapt-how10.pdf>`_
-Proc. ECIR.
+R. Kaptein, D. Hiemstra, and J. Kamps (2010). `How different are Language Models
+and word clouds? <http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.189.822>`_.
+Proc. ECIR'10.
+
+M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016).
+`Luhn Revisited: Significant Words Language Models
+<https://djoerdhiemstra.com/wp-content/uploads/cikm2016.pdf>`_.
+Proc. CKIM'16.
diff --git a/example/dickens.py b/example/dickens.py
@@ -1,13 +1,16 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Find terms that distinguish various novels by Charles Dickens.
 # Note: if the w parameter is set wisely, no stop list is needed.
-
-from weighwords import ParsimoniousLM
 import gzip
 import logging
-import numpy as np
+import math
 import re
+from itertools import zip_longest
+
+import numpy as np
+
+from weighwords import ParsimoniousLM, SignificantWordsLM
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -21,28 +24,49 @@
 ]
 
 startbook = """*** START OF THIS PROJECT GUTENBERG EBOOK """
+endbook = """*** END OF THIS PROJECT GUTENBERG EBOOK """
 
 
 def read_book(title, num):
     """Returns generator over words in book num"""
 
-    logger.info("Fetching terms from %s" % title)
-    path = "%s.txt.utf8.gz" % num
+    logger.info(f"Fetching terms from {title}")
+    path = f"{num}.txt.utf8.gz"
     in_book = False
-    for ln in gzip.open(path):
-        if in_book:
-            for w in re.sub(r"[.,:;!?\"']", " ", ln).lower().split():
+    for ln in gzip.open(path, 'rt', encoding='utf8'):
+        if in_book and ln.startswith(endbook):
+            break
+        elif in_book:
+            for w in re.sub(r"[.,:;!?\"'‘’]", " ", ln).lower().split():
                 yield w
         elif ln.startswith(startbook):
             in_book = True
 
 
+def grouper(iterable, n, filler=None):
+    """Source: https://docs.python.org/3/library/itertools.html#itertools-recipes"""
+    args = [iter(iterable)] * n
+    return zip_longest(*args, fillvalue=filler)
+
+
 book_contents = [(title, list(read_book(title, num))) for title, num in books]
+corpus = [terms for title, terms in book_contents]
 
-model = ParsimoniousLM([terms for title, terms in book_contents], w=.01)
+plm = ParsimoniousLM(corpus, w=.01)
+swlm = SignificantWordsLM(corpus, lambdas=(.9, .01, .09))
 
 for title, terms in book_contents:
-    print("Top %d words in %s:" % (top_k, title))
-    for term, p in model.top(top_k, terms):
-        print("    %s %.4f" % (term, np.exp(p)))
+    plm_top = plm.top(top_k, terms)
+    swlm_top = swlm.group_top(
+        top_k,
+        grouper(terms, math.ceil(len(terms) / 10)),
+        fix_lambdas=True,
+    )
+    print(f"\nTop {top_k} words in {title}:")
+    print(f"\n{'PLM term':<16} {'PLM p':<12} {'SWLM term':<16} {'SWLM p':<6}")
+    for (plm_t, plm_p), (swlm_t, swlm_p) in zip(plm_top, swlm_top):
+        print(f"{plm_t:<16} {np.exp(plm_p):<12.4f} {swlm_t:<16} {swlm_p:.4f}")
     print("")
+
+
+
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,9 @@
+# install the weighwords package for convenience
+-e .
+
+# testing framework
+pytest ~= 4.5
+
+# static type checking
+mypy >= 0.701
+https://github.com/numpy/numpy-stubs/archive/master.tar.gz
diff --git a/setup.py b/setup.py
@@ -8,11 +8,14 @@
     description = "Python library for creating word weights/word clouds from text",
     keywords = "word cloud nlp language model",
     license = "LGPL",
+    package_data = {"weighwords": ["py.typed"]},
     packages = ["weighwords"],
-    install_requires = ["numpy>=1.4.0"],
+    install_requires = ["numpy>=1.15.0"],
+    tests_require = ["pytest"],
     classifiers = [
         "Development Status :: 4 - Beta",
         "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)",
+        "Programming Language :: Python :: 3.7",
         "Topic :: Text Processing",
     ]
 )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,43 @@
+import re
+
+import pytest
+
+
+@pytest.fixture(scope="module")
+def uniform_doc():
+    return ['one', 'two', 'three', 'four', 'five']
+
+
+@pytest.fixture(scope="module")
+def number_corpus():
+    return [
+        ['one'],
+        ['two', 'two'],
+        ['three', 'three', 'three'],
+        ['four', 'four', 'four', 'four'],
+        ['five', 'five', 'five', 'five', 'five']
+    ]
+
+
+@pytest.fixture(scope="module")
+def shakespeare_quotes():
+    quotes = [
+        "Love all, trust a few, Do wrong to none",
+        "But love that comes too late, "
+        "Like a remorseful pardon slowly carried, "
+        "To the great sender turns a sour offence.",
+        "If thou remember'st not the slightest folly "
+        "That ever love did make thee run into, "
+        "Thou hast not lov'd.",
+        "We that are true lovers run into strange capers; "
+        "but as all is mortal in nature, "
+        "so is all nature in love mortal in folly.",
+        "But are you so much in love as your rhymes speak? "
+        "Neither rhyme nor reason can express how much.",
+        "A lover's eyes will gaze an eagle blind. "
+        "A lover's ear will hear the lowest sound.",
+    ]
+    return [
+        re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split()
+        for quote in quotes
+    ]
diff --git a/tests/test_equivalence.py b/tests/test_equivalence.py
@@ -0,0 +1,58 @@
+from itertools import chain
+
+from weighwords import ParsimoniousLM, SignificantWordsLM
+from weighwords.logsum import logsum
+
+
+def test_model_equivalence(shakespeare_quotes):
+    weight = .1
+    plm = ParsimoniousLM(shakespeare_quotes, w=weight)
+    # initialize SWLM with weights that make it equivalent to PLM
+    swlm = SignificantWordsLM(
+        shakespeare_quotes,
+        lambdas=(1 - weight, weight, 0.)
+    )
+    plm_terms, swlm_terms = fit_models(plm, swlm, shakespeare_quotes)
+
+    assert plm_terms == swlm_terms, 'PLM and SWLM are not functionally equivalent'
+
+
+def test_model_non_equivalence(shakespeare_quotes):
+    weight = .1
+    plm = ParsimoniousLM(shakespeare_quotes, w=weight)
+    # initialize SWLM with weights that make it non-equivalent to PLM
+    swlm = SignificantWordsLM(
+        shakespeare_quotes,
+        lambdas=(1 - 2 * weight, weight, weight)
+    )
+    plm_terms, swlm_terms = fit_models(plm, swlm, shakespeare_quotes)
+
+    assert plm_terms != swlm_terms, 'PLM and SWLM should not be functionally equivalent'
+
+
+def get_p_corpus(language_model):
+    p_corpus = language_model.p_corpus.copy()
+    vocab = language_model.vocab
+    term_tiers = [
+        (1.5, ['love', 'folly', "lov'd", 'lovers', 'lover']),
+        (1.3, ['trust', 'remorseful', 'sour', 'offence', 'gaze']),
+        (1.1, ["remember'st", 'capers', 'rhyme', 'rhymes', 'eagle']),
+    ]
+    for multiplier, terms in term_tiers:
+        for t in terms:
+            p_corpus[vocab[t]] *= multiplier
+
+    return p_corpus - logsum(p_corpus)
+
+
+def fit_models(plm, swlm, docs):
+    # artificially reduce the corpus probability of selected terms
+    plm.p_corpus = swlm.p_corpus = get_p_corpus(plm)
+
+    top_k = 15
+    plm_top = plm.top(top_k, chain(*docs))
+    swlm_top = swlm.group_top(top_k, docs, fix_lambdas=True)
+    plm_terms = [term for term, log_prob in plm_top]
+    swlm_terms = [term for term, prob in swlm_top]
+
+    return plm_terms, swlm_terms
diff --git a/tests/test_plm.py b/tests/test_plm.py
@@ -0,0 +1,21 @@
+import numpy as np
+
+from weighwords import ParsimoniousLM
+
+
+def test_document_model(number_corpus, uniform_doc):
+    plm = ParsimoniousLM(number_corpus, w=0.1, thresh=3)
+    tf, p_term = plm._document_model(uniform_doc)
+    assert (tf[:2] == 0).all(), \
+        "Terms with a corpus frequency < thresh should not be counted"
+    assert tf.sum() == 3, f"Expected tf.sum() to be 3, got {tf.sum()} instead"
+    linear_p_term = np.exp(p_term)
+    assert (linear_p_term[2:].sum() - 1) < 1e-10, \
+        f"All probability mass should be on the last 3 terms, got {linear_p_term} instead"
+
+
+def test_document_model_out_of_vocabulary(number_corpus):
+    plm = ParsimoniousLM(number_corpus, w=0.1)
+    doc = ['two', 'or', 'three', 'unseen', 'words']
+    tf, p_term = plm._document_model(doc)
+    assert tf.sum() == 2, f"Unseen words should be ignored, got {tf} instead"
diff --git a/tests/test_swlm.py b/tests/test_swlm.py
@@ -0,0 +1,37 @@
+import logging
+
+from weighwords import SignificantWordsLM
+
+logging.basicConfig(level=logging.INFO)
+
+
+def test_model_fit_fixed(number_corpus, uniform_doc):
+    swlm = SignificantWordsLM([uniform_doc], lambdas=(1/3, 1/3, 1/3))
+    doc_group = [l + r for l, r in zip(number_corpus, reversed(number_corpus))]
+    term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=True)
+    expected_probs = {
+        "one": 0.0,
+        "two": 0.12373,
+        "three": 2e-5,
+        "four": 0.50303,
+        "five": 0.37322,
+    }
+    for term, p in expected_probs.items():
+        diff = abs(term_probs[term] - p)
+        assert diff < 1e-5, f"P({term}) != {p} with difference {diff}"
+
+
+def test_model_fit_shifty(number_corpus, uniform_doc):
+    swlm = SignificantWordsLM([uniform_doc], lambdas=(1/3, 1/3, 1/3))
+    doc_group = [l + r for l, r in zip(number_corpus, reversed(number_corpus))]
+    term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=False)
+    expected_probs = {
+        "one": 0.0,
+        "two": 0.33322,
+        "three": 0.0,
+        "four": 0.66678,
+        "five": 0.0,
+    }
+    for term, p in expected_probs.items():
+        diff = abs(term_probs[term] - p)
+        assert diff < 1e-5, f"P({term}) != {p} with difference {diff}"
diff --git a/weighwords/__init__.py b/weighwords/__init__.py
@@ -1 +1,2 @@
 from .parsimonious import ParsimoniousLM
+from .significant_words import SignificantWordsLM
diff --git a/weighwords/logsum.py b/weighwords/logsum.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 
-def logsum(x):
+def logsum(x: np.ndarray) -> np.ndarray:
     """Computes the sum of x assuming x is in the log domain.
 
     Returns log(sum(exp(x))) while minimizing the possibility of
@@ -24,7 +24,7 @@ def logsum(x):
     """
     # Use the max to normalize, as with the log this is what accumulates
     # the less errors
-    vmax = x.max(axis=0)
-    out = np.log(np.sum(np.exp(x - vmax), axis=0))
+    vmax = np.nanmax(x, axis=0)
+    out = np.log(np.nansum(np.exp(x - vmax), axis=0))
     out += vmax
     return out
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .parsimonious import ParsimoniousLM
		from .significant_words import SignificantWordsLM