From 8ac8b7762625a9513468e36df515ff89ba0e740d Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Mon, 1 Apr 2019 17:35:42 +0200
Subject: [PATCH 01/40] py3 compatibility

---
 weighwords/parsimonious.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index becaa02..f8ab640 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -50,7 +50,7 @@ def __init__(self, documents, w, thresh=0):
                 count[i] += 1
 
         cf = np.empty(len(count), dtype=np.float)
-        for i, f in count.iteritems():
+        for i, f in count.items():
             cf[i] = f
         rare = (cf < thresh)
         cf -= rare * cf
@@ -86,7 +86,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None):
         tf, p_term = self._document_model(d)
         p_term = self._EM(tf, p_term, w, max_iter, eps)
 
-        terms = [(t, p_term[i]) for t, i in self.vocab.iteritems()]
+        terms = [(t, p_term[i]) for t, i in self.vocab.items()]
         return nlargest(k, terms, lambda tp: tp[1])
 
     def _document_model(self, d):
@@ -155,7 +155,7 @@ def _EM(self, tf, p_term, w, max_iter, eps):
         try:
             old_error_settings = np.seterr(divide='ignore')
             p_term = np.asarray(p_term)
-            for i in xrange(1, max_iter + 1):
+            for i in range(1, max_iter + 1):
                 # E-step
                 p_term += w
                 E = tf + p_term - np.logaddexp(p_corpus, p_term)

From 64709ab71db605353fcdb7547f3afa5420676b3e Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Mon, 1 Apr 2019 17:36:10 +0200
Subject: [PATCH 02/40] py3 compatible example

---
 example/dickens.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/example/dickens.py b/example/dickens.py
index 3328f6f..ec5fd33 100755
--- a/example/dickens.py
+++ b/example/dickens.py
@@ -21,6 +21,7 @@
 ]
 
 startbook = """*** START OF THIS PROJECT GUTENBERG EBOOK """
+endbook = """*** END OF THIS PROJECT GUTENBERG EBOOK """
 
 
 def read_book(title, num):
@@ -29,9 +30,11 @@ def read_book(title, num):
     logger.info("Fetching terms from %s" % title)
     path = "%s.txt.utf8.gz" % num
     in_book = False
-    for ln in gzip.open(path):
-        if in_book:
-            for w in re.sub(r"[.,:;!?\"']", " ", ln).lower().split():
+    for ln in gzip.open(path, 'rt', encoding='utf8'):
+        if in_book and ln.startswith(endbook):
+            break
+        elif in_book:
+            for w in re.sub(r"[.,:;!?\"'‘’]", " ", ln).lower().split():
                 yield w
         elif ln.startswith(startbook):
             in_book = True

From 563a83a559caff7b63cd432f24b3207ddceb09bd Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Tue, 2 Apr 2019 21:30:28 +0200
Subject: [PATCH 03/40] turn relative weighwords import into absolute

---
 weighwords/parsimonious.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index f8ab640..42c6e38 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -8,7 +8,7 @@
 import logging
 import numpy as np
 
-from .logsum import logsum
+from weighwords.logsum import logsum
 
 
 logger = logging.getLogger(__name__)

From cd698f4dc9e669fa39a9e546aca5d6dc314237a2 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Tue, 2 Apr 2019 21:31:25 +0200
Subject: [PATCH 04/40] codestyle / consistency

---
 weighwords/parsimonious.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index 42c6e38..ba99cbb 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -64,7 +64,7 @@ def __init__(self, documents, w, thresh=0):
             np.seterr(**old_error_settings)
 
     def top(self, k, d, max_iter=50, eps=1e-5, w=None):
-        '''Get the top k terms of a document d and their log probabilities.
+        """Get the top k terms of a document d and their log probabilities.
 
         Uses the Expectation Maximization (EM) algorithm to estimate term
         probabilities.
@@ -81,7 +81,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None):
         Returns
         -------
         t_p : list of (str, float)
-        '''
+        """
 
         tf, p_term = self._document_model(d)
         p_term = self._EM(tf, p_term, w, max_iter, eps)
@@ -90,7 +90,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None):
         return nlargest(k, terms, lambda tp: tp[1])
 
     def _document_model(self, d):
-        '''Build document model.
+        """Build document model.
 
         Parameters
         ----------
@@ -105,7 +105,7 @@ def _document_model(self, d):
 
         Initial p_term is 1/n_distinct for terms with non-zero tf,
         0 for terms with 0 tf.
-        '''
+        """
 
         logger.info('Gathering term probabilities')
 
@@ -125,7 +125,7 @@ def _document_model(self, d):
         return tf, p_term
 
     def _EM(self, tf, p_term, w, max_iter, eps):
-        '''Expectation maximization.
+        """Expectation maximization.
 
         Parameters
         ----------
@@ -140,7 +140,7 @@ def _EM(self, tf, p_term, w, max_iter, eps):
         -------
         p_term : array of float
             A posteriori term probabilities.
-        '''
+        """
 
         logger.info('EM with max_iter=%d, eps=%g' % (max_iter, eps))
 

From adba97d5c8a32cefb0538e8c6cedfc33ba56c213 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Wed, 3 Apr 2019 11:56:31 +0200
Subject: [PATCH 05/40] replace lambda with itemgetter

---
 weighwords/parsimonious.py      | 6 ++++--
 weighwords/significant_words.py | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 weighwords/significant_words.py

diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index ba99cbb..bc1d71b 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -6,6 +6,8 @@
 from collections import defaultdict
 from heapq import nlargest
 import logging
+from operator import itemgetter
+
 import numpy as np
 
 from weighwords.logsum import logsum
@@ -14,7 +16,7 @@
 logger = logging.getLogger(__name__)
 
 
-class ParsimoniousLM(object):
+class ParsimoniousLM:
     """Language model for a set of documents.
 
     Constructing an object of this class fits a background model. The top
@@ -87,7 +89,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None):
         p_term = self._EM(tf, p_term, w, max_iter, eps)
 
         terms = [(t, p_term[i]) for t, i in self.vocab.items()]
-        return nlargest(k, terms, lambda tp: tp[1])
+        return nlargest(k, terms, itemgetter(1))
 
     def _document_model(self, d):
         """Build document model.
diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
new file mode 100644
index 0000000..d78cfe5
--- /dev/null
+++ b/weighwords/significant_words.py
@@ -0,0 +1,8 @@
+import logging
+
+from weighwords import ParsimoniousLM
+
+logger = logging.getLogger(__name__)
+
+
+class SignificantWordsLM(ParsimoniousLM):

From 4805760c7e5b7f6080ef11a6508fafffa33d08c0 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Wed, 10 Apr 2019 00:21:35 +0200
Subject: [PATCH 06/40] [WIP] SignificantWordsLM init and untested E-step

---
 weighwords/significant_words.py | 119 ++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index d78cfe5..729b5da 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -1,8 +1,127 @@
 import logging
 
+import numpy as np
+
 from weighwords import ParsimoniousLM
+from weighwords.logsum import logsum
 
 logger = logging.getLogger(__name__)
 
 
 class SignificantWordsLM(ParsimoniousLM):
+    """Language model for a set of documents.
+
+    Constructing an object of this class fits a background model. The top
+    method can then be used to fit document-specific models, also for unseen
+    documents (with the same vocabulary as the background corpus).
+
+    Parameters
+    ----------
+    documents : iterable over iterable over terms
+    w : float
+        Weight of document model (1 - weight of corpus model)
+    thresh : int
+        Don't include words that occur < thresh times
+
+    Attributes
+    ----------
+    vocab : dict of term -> int
+        Mapping of terms to numeric indices
+    p_corpus : array of float
+        Log prob of terms
+    """
+    def __init__(self, documents, w, thresh=0):
+        super().__init__(documents, w, thresh=thresh)
+        self.lambda_corpus = None
+        self.lambda_group = None
+        self.lambda_specific = None
+
+    def group_top(self, k, document_group, max_iter=50, eps=1e-5, w=None):
+        document_models = [
+            self._document_model(doc)
+            for doc in document_group
+        ]
+        doc_term_frequencies = [tf for tf, _ in document_models]
+        group_tf, p_group = self._group_model(
+            doc_term_frequencies
+        )
+        try:
+            old_error_settings = np.seterr(divide='ignore')
+            doc_term_probs = [
+                np.log(tf) - np.log(np.sum(tf))
+                for tf in doc_term_frequencies
+            ]
+        finally:
+            np.seterr(**old_error_settings)
+
+        p_specific = self._specific_model(doc_term_probs)
+
+        if w is None:
+            w = self.w
+        general_w = specific_w = np.log(0.5 * (1 - w))
+        group_w = np.log(w)
+        weights_shape = len(document_group)
+        self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.float)
+        self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.float)
+        self.lambda_group = np.full(weights_shape, group_w, dtype=np.float)
+
+
+    def _e_step(self, p_group, p_specific):
+        corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus)
+        specific_numerator = np.add.outer(self.lambda_specific, p_specific)
+        group_numerator = np.add.outer(self.lambda_group, p_group)
+        denominator = [
+            logsum(np.array([sp_corpus, sp_corpus, sp_specific]))
+            for sp_corpus, sp_corpus, sp_specific in zip(
+                corpus_numerator,
+                specific_numerator,
+                group_numerator
+            )
+        ]
+        return {
+            'corpus': corpus_numerator - denominator,
+            'specific': specific_numerator - denominator,
+            'group': group_numerator - denominator
+        }
+
+
+    @staticmethod
+    def _group_model(document_term_frequencies):
+        group_tf = np.array(document_term_frequencies).sum(axis=0)
+
+        try:
+            old_error_settings = np.seterr(divide='ignore')
+            p_group = np.log(group_tf) - np.log(np.sum(group_tf))
+        finally:
+            np.seterr(**old_error_settings)
+
+        return group_tf, p_group
+
+    @staticmethod
+    def _specific_model(document_term_probabilities):
+        # complement events: 1 - p
+        complements = [
+            np.log1p(-np.exp(p_doc))
+            for p_doc in document_term_probabilities
+        ]
+        # probability of term to be important in one doc, and not others
+        complement_products = np.array([
+            document_term_probabilities[i] + complement
+            for i, dlm in enumerate(document_term_probabilities)
+            for j, complement in enumerate(complements)
+            if i != j
+        ])
+
+        try:
+            old_error_settings = np.seterr(divide='ignore')
+            # marginalize over all documents
+            p_specific = (
+                logsum(complement_products)
+                - np.log(
+                    np.count_nonzero(complement_products > np.log(0), axis=0)
+                )
+            )
+        finally:
+            np.seterr(**old_error_settings)
+
+        return p_specific

From ac737a32ff0e5f428a9cb41c6a9a2fc62a2f2976 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Wed, 10 Apr 2019 15:28:04 +0200
Subject: [PATCH 07/40] estimate SWLM group model with fixed initial lambdas

---
 weighwords/significant_words.py | 47 ++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 729b5da..40ca15d 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -1,4 +1,6 @@
 import logging
+from heapq import nlargest
+from operator import itemgetter
 
 import numpy as np
 
@@ -35,8 +37,18 @@ def __init__(self, documents, w, thresh=0):
         self.lambda_corpus = None
         self.lambda_group = None
         self.lambda_specific = None
+        self.p_group = None
+        self.p_specific = None
+
+    def group_top(self, k, document_group, **kwargs):
+        term_probabilities = self.fit_parsimonious_group(document_group, **kwargs)
+        return nlargest(k, term_probabilities.items(), itemgetter(1))
+
+    def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None):
+        if w is None:
+            w = self.w
+        assert 0 < w < 1, f"invalid w={w}; `w` needs a value between 0.0 and 1.0"
 
-    def group_top(self, k, document_group, max_iter=50, eps=1e-5, w=None):
         document_models = [
             self._document_model(doc)
             for doc in document_group
@@ -56,8 +68,6 @@ def group_top(self, k, document_group, max_iter=50, eps=1e-5, w=None):
 
         p_specific = self._specific_model(doc_term_probs)
 
-        if w is None:
-            w = self.w
         general_w = specific_w = np.log(0.5 * (1 - w))
         group_w = np.log(w)
         weights_shape = len(document_group)
@@ -65,13 +75,37 @@ def group_top(self, k, document_group, max_iter=50, eps=1e-5, w=None):
         self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.float)
         self.lambda_group = np.full(weights_shape, group_w, dtype=np.float)
 
+        self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps)
+        self.p_specific = p_specific
+
+        exp_p_group = np.exp(p_group)
+
+        return {t: exp_p_group[i] for t, i in self.vocab.items()}
+
+    def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps):
+        try:
+            old_error_settings = np.seterr(divide='ignore')
+            log_doc_tf = np.log(doc_tf)
+            for i in range(1, 1 + max_iter):
+                expectation = self._e_step(p_group, p_specific)
+                new_p_group = self._m_step(expectation, log_doc_tf)
+
+                diff = new_p_group - p_group
+                p_group = new_p_group
+                if (diff < eps).all():
+                    logger.info(f'EM: convergence reached after {i} iterations')
+                    break
+        finally:
+            np.seterr(**old_error_settings)
+
+        return p_group
 
     def _e_step(self, p_group, p_specific):
         corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus)
         specific_numerator = np.add.outer(self.lambda_specific, p_specific)
         group_numerator = np.add.outer(self.lambda_group, p_group)
         denominator = [
-            logsum(np.array([sp_corpus, sp_corpus, sp_specific]))
+            logsum(np.asarray([sp_corpus, sp_corpus, sp_specific]))
             for sp_corpus, sp_corpus, sp_specific in zip(
                 corpus_numerator,
                 specific_numerator,
@@ -84,6 +118,11 @@ def _e_step(self, p_group, p_specific):
             'group': group_numerator - denominator
         }
 
+    def _m_step(self, expectation, log_doc_tf):
+        group_numerator = logsum(log_doc_tf + expectation['group'])
+        p_group = group_numerator - logsum(group_numerator)
+        # TODO: estimate lambdas
+        return p_group
 
     @staticmethod
     def _group_model(document_term_frequencies):

From de389c98c9a04a09628f4c8a738f61ac53849a35 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Wed, 10 Apr 2019 18:13:30 +0200
Subject: [PATCH 08/40] log initial lambda values

---
 weighwords/significant_words.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 40ca15d..7962fb9 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -74,7 +74,10 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None):
         self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.float)
         self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.float)
         self.lambda_group = np.full(weights_shape, group_w, dtype=np.float)
-
+        logger.info(
+            f'Lambdas initialized to: Corpus={np.exp(general_w)}, '
+            f'Group={w}, Specific={np.exp(specific_w)}'
+        )
         self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps)
         self.p_specific = p_specific
 

From c88bf9c2319d9b56b337f63b572eb068a55c24a7 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 13 Apr 2019 18:01:28 +0200
Subject: [PATCH 09/40] omit NaNs from logsum

---
 weighwords/logsum.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weighwords/logsum.py b/weighwords/logsum.py
index 6df59cd..ed17d19 100644
--- a/weighwords/logsum.py
+++ b/weighwords/logsum.py
@@ -25,6 +25,6 @@ def logsum(x):
     # Use the max to normalize, as with the log this is what accumulates
     # the less errors
     vmax = x.max(axis=0)
-    out = np.log(np.sum(np.exp(x - vmax), axis=0))
+    out = np.log(np.nansum(np.exp(x - vmax), axis=0))
     out += vmax
     return out

From fe16fe22393fe365a6b3dfc71f10c0ee108b48ea Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 13 Apr 2019 18:07:38 +0200
Subject: [PATCH 10/40] test SWLM model fit

---
 tests/test_swlm.py | 184 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 tests/test_swlm.py

diff --git a/tests/test_swlm.py b/tests/test_swlm.py
new file mode 100644
index 0000000..7ac0bf8
--- /dev/null
+++ b/tests/test_swlm.py
@@ -0,0 +1,184 @@
+import operator
+from functools import reduce
+
+import numpy as np
+
+from weighwords.significant_words import SignificantWordsLM
+
+# data follows tests
+
+
+def test_model_fit():
+    # init an SWLM with uniform p_corpus
+    swlm = SignificantWordsLM([colors], w=0.1)
+    # deterministically generate some docs
+    doc_parts = np.array_split(list(zip(colors[:25], rcolors)), 5)
+    doc_group = [
+        reduce(operator.add, [i * list(d) for i, d in enumerate(z)])
+        for z in zip(*doc_parts)
+    ]
+    # fit the modelk
+    term_probs = swlm.fit_parsimonious_group(doc_group)
+    expected_probs = {
+        "salmon": 0.04,
+        "chocolate": 0.03,
+        "snow": 0.02,
+        "tomato": 0.01,
+        "aqua": 0.0,
+    }
+    for term, p in expected_probs.items():
+        diff = abs(term_probs[term] - p)
+        assert diff < 1e-10, f"P({term}) != {p} with difference {diff}"
+
+
+colors = [
+    "aliceblue",
+    "antiquewhite",
+    "aqua",
+    "aquamarine",
+    "azure",
+    "beige",
+    "bisque",
+    "black",
+    "blanchedalmond",
+    "blue",
+    "blueviolet",
+    "brown",
+    "burlywood",
+    "cadetblue",
+    "chartreuse",
+    "chocolate",
+    "coral",
+    "cornflowerblue",
+    "cornsilk",
+    "crimson",
+    "cyan",
+    "darkblue",
+    "darkcyan",
+    "darkgoldenrod",
+    "darkgray",
+    "darkgreen",
+    "darkgrey",
+    "darkkhaki",
+    "darkmagenta",
+    "darkolivegreen",
+    "darkorange",
+    "darkorchid",
+    "darkred",
+    "darksalmon",
+    "darkseagreen",
+    "darkslateblue",
+    "darkslategray",
+    "darkslategrey",
+    "darkturquoise",
+    "darkviolet",
+    "deeppink",
+    "deepskyblue",
+    "dimgray",
+    "dimgrey",
+    "dodgerblue",
+    "firebrick",
+    "floralwhite",
+    "forestgreen",
+    "fuchsia",
+    "gainsboro",
+    "ghostwhite",
+    "goldenrod",
+    "gold",
+    "gray",
+    "green",
+    "greenyellow",
+    "grey",
+    "honeydew",
+    "hotpink",
+    "indianred",
+    "indigo",
+    "ivory",
+    "khaki",
+    "lavenderblush",
+    "lavender",
+    "lawngreen",
+    "lemonchiffon",
+    "lightblue",
+    "lightcoral",
+    "lightcyan",
+    "lightgoldenrodyellow",
+    "lightgray",
+    "lightgreen",
+    "lightgrey",
+    "lightpink",
+    "lightsalmon",
+    "lightseagreen",
+    "lightskyblue",
+    "lightslategray",
+    "lightslategrey",
+    "lightsteelblue",
+    "lightyellow",
+    "lime",
+    "limegreen",
+    "linen",
+    "magenta",
+    "maroon",
+    "mediumaquamarine",
+    "mediumblue",
+    "mediumorchid",
+    "mediumpurple",
+    "mediumseagreen",
+    "mediumslateblue",
+    "mediumspringgreen",
+    "mediumturquoise",
+    "mediumvioletred",
+    "midnightblue",
+    "mintcream",
+    "mistyrose",
+    "moccasin",
+    "navajowhite",
+    "navy",
+    "oldlace",
+    "olive",
+    "olivedrab",
+    "orange",
+    "orangered",
+    "orchid",
+    "palegoldenrod",
+    "palegreen",
+    "paleturquoise",
+    "palevioletred",
+    "papayawhip",
+    "peachpuff",
+    "peru",
+    "pink",
+    "plum",
+    "powderblue",
+    "purple",
+    "rebeccapurple",
+    "red",
+    "rosybrown",
+    "royalblue",
+    "saddlebrown",
+    "salmon",
+    "sandybrown",
+    "seagreen",
+    "seashell",
+    "sienna",
+    "silver",
+    "skyblue",
+    "slateblue",
+    "slategray",
+    "slategrey",
+    "snow",
+    "springgreen",
+    "steelblue",
+    "tan",
+    "teal",
+    "thistle",
+    "tomato",
+    "turquoise",
+    "violet",
+    "wheat",
+    "white",
+    "whitesmoke",
+    "yellow",
+    "yellowgreen",
+]
+rcolors = reversed(colors)

From acc074c1986bd138bde3d6904450f35f3d1f958d Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 13 Apr 2019 18:09:49 +0200
Subject: [PATCH 11/40] add a method which pairs terms with their probabilities

---
 weighwords/significant_words.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 7962fb9..cdc8678 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -81,9 +81,11 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None):
         self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps)
         self.p_specific = p_specific
 
-        exp_p_group = np.exp(p_group)
+        return self.get_term_probabilities(self.p_group)
 
-        return {t: exp_p_group[i] for t, i in self.vocab.items()}
+    def get_term_probabilities(self, log_prob_distribution):
+        probabilities = np.exp(log_prob_distribution)
+        return {t: probabilities[i] for t, i in self.vocab.items()}
 
     def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps):
         try:

From 9a6757db5cfdb85fbd39686d1bb5094538d66362 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 13 Apr 2019 18:10:26 +0200
Subject: [PATCH 12/40] prevent NaNs from causing downstream errors

---
 weighwords/significant_words.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index cdc8678..ec7582e 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -122,6 +122,11 @@ def _e_step(self, p_group, p_specific):
             'specific': specific_numerator - denominator,
             'group': group_numerator - denominator
         }
+        # prevent NaNs from causing downstream errors
+        for v in out.values():
+            v[np.isnan(v)] = np.NINF
+
+        return out
 
     def _m_step(self, expectation, log_doc_tf):
         group_numerator = logsum(log_doc_tf + expectation['group'])
@@ -162,9 +167,11 @@ def _specific_model(document_term_probabilities):
             p_specific = (
                 logsum(complement_products)
                 - np.log(
-                    np.count_nonzero(complement_products > np.log(0), axis=0)
+                    np.count_nonzero(complement_products > np.NINF, axis=0)
                 )
             )
+            # prevent NaNs from causing downstream errors
+            p_specific[np.isnan(p_specific)] = np.NINF
         finally:
             np.seterr(**old_error_settings)
 

From 159b3359a5339e7c7d73fbf6abe0aca96e391b6e Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 13 Apr 2019 18:11:54 +0200
Subject: [PATCH 13/40] [WIP] remove the corpus and specific layers from E
 (unused)

---
 weighwords/significant_words.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index ec7582e..f86c10a 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -117,9 +117,9 @@ def _e_step(self, p_group, p_specific):
                 group_numerator
             )
         ]
-        return {
-            'corpus': corpus_numerator - denominator,
-            'specific': specific_numerator - denominator,
+        out = {
+            # 'corpus': corpus_numerator - denominator,
+            # 'specific': specific_numerator - denominator,
             'group': group_numerator - denominator
         }
         # prevent NaNs from causing downstream errors

From fe1e4ac0621f284c7b2f22113d9429fc0ca1bccb Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Thu, 18 Apr 2019 21:53:38 +0200
Subject: [PATCH 14/40] omit NaNs when testing for convergence; (these NaNs are
 caused by `-inf - -inf`)

---
 weighwords/parsimonious.py      | 7 +++----
 weighwords/significant_words.py | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index bc1d71b..b1ae93f 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -144,7 +144,7 @@ def _EM(self, tf, p_term, w, max_iter, eps):
             A posteriori term probabilities.
         """
 
-        logger.info('EM with max_iter=%d, eps=%g' % (max_iter, eps))
+        logger.info(f'EM with max_iter={max_iter}, eps={eps}')
 
         if w is None:
             w = self.w
@@ -167,9 +167,8 @@ def _EM(self, tf, p_term, w, max_iter, eps):
 
                 diff = new_p_term - p_term
                 p_term = new_p_term
-                if (diff < eps).all():
-                    logger.info('EM: convergence reached after %d iterations'
-                                % i)
+                if (diff[np.isfinite(diff)] < eps).all():
+                    logger.info(f'EM: convergence reached after {i} iterations')
                     break
         finally:
             np.seterr(**old_error_settings)
diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index f86c10a..c7b2876 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -97,7 +97,7 @@ def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps):
 
                 diff = new_p_group - p_group
                 p_group = new_p_group
-                if (diff < eps).all():
+                if (diff[np.isfinite(diff)] < eps).all():
                     logger.info(f'EM: convergence reached after {i} iterations')
                     break
         finally:

From c3ce35bdba40f448821bf9d4c2b61cc2ed5a43fd Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 19 Apr 2019 00:43:55 +0200
Subject: [PATCH 15/40] estimate lambdas unless they are fixed

---
 weighwords/significant_words.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index c7b2876..0dd12d2 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -39,16 +39,19 @@ def __init__(self, documents, w, thresh=0):
         self.lambda_specific = None
         self.p_group = None
         self.p_specific = None
+        self.fix_lambdas = False
 
     def group_top(self, k, document_group, **kwargs):
         term_probabilities = self.fit_parsimonious_group(document_group, **kwargs)
         return nlargest(k, term_probabilities.items(), itemgetter(1))
 
-    def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None):
+    def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, fix_lambdas=False):
         if w is None:
             w = self.w
         assert 0 < w < 1, f"invalid w={w}; `w` needs a value between 0.0 and 1.0"
 
+        self.fix_lambdas = fix_lambdas
+
         document_models = [
             self._document_model(doc)
             for doc in document_group
@@ -118,8 +121,8 @@ def _e_step(self, p_group, p_specific):
             )
         ]
         out = {
-            # 'corpus': corpus_numerator - denominator,
-            # 'specific': specific_numerator - denominator,
+            'corpus': corpus_numerator - denominator,
+            'specific': specific_numerator - denominator,
             'group': group_numerator - denominator
         }
         # prevent NaNs from causing downstream errors
@@ -129,9 +132,20 @@ def _e_step(self, p_group, p_specific):
         return out
 
     def _m_step(self, expectation, log_doc_tf):
-        group_numerator = logsum(log_doc_tf + expectation['group'])
+        term_weighted_group = log_doc_tf + expectation['group']
+        group_numerator = logsum(term_weighted_group)
         p_group = group_numerator - logsum(group_numerator)
-        # TODO: estimate lambdas
+
+        if self.fix_lambdas is False:
+            # estimate lambdas
+            corpus_numerator = logsum(np.transpose(log_doc_tf + expectation['corpus']))
+            specific_numerator = logsum(np.transpose(log_doc_tf + expectation['specific']))
+            group_numerator = logsum(np.transpose(term_weighted_group))
+            denominator = logsum(np.asarray([corpus_numerator, specific_numerator, group_numerator]))
+            self.lambda_corpus = corpus_numerator - denominator
+            self.lambda_specific = specific_numerator - denominator
+            self.lambda_group = group_numerator - denominator
+
         return p_group
 
     @staticmethod

From bf2a03123faf284114376674141f68d81cddb01e Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 19 Apr 2019 00:45:31 +0200
Subject: [PATCH 16/40] never take NaN to be the true max

---
 weighwords/logsum.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weighwords/logsum.py b/weighwords/logsum.py
index ed17d19..ff5f0f5 100644
--- a/weighwords/logsum.py
+++ b/weighwords/logsum.py
@@ -24,7 +24,7 @@ def logsum(x):
     """
     # Use the max to normalize, as with the log this is what accumulates
     # the less errors
-    vmax = x.max(axis=0)
+    vmax = np.nanmax(x, axis=0)
     out = np.log(np.nansum(np.exp(x - vmax), axis=0))
     out += vmax
     return out

From 45fec030389ec0a60a8460bbb3abe5dd2653bf8a Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 19 Apr 2019 01:21:58 +0200
Subject: [PATCH 17/40] log final lambdas (mean over documents)

---
 weighwords/significant_words.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 0dd12d2..e3ea284 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -84,6 +84,12 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None,
         self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps)
         self.p_specific = p_specific
 
+        if self.fix_lambdas is False:
+            logger.info(
+                f'Final lambdas (mean): Corpus={np.mean(np.exp(self.lambda_corpus))}, '
+                f'Group={np.mean(np.exp(self.lambda_group))}, '
+                f'Specific={np.mean(np.exp(self.lambda_specific))}'
+            )
         return self.get_term_probabilities(self.p_group)
 
     def get_term_probabilities(self, log_prob_distribution):

From 7e193a813eb0754dc950afb4fd47bba185c9ef37 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 19 Apr 2019 02:15:35 +0200
Subject: [PATCH 18/40] initialize corpus with higher weight than specific

---
 weighwords/significant_words.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index e3ea284..6702e45 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -71,7 +71,9 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None,
 
         p_specific = self._specific_model(doc_term_probs)
 
-        general_w = specific_w = np.log(0.5 * (1 - w))
+        # FIXME: magic constants
+        general_w = np.log(0.8 * (1 - w))
+        specific_w = np.log(0.2 * (1 - w))
         group_w = np.log(w)
         weights_shape = len(document_group)
         self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.float)

From 3061327faa7248e04180c11e8078452a4b0b801d Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 19 Apr 2019 02:19:33 +0200
Subject: [PATCH 19/40] update tests

---
 tests/test_swlm.py | 48 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/tests/test_swlm.py b/tests/test_swlm.py
index 7ac0bf8..9c3d305 100644
--- a/tests/test_swlm.py
+++ b/tests/test_swlm.py
@@ -1,24 +1,31 @@
+import logging
 import operator
 from functools import reduce
 
 import numpy as np
+import pytest
 
 from weighwords.significant_words import SignificantWordsLM
 
-# data follows tests
+logging.basicConfig(level=logging.INFO)
 
 
-def test_model_fit():
-    # init an SWLM with uniform p_corpus
-    swlm = SignificantWordsLM([colors], w=0.1)
-    # deterministically generate some docs
-    doc_parts = np.array_split(list(zip(colors[:25], rcolors)), 5)
-    doc_group = [
-        reduce(operator.add, [i * list(d) for i, d in enumerate(z)])
-        for z in zip(*doc_parts)
-    ]
-    # fit the modelk
-    term_probs = swlm.fit_parsimonious_group(doc_group)
+def test_model_fit_fixed(swlm, doc_group):
+    term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=True)
+    expected_probs = {
+        "salmon": 0.04,
+        "chocolate": 0.03,
+        "snow": 0.02,
+        "tomato": 0.01,
+        "aqua": 0.0,
+    }
+    for term, p in expected_probs.items():
+        diff = abs(term_probs[term] - p)
+        assert diff < 1e-10, f"P({term}) != {p} with difference {diff}"
+
+
+def test_model_fit_shifty(swlm, doc_group):
+    term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=False)
     expected_probs = {
         "salmon": 0.04,
         "chocolate": 0.03,
@@ -31,6 +38,22 @@ def test_model_fit():
         assert diff < 1e-10, f"P({term}) != {p} with difference {diff}"
 
 
+@pytest.fixture(scope="module")
+def swlm():
+    # init an SWLM with uniform p_corpus
+    return SignificantWordsLM([colors], w=0.1)
+
+
+@pytest.fixture(scope="module")
+def doc_group():
+    # deterministically generate some docs
+    doc_parts = np.array_split(list(zip(colors[:25], reversed(colors))), 5)
+    return [
+        reduce(operator.add, [i * list(d) for i, d in enumerate(z)])
+        for z in zip(*doc_parts)
+    ]
+
+
 colors = [
     "aliceblue",
     "antiquewhite",
@@ -181,4 +204,3 @@ def test_model_fit():
     "yellow",
     "yellowgreen",
 ]
-rcolors = reversed(colors)

From 993595de8dc042541b6a1f5a15448eec9dd3b1b6 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 10 May 2019 22:21:35 +0200
Subject: [PATCH 20/40] move PLM parameters to `__init__` docstring

---
 weighwords/parsimonious.py | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index b1ae93f..b9d85f7 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -22,24 +22,29 @@ class ParsimoniousLM:
     Constructing an object of this class fits a background model. The top
     method can then be used to fit document-specific models, also for unseen
     documents (with the same vocabulary as the background corpus).
-
-    Parameters
-    ----------
-    documents : iterable over iterable over terms
-    w : float
-        Weight of document model (1 - weight of corpus model)
-    thresh : int
-        Don't include words that occur < thresh times
-
-    Attributes
-    ----------
-    vocab : dict of term -> int
-        Mapping of terms to numeric indices
-    p_corpus : array of float
-        Log prob of terms
     """
 
     def __init__(self, documents, w, thresh=0):
+        """
+        Collect the vocabulary and fit the background model.
+
+        Parameters
+        ----------
+        documents : iterable over iterable over terms
+            All documents that should be included in the corpus model
+        w : float
+            Weight of document model (1 - weight of corpus model)
+        thresh : int
+            Don't include words that occur fewer than `thresh` times
+
+        Attributes
+        ----------
+        vocab : dict of term -> int
+            Mapping of terms to numeric indices
+        p_corpus : array of float
+            Log probability of terms in background model (indexed by `vocab`)
+
+        """
         logger.info('Building corpus model')
 
         self.w = w
@@ -66,7 +71,7 @@ def __init__(self, documents, w, thresh=0):
             np.seterr(**old_error_settings)
 
     def top(self, k, d, max_iter=50, eps=1e-5, w=None):
-        """Get the top k terms of a document d and their log probabilities.
+        """Get the top `k` terms of a document `d` and their log probabilities.
 
         Uses the Expectation Maximization (EM) algorithm to estimate term
         probabilities.

From b056279d5724712964fcefb962d41c57b4511924 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 11 May 2019 16:04:39 +0200
Subject: [PATCH 21/40] don't count terms with a cf < thresh; ignore
 out-of-vocabulary terms: this prevents errors but does not "handle unseen
 words" as expressed in #1

---
 setup.py                   |  2 ++
 tests/test_plm.py          | 38 ++++++++++++++++++++++++++++++++++++++
 weighwords/parsimonious.py |  7 ++++++-
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_plm.py

diff --git a/setup.py b/setup.py
index 601f5da..7826ff2 100644
--- a/setup.py
+++ b/setup.py
@@ -10,9 +10,11 @@
     license = "LGPL",
     packages = ["weighwords"],
     install_requires = ["numpy>=1.4.0"],
+    tests_require = ["pytest"],
     classifiers = [
         "Development Status :: 4 - Beta",
         "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)",
+        "Programming Language :: Python :: 3.6",
         "Topic :: Text Processing",
     ]
 )
diff --git a/tests/test_plm.py b/tests/test_plm.py
new file mode 100644
index 0000000..14c5671
--- /dev/null
+++ b/tests/test_plm.py
@@ -0,0 +1,38 @@
+import numpy as np
+import pytest
+
+from weighwords import ParsimoniousLM
+
+
+def test_document_model(number_corpus, uniform_doc):
+    plm = ParsimoniousLM([number_corpus], w=0.1, thresh=3)
+    tf, p_term = plm._document_model(uniform_doc)
+    assert (tf[:2] == 0).all(), \
+        "Terms with a corpus frequency < thresh should not be counted"
+    assert tf.sum() == 3, f"Expected tf.sum() to be 3, got {tf.sum()} instead"
+    linear_p_term = np.exp(p_term)
+    assert (linear_p_term[2:].sum() - 1) < 1e-10, \
+        f"All probability mass should be on the last 3 terms, got {linear_p_term} instead"
+
+
+def test_document_model_out_of_vocabulary(number_corpus):
+    plm = ParsimoniousLM([number_corpus], w=0.1)
+    doc = ['two', 'or', 'three', 'unseen', 'words']
+    tf, p_term = plm._document_model(doc)
+    assert tf.sum() == 2, f"Unseen words should be ignored, got {tf} instead"
+
+
+@pytest.fixture(scope="module")
+def uniform_doc():
+    return ['one', 'two', 'three', 'four', 'five']
+
+
+@pytest.fixture(scope="module")
+def number_corpus():
+    return [
+        'one',
+        'two', 'two',
+        'three', 'three', 'three',
+        'four', 'four', 'four', 'four',
+        'five', 'five', 'five', 'five', 'five'
+    ]
diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index b9d85f7..3b3bb28 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -119,7 +119,12 @@ def _document_model(self, d):
         tf = np.zeros(len(self.vocab), dtype=np.float)  # Term frequency
 
         for tok in d:
-            tf[self.vocab[tok]] += 1
+            term_id = self.vocab.get(tok)
+            if term_id:
+                tf[term_id] += 1
+
+        # ignore counts of terms with zero corpus probability
+        tf *= np.isfinite(self.p_corpus)
 
         n_distinct = (tf > 0).sum()
 

From f889815a37eec23c040391808ff5ff54b9edd238 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Tue, 14 May 2019 22:57:27 +0200
Subject: [PATCH 22/40] give dtypes a more explicit name

---
 weighwords/parsimonious.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index 3b3bb28..6a4c0cc 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -56,7 +56,7 @@ def __init__(self, documents, w, thresh=0):
                 i = vocab.setdefault(tok, len(vocab))
                 count[i] += 1
 
-        cf = np.empty(len(count), dtype=np.float)
+        cf = np.empty(len(count), dtype=np.double)
         for i, f in count.items():
             cf[i] = f
         rare = (cf < thresh)
@@ -116,7 +116,7 @@ def _document_model(self, d):
 
         logger.info('Gathering term probabilities')
 
-        tf = np.zeros(len(self.vocab), dtype=np.float)  # Term frequency
+        tf = np.zeros(len(self.vocab), dtype=np.double)  # Term frequency
 
         for tok in d:
             term_id = self.vocab.get(tok)

From 99832cb77402d39aa0d84cc7935ed171948203ff Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Tue, 14 May 2019 23:01:46 +0200
Subject: [PATCH 23/40] extend swlm model and add arguments

---
 tests/test_swlm.py              |  2 +-
 weighwords/significant_words.py | 73 +++++++++++++++++++++++----------
 2 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/tests/test_swlm.py b/tests/test_swlm.py
index 9c3d305..8c8ee3b 100644
--- a/tests/test_swlm.py
+++ b/tests/test_swlm.py
@@ -41,7 +41,7 @@ def test_model_fit_shifty(swlm, doc_group):
 @pytest.fixture(scope="module")
 def swlm():
     # init an SWLM with uniform p_corpus
-    return SignificantWordsLM([colors], w=0.1)
+    return SignificantWordsLM([colors], lambdas=(0.7, 0.1, 0.2))
 
 
 @pytest.fixture(scope="module")
diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 6702e45..94d1f8e 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -32,8 +32,9 @@ class SignificantWordsLM(ParsimoniousLM):
     p_corpus : array of float
         Log prob of terms
     """
-    def __init__(self, documents, w, thresh=0):
-        super().__init__(documents, w, thresh=thresh)
+    def __init__(self, documents, lambdas, thresh=0):
+        self.initial_lambdas = self.normalize_lambdas(lambdas)
+        super().__init__(documents, self.initial_lambdas[1], thresh=thresh)
         self.lambda_corpus = None
         self.lambda_group = None
         self.lambda_specific = None
@@ -45,10 +46,14 @@ def group_top(self, k, document_group, **kwargs):
         term_probabilities = self.fit_parsimonious_group(document_group, **kwargs)
         return nlargest(k, term_probabilities.items(), itemgetter(1))
 
-    def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, fix_lambdas=False):
-        if w is None:
-            w = self.w
-        assert 0 < w < 1, f"invalid w={w}; `w` needs a value between 0.0 and 1.0"
+    def fit_parsimonious_group(
+            self, document_group, max_iter=50, eps=1e-5, lambdas=None,
+            fix_lambdas=False, parsimonize_specific=False, post_parsimonize=False
+    ):
+        if lambdas is None:
+            lambdas = self.initial_lambdas
+        else:
+            lambdas = self.normalize_lambdas(lambdas)
 
         self.fix_lambdas = fix_lambdas
 
@@ -56,6 +61,8 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None,
             self._document_model(doc)
             for doc in document_group
         ]
+        del document_group
+
         doc_term_frequencies = [tf for tf, _ in document_models]
         group_tf, p_group = self._group_model(
             doc_term_frequencies
@@ -70,21 +77,25 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None,
             np.seterr(**old_error_settings)
 
         p_specific = self._specific_model(doc_term_probs)
+        if parsimonize_specific:
+            p_specific = self._EM(group_tf, p_specific, self.w, max_iter, eps)
+
+        self.p_specific = p_specific
 
-        # FIXME: magic constants
-        general_w = np.log(0.8 * (1 - w))
-        specific_w = np.log(0.2 * (1 - w))
-        group_w = np.log(w)
-        weights_shape = len(document_group)
-        self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.float)
-        self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.float)
-        self.lambda_group = np.full(weights_shape, group_w, dtype=np.float)
+        weights_shape = len(document_models)
+        general_w, group_w, specific_w = np.log(lambdas)
+        self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.double)
+        self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.double)
+        self.lambda_group = np.full(weights_shape, group_w, dtype=np.double)
         logger.info(
-            f'Lambdas initialized to: Corpus={np.exp(general_w)}, '
-            f'Group={w}, Specific={np.exp(specific_w)}'
+            f'Lambdas initialized to: Corpus={lambdas[0]}, '
+            f'Group={lambdas[1]}, Specific={lambdas[2]}'
         )
-        self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps)
-        self.p_specific = p_specific
+        self.p_group = self._estimate(
+            p_group, p_specific, doc_term_frequencies, max_iter, eps
+        )
+        if post_parsimonize:
+            self.p_group = self._EM(group_tf, self.p_group, self.w, max_iter, eps)
 
         if self.fix_lambdas is False:
             logger.info(
@@ -96,6 +107,7 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None,
 
     def get_term_probabilities(self, log_prob_distribution):
         probabilities = np.exp(log_prob_distribution)
+        probabilities[np.isnan(probabilities)] = 0.
         return {t: probabilities[i] for t, i in self.vocab.items()}
 
     def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps):
@@ -146,10 +158,16 @@ def _m_step(self, expectation, log_doc_tf):
 
         if self.fix_lambdas is False:
             # estimate lambdas
-            corpus_numerator = logsum(np.transpose(log_doc_tf + expectation['corpus']))
-            specific_numerator = logsum(np.transpose(log_doc_tf + expectation['specific']))
+            corpus_numerator = logsum(
+                np.transpose(log_doc_tf + expectation['corpus'])
+            )
+            specific_numerator = logsum(
+                np.transpose(log_doc_tf + expectation['specific'])
+            )
             group_numerator = logsum(np.transpose(term_weighted_group))
-            denominator = logsum(np.asarray([corpus_numerator, specific_numerator, group_numerator]))
+            denominator = logsum(
+                np.asarray([corpus_numerator, specific_numerator, group_numerator])
+            )
             self.lambda_corpus = corpus_numerator - denominator
             self.lambda_specific = specific_numerator - denominator
             self.lambda_group = group_numerator - denominator
@@ -177,7 +195,7 @@ def _specific_model(document_term_probabilities):
         ]
         # probability of term to be important in one doc, and not others
         complement_products = np.array([
-            document_term_probabilities[i] + complement
+            dlm + complement
             for i, dlm in enumerate(document_term_probabilities)
             for j, complement in enumerate(complements)
             if i != j
@@ -198,3 +216,14 @@ def _specific_model(document_term_probabilities):
             np.seterr(**old_error_settings)
 
         return p_specific
+
+    @staticmethod
+    def normalize_lambdas(lambdas):
+        assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}'
+        lambda_sum = sum(lambdas)
+        if abs(lambda_sum - 1) > 1e-10:
+            lambdas = tuple(
+                w / lambda_sum
+                for w in lambdas
+            )
+        return lambdas

From 019228a80a97d6a83ee077c3247c244092c06dc3 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Wed, 15 May 2019 18:39:45 +0200
Subject: [PATCH 24/40] type annotations for PLM (PEP 585-ready)

---
 setup.py                   |  5 ++-
 weighwords/parsimonious.py | 83 +++++++++++++++++++++++++-------------
 weighwords/py.typed        |  0
 3 files changed, 58 insertions(+), 30 deletions(-)
 create mode 100644 weighwords/py.typed

diff --git a/setup.py b/setup.py
index 7826ff2..3552010 100644
--- a/setup.py
+++ b/setup.py
@@ -8,13 +8,14 @@
     description = "Python library for creating word weights/word clouds from text",
     keywords = "word cloud nlp language model",
     license = "LGPL",
+    package_data = {"weighwords": ["py.typed"]},
     packages = ["weighwords"],
-    install_requires = ["numpy>=1.4.0"],
+    install_requires = ["numpy>=1.15.0"],
     tests_require = ["pytest"],
     classifiers = [
         "Development Status :: 4 - Beta",
         "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)",
-        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
         "Topic :: Text Processing",
     ]
 )
diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index 6a4c0cc..a6709e1 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -1,12 +1,15 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
-# Copyright 2011-2013 University of Amsterdam
+# Copyright 2011-2019 University of Amsterdam
 # Author: Lars Buitinck
 
+from __future__ import annotations
+
 from collections import defaultdict
 from heapq import nlargest
 import logging
 from operator import itemgetter
+from typing import Iterable, Optional
 
 import numpy as np
 
@@ -22,34 +25,39 @@ class ParsimoniousLM:
     Constructing an object of this class fits a background model. The top
     method can then be used to fit document-specific models, also for unseen
     documents (with the same vocabulary as the background corpus).
-    """
 
-    def __init__(self, documents, w, thresh=0):
-        """
-        Collect the vocabulary and fit the background model.
-
-        Parameters
-        ----------
-        documents : iterable over iterable over terms
-            All documents that should be included in the corpus model
-        w : float
-            Weight of document model (1 - weight of corpus model)
-        thresh : int
-            Don't include words that occur fewer than `thresh` times
-
-        Attributes
-        ----------
-        vocab : dict of term -> int
-            Mapping of terms to numeric indices
-        p_corpus : array of float
-            Log probability of terms in background model (indexed by `vocab`)
+    Parameters
+    ----------
+    documents : iterable over iterable over terms
+        All documents that should be included in the corpus model
+    w : float
+        Weight of document model (1 - weight of corpus model)
+    thresh : int
+        Don't include words that occur fewer than `thresh` times
+
+    Attributes
+    ----------
+    vocab : dict of term -> int
+        Mapping of terms to numeric indices
+    p_corpus : array of float
+        Log probability of terms in background model (indexed by `vocab`)
+    """
 
-        """
+    def __init__(
+        self,
+        documents: Iterable[Iterable[str]],
+        w: float,
+        thresh: int = 0
+    ):
+        """Collect the vocabulary and fit the background model."""
         logger.info('Building corpus model')
 
         self.w = w
-        self.vocab = vocab = {}     # Vocabulary: maps terms to numeric indices
-        count = defaultdict(int)    # Corpus frequency
+        # Vocabulary: maps terms to numeric indices
+        vocab: dict[str, int]
+        self.vocab = vocab = {}
+        # Corpus frequency
+        count: dict[int, int] = defaultdict(int)
 
         for d in documents:
             for tok in d:
@@ -70,7 +78,14 @@ def __init__(self, documents, w, thresh=0):
         finally:
             np.seterr(**old_error_settings)
 
-    def top(self, k, d, max_iter=50, eps=1e-5, w=None):
+    def top(
+        self,
+        k: int,
+        d: Iterable[str],
+        max_iter: int = 50,
+        eps: float = 1e-5,
+        w: Optional[float] = None
+    ) -> list[tuple[str, float]]:
         """Get the top `k` terms of a document `d` and their log probabilities.
 
         Uses the Expectation Maximization (EM) algorithm to estimate term
@@ -78,6 +93,10 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None):
 
         Parameters
         ----------
+        k
+            Number of top terms to return
+        d
+            Terms that make up the document
         max_iter : int, optional
             Maximum number of iterations of EM algorithm to run.
         eps : float, optional
@@ -88,6 +107,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None):
         Returns
         -------
         t_p : list of (str, float)
+            Terms and their log-probabilities in the parsimonious model
         """
 
         tf, p_term = self._document_model(d)
@@ -96,7 +116,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None):
         terms = [(t, p_term[i]) for t, i in self.vocab.items()]
         return nlargest(k, terms, itemgetter(1))
 
-    def _document_model(self, d):
+    def _document_model(self, d: Iterable[str]) -> tuple[np.ndarray, np.ndarray]:
         """Build document model.
 
         Parameters
@@ -136,7 +156,14 @@ def _document_model(self, d):
 
         return tf, p_term
 
-    def _EM(self, tf, p_term, w, max_iter, eps):
+    def _EM(
+        self,
+        tf: Iterable[int],
+        p_term: Iterable[float],
+        w: Optional[float],
+        max_iter: int,
+        eps: float
+    ) -> np.ndarray:
         """Expectation maximization.
 
         Parameters
diff --git a/weighwords/py.typed b/weighwords/py.typed
new file mode 100644
index 0000000..e69de29

From d41a79a8b49017570f5ca342b50f979e36ce9ce8 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Wed, 15 May 2019 19:36:42 +0200
Subject: [PATCH 25/40] type annotations for logsum; format some docstrings

---
 weighwords/logsum.py       |  2 +-
 weighwords/parsimonious.py | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/weighwords/logsum.py b/weighwords/logsum.py
index ff5f0f5..350bd26 100644
--- a/weighwords/logsum.py
+++ b/weighwords/logsum.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 
-def logsum(x):
+def logsum(x: np.ndarray) -> np.ndarray:
     """Computes the sum of x assuming x is in the log domain.
 
     Returns log(sum(exp(x))) while minimizing the possibility of
diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index a6709e1..5e174f8 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -29,11 +29,11 @@ class ParsimoniousLM:
     Parameters
     ----------
     documents : iterable over iterable over terms
-        All documents that should be included in the corpus model
+        All documents that should be included in the corpus model.
     w : float
-        Weight of document model (1 - weight of corpus model)
+        Weight of document model (1 - weight of corpus model).
     thresh : int
-        Don't include words that occur fewer than `thresh` times
+        Don't include words that occur fewer than `thresh` times.
 
     Attributes
     ----------
@@ -94,13 +94,13 @@ def top(
         Parameters
         ----------
         k
-            Number of top terms to return
+            Number of top terms to return.
         d
-            Terms that make up the document
+            Terms that make up the document.
         max_iter : int, optional
             Maximum number of iterations of EM algorithm to run.
         eps : float, optional
-            Convergence threshold for EM algorithm.
+            Epsilon: convergence threshold for EM algorithm.
         w : float, optional
             Weight of document model; overrides value given to __init__
 
@@ -174,6 +174,8 @@ def _EM(
             Term probabilities, as returned by document_model
         max_iter : int
             Number of iterations to run.
+        eps : float
+            Epsilon: convergence threshold for EM algorithm.
 
         Returns
         -------

From 5c243f9dabb68202370dc29ba59e275557475dee Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Thu, 16 May 2019 01:25:46 +0200
Subject: [PATCH 26/40] add type annotations and SWLM class docstring; switched
 to backwards-compatible annotations, because while python may be PEP
 585-ready, mypy does not deal with builtin generics yet

---
 weighwords/parsimonious.py      |  57 +++++++-------
 weighwords/significant_words.py | 131 ++++++++++++++++++++++++++------
 2 files changed, 137 insertions(+), 51 deletions(-)

diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index 5e174f8..eed0a30 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -4,12 +4,13 @@
 # Author: Lars Buitinck
 
 from __future__ import annotations
+# TODO: remove redundant typing imports once PEP 585 is finalized
 
 from collections import defaultdict
 from heapq import nlargest
 import logging
 from operator import itemgetter
-from typing import Iterable, Optional
+from typing import Iterable, Optional, Dict, List, Tuple
 
 import numpy as np
 
@@ -20,7 +21,8 @@
 
 
 class ParsimoniousLM:
-    """Language model for a set of documents.
+    """
+    Language model for a set of documents.
 
     Constructing an object of this class fits a background model. The top
     method can then be used to fit document-specific models, also for unseen
@@ -44,20 +46,20 @@ class ParsimoniousLM:
     """
 
     def __init__(
-        self,
-        documents: Iterable[Iterable[str]],
-        w: float,
-        thresh: int = 0
+            self,
+            documents: Iterable[Iterable[str]],
+            w: np.floating,
+            thresh: int = 0
     ):
         """Collect the vocabulary and fit the background model."""
         logger.info('Building corpus model')
 
         self.w = w
         # Vocabulary: maps terms to numeric indices
-        vocab: dict[str, int]
+        vocab: Dict[str, int]
         self.vocab = vocab = {}
         # Corpus frequency
-        count: dict[int, int] = defaultdict(int)
+        count: Dict[int, int] = defaultdict(int)
 
         for d in documents:
             for tok in d:
@@ -74,19 +76,20 @@ def __init__(
             old_error_settings = np.seterr(divide='ignore')
 
             # lg P(t|C)
-            self.p_corpus = np.log(cf) - np.log(np.sum(cf))
+            self.p_corpus: np.ndarray = np.log(cf) - np.log(np.sum(cf))
         finally:
             np.seterr(**old_error_settings)
 
     def top(
-        self,
-        k: int,
-        d: Iterable[str],
-        max_iter: int = 50,
-        eps: float = 1e-5,
-        w: Optional[float] = None
-    ) -> list[tuple[str, float]]:
-        """Get the top `k` terms of a document `d` and their log probabilities.
+            self,
+            k: int,
+            d: Iterable[str],
+            max_iter: int = 50,
+            eps: float = 1e-5,
+            w: Optional[np.floating] = None
+    ) -> List[Tuple[str, float]]:
+        """
+        Get the top `k` terms of a document `d` and their log probabilities.
 
         Uses the Expectation Maximization (EM) algorithm to estimate term
         probabilities.
@@ -116,8 +119,9 @@ def top(
         terms = [(t, p_term[i]) for t, i in self.vocab.items()]
         return nlargest(k, terms, itemgetter(1))
 
-    def _document_model(self, d: Iterable[str]) -> tuple[np.ndarray, np.ndarray]:
-        """Build document model.
+    def _document_model(self, d: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Build document model.
 
         Parameters
         ----------
@@ -157,14 +161,15 @@ def _document_model(self, d: Iterable[str]) -> tuple[np.ndarray, np.ndarray]:
         return tf, p_term
 
     def _EM(
-        self,
-        tf: Iterable[int],
-        p_term: Iterable[float],
-        w: Optional[float],
-        max_iter: int,
-        eps: float
+            self,
+            tf: np.ndarray,
+            p_term: np.ndarray,
+            w: Optional[np.floating],
+            max_iter: int,
+            eps: float
     ) -> np.ndarray:
-        """Expectation maximization.
+        """
+        Expectation maximization.
 
         Parameters
         ----------
diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 94d1f8e..ed2f0e7 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -1,6 +1,15 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 TinQwise Stamkracht, University of Amsterdam
+# Author: Alex Olieman
+
+from __future__ import annotations
+# TODO: remove redundant typing imports once PEP 585 is finalized
+
 import logging
 from heapq import nlargest
 from operator import itemgetter
+from typing import Iterable, Optional, Sequence, Tuple, List, Dict
 
 import numpy as np
 
@@ -9,47 +18,97 @@
 
 logger = logging.getLogger(__name__)
 
+InitialLambdas = Tuple[np.floating, np.floating, np.floating]
+
 
 class SignificantWordsLM(ParsimoniousLM):
-    """Language model for a set of documents.
+    """
+    Language model that consists of three sub-models:
 
-    Constructing an object of this class fits a background model. The top
-    method can then be used to fit document-specific models, also for unseen
-    documents (with the same vocabulary as the background corpus).
+    - Corpus model: represents term probabilities in a (large) background collection;
+    - Group model: parsimonious term probabilities in a group of documents;
+    - Specific model: represents the same group, but is biased towards terms that
+      occur with a high frequency in single docs, and a low frequency in others.
 
     Parameters
     ----------
     documents : iterable over iterable over terms
-    w : float
-        Weight of document model (1 - weight of corpus model)
+        All documents that should be included in the corpus model.
+    lambdas : 3-tuple of floats
+        Weight of corpus, group, and specific models. Will be normalized
+        if the weights in the tuple don't sum to one.
     thresh : int
-        Don't include words that occur < thresh times
+        Don't include words that occur fewer than `thresh` times.
 
     Attributes
     ----------
     vocab : dict of term -> int
         Mapping of terms to numeric indices
     p_corpus : array of float
-        Log prob of terms
+        Log probability of terms in background model (indexed by `vocab`)
+    p_group : array of float
+        Log probability of terms in background model (indexed by `vocab`)
+    p_specific : array of float
+        Log probability of terms in background model (indexed by `vocab`)
+    lambda_corpus : array of float
+        Log probability (weight) of corpus model for documents
+    lambda_group : array of float
+        Log probability (weight) of group model for documents
+    lambda_specific : array of float
+        Log probability (weight) of specific model for documents
+
+    Methods
+    -------
+    fit_parsimonious_group(document_group, ...)
+        Estimates a document group model, parsimonized against the corpus
+        and specific models. The documents may be unseen, but terms that
+        are not in the vocabulary will be ignored.
+    group_top(k, document_group, ...)
+        Shortcut to fit the group model and retrieve the top `k` terms.
+    get_term_probabilities(log_prob_distribution)
+        Aligns a term distribution with the vocabulary, and transforms
+        the term log probabilities to linear probabilities.
+
+    See Also
+    --------
+    parsimonious.ParsimoniousLM : one-sided parsimonious model
     """
-    def __init__(self, documents, lambdas, thresh=0):
+
+    def __init__(
+            self,
+            documents: Iterable[Iterable[str]],
+            lambdas: InitialLambdas,
+            thresh: int = 0
+    ):
+        """Collect the vocabulary and fit the background model."""
         self.initial_lambdas = self.normalize_lambdas(lambdas)
         super().__init__(documents, self.initial_lambdas[1], thresh=thresh)
-        self.lambda_corpus = None
-        self.lambda_group = None
-        self.lambda_specific = None
-        self.p_group = None
-        self.p_specific = None
+        self.lambda_corpus: Optional[np.ndarray] = None
+        self.lambda_group: Optional[np.ndarray] = None
+        self.lambda_specific: Optional[np.ndarray] = None
+        self.p_group: Optional[np.ndarray] = None
+        self.p_specific: Optional[np.ndarray] = None
         self.fix_lambdas = False
 
-    def group_top(self, k, document_group, **kwargs):
+    def group_top(
+            self,
+            k: int,
+            document_group: Iterable[Iterable[str]],
+            **kwargs
+    ) -> List[Tuple[str, float]]:
         term_probabilities = self.fit_parsimonious_group(document_group, **kwargs)
         return nlargest(k, term_probabilities.items(), itemgetter(1))
 
     def fit_parsimonious_group(
-            self, document_group, max_iter=50, eps=1e-5, lambdas=None,
-            fix_lambdas=False, parsimonize_specific=False, post_parsimonize=False
-    ):
+            self,
+            document_group: Iterable[Iterable[str]],
+            max_iter: int = 50,
+            eps: float = 1e-5,
+            lambdas: Optional[InitialLambdas] = None,
+            fix_lambdas: bool = False,
+            parsimonize_specific: bool = False,
+            post_parsimonize: bool = False
+    ) -> Dict[str, float]:
         if lambdas is None:
             lambdas = self.initial_lambdas
         else:
@@ -105,12 +164,22 @@ def fit_parsimonious_group(
             )
         return self.get_term_probabilities(self.p_group)
 
-    def get_term_probabilities(self, log_prob_distribution):
+    def get_term_probabilities(
+            self,
+            log_prob_distribution: np.ndarray
+    ) -> Dict[str, float]:
         probabilities = np.exp(log_prob_distribution)
         probabilities[np.isnan(probabilities)] = 0.
         return {t: probabilities[i] for t, i in self.vocab.items()}
 
-    def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps):
+    def _estimate(
+            self,
+            p_group: np.ndarray,
+            p_specific: np.ndarray,
+            doc_tf: Sequence[np.ndarray],
+            max_iter: int,
+            eps: float
+    ) -> np.ndarray:
         try:
             old_error_settings = np.seterr(divide='ignore')
             log_doc_tf = np.log(doc_tf)
@@ -128,7 +197,11 @@ def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps):
 
         return p_group
 
-    def _e_step(self, p_group, p_specific):
+    def _e_step(
+            self,
+            p_group: np.ndarray,
+            p_specific: np.ndarray
+    ) -> Dict[str, np.ndarray]:
         corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus)
         specific_numerator = np.add.outer(self.lambda_specific, p_specific)
         group_numerator = np.add.outer(self.lambda_group, p_group)
@@ -151,7 +224,11 @@ def _e_step(self, p_group, p_specific):
 
         return out
 
-    def _m_step(self, expectation, log_doc_tf):
+    def _m_step(
+            self,
+            expectation: Dict[str, np.ndarray],
+            log_doc_tf: Sequence[np.ndarray]
+    ) -> np.ndarray:
         term_weighted_group = log_doc_tf + expectation['group']
         group_numerator = logsum(term_weighted_group)
         p_group = group_numerator - logsum(group_numerator)
@@ -175,7 +252,9 @@ def _m_step(self, expectation, log_doc_tf):
         return p_group
 
     @staticmethod
-    def _group_model(document_term_frequencies):
+    def _group_model(
+            document_term_frequencies: Sequence[np.ndarray]
+    ) -> Tuple[np.ndarray, np.ndarray]:
         group_tf = np.array(document_term_frequencies).sum(axis=0)
 
         try:
@@ -187,7 +266,9 @@ def _group_model(document_term_frequencies):
         return group_tf, p_group
 
     @staticmethod
-    def _specific_model(document_term_probabilities):
+    def _specific_model(
+            document_term_probabilities: Sequence[np.ndarray]
+    ) -> np.ndarray:
         # complement events: 1 - p
         complements = [
             np.log1p(-np.exp(p_doc))
@@ -218,7 +299,7 @@ def _specific_model(document_term_probabilities):
         return p_specific
 
     @staticmethod
-    def normalize_lambdas(lambdas):
+    def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas:
         assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}'
         lambda_sum = sum(lambdas)
         if abs(lambda_sum - 1) > 1e-10:

From 9a6e14b1313430d0521489717031340fb86e7df9 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 17 May 2019 15:12:13 +0200
Subject: [PATCH 27/40] cast lambdas to prevent mypy naggery

---
 weighwords/significant_words.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index ed2f0e7..17c72b5 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -9,7 +9,7 @@
 import logging
 from heapq import nlargest
 from operator import itemgetter
-from typing import Iterable, Optional, Sequence, Tuple, List, Dict
+from typing import Iterable, Optional, Sequence, Tuple, List, Dict, cast
 
 import numpy as np
 
@@ -303,8 +303,11 @@ def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas:
         assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}'
         lambda_sum = sum(lambdas)
         if abs(lambda_sum - 1) > 1e-10:
-            lambdas = tuple(
-                w / lambda_sum
-                for w in lambdas
+            lambdas = cast(
+                InitialLambdas,
+                tuple(
+                    w / lambda_sum
+                    for w in lambdas
+                )
             )
         return lambdas

From 5bfc67b37541ed397a52d784d14cdca1fef42459 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 17 May 2019 17:31:30 +0200
Subject: [PATCH 28/40] added docstrings and reference for SWLM

---
 README.rst                      |   9 ++-
 weighwords/parsimonious.py      |  17 ++++--
 weighwords/significant_words.py | 105 ++++++++++++++++++++++++++++++--
 3 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/README.rst b/README.rst
index 8fc2365..96fd7cc 100644
--- a/README.rst
+++ b/README.rst
@@ -26,11 +26,16 @@ or::
 
 References
 ----------
-D. Hiemstra, S. Robertson and H. Zaragoza (2004). `Parsimonious Language Models
+D. Hiemstra, S. Robertson, and H. Zaragoza (2004). `Parsimonious Language Models
 for Information Retrieval
 <http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5806>`_.
 Proc. SIGIR'04.
 
-R. Kaptein, D. Hiemstra and J. Kamps (2010). `How different are Language Models
+R. Kaptein, D. Hiemstra, and J. Kamps (2010). `How different are Language Models
 and word clouds? <http://riannekaptein.woelmuis.nl/2010/kapt-how10.pdf>`_
 Proc. ECIR.
+
+M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016).
+`Luhn Revisited: Significant Words Language Models
+<https://djoerdhiemstra.com/wp-content/uploads/cikm2016.pdf>`_
+Proc. CKIM'16.
diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index eed0a30..d8ce657 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -30,7 +30,7 @@ class ParsimoniousLM:
 
     Parameters
     ----------
-    documents : iterable over iterable over terms
+    documents : iterable over iterable of str terms
         All documents that should be included in the corpus model.
     w : float
         Weight of document model (1 - weight of corpus model).
@@ -43,6 +43,13 @@ class ParsimoniousLM:
         Mapping of terms to numeric indices
     p_corpus : array of float
         Log probability of terms in background model (indexed by `vocab`)
+
+    References
+    ----------
+    D. Hiemstra, S. Robertson, and H. Zaragoza (2004).
+    `Parsimonious Language Models for Information Retrieval
+    <http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5806>`_.
+    Proc. SIGIR'04.
     """
 
     def __init__(
@@ -96,9 +103,9 @@ def top(
 
         Parameters
         ----------
-        k
+        k : int
             Number of top terms to return.
-        d
+        d : iterable of str terms
             Terms that make up the document.
         max_iter : int, optional
             Maximum number of iterations of EM algorithm to run.
@@ -110,7 +117,7 @@ def top(
         Returns
         -------
         t_p : list of (str, float)
-            Terms and their log-probabilities in the parsimonious model
+            Terms and their log-probabilities in the parsimonious model.
         """
 
         tf, p_term = self._document_model(d)
@@ -125,7 +132,7 @@ def _document_model(self, d: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]:
 
         Parameters
         ----------
-        d : array of terms
+        d : iterable of str terms
 
         Returns
         -------
diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 17c72b5..46cef88 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -32,9 +32,9 @@ class SignificantWordsLM(ParsimoniousLM):
 
     Parameters
     ----------
-    documents : iterable over iterable over terms
+    documents : iterable over iterable of str terms
         All documents that should be included in the corpus model.
-    lambdas : 3-tuple of floats
+    lambdas : 3-tuple of float
         Weight of corpus, group, and specific models. Will be normalized
         if the weights in the tuple don't sum to one.
     thresh : int
@@ -72,6 +72,13 @@ class SignificantWordsLM(ParsimoniousLM):
     See Also
     --------
     parsimonious.ParsimoniousLM : one-sided parsimonious model
+
+    References
+    ----------
+    M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016).
+    `Luhn Revisited: Significant Words Language Models
+    <https://djoerdhiemstra.com/wp-content/uploads/cikm2016.pdf>`_
+    Proc. CKIM'16.
     """
 
     def __init__(
@@ -96,6 +103,28 @@ def group_top(
             document_group: Iterable[Iterable[str]],
             **kwargs
     ) -> List[Tuple[str, float]]:
+        """
+        Get the top `k` terms of a `document_group` and their probabilities.
+        This is a shortcut to retrieve the top terms found by `fit_parsimonious_group`.
+
+        Parameters
+        ----------
+        k : int
+            Number of top terms to return.
+        document_group : iterable over iterable of str terms
+            All documents that should be included in the group model.
+        kwargs
+            Optional keyword arguments for `fit_parsimonious_group`.
+
+        Returns
+        -------
+        t_p : list of (str, float)
+            Terms and their probabilities in the group model.
+
+        See Also
+        --------
+        SignificantWordsLM.fit_parsimonious_group
+        """
         term_probabilities = self.fit_parsimonious_group(document_group, **kwargs)
         return nlargest(k, term_probabilities.items(), itemgetter(1))
 
@@ -109,6 +138,40 @@ def fit_parsimonious_group(
             parsimonize_specific: bool = False,
             post_parsimonize: bool = False
     ) -> Dict[str, float]:
+        """
+        Estimate a document group model, and parsimonize it against fixed
+        corpus and specific models. The documents may be unseen, but any terms
+        that are not in the vocabulary will be ignored.
+
+        Parameters
+        ----------
+        document_group : iterable over iterable of str terms
+            All documents that should be included in the group model.
+        max_iter : int, optional
+            Maximum number of iterations of EM algorithm to run.
+        eps : float, optional
+            Epsilon: convergence threshold for EM algorithm.
+        lambdas : 3-tuple of float, optional
+            Weight of corpus, group, and specific models. Will be normalized
+            if the weights in the tuple don't sum to one.
+        fix_lambdas : bool, optional
+            Fix the weights of the three sub-models (i.e. don't estimate
+            lambdas as part of the M-step).
+        parsimonize_specific : bool, optional
+            Bias the specific model towards uncommon terms before applying
+            the EM algorithm to the group model. This generally results in
+            a group model that stands out less from the corpus model.
+        post_parsimonize : bool, optional
+            Bias the group model towards uncommon terms after applying
+            the EM algorithm. This may be used to compensate when the
+            frequency of common terms varies much between the documents
+            in the group.
+
+        Returns
+        -------
+        t_p_map : dict of term -> float
+            Dictionary of terms and their probabilities in the group model.
+        """
         if lambdas is None:
             lambdas = self.initial_lambdas
         else:
@@ -168,6 +231,20 @@ def get_term_probabilities(
             self,
             log_prob_distribution: np.ndarray
     ) -> Dict[str, float]:
+        """
+        Align a term distribution with the vocabulary, and transform
+        the term log probabilities to linear probabilities.
+
+        Parameters
+        ----------
+        log_prob_distribution : array of float
+            Log probability of terms which is indexed by the vocabulary.
+
+        Returns
+        -------
+        t_p_map : dict of term -> float
+            Dictionary of terms and their probabilities in the (sub-)model.
+        """
         probabilities = np.exp(log_prob_distribution)
         probabilities[np.isnan(probabilities)] = 0.
         return {t: probabilities[i] for t, i in self.vocab.items()}
@@ -180,6 +257,7 @@ def _estimate(
             max_iter: int,
             eps: float
     ) -> np.ndarray:
+        """Apply the Expectation Maximization algorithm."""
         try:
             old_error_settings = np.seterr(divide='ignore')
             log_doc_tf = np.log(doc_tf)
@@ -202,6 +280,7 @@ def _e_step(
             p_group: np.ndarray,
             p_specific: np.ndarray
     ) -> Dict[str, np.ndarray]:
+        """Run an E-step."""
         corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus)
         specific_numerator = np.add.outer(self.lambda_specific, p_specific)
         group_numerator = np.add.outer(self.lambda_group, p_group)
@@ -229,6 +308,7 @@ def _m_step(
             expectation: Dict[str, np.ndarray],
             log_doc_tf: Sequence[np.ndarray]
     ) -> np.ndarray:
+        """Run an M-step."""
         term_weighted_group = log_doc_tf + expectation['group']
         group_numerator = logsum(term_weighted_group)
         p_group = group_numerator - logsum(group_numerator)
@@ -255,6 +335,7 @@ def _m_step(
     def _group_model(
             document_term_frequencies: Sequence[np.ndarray]
     ) -> Tuple[np.ndarray, np.ndarray]:
+        """Create the initial group model."""
         group_tf = np.array(document_term_frequencies).sum(axis=0)
 
         try:
@@ -269,6 +350,7 @@ def _group_model(
     def _specific_model(
             document_term_probabilities: Sequence[np.ndarray]
     ) -> np.ndarray:
+        """Create the fixed specific model."""
         # complement events: 1 - p
         complements = [
             np.log1p(-np.exp(p_doc))
@@ -300,13 +382,26 @@ def _specific_model(
 
     @staticmethod
     def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas:
+        """
+        Check and normalize the initial lambdas of the three sub-models.
+
+        Parameters
+        ----------
+        lambdas : 3-tuple of float
+            Weight of corpus, group, and specific models.
+
+        Returns
+        -------
+        lambdas : 3-tuple of float
+            Normalized probability of corpus, group, and specific models.
+        """
         assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}'
-        lambda_sum = sum(lambdas)
-        if abs(lambda_sum - 1) > 1e-10:
+        total_weight = sum(lambdas)
+        if abs(total_weight - 1) > 1e-10:
             lambdas = cast(
                 InitialLambdas,
                 tuple(
-                    w / lambda_sum
+                    w / total_weight
                     for w in lambdas
                 )
             )

From 124d63d4538f1306f53e8362ca441bb2aa53682d Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 17 May 2019 20:19:51 +0200
Subject: [PATCH 29/40] correct SWLM E-step denominator (*facepalm* for major
 oversight)

---
 weighwords/significant_words.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 46cef88..b81e08f 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -285,8 +285,8 @@ def _e_step(
         specific_numerator = np.add.outer(self.lambda_specific, p_specific)
         group_numerator = np.add.outer(self.lambda_group, p_group)
         denominator = [
-            logsum(np.asarray([sp_corpus, sp_corpus, sp_specific]))
-            for sp_corpus, sp_corpus, sp_specific in zip(
+            logsum(np.asarray(doc_numerators))
+            for doc_numerators in zip(
                 corpus_numerator,
                 specific_numerator,
                 group_numerator

From 0c01da4aa56ea94cdab21f74f31c4702f1fa920f Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 17 May 2019 20:43:28 +0200
Subject: [PATCH 30/40] extend dickens example with SWLM

---
 example/dickens.py     | 41 +++++++++++++++++++++++++++++++----------
 weighwords/__init__.py |  1 +
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/example/dickens.py b/example/dickens.py
index ec5fd33..0b334b3 100755
--- a/example/dickens.py
+++ b/example/dickens.py
@@ -1,13 +1,16 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Find terms that distinguish various novels by Charles Dickens.
 # Note: if the w parameter is set wisely, no stop list is needed.
-
-from weighwords import ParsimoniousLM
 import gzip
 import logging
-import numpy as np
+import math
 import re
+from itertools import zip_longest
+
+import numpy as np
+
+from weighwords import ParsimoniousLM, SignificantWordsLM
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -27,8 +30,8 @@
 def read_book(title, num):
     """Returns generator over words in book num"""
 
-    logger.info("Fetching terms from %s" % title)
-    path = "%s.txt.utf8.gz" % num
+    logger.info(f"Fetching terms from {title}")
+    path = f"{num}.txt.utf8.gz"
     in_book = False
     for ln in gzip.open(path, 'rt', encoding='utf8'):
         if in_book and ln.startswith(endbook):
@@ -40,12 +43,30 @@ def read_book(title, num):
             in_book = True
 
 
+def grouper(iterable, n, filler=None):
+    """Source: https://docs.python.org/3/library/itertools.html#itertools-recipes"""
+    args = [iter(iterable)] * n
+    return zip_longest(*args, fillvalue=filler)
+
+
 book_contents = [(title, list(read_book(title, num))) for title, num in books]
+corpus = [terms for title, terms in book_contents]
 
-model = ParsimoniousLM([terms for title, terms in book_contents], w=.01)
+plm = ParsimoniousLM(corpus, w=.01)
+swlm = SignificantWordsLM(corpus, lambdas=(.9, .01, .09))
 
 for title, terms in book_contents:
-    print("Top %d words in %s:" % (top_k, title))
-    for term, p in model.top(top_k, terms):
-        print("    %s %.4f" % (term, np.exp(p)))
+    plm_top = plm.top(top_k, terms)
+    swlm_top = swlm.group_top(
+        top_k,
+        grouper(terms, math.ceil(len(terms) / 10)),
+        fix_lambdas=True,
+    )
+    print(f"\nTop {top_k} words in {title}:")
+    print(f"\n{'PLM term':<16} {'PLM p':<12} {'SWLM term':<16} {'SWLM p':<6}")
+    for (plm_t, plm_p), (swlm_t, swlm_p) in zip(plm_top, swlm_top):
+        print(f"{plm_t:<16} {np.exp(plm_p):<12.4f} {swlm_t:<16} {swlm_p:.4f}")
     print("")
+
+
+
diff --git a/weighwords/__init__.py b/weighwords/__init__.py
index f725693..40c7742 100644
--- a/weighwords/__init__.py
+++ b/weighwords/__init__.py
@@ -1 +1,2 @@
 from .parsimonious import ParsimoniousLM
+from .significant_words import SignificantWordsLM

From c32151cd9a7a5bd9fd3013712c7913caecfabff3 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 17 May 2019 21:42:36 +0200
Subject: [PATCH 31/40] test model equivalence; parsimonize specific with fixed
 w=1/3; format floats in lambda logging

---
 tests/test_equivalence.py       | 85 +++++++++++++++++++++++++++++++++
 tests/test_swlm.py              |  2 +-
 weighwords/significant_words.py | 13 ++---
 3 files changed, 93 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_equivalence.py

diff --git a/tests/test_equivalence.py b/tests/test_equivalence.py
new file mode 100644
index 0000000..9cf02c7
--- /dev/null
+++ b/tests/test_equivalence.py
@@ -0,0 +1,85 @@
+import re
+from itertools import chain
+
+import pytest
+
+from weighwords import ParsimoniousLM, SignificantWordsLM
+from weighwords.logsum import logsum
+
+
+def test_model_equivalence(shakespeare_quotes):
+    weight = .1
+    plm = ParsimoniousLM(shakespeare_quotes, w=weight)
+    # initialize SWLM with weights that make it equivalent to PLM
+    swlm = SignificantWordsLM(
+        shakespeare_quotes,
+        lambdas=(1 - weight, weight, 0.)
+    )
+    plm_terms, swlm_terms = fit_models(plm, swlm, shakespeare_quotes)
+
+    assert plm_terms == swlm_terms, 'PLM and SWLM are not functionally equivalent'
+
+
+def test_model_non_equivalence(shakespeare_quotes):
+    weight = .1
+    plm = ParsimoniousLM(shakespeare_quotes, w=weight)
+    # initialize SWLM with weights that make it non-equivalent to PLM
+    swlm = SignificantWordsLM(
+        shakespeare_quotes,
+        lambdas=(1 - 2 * weight, weight, weight)
+    )
+    plm_terms, swlm_terms = fit_models(plm, swlm, shakespeare_quotes)
+
+    assert plm_terms != swlm_terms, 'PLM and SWLM should not be functionally equivalent'
+
+
+@pytest.fixture(scope="module")
+def shakespeare_quotes():
+    quotes = [
+        "Love all, trust a few, Do wrong to none",
+        "But love that comes too late, "
+        "Like a remorseful pardon slowly carried, "
+        "To the great sender turns a sour offence.",
+        "If thou remember'st not the slightest folly "
+        "That ever love did make thee run into, "
+        "Thou hast not lov'd.",
+        "We that are true lovers run into strange capers; "
+        "but as all is mortal in nature, "
+        "so is all nature in love mortal in folly.",
+        "But are you so much in love as your rhymes speak? "
+        "Neither rhyme nor reason can express how much.",
+        "A lover's eyes will gaze an eagle blind. "
+        "A lover's ear will hear the lowest sound.",
+    ]
+    return [
+        re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split()
+        for quote in quotes
+    ]
+
+
+def get_p_corpus(language_model):
+    p_corpus = language_model.p_corpus.copy()
+    vocab = language_model.vocab
+    term_tiers = [
+        (1.5, ['love', 'folly', "lov'd", 'lovers', 'lover']),
+        (1.3, ['trust', 'remorseful', 'sour', 'offence', 'gaze']),
+        (1.1, ["remember'st", 'capers', 'rhyme', 'rhymes', 'eagle']),
+    ]
+    for multiplier, terms in term_tiers:
+        for t in terms:
+            p_corpus[vocab[t]] *= multiplier
+
+    return p_corpus - logsum(p_corpus)
+
+
+def fit_models(plm, swlm, docs):
+    # artificially reduce the corpus probability of selected terms
+    plm.p_corpus = swlm.p_corpus = get_p_corpus(plm)
+
+    top_k = 15
+    plm_top = plm.top(top_k, chain(*docs))
+    swlm_top = swlm.group_top(top_k, docs, fix_lambdas=True)
+    plm_terms = [term for term, log_prob in plm_top]
+    swlm_terms = [term for term, prob in swlm_top]
+
+    return plm_terms, swlm_terms
diff --git a/tests/test_swlm.py b/tests/test_swlm.py
index 8c8ee3b..68519ae 100644
--- a/tests/test_swlm.py
+++ b/tests/test_swlm.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 
-from weighwords.significant_words import SignificantWordsLM
+from weighwords import SignificantWordsLM
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index b81e08f..0cb584e 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -200,7 +200,7 @@ def fit_parsimonious_group(
 
         p_specific = self._specific_model(doc_term_probs)
         if parsimonize_specific:
-            p_specific = self._EM(group_tf, p_specific, self.w, max_iter, eps)
+            p_specific = self._EM(group_tf, p_specific, 1/3, max_iter, eps)
 
         self.p_specific = p_specific
 
@@ -210,8 +210,8 @@ def fit_parsimonious_group(
         self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.double)
         self.lambda_group = np.full(weights_shape, group_w, dtype=np.double)
         logger.info(
-            f'Lambdas initialized to: Corpus={lambdas[0]}, '
-            f'Group={lambdas[1]}, Specific={lambdas[2]}'
+            f'Lambdas initialized to: Corpus={lambdas[0]:.4f}, '
+            f'Group={lambdas[1]:.4f}, Specific={lambdas[2]:.4f}'
         )
         self.p_group = self._estimate(
             p_group, p_specific, doc_term_frequencies, max_iter, eps
@@ -221,9 +221,10 @@ def fit_parsimonious_group(
 
         if self.fix_lambdas is False:
             logger.info(
-                f'Final lambdas (mean): Corpus={np.mean(np.exp(self.lambda_corpus))}, '
-                f'Group={np.mean(np.exp(self.lambda_group))}, '
-                f'Specific={np.mean(np.exp(self.lambda_specific))}'
+                f'Final lambdas (mean): '
+                f'Corpus={np.mean(np.exp(self.lambda_corpus)):.4f}, '
+                f'Group={np.mean(np.exp(self.lambda_group)):.4f}, '
+                f'Specific={np.mean(np.exp(self.lambda_specific)):.4f}'
             )
         return self.get_term_probabilities(self.p_group)
 

From d4272350bc9e685bfca962e1da51ba1b5bd75d06 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 18 May 2019 03:43:14 +0200
Subject: [PATCH 32/40] use scalar lambdas when they are fixed (this can
 significantly reduce the memory complexity)

---
 weighwords/significant_words.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 0cb584e..b20b331 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -205,6 +205,9 @@ def fit_parsimonious_group(
         self.p_specific = p_specific
 
         weights_shape = len(document_models)
+        if self.fix_lambdas:
+            weights_shape = 1
+
         general_w, group_w, specific_w = np.log(lambdas)
         self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.double)
         self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.double)

From cc7577ff3b5404326fccc10e8eed9fd1372b4eac Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 18 May 2019 15:18:52 +0200
Subject: [PATCH 33/40] simplify tests

---
 tests/conftest.py  |  17 ++++
 tests/test_plm.py  |  21 +----
 tests/test_swlm.py | 205 ++++-----------------------------------------
 3 files changed, 37 insertions(+), 206 deletions(-)
 create mode 100644 tests/conftest.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..17faf7f
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,17 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def uniform_doc():
+    return ['one', 'two', 'three', 'four', 'five']
+
+
+@pytest.fixture(scope="module")
+def number_corpus():
+    return [
+        ['one'],
+        ['two', 'two'],
+        ['three', 'three', 'three'],
+        ['four', 'four', 'four', 'four'],
+        ['five', 'five', 'five', 'five', 'five']
+    ]
diff --git a/tests/test_plm.py b/tests/test_plm.py
index 14c5671..ceb9f90 100644
--- a/tests/test_plm.py
+++ b/tests/test_plm.py
@@ -1,11 +1,10 @@
 import numpy as np
-import pytest
 
 from weighwords import ParsimoniousLM
 
 
 def test_document_model(number_corpus, uniform_doc):
-    plm = ParsimoniousLM([number_corpus], w=0.1, thresh=3)
+    plm = ParsimoniousLM(number_corpus, w=0.1, thresh=3)
     tf, p_term = plm._document_model(uniform_doc)
     assert (tf[:2] == 0).all(), \
         "Terms with a corpus frequency < thresh should not be counted"
@@ -16,23 +15,7 @@ def test_document_model(number_corpus, uniform_doc):
 
 
 def test_document_model_out_of_vocabulary(number_corpus):
-    plm = ParsimoniousLM([number_corpus], w=0.1)
+    plm = ParsimoniousLM(number_corpus, w=0.1)
     doc = ['two', 'or', 'three', 'unseen', 'words']
     tf, p_term = plm._document_model(doc)
     assert tf.sum() == 2, f"Unseen words should be ignored, got {tf} instead"
-
-
-@pytest.fixture(scope="module")
-def uniform_doc():
-    return ['one', 'two', 'three', 'four', 'five']
-
-
-@pytest.fixture(scope="module")
-def number_corpus():
-    return [
-        'one',
-        'two', 'two',
-        'three', 'three', 'three',
-        'four', 'four', 'four', 'four',
-        'five', 'five', 'five', 'five', 'five'
-    ]
diff --git a/tests/test_swlm.py b/tests/test_swlm.py
index 68519ae..e6b82bc 100644
--- a/tests/test_swlm.py
+++ b/tests/test_swlm.py
@@ -1,206 +1,37 @@
 import logging
-import operator
-from functools import reduce
-
-import numpy as np
-import pytest
 
 from weighwords import SignificantWordsLM
 
 logging.basicConfig(level=logging.INFO)
 
 
-def test_model_fit_fixed(swlm, doc_group):
+def test_model_fit_fixed(number_corpus, uniform_doc):
+    swlm = SignificantWordsLM([uniform_doc], lambdas=(1/3, 1/3, 1/3))
+    doc_group = [l + r for l, r in zip(number_corpus, reversed(number_corpus))]
     term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=True)
     expected_probs = {
-        "salmon": 0.04,
-        "chocolate": 0.03,
-        "snow": 0.02,
-        "tomato": 0.01,
-        "aqua": 0.0,
+        "one": 0.0,
+        "two": 0.12373,
+        "three": 2e-5,
+        "four": 0.50303,
+        "five": 0.37322,
     }
     for term, p in expected_probs.items():
         diff = abs(term_probs[term] - p)
-        assert diff < 1e-10, f"P({term}) != {p} with difference {diff}"
+        assert diff < 1e-5, f"P({term}) != {p} with difference {diff}"
 
 
-def test_model_fit_shifty(swlm, doc_group):
+def test_model_fit_shifty(number_corpus, uniform_doc):
+    swlm = SignificantWordsLM([uniform_doc], lambdas=(1/3, 1/3, 1/3))
+    doc_group = [l + r for l, r in zip(number_corpus, reversed(number_corpus))]
     term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=False)
     expected_probs = {
-        "salmon": 0.04,
-        "chocolate": 0.03,
-        "snow": 0.02,
-        "tomato": 0.01,
-        "aqua": 0.0,
+        "one": 0.0,
+        "two": 0.33322,
+        "three": 0.0,
+        "four": 0.66678,
+        "five": 0.0,
     }
     for term, p in expected_probs.items():
         diff = abs(term_probs[term] - p)
-        assert diff < 1e-10, f"P({term}) != {p} with difference {diff}"
-
-
-@pytest.fixture(scope="module")
-def swlm():
-    # init an SWLM with uniform p_corpus
-    return SignificantWordsLM([colors], lambdas=(0.7, 0.1, 0.2))
-
-
-@pytest.fixture(scope="module")
-def doc_group():
-    # deterministically generate some docs
-    doc_parts = np.array_split(list(zip(colors[:25], reversed(colors))), 5)
-    return [
-        reduce(operator.add, [i * list(d) for i, d in enumerate(z)])
-        for z in zip(*doc_parts)
-    ]
-
-
-colors = [
-    "aliceblue",
-    "antiquewhite",
-    "aqua",
-    "aquamarine",
-    "azure",
-    "beige",
-    "bisque",
-    "black",
-    "blanchedalmond",
-    "blue",
-    "blueviolet",
-    "brown",
-    "burlywood",
-    "cadetblue",
-    "chartreuse",
-    "chocolate",
-    "coral",
-    "cornflowerblue",
-    "cornsilk",
-    "crimson",
-    "cyan",
-    "darkblue",
-    "darkcyan",
-    "darkgoldenrod",
-    "darkgray",
-    "darkgreen",
-    "darkgrey",
-    "darkkhaki",
-    "darkmagenta",
-    "darkolivegreen",
-    "darkorange",
-    "darkorchid",
-    "darkred",
-    "darksalmon",
-    "darkseagreen",
-    "darkslateblue",
-    "darkslategray",
-    "darkslategrey",
-    "darkturquoise",
-    "darkviolet",
-    "deeppink",
-    "deepskyblue",
-    "dimgray",
-    "dimgrey",
-    "dodgerblue",
-    "firebrick",
-    "floralwhite",
-    "forestgreen",
-    "fuchsia",
-    "gainsboro",
-    "ghostwhite",
-    "goldenrod",
-    "gold",
-    "gray",
-    "green",
-    "greenyellow",
-    "grey",
-    "honeydew",
-    "hotpink",
-    "indianred",
-    "indigo",
-    "ivory",
-    "khaki",
-    "lavenderblush",
-    "lavender",
-    "lawngreen",
-    "lemonchiffon",
-    "lightblue",
-    "lightcoral",
-    "lightcyan",
-    "lightgoldenrodyellow",
-    "lightgray",
-    "lightgreen",
-    "lightgrey",
-    "lightpink",
-    "lightsalmon",
-    "lightseagreen",
-    "lightskyblue",
-    "lightslategray",
-    "lightslategrey",
-    "lightsteelblue",
-    "lightyellow",
-    "lime",
-    "limegreen",
-    "linen",
-    "magenta",
-    "maroon",
-    "mediumaquamarine",
-    "mediumblue",
-    "mediumorchid",
-    "mediumpurple",
-    "mediumseagreen",
-    "mediumslateblue",
-    "mediumspringgreen",
-    "mediumturquoise",
-    "mediumvioletred",
-    "midnightblue",
-    "mintcream",
-    "mistyrose",
-    "moccasin",
-    "navajowhite",
-    "navy",
-    "oldlace",
-    "olive",
-    "olivedrab",
-    "orange",
-    "orangered",
-    "orchid",
-    "palegoldenrod",
-    "palegreen",
-    "paleturquoise",
-    "palevioletred",
-    "papayawhip",
-    "peachpuff",
-    "peru",
-    "pink",
-    "plum",
-    "powderblue",
-    "purple",
-    "rebeccapurple",
-    "red",
-    "rosybrown",
-    "royalblue",
-    "saddlebrown",
-    "salmon",
-    "sandybrown",
-    "seagreen",
-    "seashell",
-    "sienna",
-    "silver",
-    "skyblue",
-    "slateblue",
-    "slategray",
-    "slategrey",
-    "snow",
-    "springgreen",
-    "steelblue",
-    "tan",
-    "teal",
-    "thistle",
-    "tomato",
-    "turquoise",
-    "violet",
-    "wheat",
-    "white",
-    "whitesmoke",
-    "yellow",
-    "yellowgreen",
-]
+        assert diff < 1e-5, f"P({term}) != {p} with difference {diff}"

From ea00801b026ebfe950e87956ba2da91bb48a3e9c Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 18 May 2019 16:07:05 +0200
Subject: [PATCH 34/40] updated readme with usage examples; moved test fixture;
 updated copyright in license statement

---
 COPYING                   |  3 +-
 README.rst                | 59 +++++++++++++++++++++++++++++++++++++--
 tests/conftest.py         | 26 +++++++++++++++++
 tests/test_equivalence.py | 27 ------------------
 4 files changed, 84 insertions(+), 31 deletions(-)

diff --git a/COPYING b/COPYING
index c128f62..02ba3d4 100644
--- a/COPYING
+++ b/COPYING
@@ -1,5 +1,6 @@
 WeighWords: a Python library for creating word weights/word clouds from text
-Copyright 2011 University of Amsterdam
+Copyright 2011-2019 University of Amsterdam
+Copyright 2019 TinQwise Stamkracht
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU Lesser General Public License as published by the
diff --git a/README.rst b/README.rst
index 96fd7cc..ea17792 100644
--- a/README.rst
+++ b/README.rst
@@ -9,20 +9,73 @@ Rather than use simple word frequency, it weighs words by statistical models
 known as *parsimonious language models*. These models are good at picking up
 the words that distinguish a text document from other documents in a
 collection. The downside to this is that you can't use WeighWords to make a
-word cloud of a single document; you need a bunch of document to compare to.
+word cloud of a single document; you need a bunch of documents (i.e. a
+background collection) to compare to.
 
 
 Installation
 ------------
 
-Either::
+Either install the latest release from PyPI::
 
     pip install weighwords
 
-or::
+or clone this git repository, and::
 
     python setup.py install
 
+or::
+
+    pip install -e .
+
+Usage
+-----
+>>> quotes = [
+        "Love all, trust a few, Do wrong to none",
+        ...
+        "A lover's eyes will gaze an eagle blind. "
+        "A lover's ear will hear the lowest sound.",
+    ]
+>>> doc_tokens = [
+        re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split()
+        for quote in quotes
+    ]
+
+The `ParsimoniousLM` is initialized with all document tokens as a
+background corpus, and subsequently takes a single document's tokens
+as input. Its `top` method returns the top terms and their log-probabilities:
+
+>>> plm = ParsimoniousLM(doc_tokens, w=.1)
+>>> plm.top(10, doc_tokens[-1])
+[('lover', -1.871802261651365),
+ ('will', -1.871802261651365),
+ ('eyes', -2.5649494422113044),
+ ('gaze', -2.5649494422113044),
+ ('an', -2.5649494422113044),
+ ('eagle', -2.5649494422113044),
+ ('blind', -2.5649494422113044),
+ ('ear', -2.5649494422113044),
+ ('hear', -2.5649494422113044),
+ ('lowest', -2.5649494422113044)]
+
+The `SignificantWordsLM` is similarly initialized with a background corpus,
+but subsequently takes a group of document tokens as input. Its `group_top`
+method returns the top terms and their probabilities:
+
+>>> swlm = SignificantWordsLM(doc_tokens, lambdas=(.7, .1, .2))
+>>> swlm.group_top(10, doc_tokens[-3:])
+[('in', 0.37875318027881),
+ ('is', 0.07195732361699828),
+ ('mortal', 0.07195732361699828),
+ ('nature', 0.07195732361699828),
+ ('all', 0.07110584778711342),
+ ('we', 0.03597866180849914),
+ ('true', 0.03597866180849914),
+ ('lovers', 0.03597866180849914),
+ ('strange', 0.03597866180849914),
+ ('capers', 0.03597866180849914)]
+
+See `example/dickens.py` for a running example with more realistic data.
 
 References
 ----------
diff --git a/tests/conftest.py b/tests/conftest.py
index 17faf7f..3b68efa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,5 @@
+import re
+
 import pytest
 
 
@@ -15,3 +17,27 @@ def number_corpus():
         ['four', 'four', 'four', 'four'],
         ['five', 'five', 'five', 'five', 'five']
     ]
+
+
+@pytest.fixture(scope="module")
+def shakespeare_quotes():
+    quotes = [
+        "Love all, trust a few, Do wrong to none",
+        "But love that comes too late, "
+        "Like a remorseful pardon slowly carried, "
+        "To the great sender turns a sour offence.",
+        "If thou remember'st not the slightest folly "
+        "That ever love did make thee run into, "
+        "Thou hast not lov'd.",
+        "We that are true lovers run into strange capers; "
+        "but as all is mortal in nature, "
+        "so is all nature in love mortal in folly.",
+        "But are you so much in love as your rhymes speak? "
+        "Neither rhyme nor reason can express how much.",
+        "A lover's eyes will gaze an eagle blind. "
+        "A lover's ear will hear the lowest sound.",
+    ]
+    return [
+        re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split()
+        for quote in quotes
+    ]
diff --git a/tests/test_equivalence.py b/tests/test_equivalence.py
index 9cf02c7..4c934e6 100644
--- a/tests/test_equivalence.py
+++ b/tests/test_equivalence.py
@@ -1,8 +1,5 @@
-import re
 from itertools import chain
 
-import pytest
-
 from weighwords import ParsimoniousLM, SignificantWordsLM
 from weighwords.logsum import logsum
 
@@ -33,30 +30,6 @@ def test_model_non_equivalence(shakespeare_quotes):
     assert plm_terms != swlm_terms, 'PLM and SWLM should not be functionally equivalent'
 
 
-@pytest.fixture(scope="module")
-def shakespeare_quotes():
-    quotes = [
-        "Love all, trust a few, Do wrong to none",
-        "But love that comes too late, "
-        "Like a remorseful pardon slowly carried, "
-        "To the great sender turns a sour offence.",
-        "If thou remember'st not the slightest folly "
-        "That ever love did make thee run into, "
-        "Thou hast not lov'd.",
-        "We that are true lovers run into strange capers; "
-        "but as all is mortal in nature, "
-        "so is all nature in love mortal in folly.",
-        "But are you so much in love as your rhymes speak? "
-        "Neither rhyme nor reason can express how much.",
-        "A lover's eyes will gaze an eagle blind. "
-        "A lover's ear will hear the lowest sound.",
-    ]
-    return [
-        re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split()
-        for quote in quotes
-    ]
-
-
 def get_p_corpus(language_model):
     p_corpus = language_model.p_corpus.copy()
     vocab = language_model.vocab

From b2d0687333a81e6c9f8f556141de3a809b97679d Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 18 May 2019 16:13:28 +0200
Subject: [PATCH 35/40] cast 1/3 to np.floating to keep mypy quiet

---
 weighwords/significant_words.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index b20b331..45142b3 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -200,7 +200,13 @@ def fit_parsimonious_group(
 
         p_specific = self._specific_model(doc_term_probs)
         if parsimonize_specific:
-            p_specific = self._EM(group_tf, p_specific, 1/3, max_iter, eps)
+            p_specific = self._EM(
+                group_tf,
+                p_specific,
+                cast(np.floating, 1/3),
+                max_iter,
+                eps
+            )
 
         self.p_specific = p_specific
 

From 8a1126f15c1b189c55e3c319ceb686a9fa766712 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 18 May 2019 16:18:37 +0200
Subject: [PATCH 36/40] simplify installing optional dev requirements

---
 requirements-dev.txt | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 requirements-dev.txt

diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..7670e0e
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,9 @@
+# install the weighwords package for convenience
+-e .
+
+# testing framework
+pytest ~= 4.5
+
+# static type checking
+mypy >= 0.701
+https://github.com/numpy/numpy-stubs/archive/master.tar.gz

From c792a875c5c9e8da736e283fe2dc247e0cae3437 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Sat, 18 May 2019 17:04:07 +0200
Subject: [PATCH 37/40] docs: double backticks and float

---
 README.rst                 | 10 +++++-----
 weighwords/parsimonious.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.rst b/README.rst
index ea17792..6f720ae 100644
--- a/README.rst
+++ b/README.rst
@@ -41,9 +41,9 @@ Usage
         for quote in quotes
     ]
 
-The `ParsimoniousLM` is initialized with all document tokens as a
+The ``ParsimoniousLM`` is initialized with all document tokens as a
 background corpus, and subsequently takes a single document's tokens
-as input. Its `top` method returns the top terms and their log-probabilities:
+as input. Its ``top`` method returns the top terms and their log-probabilities:
 
 >>> plm = ParsimoniousLM(doc_tokens, w=.1)
 >>> plm.top(10, doc_tokens[-1])
@@ -58,8 +58,8 @@ as input. Its `top` method returns the top terms and their log-probabilities:
  ('hear', -2.5649494422113044),
  ('lowest', -2.5649494422113044)]
 
-The `SignificantWordsLM` is similarly initialized with a background corpus,
-but subsequently takes a group of document tokens as input. Its `group_top`
+The ``SignificantWordsLM`` is similarly initialized with a background corpus,
+but subsequently takes a group of document tokens as input. Its ``group_top``
 method returns the top terms and their probabilities:
 
 >>> swlm = SignificantWordsLM(doc_tokens, lambdas=(.7, .1, .2))
@@ -75,7 +75,7 @@ method returns the top terms and their probabilities:
  ('strange', 0.03597866180849914),
  ('capers', 0.03597866180849914)]
 
-See `example/dickens.py` for a running example with more realistic data.
+See ``example/dickens.py`` for a running example with more realistic data.
 
 References
 ----------
diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py
index d8ce657..60ef281 100644
--- a/weighwords/parsimonious.py
+++ b/weighwords/parsimonious.py
@@ -136,7 +136,7 @@ def _document_model(self, d: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]:
 
         Returns
         -------
-        tf : array of int
+        tf : array of float
             Term frequencies
         p_term : array of float
             Term log probabilities

From 48bfc7609d255274ba8a9ed90e7f33528f940031 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Mon, 20 May 2019 11:36:41 +0200
Subject: [PATCH 38/40] fix readme example syntax

---
 README.rst                      | 24 +++++++++++++-----------
 weighwords/significant_words.py |  2 +-
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/README.rst b/README.rst
index 6f720ae..c3888cb 100644
--- a/README.rst
+++ b/README.rst
@@ -31,20 +31,21 @@ or::
 Usage
 -----
 >>> quotes = [
-        "Love all, trust a few, Do wrong to none",
-        ...
-        "A lover's eyes will gaze an eagle blind. "
-        "A lover's ear will hear the lowest sound.",
-    ]
+...     "Love all, trust a few, Do wrong to none",
+...     ...
+...     "A lover's eyes will gaze an eagle blind. "
+...     "A lover's ear will hear the lowest sound.",
+... ]
 >>> doc_tokens = [
-        re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split()
-        for quote in quotes
-    ]
+...     re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split()
+...     for quote in quotes
+... ]
 
 The ``ParsimoniousLM`` is initialized with all document tokens as a
 background corpus, and subsequently takes a single document's tokens
 as input. Its ``top`` method returns the top terms and their log-probabilities:
 
+>>> from weighwords import  ParsimoniousLM
 >>> plm = ParsimoniousLM(doc_tokens, w=.1)
 >>> plm.top(10, doc_tokens[-1])
 [('lover', -1.871802261651365),
@@ -62,6 +63,7 @@ The ``SignificantWordsLM`` is similarly initialized with a background corpus,
 but subsequently takes a group of document tokens as input. Its ``group_top``
 method returns the top terms and their probabilities:
 
+>>> from weighwords import SignificantWordsLM
 >>> swlm = SignificantWordsLM(doc_tokens, lambdas=(.7, .1, .2))
 >>> swlm.group_top(10, doc_tokens[-3:])
 [('in', 0.37875318027881),
@@ -85,10 +87,10 @@ for Information Retrieval
 Proc. SIGIR'04.
 
 R. Kaptein, D. Hiemstra, and J. Kamps (2010). `How different are Language Models
-and word clouds? <http://riannekaptein.woelmuis.nl/2010/kapt-how10.pdf>`_
-Proc. ECIR.
+and word clouds? <http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.189.822>`_.
+Proc. ECIR'10.
 
 M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016).
 `Luhn Revisited: Significant Words Language Models
-<https://djoerdhiemstra.com/wp-content/uploads/cikm2016.pdf>`_
+<https://djoerdhiemstra.com/wp-content/uploads/cikm2016.pdf>`_.
 Proc. CKIM'16.
diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 45142b3..9bd831e 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -77,7 +77,7 @@ class SignificantWordsLM(ParsimoniousLM):
     ----------
     M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016).
     `Luhn Revisited: Significant Words Language Models
-    <https://djoerdhiemstra.com/wp-content/uploads/cikm2016.pdf>`_
+    <https://djoerdhiemstra.com/wp-content/uploads/cikm2016.pdf>`_.
     Proc. CKIM'16.
     """
 

From 72aff8cf485850dcd69a584adaf95b9371753a98 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Fri, 31 May 2019 01:21:11 +0200
Subject: [PATCH 39/40] use p_corpus as replacement for p_specific when
 len(docs) < 2

---
 weighwords/significant_words.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 9bd831e..42c7161 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -356,11 +356,18 @@ def _group_model(
 
         return group_tf, p_group
 
-    @staticmethod
     def _specific_model(
+            self,
             document_term_probabilities: Sequence[np.ndarray]
     ) -> np.ndarray:
         """Create the fixed specific model."""
+        if len(document_term_probabilities) < 2:
+            logger.warning(
+                'Cannot calculate `p_specific` for a single document, '
+                'using `p_corpus` as replacement.'
+            )
+            return self.p_corpus
+
         # complement events: 1 - p
         complements = [
             np.log1p(-np.exp(p_doc))

From 32acde2e5e707786a06701c10db14e8e9ccd75a7 Mon Sep 17 00:00:00 2001
From: Alex Olieman <alex@olieman.net>
Date: Tue, 4 Jun 2019 02:52:27 +0200
Subject: [PATCH 40/40] make the specific terms estimator pluggable

---
 weighwords/significant_words.py        |  75 +++++-----------
 weighwords/specific_term_estimators.py | 120 +++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 55 deletions(-)
 create mode 100644 weighwords/specific_term_estimators.py

diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py
index 42c7161..3e9d761 100644
--- a/weighwords/significant_words.py
+++ b/weighwords/significant_words.py
@@ -15,6 +15,11 @@
 
 from weighwords import ParsimoniousLM
 from weighwords.logsum import logsum
+from weighwords.specific_term_estimators import (
+    SpecificTermEstimator,
+    RequiresMultipleDocuments,
+    mutual_exclusion,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -136,7 +141,8 @@ def fit_parsimonious_group(
             lambdas: Optional[InitialLambdas] = None,
             fix_lambdas: bool = False,
             parsimonize_specific: bool = False,
-            post_parsimonize: bool = False
+            post_parsimonize: bool = False,
+            specific_estimator: SpecificTermEstimator = mutual_exclusion
     ) -> Dict[str, float]:
         """
         Estimate a document group model, and parsimonize it against fixed
@@ -166,6 +172,9 @@ def fit_parsimonious_group(
             the EM algorithm. This may be used to compensate when the
             frequency of common terms varies much between the documents
             in the group.
+        specific_estimator : callable, optional
+            Function that estimates the specific terms model based on
+            the document term frequencies of the doc group.
 
         Returns
         -------
@@ -190,26 +199,23 @@ def fit_parsimonious_group(
             doc_term_frequencies
         )
         try:
-            old_error_settings = np.seterr(divide='ignore')
-            doc_term_probs = [
-                np.log(tf) - np.log(np.sum(tf))
-                for tf in doc_term_frequencies
-            ]
-        finally:
-            np.seterr(**old_error_settings)
+            self.p_specific = specific_estimator(doc_term_frequencies)
+        except RequiresMultipleDocuments:
+            logger.warning(
+                'Cannot calculate `p_specific` for a single document, '
+                'using `p_corpus` as replacement.'
+            )
+            self.p_specific = self.p_corpus
 
-        p_specific = self._specific_model(doc_term_probs)
         if parsimonize_specific:
-            p_specific = self._EM(
+            self.p_specific = self._EM(
                 group_tf,
-                p_specific,
+                self.p_specific,
                 cast(np.floating, 1/3),
                 max_iter,
                 eps
             )
 
-        self.p_specific = p_specific
-
         weights_shape = len(document_models)
         if self.fix_lambdas:
             weights_shape = 1
@@ -223,7 +229,7 @@ def fit_parsimonious_group(
             f'Group={lambdas[1]:.4f}, Specific={lambdas[2]:.4f}'
         )
         self.p_group = self._estimate(
-            p_group, p_specific, doc_term_frequencies, max_iter, eps
+            p_group, self.p_specific, doc_term_frequencies, max_iter, eps
         )
         if post_parsimonize:
             self.p_group = self._EM(group_tf, self.p_group, self.w, max_iter, eps)
@@ -356,47 +362,6 @@ def _group_model(
 
         return group_tf, p_group
 
-    def _specific_model(
-            self,
-            document_term_probabilities: Sequence[np.ndarray]
-    ) -> np.ndarray:
-        """Create the fixed specific model."""
-        if len(document_term_probabilities) < 2:
-            logger.warning(
-                'Cannot calculate `p_specific` for a single document, '
-                'using `p_corpus` as replacement.'
-            )
-            return self.p_corpus
-
-        # complement events: 1 - p
-        complements = [
-            np.log1p(-np.exp(p_doc))
-            for p_doc in document_term_probabilities
-        ]
-        # probability of term to be important in one doc, and not others
-        complement_products = np.array([
-            dlm + complement
-            for i, dlm in enumerate(document_term_probabilities)
-            for j, complement in enumerate(complements)
-            if i != j
-        ])
-
-        try:
-            old_error_settings = np.seterr(divide='ignore')
-            # marginalize over all documents
-            p_specific = (
-                logsum(complement_products)
-                - np.log(
-                    np.count_nonzero(complement_products > np.NINF, axis=0)
-                )
-            )
-            # prevent NaNs from causing downstream errors
-            p_specific[np.isnan(p_specific)] = np.NINF
-        finally:
-            np.seterr(**old_error_settings)
-
-        return p_specific
-
     @staticmethod
     def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas:
         """
diff --git a/weighwords/specific_term_estimators.py b/weighwords/specific_term_estimators.py
new file mode 100644
index 0000000..158abd9
--- /dev/null
+++ b/weighwords/specific_term_estimators.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 TinQwise Stamkracht, University of Amsterdam
+# Author: Alex Olieman
+
+from __future__ import annotations
+# TODO: remove redundant typing imports once PEP 585 is finalized
+
+import functools
+import logging
+from typing import Sequence, Callable
+
+import numpy as np
+from weighwords.logsum import logsum
+
+logger = logging.getLogger(__name__)
+
+SpecificTermEstimator = Callable[[Sequence[np.ndarray]], np.ndarray]
+
+
+class RequiresMultipleDocuments(Exception):
+    pass
+
+
+def requires_multiple_docs(estimator_func: SpecificTermEstimator):
+    """
+    Do not let the decorated function be called with fewer than two docs.
+
+    Parameters
+    ----------
+    estimator_func : SpecificTermEstimator
+
+    Raises
+    ------
+    RequiresMultipleDocuments
+
+    Returns
+    -------
+    decorated_func : SpecificTermEstimator
+    """
+    @functools.wraps(estimator_func)
+    def wrapper_func(document_term_frequencies):
+        if len(document_term_frequencies) < 2:
+            raise RequiresMultipleDocuments
+
+        return estimator_func(document_term_frequencies)
+
+    return wrapper_func
+
+
+@requires_multiple_docs
+def mutual_exclusion(
+        document_term_frequencies: Sequence[np.ndarray]
+) -> np.ndarray:
+    """Estimate the fixed specific model with the mutual exclusion method."""
+    doc_term_probs = [
+        np.log(tf) - np.log(np.sum(tf))
+        for tf in document_term_frequencies
+    ]
+    # complement events: 1 - p
+    complements = [
+        np.log1p(-np.exp(p_doc))
+        for p_doc in doc_term_probs
+    ]
+    # probability of term to be important in one doc, and not others
+    complement_products = np.array([
+        dlm + complement
+        for i, dlm in enumerate(doc_term_probs)
+        for j, complement in enumerate(complements)
+        if i != j
+    ])
+    # marginalize over all documents
+    p_specific = (
+        logsum(complement_products)
+        - np.log(
+            np.count_nonzero(complement_products > np.NINF, axis=0)
+        )
+    )
+    # prevent NaNs from causing downstream errors
+    p_specific[np.isnan(p_specific)] = np.NINF
+
+    return p_specific
+
+
+@requires_multiple_docs
+def inverse_doc_frequency(
+        document_term_frequencies: Sequence[np.ndarray]
+) -> np.ndarray:
+    """Estimate the fixed specific model with the inverse doc frequency method."""
+    idf = 1 / np.count_nonzero(document_term_frequencies, axis=0)
+    idf[~np.isfinite(idf)] = 0.
+
+    # calculate normalized idf as log-probabilities
+    p_specific = np.log(idf) - np.log(np.sum(idf))
+
+    return p_specific
+
+
+def idf_fallback_for_many_docs(
+        document_term_frequencies: Sequence[np.ndarray],
+        primary_estimator: SpecificTermEstimator,
+        fallback_thresh: int
+):
+    if len(document_term_frequencies) < fallback_thresh:
+        estimator_func = primary_estimator
+    else:
+        estimator_func = inverse_doc_frequency
+        logger.warning(
+            f'Estimator got more than {fallback_thresh} docs:'
+            ' falling back to IDF for the current doc group.'
+        )
+
+    return estimator_func(document_term_frequencies)
+
+
+me_up_to_40_docs = functools.partial(
+    idf_fallback_for_many_docs,
+    primary_estimator=mutual_exclusion,
+    fallback_thresh=40
+)