diff --git a/COPYING b/COPYING index c128f62..02ba3d4 100644 --- a/COPYING +++ b/COPYING @@ -1,5 +1,6 @@ WeighWords: a Python library for creating word weights/word clouds from text -Copyright 2011 University of Amsterdam +Copyright 2011-2019 University of Amsterdam +Copyright 2019 TinQwise Stamkracht This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the diff --git a/README.rst b/README.rst index 8fc2365..c3888cb 100644 --- a/README.rst +++ b/README.rst @@ -9,28 +9,88 @@ Rather than use simple word frequency, it weighs words by statistical models known as *parsimonious language models*. These models are good at picking up the words that distinguish a text document from other documents in a collection. The downside to this is that you can't use WeighWords to make a -word cloud of a single document; you need a bunch of document to compare to. +word cloud of a single document; you need a bunch of documents (i.e. a +background collection) to compare to. Installation ------------ -Either:: +Either install the latest release from PyPI:: pip install weighwords -or:: +or clone this git repository, and:: python setup.py install +or:: + + pip install -e . + +Usage +----- +>>> quotes = [ +... "Love all, trust a few, Do wrong to none", +... ... +... "A lover's eyes will gaze an eagle blind. " +... "A lover's ear will hear the lowest sound.", +... ] +>>> doc_tokens = [ +... re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split() +... for quote in quotes +... ] + +The ``ParsimoniousLM`` is initialized with all document tokens as a +background corpus, and subsequently takes a single document's tokens +as input. Its ``top`` method returns the top terms and their log-probabilities: + +>>> from weighwords import ParsimoniousLM +>>> plm = ParsimoniousLM(doc_tokens, w=.1) +>>> plm.top(10, doc_tokens[-1]) +[('lover', -1.871802261651365), + ('will', -1.871802261651365), + ('eyes', -2.5649494422113044), + ('gaze', -2.5649494422113044), + ('an', -2.5649494422113044), + ('eagle', -2.5649494422113044), + ('blind', -2.5649494422113044), + ('ear', -2.5649494422113044), + ('hear', -2.5649494422113044), + ('lowest', -2.5649494422113044)] + +The ``SignificantWordsLM`` is similarly initialized with a background corpus, +but subsequently takes a group of document tokens as input. Its ``group_top`` +method returns the top terms and their probabilities: + +>>> from weighwords import SignificantWordsLM +>>> swlm = SignificantWordsLM(doc_tokens, lambdas=(.7, .1, .2)) +>>> swlm.group_top(10, doc_tokens[-3:]) +[('in', 0.37875318027881), + ('is', 0.07195732361699828), + ('mortal', 0.07195732361699828), + ('nature', 0.07195732361699828), + ('all', 0.07110584778711342), + ('we', 0.03597866180849914), + ('true', 0.03597866180849914), + ('lovers', 0.03597866180849914), + ('strange', 0.03597866180849914), + ('capers', 0.03597866180849914)] + +See ``example/dickens.py`` for a running example with more realistic data. References ---------- -D. Hiemstra, S. Robertson and H. Zaragoza (2004). `Parsimonious Language Models +D. Hiemstra, S. Robertson, and H. Zaragoza (2004). `Parsimonious Language Models for Information Retrieval `_. Proc. SIGIR'04. -R. Kaptein, D. Hiemstra and J. Kamps (2010). `How different are Language Models -and word clouds? `_ -Proc. ECIR. +R. Kaptein, D. Hiemstra, and J. Kamps (2010). `How different are Language Models +and word clouds? `_. +Proc. ECIR'10. + +M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016). +`Luhn Revisited: Significant Words Language Models +`_. +Proc. CKIM'16. diff --git a/example/dickens.py b/example/dickens.py index 3328f6f..0b334b3 100755 --- a/example/dickens.py +++ b/example/dickens.py @@ -1,13 +1,16 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Find terms that distinguish various novels by Charles Dickens. # Note: if the w parameter is set wisely, no stop list is needed. - -from weighwords import ParsimoniousLM import gzip import logging -import numpy as np +import math import re +from itertools import zip_longest + +import numpy as np + +from weighwords import ParsimoniousLM, SignificantWordsLM logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -21,28 +24,49 @@ ] startbook = """*** START OF THIS PROJECT GUTENBERG EBOOK """ +endbook = """*** END OF THIS PROJECT GUTENBERG EBOOK """ def read_book(title, num): """Returns generator over words in book num""" - logger.info("Fetching terms from %s" % title) - path = "%s.txt.utf8.gz" % num + logger.info(f"Fetching terms from {title}") + path = f"{num}.txt.utf8.gz" in_book = False - for ln in gzip.open(path): - if in_book: - for w in re.sub(r"[.,:;!?\"']", " ", ln).lower().split(): + for ln in gzip.open(path, 'rt', encoding='utf8'): + if in_book and ln.startswith(endbook): + break + elif in_book: + for w in re.sub(r"[.,:;!?\"'‘’]", " ", ln).lower().split(): yield w elif ln.startswith(startbook): in_book = True +def grouper(iterable, n, filler=None): + """Source: https://docs.python.org/3/library/itertools.html#itertools-recipes""" + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=filler) + + book_contents = [(title, list(read_book(title, num))) for title, num in books] +corpus = [terms for title, terms in book_contents] -model = ParsimoniousLM([terms for title, terms in book_contents], w=.01) +plm = ParsimoniousLM(corpus, w=.01) +swlm = SignificantWordsLM(corpus, lambdas=(.9, .01, .09)) for title, terms in book_contents: - print("Top %d words in %s:" % (top_k, title)) - for term, p in model.top(top_k, terms): - print(" %s %.4f" % (term, np.exp(p))) + plm_top = plm.top(top_k, terms) + swlm_top = swlm.group_top( + top_k, + grouper(terms, math.ceil(len(terms) / 10)), + fix_lambdas=True, + ) + print(f"\nTop {top_k} words in {title}:") + print(f"\n{'PLM term':<16} {'PLM p':<12} {'SWLM term':<16} {'SWLM p':<6}") + for (plm_t, plm_p), (swlm_t, swlm_p) in zip(plm_top, swlm_top): + print(f"{plm_t:<16} {np.exp(plm_p):<12.4f} {swlm_t:<16} {swlm_p:.4f}") print("") + + + diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..7670e0e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,9 @@ +# install the weighwords package for convenience +-e . + +# testing framework +pytest ~= 4.5 + +# static type checking +mypy >= 0.701 +https://github.com/numpy/numpy-stubs/archive/master.tar.gz diff --git a/setup.py b/setup.py index 601f5da..3552010 100644 --- a/setup.py +++ b/setup.py @@ -8,11 +8,14 @@ description = "Python library for creating word weights/word clouds from text", keywords = "word cloud nlp language model", license = "LGPL", + package_data = {"weighwords": ["py.typed"]}, packages = ["weighwords"], - install_requires = ["numpy>=1.4.0"], + install_requires = ["numpy>=1.15.0"], + tests_require = ["pytest"], classifiers = [ "Development Status :: 4 - Beta", "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", + "Programming Language :: Python :: 3.7", "Topic :: Text Processing", ] ) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..3b68efa --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,43 @@ +import re + +import pytest + + +@pytest.fixture(scope="module") +def uniform_doc(): + return ['one', 'two', 'three', 'four', 'five'] + + +@pytest.fixture(scope="module") +def number_corpus(): + return [ + ['one'], + ['two', 'two'], + ['three', 'three', 'three'], + ['four', 'four', 'four', 'four'], + ['five', 'five', 'five', 'five', 'five'] + ] + + +@pytest.fixture(scope="module") +def shakespeare_quotes(): + quotes = [ + "Love all, trust a few, Do wrong to none", + "But love that comes too late, " + "Like a remorseful pardon slowly carried, " + "To the great sender turns a sour offence.", + "If thou remember'st not the slightest folly " + "That ever love did make thee run into, " + "Thou hast not lov'd.", + "We that are true lovers run into strange capers; " + "but as all is mortal in nature, " + "so is all nature in love mortal in folly.", + "But are you so much in love as your rhymes speak? " + "Neither rhyme nor reason can express how much.", + "A lover's eyes will gaze an eagle blind. " + "A lover's ear will hear the lowest sound.", + ] + return [ + re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split() + for quote in quotes + ] diff --git a/tests/test_equivalence.py b/tests/test_equivalence.py new file mode 100644 index 0000000..4c934e6 --- /dev/null +++ b/tests/test_equivalence.py @@ -0,0 +1,58 @@ +from itertools import chain + +from weighwords import ParsimoniousLM, SignificantWordsLM +from weighwords.logsum import logsum + + +def test_model_equivalence(shakespeare_quotes): + weight = .1 + plm = ParsimoniousLM(shakespeare_quotes, w=weight) + # initialize SWLM with weights that make it equivalent to PLM + swlm = SignificantWordsLM( + shakespeare_quotes, + lambdas=(1 - weight, weight, 0.) + ) + plm_terms, swlm_terms = fit_models(plm, swlm, shakespeare_quotes) + + assert plm_terms == swlm_terms, 'PLM and SWLM are not functionally equivalent' + + +def test_model_non_equivalence(shakespeare_quotes): + weight = .1 + plm = ParsimoniousLM(shakespeare_quotes, w=weight) + # initialize SWLM with weights that make it non-equivalent to PLM + swlm = SignificantWordsLM( + shakespeare_quotes, + lambdas=(1 - 2 * weight, weight, weight) + ) + plm_terms, swlm_terms = fit_models(plm, swlm, shakespeare_quotes) + + assert plm_terms != swlm_terms, 'PLM and SWLM should not be functionally equivalent' + + +def get_p_corpus(language_model): + p_corpus = language_model.p_corpus.copy() + vocab = language_model.vocab + term_tiers = [ + (1.5, ['love', 'folly', "lov'd", 'lovers', 'lover']), + (1.3, ['trust', 'remorseful', 'sour', 'offence', 'gaze']), + (1.1, ["remember'st", 'capers', 'rhyme', 'rhymes', 'eagle']), + ] + for multiplier, terms in term_tiers: + for t in terms: + p_corpus[vocab[t]] *= multiplier + + return p_corpus - logsum(p_corpus) + + +def fit_models(plm, swlm, docs): + # artificially reduce the corpus probability of selected terms + plm.p_corpus = swlm.p_corpus = get_p_corpus(plm) + + top_k = 15 + plm_top = plm.top(top_k, chain(*docs)) + swlm_top = swlm.group_top(top_k, docs, fix_lambdas=True) + plm_terms = [term for term, log_prob in plm_top] + swlm_terms = [term for term, prob in swlm_top] + + return plm_terms, swlm_terms diff --git a/tests/test_plm.py b/tests/test_plm.py new file mode 100644 index 0000000..ceb9f90 --- /dev/null +++ b/tests/test_plm.py @@ -0,0 +1,21 @@ +import numpy as np + +from weighwords import ParsimoniousLM + + +def test_document_model(number_corpus, uniform_doc): + plm = ParsimoniousLM(number_corpus, w=0.1, thresh=3) + tf, p_term = plm._document_model(uniform_doc) + assert (tf[:2] == 0).all(), \ + "Terms with a corpus frequency < thresh should not be counted" + assert tf.sum() == 3, f"Expected tf.sum() to be 3, got {tf.sum()} instead" + linear_p_term = np.exp(p_term) + assert (linear_p_term[2:].sum() - 1) < 1e-10, \ + f"All probability mass should be on the last 3 terms, got {linear_p_term} instead" + + +def test_document_model_out_of_vocabulary(number_corpus): + plm = ParsimoniousLM(number_corpus, w=0.1) + doc = ['two', 'or', 'three', 'unseen', 'words'] + tf, p_term = plm._document_model(doc) + assert tf.sum() == 2, f"Unseen words should be ignored, got {tf} instead" diff --git a/tests/test_swlm.py b/tests/test_swlm.py new file mode 100644 index 0000000..e6b82bc --- /dev/null +++ b/tests/test_swlm.py @@ -0,0 +1,37 @@ +import logging + +from weighwords import SignificantWordsLM + +logging.basicConfig(level=logging.INFO) + + +def test_model_fit_fixed(number_corpus, uniform_doc): + swlm = SignificantWordsLM([uniform_doc], lambdas=(1/3, 1/3, 1/3)) + doc_group = [l + r for l, r in zip(number_corpus, reversed(number_corpus))] + term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=True) + expected_probs = { + "one": 0.0, + "two": 0.12373, + "three": 2e-5, + "four": 0.50303, + "five": 0.37322, + } + for term, p in expected_probs.items(): + diff = abs(term_probs[term] - p) + assert diff < 1e-5, f"P({term}) != {p} with difference {diff}" + + +def test_model_fit_shifty(number_corpus, uniform_doc): + swlm = SignificantWordsLM([uniform_doc], lambdas=(1/3, 1/3, 1/3)) + doc_group = [l + r for l, r in zip(number_corpus, reversed(number_corpus))] + term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=False) + expected_probs = { + "one": 0.0, + "two": 0.33322, + "three": 0.0, + "four": 0.66678, + "five": 0.0, + } + for term, p in expected_probs.items(): + diff = abs(term_probs[term] - p) + assert diff < 1e-5, f"P({term}) != {p} with difference {diff}" diff --git a/weighwords/__init__.py b/weighwords/__init__.py index f725693..40c7742 100644 --- a/weighwords/__init__.py +++ b/weighwords/__init__.py @@ -1 +1,2 @@ from .parsimonious import ParsimoniousLM +from .significant_words import SignificantWordsLM diff --git a/weighwords/logsum.py b/weighwords/logsum.py index 6df59cd..350bd26 100644 --- a/weighwords/logsum.py +++ b/weighwords/logsum.py @@ -6,7 +6,7 @@ import numpy as np -def logsum(x): +def logsum(x: np.ndarray) -> np.ndarray: """Computes the sum of x assuming x is in the log domain. Returns log(sum(exp(x))) while minimizing the possibility of @@ -24,7 +24,7 @@ def logsum(x): """ # Use the max to normalize, as with the log this is what accumulates # the less errors - vmax = x.max(axis=0) - out = np.log(np.sum(np.exp(x - vmax), axis=0)) + vmax = np.nanmax(x, axis=0) + out = np.log(np.nansum(np.exp(x - vmax), axis=0)) out += vmax return out diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index becaa02..60ef281 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -1,21 +1,28 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 -# Copyright 2011-2013 University of Amsterdam +# Copyright 2011-2019 University of Amsterdam # Author: Lars Buitinck +from __future__ import annotations +# TODO: remove redundant typing imports once PEP 585 is finalized + from collections import defaultdict from heapq import nlargest import logging +from operator import itemgetter +from typing import Iterable, Optional, Dict, List, Tuple + import numpy as np -from .logsum import logsum +from weighwords.logsum import logsum logger = logging.getLogger(__name__) -class ParsimoniousLM(object): - """Language model for a set of documents. +class ParsimoniousLM: + """ + Language model for a set of documents. Constructing an object of this class fits a background model. The top method can then be used to fit document-specific models, also for unseen @@ -23,34 +30,51 @@ class ParsimoniousLM(object): Parameters ---------- - documents : iterable over iterable over terms + documents : iterable over iterable of str terms + All documents that should be included in the corpus model. w : float - Weight of document model (1 - weight of corpus model) + Weight of document model (1 - weight of corpus model). thresh : int - Don't include words that occur < thresh times + Don't include words that occur fewer than `thresh` times. Attributes ---------- vocab : dict of term -> int Mapping of terms to numeric indices p_corpus : array of float - Log prob of terms + Log probability of terms in background model (indexed by `vocab`) + + References + ---------- + D. Hiemstra, S. Robertson, and H. Zaragoza (2004). + `Parsimonious Language Models for Information Retrieval + `_. + Proc. SIGIR'04. """ - def __init__(self, documents, w, thresh=0): + def __init__( + self, + documents: Iterable[Iterable[str]], + w: np.floating, + thresh: int = 0 + ): + """Collect the vocabulary and fit the background model.""" logger.info('Building corpus model') self.w = w - self.vocab = vocab = {} # Vocabulary: maps terms to numeric indices - count = defaultdict(int) # Corpus frequency + # Vocabulary: maps terms to numeric indices + vocab: Dict[str, int] + self.vocab = vocab = {} + # Corpus frequency + count: Dict[int, int] = defaultdict(int) for d in documents: for tok in d: i = vocab.setdefault(tok, len(vocab)) count[i] += 1 - cf = np.empty(len(count), dtype=np.float) - for i, f in count.iteritems(): + cf = np.empty(len(count), dtype=np.double) + for i, f in count.items(): cf[i] = f rare = (cf < thresh) cf -= rare * cf @@ -59,60 +83,79 @@ def __init__(self, documents, w, thresh=0): old_error_settings = np.seterr(divide='ignore') # lg P(t|C) - self.p_corpus = np.log(cf) - np.log(np.sum(cf)) + self.p_corpus: np.ndarray = np.log(cf) - np.log(np.sum(cf)) finally: np.seterr(**old_error_settings) - def top(self, k, d, max_iter=50, eps=1e-5, w=None): - '''Get the top k terms of a document d and their log probabilities. + def top( + self, + k: int, + d: Iterable[str], + max_iter: int = 50, + eps: float = 1e-5, + w: Optional[np.floating] = None + ) -> List[Tuple[str, float]]: + """ + Get the top `k` terms of a document `d` and their log probabilities. Uses the Expectation Maximization (EM) algorithm to estimate term probabilities. Parameters ---------- + k : int + Number of top terms to return. + d : iterable of str terms + Terms that make up the document. max_iter : int, optional Maximum number of iterations of EM algorithm to run. eps : float, optional - Convergence threshold for EM algorithm. + Epsilon: convergence threshold for EM algorithm. w : float, optional Weight of document model; overrides value given to __init__ Returns ------- t_p : list of (str, float) - ''' + Terms and their log-probabilities in the parsimonious model. + """ tf, p_term = self._document_model(d) p_term = self._EM(tf, p_term, w, max_iter, eps) - terms = [(t, p_term[i]) for t, i in self.vocab.iteritems()] - return nlargest(k, terms, lambda tp: tp[1]) + terms = [(t, p_term[i]) for t, i in self.vocab.items()] + return nlargest(k, terms, itemgetter(1)) - def _document_model(self, d): - '''Build document model. + def _document_model(self, d: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]: + """ + Build document model. Parameters ---------- - d : array of terms + d : iterable of str terms Returns ------- - tf : array of int + tf : array of float Term frequencies p_term : array of float Term log probabilities Initial p_term is 1/n_distinct for terms with non-zero tf, 0 for terms with 0 tf. - ''' + """ logger.info('Gathering term probabilities') - tf = np.zeros(len(self.vocab), dtype=np.float) # Term frequency + tf = np.zeros(len(self.vocab), dtype=np.double) # Term frequency for tok in d: - tf[self.vocab[tok]] += 1 + term_id = self.vocab.get(tok) + if term_id: + tf[term_id] += 1 + + # ignore counts of terms with zero corpus probability + tf *= np.isfinite(self.p_corpus) n_distinct = (tf > 0).sum() @@ -124,8 +167,16 @@ def _document_model(self, d): return tf, p_term - def _EM(self, tf, p_term, w, max_iter, eps): - '''Expectation maximization. + def _EM( + self, + tf: np.ndarray, + p_term: np.ndarray, + w: Optional[np.floating], + max_iter: int, + eps: float + ) -> np.ndarray: + """ + Expectation maximization. Parameters ---------- @@ -135,14 +186,16 @@ def _EM(self, tf, p_term, w, max_iter, eps): Term probabilities, as returned by document_model max_iter : int Number of iterations to run. + eps : float + Epsilon: convergence threshold for EM algorithm. Returns ------- p_term : array of float A posteriori term probabilities. - ''' + """ - logger.info('EM with max_iter=%d, eps=%g' % (max_iter, eps)) + logger.info(f'EM with max_iter={max_iter}, eps={eps}') if w is None: w = self.w @@ -155,7 +208,7 @@ def _EM(self, tf, p_term, w, max_iter, eps): try: old_error_settings = np.seterr(divide='ignore') p_term = np.asarray(p_term) - for i in xrange(1, max_iter + 1): + for i in range(1, max_iter + 1): # E-step p_term += w E = tf + p_term - np.logaddexp(p_corpus, p_term) @@ -165,9 +218,8 @@ def _EM(self, tf, p_term, w, max_iter, eps): diff = new_p_term - p_term p_term = new_p_term - if (diff < eps).all(): - logger.info('EM: convergence reached after %d iterations' - % i) + if (diff[np.isfinite(diff)] < eps).all(): + logger.info(f'EM: convergence reached after {i} iterations') break finally: np.seterr(**old_error_settings) diff --git a/weighwords/py.typed b/weighwords/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py new file mode 100644 index 0000000..3e9d761 --- /dev/null +++ b/weighwords/significant_words.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 + +# Copyright 2019 TinQwise Stamkracht, University of Amsterdam +# Author: Alex Olieman + +from __future__ import annotations +# TODO: remove redundant typing imports once PEP 585 is finalized + +import logging +from heapq import nlargest +from operator import itemgetter +from typing import Iterable, Optional, Sequence, Tuple, List, Dict, cast + +import numpy as np + +from weighwords import ParsimoniousLM +from weighwords.logsum import logsum +from weighwords.specific_term_estimators import ( + SpecificTermEstimator, + RequiresMultipleDocuments, + mutual_exclusion, +) + +logger = logging.getLogger(__name__) + +InitialLambdas = Tuple[np.floating, np.floating, np.floating] + + +class SignificantWordsLM(ParsimoniousLM): + """ + Language model that consists of three sub-models: + + - Corpus model: represents term probabilities in a (large) background collection; + - Group model: parsimonious term probabilities in a group of documents; + - Specific model: represents the same group, but is biased towards terms that + occur with a high frequency in single docs, and a low frequency in others. + + Parameters + ---------- + documents : iterable over iterable of str terms + All documents that should be included in the corpus model. + lambdas : 3-tuple of float + Weight of corpus, group, and specific models. Will be normalized + if the weights in the tuple don't sum to one. + thresh : int + Don't include words that occur fewer than `thresh` times. + + Attributes + ---------- + vocab : dict of term -> int + Mapping of terms to numeric indices + p_corpus : array of float + Log probability of terms in background model (indexed by `vocab`) + p_group : array of float + Log probability of terms in background model (indexed by `vocab`) + p_specific : array of float + Log probability of terms in background model (indexed by `vocab`) + lambda_corpus : array of float + Log probability (weight) of corpus model for documents + lambda_group : array of float + Log probability (weight) of group model for documents + lambda_specific : array of float + Log probability (weight) of specific model for documents + + Methods + ------- + fit_parsimonious_group(document_group, ...) + Estimates a document group model, parsimonized against the corpus + and specific models. The documents may be unseen, but terms that + are not in the vocabulary will be ignored. + group_top(k, document_group, ...) + Shortcut to fit the group model and retrieve the top `k` terms. + get_term_probabilities(log_prob_distribution) + Aligns a term distribution with the vocabulary, and transforms + the term log probabilities to linear probabilities. + + See Also + -------- + parsimonious.ParsimoniousLM : one-sided parsimonious model + + References + ---------- + M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016). + `Luhn Revisited: Significant Words Language Models + `_. + Proc. CKIM'16. + """ + + def __init__( + self, + documents: Iterable[Iterable[str]], + lambdas: InitialLambdas, + thresh: int = 0 + ): + """Collect the vocabulary and fit the background model.""" + self.initial_lambdas = self.normalize_lambdas(lambdas) + super().__init__(documents, self.initial_lambdas[1], thresh=thresh) + self.lambda_corpus: Optional[np.ndarray] = None + self.lambda_group: Optional[np.ndarray] = None + self.lambda_specific: Optional[np.ndarray] = None + self.p_group: Optional[np.ndarray] = None + self.p_specific: Optional[np.ndarray] = None + self.fix_lambdas = False + + def group_top( + self, + k: int, + document_group: Iterable[Iterable[str]], + **kwargs + ) -> List[Tuple[str, float]]: + """ + Get the top `k` terms of a `document_group` and their probabilities. + This is a shortcut to retrieve the top terms found by `fit_parsimonious_group`. + + Parameters + ---------- + k : int + Number of top terms to return. + document_group : iterable over iterable of str terms + All documents that should be included in the group model. + kwargs + Optional keyword arguments for `fit_parsimonious_group`. + + Returns + ------- + t_p : list of (str, float) + Terms and their probabilities in the group model. + + See Also + -------- + SignificantWordsLM.fit_parsimonious_group + """ + term_probabilities = self.fit_parsimonious_group(document_group, **kwargs) + return nlargest(k, term_probabilities.items(), itemgetter(1)) + + def fit_parsimonious_group( + self, + document_group: Iterable[Iterable[str]], + max_iter: int = 50, + eps: float = 1e-5, + lambdas: Optional[InitialLambdas] = None, + fix_lambdas: bool = False, + parsimonize_specific: bool = False, + post_parsimonize: bool = False, + specific_estimator: SpecificTermEstimator = mutual_exclusion + ) -> Dict[str, float]: + """ + Estimate a document group model, and parsimonize it against fixed + corpus and specific models. The documents may be unseen, but any terms + that are not in the vocabulary will be ignored. + + Parameters + ---------- + document_group : iterable over iterable of str terms + All documents that should be included in the group model. + max_iter : int, optional + Maximum number of iterations of EM algorithm to run. + eps : float, optional + Epsilon: convergence threshold for EM algorithm. + lambdas : 3-tuple of float, optional + Weight of corpus, group, and specific models. Will be normalized + if the weights in the tuple don't sum to one. + fix_lambdas : bool, optional + Fix the weights of the three sub-models (i.e. don't estimate + lambdas as part of the M-step). + parsimonize_specific : bool, optional + Bias the specific model towards uncommon terms before applying + the EM algorithm to the group model. This generally results in + a group model that stands out less from the corpus model. + post_parsimonize : bool, optional + Bias the group model towards uncommon terms after applying + the EM algorithm. This may be used to compensate when the + frequency of common terms varies much between the documents + in the group. + specific_estimator : callable, optional + Function that estimates the specific terms model based on + the document term frequencies of the doc group. + + Returns + ------- + t_p_map : dict of term -> float + Dictionary of terms and their probabilities in the group model. + """ + if lambdas is None: + lambdas = self.initial_lambdas + else: + lambdas = self.normalize_lambdas(lambdas) + + self.fix_lambdas = fix_lambdas + + document_models = [ + self._document_model(doc) + for doc in document_group + ] + del document_group + + doc_term_frequencies = [tf for tf, _ in document_models] + group_tf, p_group = self._group_model( + doc_term_frequencies + ) + try: + self.p_specific = specific_estimator(doc_term_frequencies) + except RequiresMultipleDocuments: + logger.warning( + 'Cannot calculate `p_specific` for a single document, ' + 'using `p_corpus` as replacement.' + ) + self.p_specific = self.p_corpus + + if parsimonize_specific: + self.p_specific = self._EM( + group_tf, + self.p_specific, + cast(np.floating, 1/3), + max_iter, + eps + ) + + weights_shape = len(document_models) + if self.fix_lambdas: + weights_shape = 1 + + general_w, group_w, specific_w = np.log(lambdas) + self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.double) + self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.double) + self.lambda_group = np.full(weights_shape, group_w, dtype=np.double) + logger.info( + f'Lambdas initialized to: Corpus={lambdas[0]:.4f}, ' + f'Group={lambdas[1]:.4f}, Specific={lambdas[2]:.4f}' + ) + self.p_group = self._estimate( + p_group, self.p_specific, doc_term_frequencies, max_iter, eps + ) + if post_parsimonize: + self.p_group = self._EM(group_tf, self.p_group, self.w, max_iter, eps) + + if self.fix_lambdas is False: + logger.info( + f'Final lambdas (mean): ' + f'Corpus={np.mean(np.exp(self.lambda_corpus)):.4f}, ' + f'Group={np.mean(np.exp(self.lambda_group)):.4f}, ' + f'Specific={np.mean(np.exp(self.lambda_specific)):.4f}' + ) + return self.get_term_probabilities(self.p_group) + + def get_term_probabilities( + self, + log_prob_distribution: np.ndarray + ) -> Dict[str, float]: + """ + Align a term distribution with the vocabulary, and transform + the term log probabilities to linear probabilities. + + Parameters + ---------- + log_prob_distribution : array of float + Log probability of terms which is indexed by the vocabulary. + + Returns + ------- + t_p_map : dict of term -> float + Dictionary of terms and their probabilities in the (sub-)model. + """ + probabilities = np.exp(log_prob_distribution) + probabilities[np.isnan(probabilities)] = 0. + return {t: probabilities[i] for t, i in self.vocab.items()} + + def _estimate( + self, + p_group: np.ndarray, + p_specific: np.ndarray, + doc_tf: Sequence[np.ndarray], + max_iter: int, + eps: float + ) -> np.ndarray: + """Apply the Expectation Maximization algorithm.""" + try: + old_error_settings = np.seterr(divide='ignore') + log_doc_tf = np.log(doc_tf) + for i in range(1, 1 + max_iter): + expectation = self._e_step(p_group, p_specific) + new_p_group = self._m_step(expectation, log_doc_tf) + + diff = new_p_group - p_group + p_group = new_p_group + if (diff[np.isfinite(diff)] < eps).all(): + logger.info(f'EM: convergence reached after {i} iterations') + break + finally: + np.seterr(**old_error_settings) + + return p_group + + def _e_step( + self, + p_group: np.ndarray, + p_specific: np.ndarray + ) -> Dict[str, np.ndarray]: + """Run an E-step.""" + corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus) + specific_numerator = np.add.outer(self.lambda_specific, p_specific) + group_numerator = np.add.outer(self.lambda_group, p_group) + denominator = [ + logsum(np.asarray(doc_numerators)) + for doc_numerators in zip( + corpus_numerator, + specific_numerator, + group_numerator + ) + ] + out = { + 'corpus': corpus_numerator - denominator, + 'specific': specific_numerator - denominator, + 'group': group_numerator - denominator + } + # prevent NaNs from causing downstream errors + for v in out.values(): + v[np.isnan(v)] = np.NINF + + return out + + def _m_step( + self, + expectation: Dict[str, np.ndarray], + log_doc_tf: Sequence[np.ndarray] + ) -> np.ndarray: + """Run an M-step.""" + term_weighted_group = log_doc_tf + expectation['group'] + group_numerator = logsum(term_weighted_group) + p_group = group_numerator - logsum(group_numerator) + + if self.fix_lambdas is False: + # estimate lambdas + corpus_numerator = logsum( + np.transpose(log_doc_tf + expectation['corpus']) + ) + specific_numerator = logsum( + np.transpose(log_doc_tf + expectation['specific']) + ) + group_numerator = logsum(np.transpose(term_weighted_group)) + denominator = logsum( + np.asarray([corpus_numerator, specific_numerator, group_numerator]) + ) + self.lambda_corpus = corpus_numerator - denominator + self.lambda_specific = specific_numerator - denominator + self.lambda_group = group_numerator - denominator + + return p_group + + @staticmethod + def _group_model( + document_term_frequencies: Sequence[np.ndarray] + ) -> Tuple[np.ndarray, np.ndarray]: + """Create the initial group model.""" + group_tf = np.array(document_term_frequencies).sum(axis=0) + + try: + old_error_settings = np.seterr(divide='ignore') + p_group = np.log(group_tf) - np.log(np.sum(group_tf)) + finally: + np.seterr(**old_error_settings) + + return group_tf, p_group + + @staticmethod + def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas: + """ + Check and normalize the initial lambdas of the three sub-models. + + Parameters + ---------- + lambdas : 3-tuple of float + Weight of corpus, group, and specific models. + + Returns + ------- + lambdas : 3-tuple of float + Normalized probability of corpus, group, and specific models. + """ + assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}' + total_weight = sum(lambdas) + if abs(total_weight - 1) > 1e-10: + lambdas = cast( + InitialLambdas, + tuple( + w / total_weight + for w in lambdas + ) + ) + return lambdas diff --git a/weighwords/specific_term_estimators.py b/weighwords/specific_term_estimators.py new file mode 100644 index 0000000..158abd9 --- /dev/null +++ b/weighwords/specific_term_estimators.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +# Copyright 2019 TinQwise Stamkracht, University of Amsterdam +# Author: Alex Olieman + +from __future__ import annotations +# TODO: remove redundant typing imports once PEP 585 is finalized + +import functools +import logging +from typing import Sequence, Callable + +import numpy as np +from weighwords.logsum import logsum + +logger = logging.getLogger(__name__) + +SpecificTermEstimator = Callable[[Sequence[np.ndarray]], np.ndarray] + + +class RequiresMultipleDocuments(Exception): + pass + + +def requires_multiple_docs(estimator_func: SpecificTermEstimator): + """ + Do not let the decorated function be called with fewer than two docs. + + Parameters + ---------- + estimator_func : SpecificTermEstimator + + Raises + ------ + RequiresMultipleDocuments + + Returns + ------- + decorated_func : SpecificTermEstimator + """ + @functools.wraps(estimator_func) + def wrapper_func(document_term_frequencies): + if len(document_term_frequencies) < 2: + raise RequiresMultipleDocuments + + return estimator_func(document_term_frequencies) + + return wrapper_func + + +@requires_multiple_docs +def mutual_exclusion( + document_term_frequencies: Sequence[np.ndarray] +) -> np.ndarray: + """Estimate the fixed specific model with the mutual exclusion method.""" + doc_term_probs = [ + np.log(tf) - np.log(np.sum(tf)) + for tf in document_term_frequencies + ] + # complement events: 1 - p + complements = [ + np.log1p(-np.exp(p_doc)) + for p_doc in doc_term_probs + ] + # probability of term to be important in one doc, and not others + complement_products = np.array([ + dlm + complement + for i, dlm in enumerate(doc_term_probs) + for j, complement in enumerate(complements) + if i != j + ]) + # marginalize over all documents + p_specific = ( + logsum(complement_products) + - np.log( + np.count_nonzero(complement_products > np.NINF, axis=0) + ) + ) + # prevent NaNs from causing downstream errors + p_specific[np.isnan(p_specific)] = np.NINF + + return p_specific + + +@requires_multiple_docs +def inverse_doc_frequency( + document_term_frequencies: Sequence[np.ndarray] +) -> np.ndarray: + """Estimate the fixed specific model with the inverse doc frequency method.""" + idf = 1 / np.count_nonzero(document_term_frequencies, axis=0) + idf[~np.isfinite(idf)] = 0. + + # calculate normalized idf as log-probabilities + p_specific = np.log(idf) - np.log(np.sum(idf)) + + return p_specific + + +def idf_fallback_for_many_docs( + document_term_frequencies: Sequence[np.ndarray], + primary_estimator: SpecificTermEstimator, + fallback_thresh: int +): + if len(document_term_frequencies) < fallback_thresh: + estimator_func = primary_estimator + else: + estimator_func = inverse_doc_frequency + logger.warning( + f'Estimator got more than {fallback_thresh} docs:' + ' falling back to IDF for the current doc group.' + ) + + return estimator_func(document_term_frequencies) + + +me_up_to_40_docs = functools.partial( + idf_fallback_for_many_docs, + primary_estimator=mutual_exclusion, + fallback_thresh=40 +)