From 8ac8b7762625a9513468e36df515ff89ba0e740d Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Mon, 1 Apr 2019 17:35:42 +0200 Subject: [PATCH 01/40] py3 compatibility --- weighwords/parsimonious.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index becaa02..f8ab640 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -50,7 +50,7 @@ def __init__(self, documents, w, thresh=0): count[i] += 1 cf = np.empty(len(count), dtype=np.float) - for i, f in count.iteritems(): + for i, f in count.items(): cf[i] = f rare = (cf < thresh) cf -= rare * cf @@ -86,7 +86,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None): tf, p_term = self._document_model(d) p_term = self._EM(tf, p_term, w, max_iter, eps) - terms = [(t, p_term[i]) for t, i in self.vocab.iteritems()] + terms = [(t, p_term[i]) for t, i in self.vocab.items()] return nlargest(k, terms, lambda tp: tp[1]) def _document_model(self, d): @@ -155,7 +155,7 @@ def _EM(self, tf, p_term, w, max_iter, eps): try: old_error_settings = np.seterr(divide='ignore') p_term = np.asarray(p_term) - for i in xrange(1, max_iter + 1): + for i in range(1, max_iter + 1): # E-step p_term += w E = tf + p_term - np.logaddexp(p_corpus, p_term) From 64709ab71db605353fcdb7547f3afa5420676b3e Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Mon, 1 Apr 2019 17:36:10 +0200 Subject: [PATCH 02/40] py3 compatible example --- example/dickens.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/example/dickens.py b/example/dickens.py index 3328f6f..ec5fd33 100755 --- a/example/dickens.py +++ b/example/dickens.py @@ -21,6 +21,7 @@ ] startbook = """*** START OF THIS PROJECT GUTENBERG EBOOK """ +endbook = """*** END OF THIS PROJECT GUTENBERG EBOOK """ def read_book(title, num): @@ -29,9 +30,11 @@ def read_book(title, num): logger.info("Fetching terms from %s" % title) path = "%s.txt.utf8.gz" % num in_book = False - for ln in gzip.open(path): - if in_book: - for w in re.sub(r"[.,:;!?\"']", " ", ln).lower().split(): + for ln in gzip.open(path, 'rt', encoding='utf8'): + if in_book and ln.startswith(endbook): + break + elif in_book: + for w in re.sub(r"[.,:;!?\"'‘’]", " ", ln).lower().split(): yield w elif ln.startswith(startbook): in_book = True From 563a83a559caff7b63cd432f24b3207ddceb09bd Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Tue, 2 Apr 2019 21:30:28 +0200 Subject: [PATCH 03/40] turn relative weighwords import into absolute --- weighwords/parsimonious.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index f8ab640..42c6e38 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -8,7 +8,7 @@ import logging import numpy as np -from .logsum import logsum +from weighwords.logsum import logsum logger = logging.getLogger(__name__) From cd698f4dc9e669fa39a9e546aca5d6dc314237a2 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Tue, 2 Apr 2019 21:31:25 +0200 Subject: [PATCH 04/40] codestyle / consistency --- weighwords/parsimonious.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index 42c6e38..ba99cbb 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -64,7 +64,7 @@ def __init__(self, documents, w, thresh=0): np.seterr(**old_error_settings) def top(self, k, d, max_iter=50, eps=1e-5, w=None): - '''Get the top k terms of a document d and their log probabilities. + """Get the top k terms of a document d and their log probabilities. Uses the Expectation Maximization (EM) algorithm to estimate term probabilities. @@ -81,7 +81,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None): Returns ------- t_p : list of (str, float) - ''' + """ tf, p_term = self._document_model(d) p_term = self._EM(tf, p_term, w, max_iter, eps) @@ -90,7 +90,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None): return nlargest(k, terms, lambda tp: tp[1]) def _document_model(self, d): - '''Build document model. + """Build document model. Parameters ---------- @@ -105,7 +105,7 @@ def _document_model(self, d): Initial p_term is 1/n_distinct for terms with non-zero tf, 0 for terms with 0 tf. - ''' + """ logger.info('Gathering term probabilities') @@ -125,7 +125,7 @@ def _document_model(self, d): return tf, p_term def _EM(self, tf, p_term, w, max_iter, eps): - '''Expectation maximization. + """Expectation maximization. Parameters ---------- @@ -140,7 +140,7 @@ def _EM(self, tf, p_term, w, max_iter, eps): ------- p_term : array of float A posteriori term probabilities. - ''' + """ logger.info('EM with max_iter=%d, eps=%g' % (max_iter, eps)) From adba97d5c8a32cefb0538e8c6cedfc33ba56c213 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Wed, 3 Apr 2019 11:56:31 +0200 Subject: [PATCH 05/40] replace lambda with itemgetter --- weighwords/parsimonious.py | 6 ++++-- weighwords/significant_words.py | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 weighwords/significant_words.py diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index ba99cbb..bc1d71b 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -6,6 +6,8 @@ from collections import defaultdict from heapq import nlargest import logging +from operator import itemgetter + import numpy as np from weighwords.logsum import logsum @@ -14,7 +16,7 @@ logger = logging.getLogger(__name__) -class ParsimoniousLM(object): +class ParsimoniousLM: """Language model for a set of documents. Constructing an object of this class fits a background model. The top @@ -87,7 +89,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None): p_term = self._EM(tf, p_term, w, max_iter, eps) terms = [(t, p_term[i]) for t, i in self.vocab.items()] - return nlargest(k, terms, lambda tp: tp[1]) + return nlargest(k, terms, itemgetter(1)) def _document_model(self, d): """Build document model. diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py new file mode 100644 index 0000000..d78cfe5 --- /dev/null +++ b/weighwords/significant_words.py @@ -0,0 +1,8 @@ +import logging + +from weighwords import ParsimoniousLM + +logger = logging.getLogger(__name__) + + +class SignificantWordsLM(ParsimoniousLM): From 4805760c7e5b7f6080ef11a6508fafffa33d08c0 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Wed, 10 Apr 2019 00:21:35 +0200 Subject: [PATCH 06/40] [WIP] SignificantWordsLM init and untested E-step --- weighwords/significant_words.py | 119 ++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index d78cfe5..729b5da 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -1,8 +1,127 @@ import logging +import numpy as np + from weighwords import ParsimoniousLM +from weighwords.logsum import logsum logger = logging.getLogger(__name__) class SignificantWordsLM(ParsimoniousLM): + """Language model for a set of documents. + + Constructing an object of this class fits a background model. The top + method can then be used to fit document-specific models, also for unseen + documents (with the same vocabulary as the background corpus). + + Parameters + ---------- + documents : iterable over iterable over terms + w : float + Weight of document model (1 - weight of corpus model) + thresh : int + Don't include words that occur < thresh times + + Attributes + ---------- + vocab : dict of term -> int + Mapping of terms to numeric indices + p_corpus : array of float + Log prob of terms + """ + def __init__(self, documents, w, thresh=0): + super().__init__(documents, w, thresh=thresh) + self.lambda_corpus = None + self.lambda_group = None + self.lambda_specific = None + + def group_top(self, k, document_group, max_iter=50, eps=1e-5, w=None): + document_models = [ + self._document_model(doc) + for doc in document_group + ] + doc_term_frequencies = [tf for tf, _ in document_models] + group_tf, p_group = self._group_model( + doc_term_frequencies + ) + try: + old_error_settings = np.seterr(divide='ignore') + doc_term_probs = [ + np.log(tf) - np.log(np.sum(tf)) + for tf in doc_term_frequencies + ] + finally: + np.seterr(**old_error_settings) + + p_specific = self._specific_model(doc_term_probs) + + if w is None: + w = self.w + general_w = specific_w = np.log(0.5 * (1 - w)) + group_w = np.log(w) + weights_shape = len(document_group) + self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.float) + self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.float) + self.lambda_group = np.full(weights_shape, group_w, dtype=np.float) + + + def _e_step(self, p_group, p_specific): + corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus) + specific_numerator = np.add.outer(self.lambda_specific, p_specific) + group_numerator = np.add.outer(self.lambda_group, p_group) + denominator = [ + logsum(np.array([sp_corpus, sp_corpus, sp_specific])) + for sp_corpus, sp_corpus, sp_specific in zip( + corpus_numerator, + specific_numerator, + group_numerator + ) + ] + return { + 'corpus': corpus_numerator - denominator, + 'specific': specific_numerator - denominator, + 'group': group_numerator - denominator + } + + + @staticmethod + def _group_model(document_term_frequencies): + group_tf = np.array(document_term_frequencies).sum(axis=0) + + try: + old_error_settings = np.seterr(divide='ignore') + p_group = np.log(group_tf) - np.log(np.sum(group_tf)) + finally: + np.seterr(**old_error_settings) + + return group_tf, p_group + + @staticmethod + def _specific_model(document_term_probabilities): + # complement events: 1 - p + complements = [ + np.log1p(-np.exp(p_doc)) + for p_doc in document_term_probabilities + ] + # probability of term to be important in one doc, and not others + complement_products = np.array([ + document_term_probabilities[i] + complement + for i, dlm in enumerate(document_term_probabilities) + for j, complement in enumerate(complements) + if i != j + ]) + + try: + old_error_settings = np.seterr(divide='ignore') + # marginalize over all documents + p_specific = ( + logsum(complement_products) + - np.log( + np.count_nonzero(complement_products > np.log(0), axis=0) + ) + ) + finally: + np.seterr(**old_error_settings) + + return p_specific From ac737a32ff0e5f428a9cb41c6a9a2fc62a2f2976 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Wed, 10 Apr 2019 15:28:04 +0200 Subject: [PATCH 07/40] estimate SWLM group model with fixed initial lambdas --- weighwords/significant_words.py | 47 ++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 729b5da..40ca15d 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -1,4 +1,6 @@ import logging +from heapq import nlargest +from operator import itemgetter import numpy as np @@ -35,8 +37,18 @@ def __init__(self, documents, w, thresh=0): self.lambda_corpus = None self.lambda_group = None self.lambda_specific = None + self.p_group = None + self.p_specific = None + + def group_top(self, k, document_group, **kwargs): + term_probabilities = self.fit_parsimonious_group(document_group, **kwargs) + return nlargest(k, term_probabilities.items(), itemgetter(1)) + + def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None): + if w is None: + w = self.w + assert 0 < w < 1, f"invalid w={w}; `w` needs a value between 0.0 and 1.0" - def group_top(self, k, document_group, max_iter=50, eps=1e-5, w=None): document_models = [ self._document_model(doc) for doc in document_group @@ -56,8 +68,6 @@ def group_top(self, k, document_group, max_iter=50, eps=1e-5, w=None): p_specific = self._specific_model(doc_term_probs) - if w is None: - w = self.w general_w = specific_w = np.log(0.5 * (1 - w)) group_w = np.log(w) weights_shape = len(document_group) @@ -65,13 +75,37 @@ def group_top(self, k, document_group, max_iter=50, eps=1e-5, w=None): self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.float) self.lambda_group = np.full(weights_shape, group_w, dtype=np.float) + self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps) + self.p_specific = p_specific + + exp_p_group = np.exp(p_group) + + return {t: exp_p_group[i] for t, i in self.vocab.items()} + + def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps): + try: + old_error_settings = np.seterr(divide='ignore') + log_doc_tf = np.log(doc_tf) + for i in range(1, 1 + max_iter): + expectation = self._e_step(p_group, p_specific) + new_p_group = self._m_step(expectation, log_doc_tf) + + diff = new_p_group - p_group + p_group = new_p_group + if (diff < eps).all(): + logger.info(f'EM: convergence reached after {i} iterations') + break + finally: + np.seterr(**old_error_settings) + + return p_group def _e_step(self, p_group, p_specific): corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus) specific_numerator = np.add.outer(self.lambda_specific, p_specific) group_numerator = np.add.outer(self.lambda_group, p_group) denominator = [ - logsum(np.array([sp_corpus, sp_corpus, sp_specific])) + logsum(np.asarray([sp_corpus, sp_corpus, sp_specific])) for sp_corpus, sp_corpus, sp_specific in zip( corpus_numerator, specific_numerator, @@ -84,6 +118,11 @@ def _e_step(self, p_group, p_specific): 'group': group_numerator - denominator } + def _m_step(self, expectation, log_doc_tf): + group_numerator = logsum(log_doc_tf + expectation['group']) + p_group = group_numerator - logsum(group_numerator) + # TODO: estimate lambdas + return p_group @staticmethod def _group_model(document_term_frequencies): From de389c98c9a04a09628f4c8a738f61ac53849a35 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Wed, 10 Apr 2019 18:13:30 +0200 Subject: [PATCH 08/40] log initial lambda values --- weighwords/significant_words.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 40ca15d..7962fb9 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -74,7 +74,10 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None): self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.float) self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.float) self.lambda_group = np.full(weights_shape, group_w, dtype=np.float) - + logger.info( + f'Lambdas initialized to: Corpus={np.exp(general_w)}, ' + f'Group={w}, Specific={np.exp(specific_w)}' + ) self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps) self.p_specific = p_specific From c88bf9c2319d9b56b337f63b572eb068a55c24a7 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 13 Apr 2019 18:01:28 +0200 Subject: [PATCH 09/40] omit NaNs from logsum --- weighwords/logsum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weighwords/logsum.py b/weighwords/logsum.py index 6df59cd..ed17d19 100644 --- a/weighwords/logsum.py +++ b/weighwords/logsum.py @@ -25,6 +25,6 @@ def logsum(x): # Use the max to normalize, as with the log this is what accumulates # the less errors vmax = x.max(axis=0) - out = np.log(np.sum(np.exp(x - vmax), axis=0)) + out = np.log(np.nansum(np.exp(x - vmax), axis=0)) out += vmax return out From fe16fe22393fe365a6b3dfc71f10c0ee108b48ea Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 13 Apr 2019 18:07:38 +0200 Subject: [PATCH 10/40] test SWLM model fit --- tests/test_swlm.py | 184 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 tests/test_swlm.py diff --git a/tests/test_swlm.py b/tests/test_swlm.py new file mode 100644 index 0000000..7ac0bf8 --- /dev/null +++ b/tests/test_swlm.py @@ -0,0 +1,184 @@ +import operator +from functools import reduce + +import numpy as np + +from weighwords.significant_words import SignificantWordsLM + +# data follows tests + + +def test_model_fit(): + # init an SWLM with uniform p_corpus + swlm = SignificantWordsLM([colors], w=0.1) + # deterministically generate some docs + doc_parts = np.array_split(list(zip(colors[:25], rcolors)), 5) + doc_group = [ + reduce(operator.add, [i * list(d) for i, d in enumerate(z)]) + for z in zip(*doc_parts) + ] + # fit the modelk + term_probs = swlm.fit_parsimonious_group(doc_group) + expected_probs = { + "salmon": 0.04, + "chocolate": 0.03, + "snow": 0.02, + "tomato": 0.01, + "aqua": 0.0, + } + for term, p in expected_probs.items(): + diff = abs(term_probs[term] - p) + assert diff < 1e-10, f"P({term}) != {p} with difference {diff}" + + +colors = [ + "aliceblue", + "antiquewhite", + "aqua", + "aquamarine", + "azure", + "beige", + "bisque", + "black", + "blanchedalmond", + "blue", + "blueviolet", + "brown", + "burlywood", + "cadetblue", + "chartreuse", + "chocolate", + "coral", + "cornflowerblue", + "cornsilk", + "crimson", + "cyan", + "darkblue", + "darkcyan", + "darkgoldenrod", + "darkgray", + "darkgreen", + "darkgrey", + "darkkhaki", + "darkmagenta", + "darkolivegreen", + "darkorange", + "darkorchid", + "darkred", + "darksalmon", + "darkseagreen", + "darkslateblue", + "darkslategray", + "darkslategrey", + "darkturquoise", + "darkviolet", + "deeppink", + "deepskyblue", + "dimgray", + "dimgrey", + "dodgerblue", + "firebrick", + "floralwhite", + "forestgreen", + "fuchsia", + "gainsboro", + "ghostwhite", + "goldenrod", + "gold", + "gray", + "green", + "greenyellow", + "grey", + "honeydew", + "hotpink", + "indianred", + "indigo", + "ivory", + "khaki", + "lavenderblush", + "lavender", + "lawngreen", + "lemonchiffon", + "lightblue", + "lightcoral", + "lightcyan", + "lightgoldenrodyellow", + "lightgray", + "lightgreen", + "lightgrey", + "lightpink", + "lightsalmon", + "lightseagreen", + "lightskyblue", + "lightslategray", + "lightslategrey", + "lightsteelblue", + "lightyellow", + "lime", + "limegreen", + "linen", + "magenta", + "maroon", + "mediumaquamarine", + "mediumblue", + "mediumorchid", + "mediumpurple", + "mediumseagreen", + "mediumslateblue", + "mediumspringgreen", + "mediumturquoise", + "mediumvioletred", + "midnightblue", + "mintcream", + "mistyrose", + "moccasin", + "navajowhite", + "navy", + "oldlace", + "olive", + "olivedrab", + "orange", + "orangered", + "orchid", + "palegoldenrod", + "palegreen", + "paleturquoise", + "palevioletred", + "papayawhip", + "peachpuff", + "peru", + "pink", + "plum", + "powderblue", + "purple", + "rebeccapurple", + "red", + "rosybrown", + "royalblue", + "saddlebrown", + "salmon", + "sandybrown", + "seagreen", + "seashell", + "sienna", + "silver", + "skyblue", + "slateblue", + "slategray", + "slategrey", + "snow", + "springgreen", + "steelblue", + "tan", + "teal", + "thistle", + "tomato", + "turquoise", + "violet", + "wheat", + "white", + "whitesmoke", + "yellow", + "yellowgreen", +] +rcolors = reversed(colors) From acc074c1986bd138bde3d6904450f35f3d1f958d Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 13 Apr 2019 18:09:49 +0200 Subject: [PATCH 11/40] add a method which pairs terms with their probabilities --- weighwords/significant_words.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 7962fb9..cdc8678 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -81,9 +81,11 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None): self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps) self.p_specific = p_specific - exp_p_group = np.exp(p_group) + return self.get_term_probabilities(self.p_group) - return {t: exp_p_group[i] for t, i in self.vocab.items()} + def get_term_probabilities(self, log_prob_distribution): + probabilities = np.exp(log_prob_distribution) + return {t: probabilities[i] for t, i in self.vocab.items()} def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps): try: From 9a6757db5cfdb85fbd39686d1bb5094538d66362 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 13 Apr 2019 18:10:26 +0200 Subject: [PATCH 12/40] prevent NaNs from causing downstream errors --- weighwords/significant_words.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index cdc8678..ec7582e 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -122,6 +122,11 @@ def _e_step(self, p_group, p_specific): 'specific': specific_numerator - denominator, 'group': group_numerator - denominator } + # prevent NaNs from causing downstream errors + for v in out.values(): + v[np.isnan(v)] = np.NINF + + return out def _m_step(self, expectation, log_doc_tf): group_numerator = logsum(log_doc_tf + expectation['group']) @@ -162,9 +167,11 @@ def _specific_model(document_term_probabilities): p_specific = ( logsum(complement_products) - np.log( - np.count_nonzero(complement_products > np.log(0), axis=0) + np.count_nonzero(complement_products > np.NINF, axis=0) ) ) + # prevent NaNs from causing downstream errors + p_specific[np.isnan(p_specific)] = np.NINF finally: np.seterr(**old_error_settings) From 159b3359a5339e7c7d73fbf6abe0aca96e391b6e Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 13 Apr 2019 18:11:54 +0200 Subject: [PATCH 13/40] [WIP] remove the corpus and specific layers from E (unused) --- weighwords/significant_words.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index ec7582e..f86c10a 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -117,9 +117,9 @@ def _e_step(self, p_group, p_specific): group_numerator ) ] - return { - 'corpus': corpus_numerator - denominator, - 'specific': specific_numerator - denominator, + out = { + # 'corpus': corpus_numerator - denominator, + # 'specific': specific_numerator - denominator, 'group': group_numerator - denominator } # prevent NaNs from causing downstream errors From fe1e4ac0621f284c7b2f22113d9429fc0ca1bccb Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Thu, 18 Apr 2019 21:53:38 +0200 Subject: [PATCH 14/40] omit NaNs when testing for convergence; (these NaNs are caused by `-inf - -inf`) --- weighwords/parsimonious.py | 7 +++---- weighwords/significant_words.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index bc1d71b..b1ae93f 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -144,7 +144,7 @@ def _EM(self, tf, p_term, w, max_iter, eps): A posteriori term probabilities. """ - logger.info('EM with max_iter=%d, eps=%g' % (max_iter, eps)) + logger.info(f'EM with max_iter={max_iter}, eps={eps}') if w is None: w = self.w @@ -167,9 +167,8 @@ def _EM(self, tf, p_term, w, max_iter, eps): diff = new_p_term - p_term p_term = new_p_term - if (diff < eps).all(): - logger.info('EM: convergence reached after %d iterations' - % i) + if (diff[np.isfinite(diff)] < eps).all(): + logger.info(f'EM: convergence reached after {i} iterations') break finally: np.seterr(**old_error_settings) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index f86c10a..c7b2876 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -97,7 +97,7 @@ def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps): diff = new_p_group - p_group p_group = new_p_group - if (diff < eps).all(): + if (diff[np.isfinite(diff)] < eps).all(): logger.info(f'EM: convergence reached after {i} iterations') break finally: From c3ce35bdba40f448821bf9d4c2b61cc2ed5a43fd Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 19 Apr 2019 00:43:55 +0200 Subject: [PATCH 15/40] estimate lambdas unless they are fixed --- weighwords/significant_words.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index c7b2876..0dd12d2 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -39,16 +39,19 @@ def __init__(self, documents, w, thresh=0): self.lambda_specific = None self.p_group = None self.p_specific = None + self.fix_lambdas = False def group_top(self, k, document_group, **kwargs): term_probabilities = self.fit_parsimonious_group(document_group, **kwargs) return nlargest(k, term_probabilities.items(), itemgetter(1)) - def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None): + def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, fix_lambdas=False): if w is None: w = self.w assert 0 < w < 1, f"invalid w={w}; `w` needs a value between 0.0 and 1.0" + self.fix_lambdas = fix_lambdas + document_models = [ self._document_model(doc) for doc in document_group @@ -118,8 +121,8 @@ def _e_step(self, p_group, p_specific): ) ] out = { - # 'corpus': corpus_numerator - denominator, - # 'specific': specific_numerator - denominator, + 'corpus': corpus_numerator - denominator, + 'specific': specific_numerator - denominator, 'group': group_numerator - denominator } # prevent NaNs from causing downstream errors @@ -129,9 +132,20 @@ def _e_step(self, p_group, p_specific): return out def _m_step(self, expectation, log_doc_tf): - group_numerator = logsum(log_doc_tf + expectation['group']) + term_weighted_group = log_doc_tf + expectation['group'] + group_numerator = logsum(term_weighted_group) p_group = group_numerator - logsum(group_numerator) - # TODO: estimate lambdas + + if self.fix_lambdas is False: + # estimate lambdas + corpus_numerator = logsum(np.transpose(log_doc_tf + expectation['corpus'])) + specific_numerator = logsum(np.transpose(log_doc_tf + expectation['specific'])) + group_numerator = logsum(np.transpose(term_weighted_group)) + denominator = logsum(np.asarray([corpus_numerator, specific_numerator, group_numerator])) + self.lambda_corpus = corpus_numerator - denominator + self.lambda_specific = specific_numerator - denominator + self.lambda_group = group_numerator - denominator + return p_group @staticmethod From bf2a03123faf284114376674141f68d81cddb01e Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 19 Apr 2019 00:45:31 +0200 Subject: [PATCH 16/40] never take NaN to be the true max --- weighwords/logsum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weighwords/logsum.py b/weighwords/logsum.py index ed17d19..ff5f0f5 100644 --- a/weighwords/logsum.py +++ b/weighwords/logsum.py @@ -24,7 +24,7 @@ def logsum(x): """ # Use the max to normalize, as with the log this is what accumulates # the less errors - vmax = x.max(axis=0) + vmax = np.nanmax(x, axis=0) out = np.log(np.nansum(np.exp(x - vmax), axis=0)) out += vmax return out From 45fec030389ec0a60a8460bbb3abe5dd2653bf8a Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 19 Apr 2019 01:21:58 +0200 Subject: [PATCH 17/40] log final lambdas (mean over documents) --- weighwords/significant_words.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 0dd12d2..e3ea284 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -84,6 +84,12 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps) self.p_specific = p_specific + if self.fix_lambdas is False: + logger.info( + f'Final lambdas (mean): Corpus={np.mean(np.exp(self.lambda_corpus))}, ' + f'Group={np.mean(np.exp(self.lambda_group))}, ' + f'Specific={np.mean(np.exp(self.lambda_specific))}' + ) return self.get_term_probabilities(self.p_group) def get_term_probabilities(self, log_prob_distribution): From 7e193a813eb0754dc950afb4fd47bba185c9ef37 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 19 Apr 2019 02:15:35 +0200 Subject: [PATCH 18/40] initialize corpus with higher weight than specific --- weighwords/significant_words.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index e3ea284..6702e45 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -71,7 +71,9 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, p_specific = self._specific_model(doc_term_probs) - general_w = specific_w = np.log(0.5 * (1 - w)) + # FIXME: magic constants + general_w = np.log(0.8 * (1 - w)) + specific_w = np.log(0.2 * (1 - w)) group_w = np.log(w) weights_shape = len(document_group) self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.float) From 3061327faa7248e04180c11e8078452a4b0b801d Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 19 Apr 2019 02:19:33 +0200 Subject: [PATCH 19/40] update tests --- tests/test_swlm.py | 48 +++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/tests/test_swlm.py b/tests/test_swlm.py index 7ac0bf8..9c3d305 100644 --- a/tests/test_swlm.py +++ b/tests/test_swlm.py @@ -1,24 +1,31 @@ +import logging import operator from functools import reduce import numpy as np +import pytest from weighwords.significant_words import SignificantWordsLM -# data follows tests +logging.basicConfig(level=logging.INFO) -def test_model_fit(): - # init an SWLM with uniform p_corpus - swlm = SignificantWordsLM([colors], w=0.1) - # deterministically generate some docs - doc_parts = np.array_split(list(zip(colors[:25], rcolors)), 5) - doc_group = [ - reduce(operator.add, [i * list(d) for i, d in enumerate(z)]) - for z in zip(*doc_parts) - ] - # fit the modelk - term_probs = swlm.fit_parsimonious_group(doc_group) +def test_model_fit_fixed(swlm, doc_group): + term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=True) + expected_probs = { + "salmon": 0.04, + "chocolate": 0.03, + "snow": 0.02, + "tomato": 0.01, + "aqua": 0.0, + } + for term, p in expected_probs.items(): + diff = abs(term_probs[term] - p) + assert diff < 1e-10, f"P({term}) != {p} with difference {diff}" + + +def test_model_fit_shifty(swlm, doc_group): + term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=False) expected_probs = { "salmon": 0.04, "chocolate": 0.03, @@ -31,6 +38,22 @@ def test_model_fit(): assert diff < 1e-10, f"P({term}) != {p} with difference {diff}" +@pytest.fixture(scope="module") +def swlm(): + # init an SWLM with uniform p_corpus + return SignificantWordsLM([colors], w=0.1) + + +@pytest.fixture(scope="module") +def doc_group(): + # deterministically generate some docs + doc_parts = np.array_split(list(zip(colors[:25], reversed(colors))), 5) + return [ + reduce(operator.add, [i * list(d) for i, d in enumerate(z)]) + for z in zip(*doc_parts) + ] + + colors = [ "aliceblue", "antiquewhite", @@ -181,4 +204,3 @@ def test_model_fit(): "yellow", "yellowgreen", ] -rcolors = reversed(colors) From 993595de8dc042541b6a1f5a15448eec9dd3b1b6 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 10 May 2019 22:21:35 +0200 Subject: [PATCH 20/40] move PLM parameters to `__init__` docstring --- weighwords/parsimonious.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index b1ae93f..b9d85f7 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -22,24 +22,29 @@ class ParsimoniousLM: Constructing an object of this class fits a background model. The top method can then be used to fit document-specific models, also for unseen documents (with the same vocabulary as the background corpus). - - Parameters - ---------- - documents : iterable over iterable over terms - w : float - Weight of document model (1 - weight of corpus model) - thresh : int - Don't include words that occur < thresh times - - Attributes - ---------- - vocab : dict of term -> int - Mapping of terms to numeric indices - p_corpus : array of float - Log prob of terms """ def __init__(self, documents, w, thresh=0): + """ + Collect the vocabulary and fit the background model. + + Parameters + ---------- + documents : iterable over iterable over terms + All documents that should be included in the corpus model + w : float + Weight of document model (1 - weight of corpus model) + thresh : int + Don't include words that occur fewer than `thresh` times + + Attributes + ---------- + vocab : dict of term -> int + Mapping of terms to numeric indices + p_corpus : array of float + Log probability of terms in background model (indexed by `vocab`) + + """ logger.info('Building corpus model') self.w = w @@ -66,7 +71,7 @@ def __init__(self, documents, w, thresh=0): np.seterr(**old_error_settings) def top(self, k, d, max_iter=50, eps=1e-5, w=None): - """Get the top k terms of a document d and their log probabilities. + """Get the top `k` terms of a document `d` and their log probabilities. Uses the Expectation Maximization (EM) algorithm to estimate term probabilities. From b056279d5724712964fcefb962d41c57b4511924 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 11 May 2019 16:04:39 +0200 Subject: [PATCH 21/40] don't count terms with a cf < thresh; ignore out-of-vocabulary terms: this prevents errors but does not "handle unseen words" as expressed in #1 --- setup.py | 2 ++ tests/test_plm.py | 38 ++++++++++++++++++++++++++++++++++++++ weighwords/parsimonious.py | 7 ++++++- 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 tests/test_plm.py diff --git a/setup.py b/setup.py index 601f5da..7826ff2 100644 --- a/setup.py +++ b/setup.py @@ -10,9 +10,11 @@ license = "LGPL", packages = ["weighwords"], install_requires = ["numpy>=1.4.0"], + tests_require = ["pytest"], classifiers = [ "Development Status :: 4 - Beta", "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", + "Programming Language :: Python :: 3.6", "Topic :: Text Processing", ] ) diff --git a/tests/test_plm.py b/tests/test_plm.py new file mode 100644 index 0000000..14c5671 --- /dev/null +++ b/tests/test_plm.py @@ -0,0 +1,38 @@ +import numpy as np +import pytest + +from weighwords import ParsimoniousLM + + +def test_document_model(number_corpus, uniform_doc): + plm = ParsimoniousLM([number_corpus], w=0.1, thresh=3) + tf, p_term = plm._document_model(uniform_doc) + assert (tf[:2] == 0).all(), \ + "Terms with a corpus frequency < thresh should not be counted" + assert tf.sum() == 3, f"Expected tf.sum() to be 3, got {tf.sum()} instead" + linear_p_term = np.exp(p_term) + assert (linear_p_term[2:].sum() - 1) < 1e-10, \ + f"All probability mass should be on the last 3 terms, got {linear_p_term} instead" + + +def test_document_model_out_of_vocabulary(number_corpus): + plm = ParsimoniousLM([number_corpus], w=0.1) + doc = ['two', 'or', 'three', 'unseen', 'words'] + tf, p_term = plm._document_model(doc) + assert tf.sum() == 2, f"Unseen words should be ignored, got {tf} instead" + + +@pytest.fixture(scope="module") +def uniform_doc(): + return ['one', 'two', 'three', 'four', 'five'] + + +@pytest.fixture(scope="module") +def number_corpus(): + return [ + 'one', + 'two', 'two', + 'three', 'three', 'three', + 'four', 'four', 'four', 'four', + 'five', 'five', 'five', 'five', 'five' + ] diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index b9d85f7..3b3bb28 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -119,7 +119,12 @@ def _document_model(self, d): tf = np.zeros(len(self.vocab), dtype=np.float) # Term frequency for tok in d: - tf[self.vocab[tok]] += 1 + term_id = self.vocab.get(tok) + if term_id: + tf[term_id] += 1 + + # ignore counts of terms with zero corpus probability + tf *= np.isfinite(self.p_corpus) n_distinct = (tf > 0).sum() From f889815a37eec23c040391808ff5ff54b9edd238 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Tue, 14 May 2019 22:57:27 +0200 Subject: [PATCH 22/40] give dtypes a more explicit name --- weighwords/parsimonious.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index 3b3bb28..6a4c0cc 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -56,7 +56,7 @@ def __init__(self, documents, w, thresh=0): i = vocab.setdefault(tok, len(vocab)) count[i] += 1 - cf = np.empty(len(count), dtype=np.float) + cf = np.empty(len(count), dtype=np.double) for i, f in count.items(): cf[i] = f rare = (cf < thresh) @@ -116,7 +116,7 @@ def _document_model(self, d): logger.info('Gathering term probabilities') - tf = np.zeros(len(self.vocab), dtype=np.float) # Term frequency + tf = np.zeros(len(self.vocab), dtype=np.double) # Term frequency for tok in d: term_id = self.vocab.get(tok) From 99832cb77402d39aa0d84cc7935ed171948203ff Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Tue, 14 May 2019 23:01:46 +0200 Subject: [PATCH 23/40] extend swlm model and add arguments --- tests/test_swlm.py | 2 +- weighwords/significant_words.py | 73 +++++++++++++++++++++++---------- 2 files changed, 52 insertions(+), 23 deletions(-) diff --git a/tests/test_swlm.py b/tests/test_swlm.py index 9c3d305..8c8ee3b 100644 --- a/tests/test_swlm.py +++ b/tests/test_swlm.py @@ -41,7 +41,7 @@ def test_model_fit_shifty(swlm, doc_group): @pytest.fixture(scope="module") def swlm(): # init an SWLM with uniform p_corpus - return SignificantWordsLM([colors], w=0.1) + return SignificantWordsLM([colors], lambdas=(0.7, 0.1, 0.2)) @pytest.fixture(scope="module") diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 6702e45..94d1f8e 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -32,8 +32,9 @@ class SignificantWordsLM(ParsimoniousLM): p_corpus : array of float Log prob of terms """ - def __init__(self, documents, w, thresh=0): - super().__init__(documents, w, thresh=thresh) + def __init__(self, documents, lambdas, thresh=0): + self.initial_lambdas = self.normalize_lambdas(lambdas) + super().__init__(documents, self.initial_lambdas[1], thresh=thresh) self.lambda_corpus = None self.lambda_group = None self.lambda_specific = None @@ -45,10 +46,14 @@ def group_top(self, k, document_group, **kwargs): term_probabilities = self.fit_parsimonious_group(document_group, **kwargs) return nlargest(k, term_probabilities.items(), itemgetter(1)) - def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, fix_lambdas=False): - if w is None: - w = self.w - assert 0 < w < 1, f"invalid w={w}; `w` needs a value between 0.0 and 1.0" + def fit_parsimonious_group( + self, document_group, max_iter=50, eps=1e-5, lambdas=None, + fix_lambdas=False, parsimonize_specific=False, post_parsimonize=False + ): + if lambdas is None: + lambdas = self.initial_lambdas + else: + lambdas = self.normalize_lambdas(lambdas) self.fix_lambdas = fix_lambdas @@ -56,6 +61,8 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, self._document_model(doc) for doc in document_group ] + del document_group + doc_term_frequencies = [tf for tf, _ in document_models] group_tf, p_group = self._group_model( doc_term_frequencies @@ -70,21 +77,25 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, np.seterr(**old_error_settings) p_specific = self._specific_model(doc_term_probs) + if parsimonize_specific: + p_specific = self._EM(group_tf, p_specific, self.w, max_iter, eps) + + self.p_specific = p_specific - # FIXME: magic constants - general_w = np.log(0.8 * (1 - w)) - specific_w = np.log(0.2 * (1 - w)) - group_w = np.log(w) - weights_shape = len(document_group) - self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.float) - self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.float) - self.lambda_group = np.full(weights_shape, group_w, dtype=np.float) + weights_shape = len(document_models) + general_w, group_w, specific_w = np.log(lambdas) + self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.double) + self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.double) + self.lambda_group = np.full(weights_shape, group_w, dtype=np.double) logger.info( - f'Lambdas initialized to: Corpus={np.exp(general_w)}, ' - f'Group={w}, Specific={np.exp(specific_w)}' + f'Lambdas initialized to: Corpus={lambdas[0]}, ' + f'Group={lambdas[1]}, Specific={lambdas[2]}' ) - self.p_group = self._estimate(p_group, p_specific, doc_term_frequencies, max_iter, eps) - self.p_specific = p_specific + self.p_group = self._estimate( + p_group, p_specific, doc_term_frequencies, max_iter, eps + ) + if post_parsimonize: + self.p_group = self._EM(group_tf, self.p_group, self.w, max_iter, eps) if self.fix_lambdas is False: logger.info( @@ -96,6 +107,7 @@ def fit_parsimonious_group(self, document_group, max_iter=50, eps=1e-5, w=None, def get_term_probabilities(self, log_prob_distribution): probabilities = np.exp(log_prob_distribution) + probabilities[np.isnan(probabilities)] = 0. return {t: probabilities[i] for t, i in self.vocab.items()} def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps): @@ -146,10 +158,16 @@ def _m_step(self, expectation, log_doc_tf): if self.fix_lambdas is False: # estimate lambdas - corpus_numerator = logsum(np.transpose(log_doc_tf + expectation['corpus'])) - specific_numerator = logsum(np.transpose(log_doc_tf + expectation['specific'])) + corpus_numerator = logsum( + np.transpose(log_doc_tf + expectation['corpus']) + ) + specific_numerator = logsum( + np.transpose(log_doc_tf + expectation['specific']) + ) group_numerator = logsum(np.transpose(term_weighted_group)) - denominator = logsum(np.asarray([corpus_numerator, specific_numerator, group_numerator])) + denominator = logsum( + np.asarray([corpus_numerator, specific_numerator, group_numerator]) + ) self.lambda_corpus = corpus_numerator - denominator self.lambda_specific = specific_numerator - denominator self.lambda_group = group_numerator - denominator @@ -177,7 +195,7 @@ def _specific_model(document_term_probabilities): ] # probability of term to be important in one doc, and not others complement_products = np.array([ - document_term_probabilities[i] + complement + dlm + complement for i, dlm in enumerate(document_term_probabilities) for j, complement in enumerate(complements) if i != j @@ -198,3 +216,14 @@ def _specific_model(document_term_probabilities): np.seterr(**old_error_settings) return p_specific + + @staticmethod + def normalize_lambdas(lambdas): + assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}' + lambda_sum = sum(lambdas) + if abs(lambda_sum - 1) > 1e-10: + lambdas = tuple( + w / lambda_sum + for w in lambdas + ) + return lambdas From 019228a80a97d6a83ee077c3247c244092c06dc3 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Wed, 15 May 2019 18:39:45 +0200 Subject: [PATCH 24/40] type annotations for PLM (PEP 585-ready) --- setup.py | 5 ++- weighwords/parsimonious.py | 83 +++++++++++++++++++++++++------------- weighwords/py.typed | 0 3 files changed, 58 insertions(+), 30 deletions(-) create mode 100644 weighwords/py.typed diff --git a/setup.py b/setup.py index 7826ff2..3552010 100644 --- a/setup.py +++ b/setup.py @@ -8,13 +8,14 @@ description = "Python library for creating word weights/word clouds from text", keywords = "word cloud nlp language model", license = "LGPL", + package_data = {"weighwords": ["py.typed"]}, packages = ["weighwords"], - install_requires = ["numpy>=1.4.0"], + install_requires = ["numpy>=1.15.0"], tests_require = ["pytest"], classifiers = [ "Development Status :: 4 - Beta", "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", - "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", "Topic :: Text Processing", ] ) diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index 6a4c0cc..a6709e1 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -1,12 +1,15 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 -# Copyright 2011-2013 University of Amsterdam +# Copyright 2011-2019 University of Amsterdam # Author: Lars Buitinck +from __future__ import annotations + from collections import defaultdict from heapq import nlargest import logging from operator import itemgetter +from typing import Iterable, Optional import numpy as np @@ -22,34 +25,39 @@ class ParsimoniousLM: Constructing an object of this class fits a background model. The top method can then be used to fit document-specific models, also for unseen documents (with the same vocabulary as the background corpus). - """ - def __init__(self, documents, w, thresh=0): - """ - Collect the vocabulary and fit the background model. - - Parameters - ---------- - documents : iterable over iterable over terms - All documents that should be included in the corpus model - w : float - Weight of document model (1 - weight of corpus model) - thresh : int - Don't include words that occur fewer than `thresh` times - - Attributes - ---------- - vocab : dict of term -> int - Mapping of terms to numeric indices - p_corpus : array of float - Log probability of terms in background model (indexed by `vocab`) + Parameters + ---------- + documents : iterable over iterable over terms + All documents that should be included in the corpus model + w : float + Weight of document model (1 - weight of corpus model) + thresh : int + Don't include words that occur fewer than `thresh` times + + Attributes + ---------- + vocab : dict of term -> int + Mapping of terms to numeric indices + p_corpus : array of float + Log probability of terms in background model (indexed by `vocab`) + """ - """ + def __init__( + self, + documents: Iterable[Iterable[str]], + w: float, + thresh: int = 0 + ): + """Collect the vocabulary and fit the background model.""" logger.info('Building corpus model') self.w = w - self.vocab = vocab = {} # Vocabulary: maps terms to numeric indices - count = defaultdict(int) # Corpus frequency + # Vocabulary: maps terms to numeric indices + vocab: dict[str, int] + self.vocab = vocab = {} + # Corpus frequency + count: dict[int, int] = defaultdict(int) for d in documents: for tok in d: @@ -70,7 +78,14 @@ def __init__(self, documents, w, thresh=0): finally: np.seterr(**old_error_settings) - def top(self, k, d, max_iter=50, eps=1e-5, w=None): + def top( + self, + k: int, + d: Iterable[str], + max_iter: int = 50, + eps: float = 1e-5, + w: Optional[float] = None + ) -> list[tuple[str, float]]: """Get the top `k` terms of a document `d` and their log probabilities. Uses the Expectation Maximization (EM) algorithm to estimate term @@ -78,6 +93,10 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None): Parameters ---------- + k + Number of top terms to return + d + Terms that make up the document max_iter : int, optional Maximum number of iterations of EM algorithm to run. eps : float, optional @@ -88,6 +107,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None): Returns ------- t_p : list of (str, float) + Terms and their log-probabilities in the parsimonious model """ tf, p_term = self._document_model(d) @@ -96,7 +116,7 @@ def top(self, k, d, max_iter=50, eps=1e-5, w=None): terms = [(t, p_term[i]) for t, i in self.vocab.items()] return nlargest(k, terms, itemgetter(1)) - def _document_model(self, d): + def _document_model(self, d: Iterable[str]) -> tuple[np.ndarray, np.ndarray]: """Build document model. Parameters @@ -136,7 +156,14 @@ def _document_model(self, d): return tf, p_term - def _EM(self, tf, p_term, w, max_iter, eps): + def _EM( + self, + tf: Iterable[int], + p_term: Iterable[float], + w: Optional[float], + max_iter: int, + eps: float + ) -> np.ndarray: """Expectation maximization. Parameters diff --git a/weighwords/py.typed b/weighwords/py.typed new file mode 100644 index 0000000..e69de29 From d41a79a8b49017570f5ca342b50f979e36ce9ce8 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Wed, 15 May 2019 19:36:42 +0200 Subject: [PATCH 25/40] type annotations for logsum; format some docstrings --- weighwords/logsum.py | 2 +- weighwords/parsimonious.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/weighwords/logsum.py b/weighwords/logsum.py index ff5f0f5..350bd26 100644 --- a/weighwords/logsum.py +++ b/weighwords/logsum.py @@ -6,7 +6,7 @@ import numpy as np -def logsum(x): +def logsum(x: np.ndarray) -> np.ndarray: """Computes the sum of x assuming x is in the log domain. Returns log(sum(exp(x))) while minimizing the possibility of diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index a6709e1..5e174f8 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -29,11 +29,11 @@ class ParsimoniousLM: Parameters ---------- documents : iterable over iterable over terms - All documents that should be included in the corpus model + All documents that should be included in the corpus model. w : float - Weight of document model (1 - weight of corpus model) + Weight of document model (1 - weight of corpus model). thresh : int - Don't include words that occur fewer than `thresh` times + Don't include words that occur fewer than `thresh` times. Attributes ---------- @@ -94,13 +94,13 @@ def top( Parameters ---------- k - Number of top terms to return + Number of top terms to return. d - Terms that make up the document + Terms that make up the document. max_iter : int, optional Maximum number of iterations of EM algorithm to run. eps : float, optional - Convergence threshold for EM algorithm. + Epsilon: convergence threshold for EM algorithm. w : float, optional Weight of document model; overrides value given to __init__ @@ -174,6 +174,8 @@ def _EM( Term probabilities, as returned by document_model max_iter : int Number of iterations to run. + eps : float + Epsilon: convergence threshold for EM algorithm. Returns ------- From 5c243f9dabb68202370dc29ba59e275557475dee Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Thu, 16 May 2019 01:25:46 +0200 Subject: [PATCH 26/40] add type annotations and SWLM class docstring; switched to backwards-compatible annotations, because while python may be PEP 585-ready, mypy does not deal with builtin generics yet --- weighwords/parsimonious.py | 57 +++++++------- weighwords/significant_words.py | 131 ++++++++++++++++++++++++++------ 2 files changed, 137 insertions(+), 51 deletions(-) diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index 5e174f8..eed0a30 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -4,12 +4,13 @@ # Author: Lars Buitinck from __future__ import annotations +# TODO: remove redundant typing imports once PEP 585 is finalized from collections import defaultdict from heapq import nlargest import logging from operator import itemgetter -from typing import Iterable, Optional +from typing import Iterable, Optional, Dict, List, Tuple import numpy as np @@ -20,7 +21,8 @@ class ParsimoniousLM: - """Language model for a set of documents. + """ + Language model for a set of documents. Constructing an object of this class fits a background model. The top method can then be used to fit document-specific models, also for unseen @@ -44,20 +46,20 @@ class ParsimoniousLM: """ def __init__( - self, - documents: Iterable[Iterable[str]], - w: float, - thresh: int = 0 + self, + documents: Iterable[Iterable[str]], + w: np.floating, + thresh: int = 0 ): """Collect the vocabulary and fit the background model.""" logger.info('Building corpus model') self.w = w # Vocabulary: maps terms to numeric indices - vocab: dict[str, int] + vocab: Dict[str, int] self.vocab = vocab = {} # Corpus frequency - count: dict[int, int] = defaultdict(int) + count: Dict[int, int] = defaultdict(int) for d in documents: for tok in d: @@ -74,19 +76,20 @@ def __init__( old_error_settings = np.seterr(divide='ignore') # lg P(t|C) - self.p_corpus = np.log(cf) - np.log(np.sum(cf)) + self.p_corpus: np.ndarray = np.log(cf) - np.log(np.sum(cf)) finally: np.seterr(**old_error_settings) def top( - self, - k: int, - d: Iterable[str], - max_iter: int = 50, - eps: float = 1e-5, - w: Optional[float] = None - ) -> list[tuple[str, float]]: - """Get the top `k` terms of a document `d` and their log probabilities. + self, + k: int, + d: Iterable[str], + max_iter: int = 50, + eps: float = 1e-5, + w: Optional[np.floating] = None + ) -> List[Tuple[str, float]]: + """ + Get the top `k` terms of a document `d` and their log probabilities. Uses the Expectation Maximization (EM) algorithm to estimate term probabilities. @@ -116,8 +119,9 @@ def top( terms = [(t, p_term[i]) for t, i in self.vocab.items()] return nlargest(k, terms, itemgetter(1)) - def _document_model(self, d: Iterable[str]) -> tuple[np.ndarray, np.ndarray]: - """Build document model. + def _document_model(self, d: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]: + """ + Build document model. Parameters ---------- @@ -157,14 +161,15 @@ def _document_model(self, d: Iterable[str]) -> tuple[np.ndarray, np.ndarray]: return tf, p_term def _EM( - self, - tf: Iterable[int], - p_term: Iterable[float], - w: Optional[float], - max_iter: int, - eps: float + self, + tf: np.ndarray, + p_term: np.ndarray, + w: Optional[np.floating], + max_iter: int, + eps: float ) -> np.ndarray: - """Expectation maximization. + """ + Expectation maximization. Parameters ---------- diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 94d1f8e..ed2f0e7 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -1,6 +1,15 @@ +#!/usr/bin/env python3 + +# Copyright 2019 TinQwise Stamkracht, University of Amsterdam +# Author: Alex Olieman + +from __future__ import annotations +# TODO: remove redundant typing imports once PEP 585 is finalized + import logging from heapq import nlargest from operator import itemgetter +from typing import Iterable, Optional, Sequence, Tuple, List, Dict import numpy as np @@ -9,47 +18,97 @@ logger = logging.getLogger(__name__) +InitialLambdas = Tuple[np.floating, np.floating, np.floating] + class SignificantWordsLM(ParsimoniousLM): - """Language model for a set of documents. + """ + Language model that consists of three sub-models: - Constructing an object of this class fits a background model. The top - method can then be used to fit document-specific models, also for unseen - documents (with the same vocabulary as the background corpus). + - Corpus model: represents term probabilities in a (large) background collection; + - Group model: parsimonious term probabilities in a group of documents; + - Specific model: represents the same group, but is biased towards terms that + occur with a high frequency in single docs, and a low frequency in others. Parameters ---------- documents : iterable over iterable over terms - w : float - Weight of document model (1 - weight of corpus model) + All documents that should be included in the corpus model. + lambdas : 3-tuple of floats + Weight of corpus, group, and specific models. Will be normalized + if the weights in the tuple don't sum to one. thresh : int - Don't include words that occur < thresh times + Don't include words that occur fewer than `thresh` times. Attributes ---------- vocab : dict of term -> int Mapping of terms to numeric indices p_corpus : array of float - Log prob of terms + Log probability of terms in background model (indexed by `vocab`) + p_group : array of float + Log probability of terms in background model (indexed by `vocab`) + p_specific : array of float + Log probability of terms in background model (indexed by `vocab`) + lambda_corpus : array of float + Log probability (weight) of corpus model for documents + lambda_group : array of float + Log probability (weight) of group model for documents + lambda_specific : array of float + Log probability (weight) of specific model for documents + + Methods + ------- + fit_parsimonious_group(document_group, ...) + Estimates a document group model, parsimonized against the corpus + and specific models. The documents may be unseen, but terms that + are not in the vocabulary will be ignored. + group_top(k, document_group, ...) + Shortcut to fit the group model and retrieve the top `k` terms. + get_term_probabilities(log_prob_distribution) + Aligns a term distribution with the vocabulary, and transforms + the term log probabilities to linear probabilities. + + See Also + -------- + parsimonious.ParsimoniousLM : one-sided parsimonious model """ - def __init__(self, documents, lambdas, thresh=0): + + def __init__( + self, + documents: Iterable[Iterable[str]], + lambdas: InitialLambdas, + thresh: int = 0 + ): + """Collect the vocabulary and fit the background model.""" self.initial_lambdas = self.normalize_lambdas(lambdas) super().__init__(documents, self.initial_lambdas[1], thresh=thresh) - self.lambda_corpus = None - self.lambda_group = None - self.lambda_specific = None - self.p_group = None - self.p_specific = None + self.lambda_corpus: Optional[np.ndarray] = None + self.lambda_group: Optional[np.ndarray] = None + self.lambda_specific: Optional[np.ndarray] = None + self.p_group: Optional[np.ndarray] = None + self.p_specific: Optional[np.ndarray] = None self.fix_lambdas = False - def group_top(self, k, document_group, **kwargs): + def group_top( + self, + k: int, + document_group: Iterable[Iterable[str]], + **kwargs + ) -> List[Tuple[str, float]]: term_probabilities = self.fit_parsimonious_group(document_group, **kwargs) return nlargest(k, term_probabilities.items(), itemgetter(1)) def fit_parsimonious_group( - self, document_group, max_iter=50, eps=1e-5, lambdas=None, - fix_lambdas=False, parsimonize_specific=False, post_parsimonize=False - ): + self, + document_group: Iterable[Iterable[str]], + max_iter: int = 50, + eps: float = 1e-5, + lambdas: Optional[InitialLambdas] = None, + fix_lambdas: bool = False, + parsimonize_specific: bool = False, + post_parsimonize: bool = False + ) -> Dict[str, float]: if lambdas is None: lambdas = self.initial_lambdas else: @@ -105,12 +164,22 @@ def fit_parsimonious_group( ) return self.get_term_probabilities(self.p_group) - def get_term_probabilities(self, log_prob_distribution): + def get_term_probabilities( + self, + log_prob_distribution: np.ndarray + ) -> Dict[str, float]: probabilities = np.exp(log_prob_distribution) probabilities[np.isnan(probabilities)] = 0. return {t: probabilities[i] for t, i in self.vocab.items()} - def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps): + def _estimate( + self, + p_group: np.ndarray, + p_specific: np.ndarray, + doc_tf: Sequence[np.ndarray], + max_iter: int, + eps: float + ) -> np.ndarray: try: old_error_settings = np.seterr(divide='ignore') log_doc_tf = np.log(doc_tf) @@ -128,7 +197,11 @@ def _estimate(self, p_group, p_specific, doc_tf, max_iter, eps): return p_group - def _e_step(self, p_group, p_specific): + def _e_step( + self, + p_group: np.ndarray, + p_specific: np.ndarray + ) -> Dict[str, np.ndarray]: corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus) specific_numerator = np.add.outer(self.lambda_specific, p_specific) group_numerator = np.add.outer(self.lambda_group, p_group) @@ -151,7 +224,11 @@ def _e_step(self, p_group, p_specific): return out - def _m_step(self, expectation, log_doc_tf): + def _m_step( + self, + expectation: Dict[str, np.ndarray], + log_doc_tf: Sequence[np.ndarray] + ) -> np.ndarray: term_weighted_group = log_doc_tf + expectation['group'] group_numerator = logsum(term_weighted_group) p_group = group_numerator - logsum(group_numerator) @@ -175,7 +252,9 @@ def _m_step(self, expectation, log_doc_tf): return p_group @staticmethod - def _group_model(document_term_frequencies): + def _group_model( + document_term_frequencies: Sequence[np.ndarray] + ) -> Tuple[np.ndarray, np.ndarray]: group_tf = np.array(document_term_frequencies).sum(axis=0) try: @@ -187,7 +266,9 @@ def _group_model(document_term_frequencies): return group_tf, p_group @staticmethod - def _specific_model(document_term_probabilities): + def _specific_model( + document_term_probabilities: Sequence[np.ndarray] + ) -> np.ndarray: # complement events: 1 - p complements = [ np.log1p(-np.exp(p_doc)) @@ -218,7 +299,7 @@ def _specific_model(document_term_probabilities): return p_specific @staticmethod - def normalize_lambdas(lambdas): + def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas: assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}' lambda_sum = sum(lambdas) if abs(lambda_sum - 1) > 1e-10: From 9a6e14b1313430d0521489717031340fb86e7df9 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 17 May 2019 15:12:13 +0200 Subject: [PATCH 27/40] cast lambdas to prevent mypy naggery --- weighwords/significant_words.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index ed2f0e7..17c72b5 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -9,7 +9,7 @@ import logging from heapq import nlargest from operator import itemgetter -from typing import Iterable, Optional, Sequence, Tuple, List, Dict +from typing import Iterable, Optional, Sequence, Tuple, List, Dict, cast import numpy as np @@ -303,8 +303,11 @@ def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas: assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}' lambda_sum = sum(lambdas) if abs(lambda_sum - 1) > 1e-10: - lambdas = tuple( - w / lambda_sum - for w in lambdas + lambdas = cast( + InitialLambdas, + tuple( + w / lambda_sum + for w in lambdas + ) ) return lambdas From 5bfc67b37541ed397a52d784d14cdca1fef42459 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 17 May 2019 17:31:30 +0200 Subject: [PATCH 28/40] added docstrings and reference for SWLM --- README.rst | 9 ++- weighwords/parsimonious.py | 17 ++++-- weighwords/significant_words.py | 105 ++++++++++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index 8fc2365..96fd7cc 100644 --- a/README.rst +++ b/README.rst @@ -26,11 +26,16 @@ or:: References ---------- -D. Hiemstra, S. Robertson and H. Zaragoza (2004). `Parsimonious Language Models +D. Hiemstra, S. Robertson, and H. Zaragoza (2004). `Parsimonious Language Models for Information Retrieval `_. Proc. SIGIR'04. -R. Kaptein, D. Hiemstra and J. Kamps (2010). `How different are Language Models +R. Kaptein, D. Hiemstra, and J. Kamps (2010). `How different are Language Models and word clouds? `_ Proc. ECIR. + +M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016). +`Luhn Revisited: Significant Words Language Models +`_ +Proc. CKIM'16. diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index eed0a30..d8ce657 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -30,7 +30,7 @@ class ParsimoniousLM: Parameters ---------- - documents : iterable over iterable over terms + documents : iterable over iterable of str terms All documents that should be included in the corpus model. w : float Weight of document model (1 - weight of corpus model). @@ -43,6 +43,13 @@ class ParsimoniousLM: Mapping of terms to numeric indices p_corpus : array of float Log probability of terms in background model (indexed by `vocab`) + + References + ---------- + D. Hiemstra, S. Robertson, and H. Zaragoza (2004). + `Parsimonious Language Models for Information Retrieval + `_. + Proc. SIGIR'04. """ def __init__( @@ -96,9 +103,9 @@ def top( Parameters ---------- - k + k : int Number of top terms to return. - d + d : iterable of str terms Terms that make up the document. max_iter : int, optional Maximum number of iterations of EM algorithm to run. @@ -110,7 +117,7 @@ def top( Returns ------- t_p : list of (str, float) - Terms and their log-probabilities in the parsimonious model + Terms and their log-probabilities in the parsimonious model. """ tf, p_term = self._document_model(d) @@ -125,7 +132,7 @@ def _document_model(self, d: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]: Parameters ---------- - d : array of terms + d : iterable of str terms Returns ------- diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 17c72b5..46cef88 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -32,9 +32,9 @@ class SignificantWordsLM(ParsimoniousLM): Parameters ---------- - documents : iterable over iterable over terms + documents : iterable over iterable of str terms All documents that should be included in the corpus model. - lambdas : 3-tuple of floats + lambdas : 3-tuple of float Weight of corpus, group, and specific models. Will be normalized if the weights in the tuple don't sum to one. thresh : int @@ -72,6 +72,13 @@ class SignificantWordsLM(ParsimoniousLM): See Also -------- parsimonious.ParsimoniousLM : one-sided parsimonious model + + References + ---------- + M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016). + `Luhn Revisited: Significant Words Language Models + `_ + Proc. CKIM'16. """ def __init__( @@ -96,6 +103,28 @@ def group_top( document_group: Iterable[Iterable[str]], **kwargs ) -> List[Tuple[str, float]]: + """ + Get the top `k` terms of a `document_group` and their probabilities. + This is a shortcut to retrieve the top terms found by `fit_parsimonious_group`. + + Parameters + ---------- + k : int + Number of top terms to return. + document_group : iterable over iterable of str terms + All documents that should be included in the group model. + kwargs + Optional keyword arguments for `fit_parsimonious_group`. + + Returns + ------- + t_p : list of (str, float) + Terms and their probabilities in the group model. + + See Also + -------- + SignificantWordsLM.fit_parsimonious_group + """ term_probabilities = self.fit_parsimonious_group(document_group, **kwargs) return nlargest(k, term_probabilities.items(), itemgetter(1)) @@ -109,6 +138,40 @@ def fit_parsimonious_group( parsimonize_specific: bool = False, post_parsimonize: bool = False ) -> Dict[str, float]: + """ + Estimate a document group model, and parsimonize it against fixed + corpus and specific models. The documents may be unseen, but any terms + that are not in the vocabulary will be ignored. + + Parameters + ---------- + document_group : iterable over iterable of str terms + All documents that should be included in the group model. + max_iter : int, optional + Maximum number of iterations of EM algorithm to run. + eps : float, optional + Epsilon: convergence threshold for EM algorithm. + lambdas : 3-tuple of float, optional + Weight of corpus, group, and specific models. Will be normalized + if the weights in the tuple don't sum to one. + fix_lambdas : bool, optional + Fix the weights of the three sub-models (i.e. don't estimate + lambdas as part of the M-step). + parsimonize_specific : bool, optional + Bias the specific model towards uncommon terms before applying + the EM algorithm to the group model. This generally results in + a group model that stands out less from the corpus model. + post_parsimonize : bool, optional + Bias the group model towards uncommon terms after applying + the EM algorithm. This may be used to compensate when the + frequency of common terms varies much between the documents + in the group. + + Returns + ------- + t_p_map : dict of term -> float + Dictionary of terms and their probabilities in the group model. + """ if lambdas is None: lambdas = self.initial_lambdas else: @@ -168,6 +231,20 @@ def get_term_probabilities( self, log_prob_distribution: np.ndarray ) -> Dict[str, float]: + """ + Align a term distribution with the vocabulary, and transform + the term log probabilities to linear probabilities. + + Parameters + ---------- + log_prob_distribution : array of float + Log probability of terms which is indexed by the vocabulary. + + Returns + ------- + t_p_map : dict of term -> float + Dictionary of terms and their probabilities in the (sub-)model. + """ probabilities = np.exp(log_prob_distribution) probabilities[np.isnan(probabilities)] = 0. return {t: probabilities[i] for t, i in self.vocab.items()} @@ -180,6 +257,7 @@ def _estimate( max_iter: int, eps: float ) -> np.ndarray: + """Apply the Expectation Maximization algorithm.""" try: old_error_settings = np.seterr(divide='ignore') log_doc_tf = np.log(doc_tf) @@ -202,6 +280,7 @@ def _e_step( p_group: np.ndarray, p_specific: np.ndarray ) -> Dict[str, np.ndarray]: + """Run an E-step.""" corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus) specific_numerator = np.add.outer(self.lambda_specific, p_specific) group_numerator = np.add.outer(self.lambda_group, p_group) @@ -229,6 +308,7 @@ def _m_step( expectation: Dict[str, np.ndarray], log_doc_tf: Sequence[np.ndarray] ) -> np.ndarray: + """Run an M-step.""" term_weighted_group = log_doc_tf + expectation['group'] group_numerator = logsum(term_weighted_group) p_group = group_numerator - logsum(group_numerator) @@ -255,6 +335,7 @@ def _m_step( def _group_model( document_term_frequencies: Sequence[np.ndarray] ) -> Tuple[np.ndarray, np.ndarray]: + """Create the initial group model.""" group_tf = np.array(document_term_frequencies).sum(axis=0) try: @@ -269,6 +350,7 @@ def _group_model( def _specific_model( document_term_probabilities: Sequence[np.ndarray] ) -> np.ndarray: + """Create the fixed specific model.""" # complement events: 1 - p complements = [ np.log1p(-np.exp(p_doc)) @@ -300,13 +382,26 @@ def _specific_model( @staticmethod def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas: + """ + Check and normalize the initial lambdas of the three sub-models. + + Parameters + ---------- + lambdas : 3-tuple of float + Weight of corpus, group, and specific models. + + Returns + ------- + lambdas : 3-tuple of float + Normalized probability of corpus, group, and specific models. + """ assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}' - lambda_sum = sum(lambdas) - if abs(lambda_sum - 1) > 1e-10: + total_weight = sum(lambdas) + if abs(total_weight - 1) > 1e-10: lambdas = cast( InitialLambdas, tuple( - w / lambda_sum + w / total_weight for w in lambdas ) ) From 124d63d4538f1306f53e8362ca441bb2aa53682d Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 17 May 2019 20:19:51 +0200 Subject: [PATCH 29/40] correct SWLM E-step denominator (*facepalm* for major oversight) --- weighwords/significant_words.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 46cef88..b81e08f 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -285,8 +285,8 @@ def _e_step( specific_numerator = np.add.outer(self.lambda_specific, p_specific) group_numerator = np.add.outer(self.lambda_group, p_group) denominator = [ - logsum(np.asarray([sp_corpus, sp_corpus, sp_specific])) - for sp_corpus, sp_corpus, sp_specific in zip( + logsum(np.asarray(doc_numerators)) + for doc_numerators in zip( corpus_numerator, specific_numerator, group_numerator From 0c01da4aa56ea94cdab21f74f31c4702f1fa920f Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 17 May 2019 20:43:28 +0200 Subject: [PATCH 30/40] extend dickens example with SWLM --- example/dickens.py | 41 +++++++++++++++++++++++++++++++---------- weighwords/__init__.py | 1 + 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/example/dickens.py b/example/dickens.py index ec5fd33..0b334b3 100755 --- a/example/dickens.py +++ b/example/dickens.py @@ -1,13 +1,16 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Find terms that distinguish various novels by Charles Dickens. # Note: if the w parameter is set wisely, no stop list is needed. - -from weighwords import ParsimoniousLM import gzip import logging -import numpy as np +import math import re +from itertools import zip_longest + +import numpy as np + +from weighwords import ParsimoniousLM, SignificantWordsLM logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -27,8 +30,8 @@ def read_book(title, num): """Returns generator over words in book num""" - logger.info("Fetching terms from %s" % title) - path = "%s.txt.utf8.gz" % num + logger.info(f"Fetching terms from {title}") + path = f"{num}.txt.utf8.gz" in_book = False for ln in gzip.open(path, 'rt', encoding='utf8'): if in_book and ln.startswith(endbook): @@ -40,12 +43,30 @@ def read_book(title, num): in_book = True +def grouper(iterable, n, filler=None): + """Source: https://docs.python.org/3/library/itertools.html#itertools-recipes""" + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=filler) + + book_contents = [(title, list(read_book(title, num))) for title, num in books] +corpus = [terms for title, terms in book_contents] -model = ParsimoniousLM([terms for title, terms in book_contents], w=.01) +plm = ParsimoniousLM(corpus, w=.01) +swlm = SignificantWordsLM(corpus, lambdas=(.9, .01, .09)) for title, terms in book_contents: - print("Top %d words in %s:" % (top_k, title)) - for term, p in model.top(top_k, terms): - print(" %s %.4f" % (term, np.exp(p))) + plm_top = plm.top(top_k, terms) + swlm_top = swlm.group_top( + top_k, + grouper(terms, math.ceil(len(terms) / 10)), + fix_lambdas=True, + ) + print(f"\nTop {top_k} words in {title}:") + print(f"\n{'PLM term':<16} {'PLM p':<12} {'SWLM term':<16} {'SWLM p':<6}") + for (plm_t, plm_p), (swlm_t, swlm_p) in zip(plm_top, swlm_top): + print(f"{plm_t:<16} {np.exp(plm_p):<12.4f} {swlm_t:<16} {swlm_p:.4f}") print("") + + + diff --git a/weighwords/__init__.py b/weighwords/__init__.py index f725693..40c7742 100644 --- a/weighwords/__init__.py +++ b/weighwords/__init__.py @@ -1 +1,2 @@ from .parsimonious import ParsimoniousLM +from .significant_words import SignificantWordsLM From c32151cd9a7a5bd9fd3013712c7913caecfabff3 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 17 May 2019 21:42:36 +0200 Subject: [PATCH 31/40] test model equivalence; parsimonize specific with fixed w=1/3; format floats in lambda logging --- tests/test_equivalence.py | 85 +++++++++++++++++++++++++++++++++ tests/test_swlm.py | 2 +- weighwords/significant_words.py | 13 ++--- 3 files changed, 93 insertions(+), 7 deletions(-) create mode 100644 tests/test_equivalence.py diff --git a/tests/test_equivalence.py b/tests/test_equivalence.py new file mode 100644 index 0000000..9cf02c7 --- /dev/null +++ b/tests/test_equivalence.py @@ -0,0 +1,85 @@ +import re +from itertools import chain + +import pytest + +from weighwords import ParsimoniousLM, SignificantWordsLM +from weighwords.logsum import logsum + + +def test_model_equivalence(shakespeare_quotes): + weight = .1 + plm = ParsimoniousLM(shakespeare_quotes, w=weight) + # initialize SWLM with weights that make it equivalent to PLM + swlm = SignificantWordsLM( + shakespeare_quotes, + lambdas=(1 - weight, weight, 0.) + ) + plm_terms, swlm_terms = fit_models(plm, swlm, shakespeare_quotes) + + assert plm_terms == swlm_terms, 'PLM and SWLM are not functionally equivalent' + + +def test_model_non_equivalence(shakespeare_quotes): + weight = .1 + plm = ParsimoniousLM(shakespeare_quotes, w=weight) + # initialize SWLM with weights that make it non-equivalent to PLM + swlm = SignificantWordsLM( + shakespeare_quotes, + lambdas=(1 - 2 * weight, weight, weight) + ) + plm_terms, swlm_terms = fit_models(plm, swlm, shakespeare_quotes) + + assert plm_terms != swlm_terms, 'PLM and SWLM should not be functionally equivalent' + + +@pytest.fixture(scope="module") +def shakespeare_quotes(): + quotes = [ + "Love all, trust a few, Do wrong to none", + "But love that comes too late, " + "Like a remorseful pardon slowly carried, " + "To the great sender turns a sour offence.", + "If thou remember'st not the slightest folly " + "That ever love did make thee run into, " + "Thou hast not lov'd.", + "We that are true lovers run into strange capers; " + "but as all is mortal in nature, " + "so is all nature in love mortal in folly.", + "But are you so much in love as your rhymes speak? " + "Neither rhyme nor reason can express how much.", + "A lover's eyes will gaze an eagle blind. " + "A lover's ear will hear the lowest sound.", + ] + return [ + re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split() + for quote in quotes + ] + + +def get_p_corpus(language_model): + p_corpus = language_model.p_corpus.copy() + vocab = language_model.vocab + term_tiers = [ + (1.5, ['love', 'folly', "lov'd", 'lovers', 'lover']), + (1.3, ['trust', 'remorseful', 'sour', 'offence', 'gaze']), + (1.1, ["remember'st", 'capers', 'rhyme', 'rhymes', 'eagle']), + ] + for multiplier, terms in term_tiers: + for t in terms: + p_corpus[vocab[t]] *= multiplier + + return p_corpus - logsum(p_corpus) + + +def fit_models(plm, swlm, docs): + # artificially reduce the corpus probability of selected terms + plm.p_corpus = swlm.p_corpus = get_p_corpus(plm) + + top_k = 15 + plm_top = plm.top(top_k, chain(*docs)) + swlm_top = swlm.group_top(top_k, docs, fix_lambdas=True) + plm_terms = [term for term, log_prob in plm_top] + swlm_terms = [term for term, prob in swlm_top] + + return plm_terms, swlm_terms diff --git a/tests/test_swlm.py b/tests/test_swlm.py index 8c8ee3b..68519ae 100644 --- a/tests/test_swlm.py +++ b/tests/test_swlm.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from weighwords.significant_words import SignificantWordsLM +from weighwords import SignificantWordsLM logging.basicConfig(level=logging.INFO) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index b81e08f..0cb584e 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -200,7 +200,7 @@ def fit_parsimonious_group( p_specific = self._specific_model(doc_term_probs) if parsimonize_specific: - p_specific = self._EM(group_tf, p_specific, self.w, max_iter, eps) + p_specific = self._EM(group_tf, p_specific, 1/3, max_iter, eps) self.p_specific = p_specific @@ -210,8 +210,8 @@ def fit_parsimonious_group( self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.double) self.lambda_group = np.full(weights_shape, group_w, dtype=np.double) logger.info( - f'Lambdas initialized to: Corpus={lambdas[0]}, ' - f'Group={lambdas[1]}, Specific={lambdas[2]}' + f'Lambdas initialized to: Corpus={lambdas[0]:.4f}, ' + f'Group={lambdas[1]:.4f}, Specific={lambdas[2]:.4f}' ) self.p_group = self._estimate( p_group, p_specific, doc_term_frequencies, max_iter, eps @@ -221,9 +221,10 @@ def fit_parsimonious_group( if self.fix_lambdas is False: logger.info( - f'Final lambdas (mean): Corpus={np.mean(np.exp(self.lambda_corpus))}, ' - f'Group={np.mean(np.exp(self.lambda_group))}, ' - f'Specific={np.mean(np.exp(self.lambda_specific))}' + f'Final lambdas (mean): ' + f'Corpus={np.mean(np.exp(self.lambda_corpus)):.4f}, ' + f'Group={np.mean(np.exp(self.lambda_group)):.4f}, ' + f'Specific={np.mean(np.exp(self.lambda_specific)):.4f}' ) return self.get_term_probabilities(self.p_group) From d4272350bc9e685bfca962e1da51ba1b5bd75d06 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 18 May 2019 03:43:14 +0200 Subject: [PATCH 32/40] use scalar lambdas when they are fixed (this can significantly reduce the memory complexity) --- weighwords/significant_words.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 0cb584e..b20b331 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -205,6 +205,9 @@ def fit_parsimonious_group( self.p_specific = p_specific weights_shape = len(document_models) + if self.fix_lambdas: + weights_shape = 1 + general_w, group_w, specific_w = np.log(lambdas) self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.double) self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.double) From cc7577ff3b5404326fccc10e8eed9fd1372b4eac Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 18 May 2019 15:18:52 +0200 Subject: [PATCH 33/40] simplify tests --- tests/conftest.py | 17 ++++ tests/test_plm.py | 21 +---- tests/test_swlm.py | 205 ++++----------------------------------------- 3 files changed, 37 insertions(+), 206 deletions(-) create mode 100644 tests/conftest.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..17faf7f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,17 @@ +import pytest + + +@pytest.fixture(scope="module") +def uniform_doc(): + return ['one', 'two', 'three', 'four', 'five'] + + +@pytest.fixture(scope="module") +def number_corpus(): + return [ + ['one'], + ['two', 'two'], + ['three', 'three', 'three'], + ['four', 'four', 'four', 'four'], + ['five', 'five', 'five', 'five', 'five'] + ] diff --git a/tests/test_plm.py b/tests/test_plm.py index 14c5671..ceb9f90 100644 --- a/tests/test_plm.py +++ b/tests/test_plm.py @@ -1,11 +1,10 @@ import numpy as np -import pytest from weighwords import ParsimoniousLM def test_document_model(number_corpus, uniform_doc): - plm = ParsimoniousLM([number_corpus], w=0.1, thresh=3) + plm = ParsimoniousLM(number_corpus, w=0.1, thresh=3) tf, p_term = plm._document_model(uniform_doc) assert (tf[:2] == 0).all(), \ "Terms with a corpus frequency < thresh should not be counted" @@ -16,23 +15,7 @@ def test_document_model(number_corpus, uniform_doc): def test_document_model_out_of_vocabulary(number_corpus): - plm = ParsimoniousLM([number_corpus], w=0.1) + plm = ParsimoniousLM(number_corpus, w=0.1) doc = ['two', 'or', 'three', 'unseen', 'words'] tf, p_term = plm._document_model(doc) assert tf.sum() == 2, f"Unseen words should be ignored, got {tf} instead" - - -@pytest.fixture(scope="module") -def uniform_doc(): - return ['one', 'two', 'three', 'four', 'five'] - - -@pytest.fixture(scope="module") -def number_corpus(): - return [ - 'one', - 'two', 'two', - 'three', 'three', 'three', - 'four', 'four', 'four', 'four', - 'five', 'five', 'five', 'five', 'five' - ] diff --git a/tests/test_swlm.py b/tests/test_swlm.py index 68519ae..e6b82bc 100644 --- a/tests/test_swlm.py +++ b/tests/test_swlm.py @@ -1,206 +1,37 @@ import logging -import operator -from functools import reduce - -import numpy as np -import pytest from weighwords import SignificantWordsLM logging.basicConfig(level=logging.INFO) -def test_model_fit_fixed(swlm, doc_group): +def test_model_fit_fixed(number_corpus, uniform_doc): + swlm = SignificantWordsLM([uniform_doc], lambdas=(1/3, 1/3, 1/3)) + doc_group = [l + r for l, r in zip(number_corpus, reversed(number_corpus))] term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=True) expected_probs = { - "salmon": 0.04, - "chocolate": 0.03, - "snow": 0.02, - "tomato": 0.01, - "aqua": 0.0, + "one": 0.0, + "two": 0.12373, + "three": 2e-5, + "four": 0.50303, + "five": 0.37322, } for term, p in expected_probs.items(): diff = abs(term_probs[term] - p) - assert diff < 1e-10, f"P({term}) != {p} with difference {diff}" + assert diff < 1e-5, f"P({term}) != {p} with difference {diff}" -def test_model_fit_shifty(swlm, doc_group): +def test_model_fit_shifty(number_corpus, uniform_doc): + swlm = SignificantWordsLM([uniform_doc], lambdas=(1/3, 1/3, 1/3)) + doc_group = [l + r for l, r in zip(number_corpus, reversed(number_corpus))] term_probs = swlm.fit_parsimonious_group(doc_group, fix_lambdas=False) expected_probs = { - "salmon": 0.04, - "chocolate": 0.03, - "snow": 0.02, - "tomato": 0.01, - "aqua": 0.0, + "one": 0.0, + "two": 0.33322, + "three": 0.0, + "four": 0.66678, + "five": 0.0, } for term, p in expected_probs.items(): diff = abs(term_probs[term] - p) - assert diff < 1e-10, f"P({term}) != {p} with difference {diff}" - - -@pytest.fixture(scope="module") -def swlm(): - # init an SWLM with uniform p_corpus - return SignificantWordsLM([colors], lambdas=(0.7, 0.1, 0.2)) - - -@pytest.fixture(scope="module") -def doc_group(): - # deterministically generate some docs - doc_parts = np.array_split(list(zip(colors[:25], reversed(colors))), 5) - return [ - reduce(operator.add, [i * list(d) for i, d in enumerate(z)]) - for z in zip(*doc_parts) - ] - - -colors = [ - "aliceblue", - "antiquewhite", - "aqua", - "aquamarine", - "azure", - "beige", - "bisque", - "black", - "blanchedalmond", - "blue", - "blueviolet", - "brown", - "burlywood", - "cadetblue", - "chartreuse", - "chocolate", - "coral", - "cornflowerblue", - "cornsilk", - "crimson", - "cyan", - "darkblue", - "darkcyan", - "darkgoldenrod", - "darkgray", - "darkgreen", - "darkgrey", - "darkkhaki", - "darkmagenta", - "darkolivegreen", - "darkorange", - "darkorchid", - "darkred", - "darksalmon", - "darkseagreen", - "darkslateblue", - "darkslategray", - "darkslategrey", - "darkturquoise", - "darkviolet", - "deeppink", - "deepskyblue", - "dimgray", - "dimgrey", - "dodgerblue", - "firebrick", - "floralwhite", - "forestgreen", - "fuchsia", - "gainsboro", - "ghostwhite", - "goldenrod", - "gold", - "gray", - "green", - "greenyellow", - "grey", - "honeydew", - "hotpink", - "indianred", - "indigo", - "ivory", - "khaki", - "lavenderblush", - "lavender", - "lawngreen", - "lemonchiffon", - "lightblue", - "lightcoral", - "lightcyan", - "lightgoldenrodyellow", - "lightgray", - "lightgreen", - "lightgrey", - "lightpink", - "lightsalmon", - "lightseagreen", - "lightskyblue", - "lightslategray", - "lightslategrey", - "lightsteelblue", - "lightyellow", - "lime", - "limegreen", - "linen", - "magenta", - "maroon", - "mediumaquamarine", - "mediumblue", - "mediumorchid", - "mediumpurple", - "mediumseagreen", - "mediumslateblue", - "mediumspringgreen", - "mediumturquoise", - "mediumvioletred", - "midnightblue", - "mintcream", - "mistyrose", - "moccasin", - "navajowhite", - "navy", - "oldlace", - "olive", - "olivedrab", - "orange", - "orangered", - "orchid", - "palegoldenrod", - "palegreen", - "paleturquoise", - "palevioletred", - "papayawhip", - "peachpuff", - "peru", - "pink", - "plum", - "powderblue", - "purple", - "rebeccapurple", - "red", - "rosybrown", - "royalblue", - "saddlebrown", - "salmon", - "sandybrown", - "seagreen", - "seashell", - "sienna", - "silver", - "skyblue", - "slateblue", - "slategray", - "slategrey", - "snow", - "springgreen", - "steelblue", - "tan", - "teal", - "thistle", - "tomato", - "turquoise", - "violet", - "wheat", - "white", - "whitesmoke", - "yellow", - "yellowgreen", -] + assert diff < 1e-5, f"P({term}) != {p} with difference {diff}" From ea00801b026ebfe950e87956ba2da91bb48a3e9c Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 18 May 2019 16:07:05 +0200 Subject: [PATCH 34/40] updated readme with usage examples; moved test fixture; updated copyright in license statement --- COPYING | 3 +- README.rst | 59 +++++++++++++++++++++++++++++++++++++-- tests/conftest.py | 26 +++++++++++++++++ tests/test_equivalence.py | 27 ------------------ 4 files changed, 84 insertions(+), 31 deletions(-) diff --git a/COPYING b/COPYING index c128f62..02ba3d4 100644 --- a/COPYING +++ b/COPYING @@ -1,5 +1,6 @@ WeighWords: a Python library for creating word weights/word clouds from text -Copyright 2011 University of Amsterdam +Copyright 2011-2019 University of Amsterdam +Copyright 2019 TinQwise Stamkracht This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the diff --git a/README.rst b/README.rst index 96fd7cc..ea17792 100644 --- a/README.rst +++ b/README.rst @@ -9,20 +9,73 @@ Rather than use simple word frequency, it weighs words by statistical models known as *parsimonious language models*. These models are good at picking up the words that distinguish a text document from other documents in a collection. The downside to this is that you can't use WeighWords to make a -word cloud of a single document; you need a bunch of document to compare to. +word cloud of a single document; you need a bunch of documents (i.e. a +background collection) to compare to. Installation ------------ -Either:: +Either install the latest release from PyPI:: pip install weighwords -or:: +or clone this git repository, and:: python setup.py install +or:: + + pip install -e . + +Usage +----- +>>> quotes = [ + "Love all, trust a few, Do wrong to none", + ... + "A lover's eyes will gaze an eagle blind. " + "A lover's ear will hear the lowest sound.", + ] +>>> doc_tokens = [ + re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split() + for quote in quotes + ] + +The `ParsimoniousLM` is initialized with all document tokens as a +background corpus, and subsequently takes a single document's tokens +as input. Its `top` method returns the top terms and their log-probabilities: + +>>> plm = ParsimoniousLM(doc_tokens, w=.1) +>>> plm.top(10, doc_tokens[-1]) +[('lover', -1.871802261651365), + ('will', -1.871802261651365), + ('eyes', -2.5649494422113044), + ('gaze', -2.5649494422113044), + ('an', -2.5649494422113044), + ('eagle', -2.5649494422113044), + ('blind', -2.5649494422113044), + ('ear', -2.5649494422113044), + ('hear', -2.5649494422113044), + ('lowest', -2.5649494422113044)] + +The `SignificantWordsLM` is similarly initialized with a background corpus, +but subsequently takes a group of document tokens as input. Its `group_top` +method returns the top terms and their probabilities: + +>>> swlm = SignificantWordsLM(doc_tokens, lambdas=(.7, .1, .2)) +>>> swlm.group_top(10, doc_tokens[-3:]) +[('in', 0.37875318027881), + ('is', 0.07195732361699828), + ('mortal', 0.07195732361699828), + ('nature', 0.07195732361699828), + ('all', 0.07110584778711342), + ('we', 0.03597866180849914), + ('true', 0.03597866180849914), + ('lovers', 0.03597866180849914), + ('strange', 0.03597866180849914), + ('capers', 0.03597866180849914)] + +See `example/dickens.py` for a running example with more realistic data. References ---------- diff --git a/tests/conftest.py b/tests/conftest.py index 17faf7f..3b68efa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +import re + import pytest @@ -15,3 +17,27 @@ def number_corpus(): ['four', 'four', 'four', 'four'], ['five', 'five', 'five', 'five', 'five'] ] + + +@pytest.fixture(scope="module") +def shakespeare_quotes(): + quotes = [ + "Love all, trust a few, Do wrong to none", + "But love that comes too late, " + "Like a remorseful pardon slowly carried, " + "To the great sender turns a sour offence.", + "If thou remember'st not the slightest folly " + "That ever love did make thee run into, " + "Thou hast not lov'd.", + "We that are true lovers run into strange capers; " + "but as all is mortal in nature, " + "so is all nature in love mortal in folly.", + "But are you so much in love as your rhymes speak? " + "Neither rhyme nor reason can express how much.", + "A lover's eyes will gaze an eagle blind. " + "A lover's ear will hear the lowest sound.", + ] + return [ + re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split() + for quote in quotes + ] diff --git a/tests/test_equivalence.py b/tests/test_equivalence.py index 9cf02c7..4c934e6 100644 --- a/tests/test_equivalence.py +++ b/tests/test_equivalence.py @@ -1,8 +1,5 @@ -import re from itertools import chain -import pytest - from weighwords import ParsimoniousLM, SignificantWordsLM from weighwords.logsum import logsum @@ -33,30 +30,6 @@ def test_model_non_equivalence(shakespeare_quotes): assert plm_terms != swlm_terms, 'PLM and SWLM should not be functionally equivalent' -@pytest.fixture(scope="module") -def shakespeare_quotes(): - quotes = [ - "Love all, trust a few, Do wrong to none", - "But love that comes too late, " - "Like a remorseful pardon slowly carried, " - "To the great sender turns a sour offence.", - "If thou remember'st not the slightest folly " - "That ever love did make thee run into, " - "Thou hast not lov'd.", - "We that are true lovers run into strange capers; " - "but as all is mortal in nature, " - "so is all nature in love mortal in folly.", - "But are you so much in love as your rhymes speak? " - "Neither rhyme nor reason can express how much.", - "A lover's eyes will gaze an eagle blind. " - "A lover's ear will hear the lowest sound.", - ] - return [ - re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split() - for quote in quotes - ] - - def get_p_corpus(language_model): p_corpus = language_model.p_corpus.copy() vocab = language_model.vocab From b2d0687333a81e6c9f8f556141de3a809b97679d Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 18 May 2019 16:13:28 +0200 Subject: [PATCH 35/40] cast 1/3 to np.floating to keep mypy quiet --- weighwords/significant_words.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index b20b331..45142b3 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -200,7 +200,13 @@ def fit_parsimonious_group( p_specific = self._specific_model(doc_term_probs) if parsimonize_specific: - p_specific = self._EM(group_tf, p_specific, 1/3, max_iter, eps) + p_specific = self._EM( + group_tf, + p_specific, + cast(np.floating, 1/3), + max_iter, + eps + ) self.p_specific = p_specific From 8a1126f15c1b189c55e3c319ceb686a9fa766712 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 18 May 2019 16:18:37 +0200 Subject: [PATCH 36/40] simplify installing optional dev requirements --- requirements-dev.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 requirements-dev.txt diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..7670e0e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,9 @@ +# install the weighwords package for convenience +-e . + +# testing framework +pytest ~= 4.5 + +# static type checking +mypy >= 0.701 +https://github.com/numpy/numpy-stubs/archive/master.tar.gz From c792a875c5c9e8da736e283fe2dc247e0cae3437 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Sat, 18 May 2019 17:04:07 +0200 Subject: [PATCH 37/40] docs: double backticks and float --- README.rst | 10 +++++----- weighwords/parsimonious.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index ea17792..6f720ae 100644 --- a/README.rst +++ b/README.rst @@ -41,9 +41,9 @@ Usage for quote in quotes ] -The `ParsimoniousLM` is initialized with all document tokens as a +The ``ParsimoniousLM`` is initialized with all document tokens as a background corpus, and subsequently takes a single document's tokens -as input. Its `top` method returns the top terms and their log-probabilities: +as input. Its ``top`` method returns the top terms and their log-probabilities: >>> plm = ParsimoniousLM(doc_tokens, w=.1) >>> plm.top(10, doc_tokens[-1]) @@ -58,8 +58,8 @@ as input. Its `top` method returns the top terms and their log-probabilities: ('hear', -2.5649494422113044), ('lowest', -2.5649494422113044)] -The `SignificantWordsLM` is similarly initialized with a background corpus, -but subsequently takes a group of document tokens as input. Its `group_top` +The ``SignificantWordsLM`` is similarly initialized with a background corpus, +but subsequently takes a group of document tokens as input. Its ``group_top`` method returns the top terms and their probabilities: >>> swlm = SignificantWordsLM(doc_tokens, lambdas=(.7, .1, .2)) @@ -75,7 +75,7 @@ method returns the top terms and their probabilities: ('strange', 0.03597866180849914), ('capers', 0.03597866180849914)] -See `example/dickens.py` for a running example with more realistic data. +See ``example/dickens.py`` for a running example with more realistic data. References ---------- diff --git a/weighwords/parsimonious.py b/weighwords/parsimonious.py index d8ce657..60ef281 100644 --- a/weighwords/parsimonious.py +++ b/weighwords/parsimonious.py @@ -136,7 +136,7 @@ def _document_model(self, d: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]: Returns ------- - tf : array of int + tf : array of float Term frequencies p_term : array of float Term log probabilities From 48bfc7609d255274ba8a9ed90e7f33528f940031 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Mon, 20 May 2019 11:36:41 +0200 Subject: [PATCH 38/40] fix readme example syntax --- README.rst | 24 +++++++++++++----------- weighwords/significant_words.py | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index 6f720ae..c3888cb 100644 --- a/README.rst +++ b/README.rst @@ -31,20 +31,21 @@ or:: Usage ----- >>> quotes = [ - "Love all, trust a few, Do wrong to none", - ... - "A lover's eyes will gaze an eagle blind. " - "A lover's ear will hear the lowest sound.", - ] +... "Love all, trust a few, Do wrong to none", +... ... +... "A lover's eyes will gaze an eagle blind. " +... "A lover's ear will hear the lowest sound.", +... ] >>> doc_tokens = [ - re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split() - for quote in quotes - ] +... re.sub(r"[.,:;!?\"‘’]|'s\b", " ", quote).lower().split() +... for quote in quotes +... ] The ``ParsimoniousLM`` is initialized with all document tokens as a background corpus, and subsequently takes a single document's tokens as input. Its ``top`` method returns the top terms and their log-probabilities: +>>> from weighwords import ParsimoniousLM >>> plm = ParsimoniousLM(doc_tokens, w=.1) >>> plm.top(10, doc_tokens[-1]) [('lover', -1.871802261651365), @@ -62,6 +63,7 @@ The ``SignificantWordsLM`` is similarly initialized with a background corpus, but subsequently takes a group of document tokens as input. Its ``group_top`` method returns the top terms and their probabilities: +>>> from weighwords import SignificantWordsLM >>> swlm = SignificantWordsLM(doc_tokens, lambdas=(.7, .1, .2)) >>> swlm.group_top(10, doc_tokens[-3:]) [('in', 0.37875318027881), @@ -85,10 +87,10 @@ for Information Retrieval Proc. SIGIR'04. R. Kaptein, D. Hiemstra, and J. Kamps (2010). `How different are Language Models -and word clouds? `_ -Proc. ECIR. +and word clouds? `_. +Proc. ECIR'10. M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016). `Luhn Revisited: Significant Words Language Models -`_ +`_. Proc. CKIM'16. diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 45142b3..9bd831e 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -77,7 +77,7 @@ class SignificantWordsLM(ParsimoniousLM): ---------- M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016). `Luhn Revisited: Significant Words Language Models - `_ + `_. Proc. CKIM'16. """ From 72aff8cf485850dcd69a584adaf95b9371753a98 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Fri, 31 May 2019 01:21:11 +0200 Subject: [PATCH 39/40] use p_corpus as replacement for p_specific when len(docs) < 2 --- weighwords/significant_words.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 9bd831e..42c7161 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -356,11 +356,18 @@ def _group_model( return group_tf, p_group - @staticmethod def _specific_model( + self, document_term_probabilities: Sequence[np.ndarray] ) -> np.ndarray: """Create the fixed specific model.""" + if len(document_term_probabilities) < 2: + logger.warning( + 'Cannot calculate `p_specific` for a single document, ' + 'using `p_corpus` as replacement.' + ) + return self.p_corpus + # complement events: 1 - p complements = [ np.log1p(-np.exp(p_doc)) From 32acde2e5e707786a06701c10db14e8e9ccd75a7 Mon Sep 17 00:00:00 2001 From: Alex Olieman Date: Tue, 4 Jun 2019 02:52:27 +0200 Subject: [PATCH 40/40] make the specific terms estimator pluggable --- weighwords/significant_words.py | 75 +++++----------- weighwords/specific_term_estimators.py | 120 +++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 55 deletions(-) create mode 100644 weighwords/specific_term_estimators.py diff --git a/weighwords/significant_words.py b/weighwords/significant_words.py index 42c7161..3e9d761 100644 --- a/weighwords/significant_words.py +++ b/weighwords/significant_words.py @@ -15,6 +15,11 @@ from weighwords import ParsimoniousLM from weighwords.logsum import logsum +from weighwords.specific_term_estimators import ( + SpecificTermEstimator, + RequiresMultipleDocuments, + mutual_exclusion, +) logger = logging.getLogger(__name__) @@ -136,7 +141,8 @@ def fit_parsimonious_group( lambdas: Optional[InitialLambdas] = None, fix_lambdas: bool = False, parsimonize_specific: bool = False, - post_parsimonize: bool = False + post_parsimonize: bool = False, + specific_estimator: SpecificTermEstimator = mutual_exclusion ) -> Dict[str, float]: """ Estimate a document group model, and parsimonize it against fixed @@ -166,6 +172,9 @@ def fit_parsimonious_group( the EM algorithm. This may be used to compensate when the frequency of common terms varies much between the documents in the group. + specific_estimator : callable, optional + Function that estimates the specific terms model based on + the document term frequencies of the doc group. Returns ------- @@ -190,26 +199,23 @@ def fit_parsimonious_group( doc_term_frequencies ) try: - old_error_settings = np.seterr(divide='ignore') - doc_term_probs = [ - np.log(tf) - np.log(np.sum(tf)) - for tf in doc_term_frequencies - ] - finally: - np.seterr(**old_error_settings) + self.p_specific = specific_estimator(doc_term_frequencies) + except RequiresMultipleDocuments: + logger.warning( + 'Cannot calculate `p_specific` for a single document, ' + 'using `p_corpus` as replacement.' + ) + self.p_specific = self.p_corpus - p_specific = self._specific_model(doc_term_probs) if parsimonize_specific: - p_specific = self._EM( + self.p_specific = self._EM( group_tf, - p_specific, + self.p_specific, cast(np.floating, 1/3), max_iter, eps ) - self.p_specific = p_specific - weights_shape = len(document_models) if self.fix_lambdas: weights_shape = 1 @@ -223,7 +229,7 @@ def fit_parsimonious_group( f'Group={lambdas[1]:.4f}, Specific={lambdas[2]:.4f}' ) self.p_group = self._estimate( - p_group, p_specific, doc_term_frequencies, max_iter, eps + p_group, self.p_specific, doc_term_frequencies, max_iter, eps ) if post_parsimonize: self.p_group = self._EM(group_tf, self.p_group, self.w, max_iter, eps) @@ -356,47 +362,6 @@ def _group_model( return group_tf, p_group - def _specific_model( - self, - document_term_probabilities: Sequence[np.ndarray] - ) -> np.ndarray: - """Create the fixed specific model.""" - if len(document_term_probabilities) < 2: - logger.warning( - 'Cannot calculate `p_specific` for a single document, ' - 'using `p_corpus` as replacement.' - ) - return self.p_corpus - - # complement events: 1 - p - complements = [ - np.log1p(-np.exp(p_doc)) - for p_doc in document_term_probabilities - ] - # probability of term to be important in one doc, and not others - complement_products = np.array([ - dlm + complement - for i, dlm in enumerate(document_term_probabilities) - for j, complement in enumerate(complements) - if i != j - ]) - - try: - old_error_settings = np.seterr(divide='ignore') - # marginalize over all documents - p_specific = ( - logsum(complement_products) - - np.log( - np.count_nonzero(complement_products > np.NINF, axis=0) - ) - ) - # prevent NaNs from causing downstream errors - p_specific[np.isnan(p_specific)] = np.NINF - finally: - np.seterr(**old_error_settings) - - return p_specific - @staticmethod def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas: """ diff --git a/weighwords/specific_term_estimators.py b/weighwords/specific_term_estimators.py new file mode 100644 index 0000000..158abd9 --- /dev/null +++ b/weighwords/specific_term_estimators.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +# Copyright 2019 TinQwise Stamkracht, University of Amsterdam +# Author: Alex Olieman + +from __future__ import annotations +# TODO: remove redundant typing imports once PEP 585 is finalized + +import functools +import logging +from typing import Sequence, Callable + +import numpy as np +from weighwords.logsum import logsum + +logger = logging.getLogger(__name__) + +SpecificTermEstimator = Callable[[Sequence[np.ndarray]], np.ndarray] + + +class RequiresMultipleDocuments(Exception): + pass + + +def requires_multiple_docs(estimator_func: SpecificTermEstimator): + """ + Do not let the decorated function be called with fewer than two docs. + + Parameters + ---------- + estimator_func : SpecificTermEstimator + + Raises + ------ + RequiresMultipleDocuments + + Returns + ------- + decorated_func : SpecificTermEstimator + """ + @functools.wraps(estimator_func) + def wrapper_func(document_term_frequencies): + if len(document_term_frequencies) < 2: + raise RequiresMultipleDocuments + + return estimator_func(document_term_frequencies) + + return wrapper_func + + +@requires_multiple_docs +def mutual_exclusion( + document_term_frequencies: Sequence[np.ndarray] +) -> np.ndarray: + """Estimate the fixed specific model with the mutual exclusion method.""" + doc_term_probs = [ + np.log(tf) - np.log(np.sum(tf)) + for tf in document_term_frequencies + ] + # complement events: 1 - p + complements = [ + np.log1p(-np.exp(p_doc)) + for p_doc in doc_term_probs + ] + # probability of term to be important in one doc, and not others + complement_products = np.array([ + dlm + complement + for i, dlm in enumerate(doc_term_probs) + for j, complement in enumerate(complements) + if i != j + ]) + # marginalize over all documents + p_specific = ( + logsum(complement_products) + - np.log( + np.count_nonzero(complement_products > np.NINF, axis=0) + ) + ) + # prevent NaNs from causing downstream errors + p_specific[np.isnan(p_specific)] = np.NINF + + return p_specific + + +@requires_multiple_docs +def inverse_doc_frequency( + document_term_frequencies: Sequence[np.ndarray] +) -> np.ndarray: + """Estimate the fixed specific model with the inverse doc frequency method.""" + idf = 1 / np.count_nonzero(document_term_frequencies, axis=0) + idf[~np.isfinite(idf)] = 0. + + # calculate normalized idf as log-probabilities + p_specific = np.log(idf) - np.log(np.sum(idf)) + + return p_specific + + +def idf_fallback_for_many_docs( + document_term_frequencies: Sequence[np.ndarray], + primary_estimator: SpecificTermEstimator, + fallback_thresh: int +): + if len(document_term_frequencies) < fallback_thresh: + estimator_func = primary_estimator + else: + estimator_func = inverse_doc_frequency + logger.warning( + f'Estimator got more than {fallback_thresh} docs:' + ' falling back to IDF for the current doc group.' + ) + + return estimator_func(document_term_frequencies) + + +me_up_to_40_docs = functools.partial( + idf_fallback_for_many_docs, + primary_estimator=mutual_exclusion, + fallback_thresh=40 +)