diff --git a/code/__init__.py b/code/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/bdi.py b/code/bdi.py index 0913bed..5a09a8a 100644 --- a/code/bdi.py +++ b/code/bdi.py @@ -2,7 +2,7 @@ from os import listdir from collections import defaultdict import utils -import Queue +import queue import argparse from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename, exists @@ -55,23 +55,23 @@ def compute_dbi(embs, clus_map, center_map): def output_dbi(dbi_scores): dbi_by_lvl = {} - all_dbi = [x[0] for x in dbi_scores.values()] + all_dbi = [x[0] for x in list(dbi_scores.values())] - print 'Average DBI for all is: %f' % (sum(all_dbi) / len(all_dbi)) + print('Average DBI for all is: %f' % (sum(all_dbi) / len(all_dbi))) - for x in dbi_scores.values(): + for x in list(dbi_scores.values()): if x[1] not in dbi_by_lvl: dbi_by_lvl[x[1]] = [] dbi_by_lvl[x[1]].append(x[0]) for lvl in dbi_by_lvl: dbis = dbi_by_lvl[lvl] - print 'Average DBI for level %d is: %f' % (lvl, sum(dbis) / len(dbis)) + print('Average DBI for level %d is: %f' % (lvl, sum(dbis) / len(dbis))) def recursion(root, lvl): - q = Queue.Queue() + q = queue.Queue() q.put((root, -1, 1, '*')) dbi_scores = {} @@ -102,7 +102,7 @@ def recursion(root, lvl): # handle current dbi = compute_dbi(embs, clus_map, hier_map) - print 'Computing DBI for %s: %f' % (c_name, dbi) + print('Computing DBI for %s: %f' % (c_name, dbi)) dbi_scores[c_name] = (dbi, level) output_dbi(dbi_scores) diff --git a/code/case_ranker.py b/code/case_ranker.py index 22da68c..e27cece 100644 --- a/code/case_ranker.py +++ b/code/case_ranker.py @@ -8,80 +8,81 @@ import utils import operator + def read_caseolap_result(case_file): - phrase_map = {} - cell_map = {} - - cell_cnt = 0 - with open(case_file) as f: - for line in f: - cell_cnt += 1 - segments = line.strip('\r\n ').split('\t') - cell_id, phs_str = segments[0], segments[1][1:-1] - cell_map[cell_id] = [] - segments = phs_str.split(', ') - for ph_score in segments: - parts = ph_score.split('|') - ph, score = parts[0], float(parts[1]) - if ph not in phrase_map: - phrase_map[ph] = {} - phrase_map[ph][cell_id] = score - cell_map[cell_id].append((ph, score)) - - return phrase_map, cell_map, cell_cnt + phrase_map = {} + cell_map = {} + + cell_cnt = 0 + with open(case_file) as f: + for line in f: + cell_cnt += 1 + segments = line.strip('\r\n ').split('\t') + cell_id, phs_str = segments[0], segments[1][1:-1] + cell_map[cell_id] = [] + segments = phs_str.split(', ') + for ph_score in segments: + parts = ph_score.split('|') + ph, score = parts[0], float(parts[1]) + if ph not in phrase_map: + phrase_map[ph] = {} + phrase_map[ph][cell_id] = score + cell_map[cell_id].append((ph, score)) + + return phrase_map, cell_map, cell_cnt def rank_phrase(case_file): + ph_dist_map = {} + smoothing_factor = 0.0 - ph_dist_map = {} - smoothing_factor = 0.0 - - phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file) + phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file) + print(cell_cnt) + unif = [1.0 / cell_cnt] * cell_cnt - unif = [1.0 / cell_cnt] * cell_cnt + for ph in phrase_map: + ph_vec = [x[1] for x in phrase_map[ph].items()] + if len(ph_vec) < cell_cnt: + ph_vec += [0] * (cell_cnt - len(ph_vec)) + # smoothing + ph_vec = [x + smoothing_factor for x in ph_vec] + ph_vec = utils.l1_normalize(ph_vec) + ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif) - for ph in phrase_map: - ph_vec = [x[1] for x in phrase_map[ph].iteritems()] - if len(ph_vec) < cell_cnt: - ph_vec += [0] * (cell_cnt - len(ph_vec)) - # smoothing - ph_vec = [x + smoothing_factor for x in ph_vec] - ph_vec = utils.l1_normalize(ph_vec) - ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif) + ranked_list = sorted(list(ph_dist_map.items()), key=operator.itemgetter(1), reverse=True) - ranked_list = sorted(ph_dist_map.items(), key=operator.itemgetter(1), reverse=True) - - return ranked_list + return ranked_list def write_keywords(o_file, ranked_list, thres): - with open(o_file, 'w+') as g: - for ph in ranked_list: - if ph[1] > thres: - g.write('%s\n' % (ph[0])) - tmp_file = o_file + '-score.txt' - with open(tmp_file, 'w+') as g: - for ph in ranked_list: - g.write('%s\t%f\n' % (ph[0], ph[1])) + with open(o_file, 'w+') as g: + for ph in ranked_list: + if ph[1] > thres: + g.write('%s\n' % (ph[0])) + tmp_file = o_file + '-score.txt' + with open(tmp_file, 'w+') as g: + for ph in ranked_list: + g.write('%s\t%f\n' % (ph[0], ph[1])) -def main_rank_phrase(input_f, output_f, thres): - ranked_list = rank_phrase(input_f) - write_keywords(output_f, ranked_list, thres) - print("[CaseOLAP] Finish pushing general terms up") -if __name__ == "__main__": - parser = argparse.ArgumentParser(prog='case_ranker.py', \ - description='Ranks the distinctiveness score using caseolap result.') - parser.add_argument('-folder', required=True, \ - help='The folder that stores the file.') - parser.add_argument('-iter', required=True, \ - help='Iteration index.') - parser.add_argument('-thres', required=True, \ - help='The files used.') - args = parser.parse_args() - - input_f = '%s/caseolap-%s.txt' % (args.folder, args.iter) - output_f = '%s/keywords-%s.txt' % (args.folder, args.iter) +def main_rank_phrase(input_f, output_f, thres): + ranked_list = rank_phrase(input_f) + write_keywords(output_f, ranked_list, thres) + print("[CaseOLAP] Finish pushing general terms up") - main_rank_phrase(input_f, output_f, float(args.thres)) +if __name__ == "__main__": + parser = argparse.ArgumentParser(prog='case_ranker.py', \ + description='Ranks the distinctiveness score using caseolap result.') + parser.add_argument('-folder', required=True, \ + help='The folder that stores the file.') + parser.add_argument('-iter', required=True, \ + help='Iteration index.') + parser.add_argument('-thres', required=True, \ + help='The files used.') + args = parser.parse_args() + + input_f = '%s/caseolap-%s.txt' % (args.folder, args.iter) + output_f = '%s/keywords-%s.txt' % (args.folder, args.iter) + + main_rank_phrase(input_f, output_f, float(args.thres)) diff --git a/code/caseslim.py b/code/caseslim.py index 65a5cfb..0b5b076 100644 --- a/code/caseslim.py +++ b/code/caseslim.py @@ -66,7 +66,7 @@ def compute(self, score_type='ALL'): group = [(self_df, self.max_df, self_cnt, sum_self)] self.context_groups[phrase] = [] - for phrase_group, phrase_values in self.phrase_cnt_context.items(): + for phrase_group, phrase_values in list(self.phrase_cnt_context.items()): context_df = self.phrase_df_context[phrase_group].get(phrase, 0) sum_context = self.sum_cnt_context[phrase_group] context_cnt = phrase_values.get(phrase, 0) @@ -167,7 +167,7 @@ def __init__(self, freq_data, selected_docs, context_doc_groups, global_scores=N self.sum_cnt = sum(self.phrase_cnt.values()) self.sum_cnt_context = {} self.global_scores = global_scores - for group, docs in context_doc_groups.items(): + for group, docs in list(context_doc_groups.items()): self.phrase_cnt_context[group], self.phrase_df_context[group] = self.agg_phrase_cnt_df(freq_data, docs) if len(self.phrase_df_context[group]) > 0: self.max_df_context[group] = max(self.phrase_df_context[group].values()) @@ -248,7 +248,7 @@ def run_caseolap(cells, freq_data, target_phs, o_file, verbose=3, top_k=200): of = open(o_file, 'w+') for cell in cells: - print('[CaseOLAP] Running CaseOLAP for cell: %s' % cell) + print(('[CaseOLAP] Running CaseOLAP for cell: %s' % cell)) selected_docs = cells[cell] context_doc_groups = copy.copy(cells) @@ -260,7 +260,7 @@ def run_caseolap(cells, freq_data, target_phs, o_file, verbose=3, top_k=200): phr_str = ', '.join([ph[0] + '|' + str(ph[1]) for ph in top_phrases if ph[0] in target_phs]) of.write('[%s]\n' % phr_str) - print('[CaseOLAP] Finished CaseOLAP for cell: %s' % cell) + print(('[CaseOLAP] Finished CaseOLAP for cell: %s' % cell)) def main_caseolap(link_f, cell_f, token_f, output_f): diff --git a/code/cluster-preprocess.py b/code/cluster-preprocess.py index c547540..44ea0eb 100644 --- a/code/cluster-preprocess.py +++ b/code/cluster-preprocess.py @@ -87,7 +87,7 @@ def gen_doc_keyword_cnt_file(doc_file, keyword_cnt_file): def counter_to_string(counter): elements = [] - for k, v in counter.items(): + for k, v in list(counter.items()): elements.append(k) elements.append(v) return '\t'.join([str(e) for e in elements]) @@ -129,7 +129,7 @@ def main(raw_dir, input_dir, init_dir): # input_dir = '/shared/data/czhang82/projects/local-embedding/sp/input/' # init_dir = '/shared/data/czhang82/projects/local-embedding/sp/init/' if __name__ == '__main__': - corpusName = sys.argv[1] + corpusName = 'dblp' raw_dir = '../data/'+corpusName+'/raw/' input_dir = '../data/'+corpusName+'/input/' init_dir = '../data/'+corpusName+'/init/' diff --git a/code/cluster.py b/code/cluster.py index 9bc25ff..5e89e0f 100644 --- a/code/cluster.py +++ b/code/cluster.py @@ -28,7 +28,7 @@ def fit(self): self.membership = labels self.center_ids = self.gen_center_idx() self.inertia_scores = self.clus.inertia_ - print('Clustering concentration score:', self.inertia_scores) + print(('Clustering concentration score:', self.inertia_scores)) # find the idx of each cluster center def gen_center_idx(self): @@ -58,13 +58,13 @@ def calc_cosine(self, vec_a, vec_b): def run_clustering(full_data, doc_id_file, filter_keyword_file, n_cluster, parent_direcotry, parent_description,\ cluster_keyword_file, hierarchy_file, doc_membership_file): dataset = SubDataSet(full_data, doc_id_file, filter_keyword_file) - print('Start clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description) + print(('Start clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description)) ## TODO: change later here for n_cluster selection from a range clus = Clusterer(dataset.embeddings, n_cluster) clus.fit() - print('Done clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description) + print(('Done clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description)) dataset.write_cluster_members(clus, cluster_keyword_file, parent_direcotry) center_names = dataset.write_cluster_centers(clus, parent_description, hierarchy_file) dataset.write_document_membership(clus, doc_membership_file, parent_direcotry) - print('Done saving cluster results for ', len(dataset.keywords), ' keywords under parent:', parent_description) + print(('Done saving cluster results for ', len(dataset.keywords), ' keywords under parent:', parent_description)) return center_names diff --git a/code/compress.py b/code/compress.py index d444ad5..4fff169 100644 --- a/code/compress.py +++ b/code/compress.py @@ -1,7 +1,7 @@ import argparse import utils import operator -import Queue +import queue import math from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename, exists @@ -37,12 +37,12 @@ def parse_reidx(reidx_f): if len(pd_map[ph]) > 0: ph_idf[ph] = math.log(float(d_cnt) / len(pd_map[ph])) - print 'Inverted Index file read.' + print('Inverted Index file read.') def get_rep(folder, c_id, N): - print('Start get representative phrases for %s, %s ========================' % (folder, c_id)) + print(('Start get representative phrases for %s, %s ========================' % (folder, c_id))) # print folder par_folder = dirname(folder) cur_label = basename(folder) @@ -72,7 +72,7 @@ def get_rep(folder, c_id, N): emb_dist = utils.cossim(embs[ph], embs[cur_label]) ph_scores[ph] = score * emb_dist - ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True) + ph_scores = sorted(list(ph_scores.items()), key=operator.itemgetter(1), reverse=True) for (ph, score) in ph_scores: if ph not in result_phrases and ph in kws: @@ -81,7 +81,7 @@ def get_rep(folder, c_id, N): break elif ph_idf == None: - print 'looking at embeddings for %s' % folder + print('looking at embeddings for %s' % folder) ph_f = '%s/embeddings.txt' % par_folder kw_f = '%s/keywords.txt' % par_folder @@ -119,7 +119,7 @@ def get_rep(folder, c_id, N): result_phrases.append(ph) else: # Using TF-IDF to generate - print 'looking at tf-idf for %s' % folder + print('looking at tf-idf for %s' % folder) d_clus_f = '%s/paper_cluster.txt' % par_folder kw_clus_f = '%s/cluster_keywords.txt' % par_folder docs = [] @@ -148,7 +148,7 @@ def get_rep(folder, c_id, N): continue ph_scores[ph] = 1 + math.log(ph_scores[ph]) ph_scores[ph] *= ph_idf[ph] - ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True) + ph_scores = sorted(list(ph_scores.items()), key=operator.itemgetter(1), reverse=True) for (ph, score) in ph_scores: if ph not in result_phrases: @@ -161,7 +161,7 @@ def get_rep(folder, c_id, N): def recursion(root, o_file, N): - q = Queue.Queue() + q = queue.Queue() q.put((root, -1, '*')) g = open(o_file, 'w+') diff --git a/code/dataset.py b/code/dataset.py index f167919..600adf5 100644 --- a/code/dataset.py +++ b/code/dataset.py @@ -76,7 +76,7 @@ def load_keywords(self, keyword_file, full_data): if keyword in full_data.embeddings: keywords.append(keyword) else: - print(keyword, ' not in the embedding file') + print((keyword, ' not in the embedding file')) return keywords def gen_keyword_id(self): @@ -210,4 +210,4 @@ def assign_document(self, doc_membership): keyword_file = data_dir + 'input/candidates.txt' embedding_file = data_dir + 'input/embeddings.txt' dataset = DataSet(embedding_file, document_file, keyword_file) - print(len(dataset.get_candidate_embeddings())) + print((len(dataset.get_candidate_embeddings()))) diff --git a/code/embedding_deprecated/fetch_papers_relevant.py b/code/embedding_deprecated/fetch_papers_relevant.py index ab5e0cf..d070b1a 100644 --- a/code/embedding_deprecated/fetch_papers_relevant.py +++ b/code/embedding_deprecated/fetch_papers_relevant.py @@ -21,19 +21,19 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map): targets, target_set = [term], set([term]) - for i in xrange(N): + for i in range(N): q, s = relevant_dict[term][i][0], relevant_dict[term][i][1] if s > thread and q not in target_set: targets.append(q) target_set.add(q) paper_set = set() - print "Query :", term, len(targets), "---" + print("Query :", term, len(targets), "---") for t in targets: if t not in tp_map: - print "(%s)" % t, + print("(%s)" % t, end=' ') continue paper_set = paper_set | tp_map[t] - print "(%s, %d) " % (t, len(paper_set)), + print("(%s, %d) " % (t, len(paper_set)), end=' ') return paper_set @@ -71,15 +71,15 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map): for term in query_terms: relevant_dict[term] = [] - print "%s/%s" % (args.relevant, term) + print("%s/%s" % (args.relevant, term)) data = open("%s/%s" % (args.relevant, term)) - print term + print(term) for line in data: values = line.strip().split() relevant_dict[term].append((values[0], float(values[1]))) - print values[0], + print(values[0], end=' ') all_terms.add(values[0]) - print "\n" + print("\n") # read paper-term term_paper_map = dict() @@ -93,13 +93,13 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map): if term not in term_paper_map: term_paper_map[term] = set() term_paper_map[term].add(paper) - print "Finish reading paper_term.net" + print("Finish reading paper_term.net") data = open(args.query) for line in data: queries = line.strip().split("#") paper_set = None - for i in xrange(len(queries)): + for i in range(len(queries)): if i == 0: paper_set = fetch_paper(threshold, N, queries[i], relevant_dict, term_paper_map) @@ -107,9 +107,9 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map): paper_set = paper_set | fetch_paper(threshold, N, queries[i], relevant_dict, term_paper_map) - print "For query :", queries[i], - print "Before :", len(term_paper_map[queries[i]]), - print "After :", len(paper_set) + print("For query :", queries[i], end=' ') + print("Before :", len(term_paper_map[queries[i]]), end=' ') + print("After :", len(paper_set)) path = "%s/%s" % (args.output, "_".join(queries)) diff --git a/code/embedding_deprecated/metrics.py b/code/embedding_deprecated/metrics.py index 8658cc3..07b1761 100644 --- a/code/embedding_deprecated/metrics.py +++ b/code/embedding_deprecated/metrics.py @@ -29,23 +29,23 @@ def precision(candidates, ground_truth): p_5, p_10, p_20 = 0, 0, 0 true_positive = 0 - for i in xrange(5): + for i in range(5): if i < len(candidates) and candidates[i] in ground_truth: true_positive += 1 p_5 = float(true_positive) / 5.0 - for i in xrange(5, 10): + for i in range(5, 10): if i < len(candidates) and candidates[i] in ground_truth: true_positive += 1 p_10 = float(true_positive) / 10.0 - for i in xrange(10, 20): + for i in range(10, 20): if i < len(candidates) and candidates[i] in ground_truth: true_positive += 1 p_20 = float(true_positive) / 20.0 true_positive = 0 - for i in xrange(min(R, C)): + for i in range(min(R, C)): if i < len(candidates) and candidates[i] in ground_truth: true_positive += 1 @@ -58,7 +58,7 @@ def MAP(candidates, ground_truth): C = len(candidates) R = len(ground_truth) true_positive, precision, total_precision = 0, 0, 0 - for n in xrange(1, min(C, R) + 1): + for n in range(1, min(C, R) + 1): if candidates[n - 1] in ground_truth: true_positive += 1 precision = float(true_positive) / float(n) @@ -68,7 +68,7 @@ def MAP(candidates, ground_truth): def RR(candidates, ground_truth): result = 0 nCandidates = len(candidates) - for n in xrange(nCandidates): + for n in range(nCandidates): if candidates[n] in ground_truth: result = n + 1 break @@ -77,7 +77,7 @@ def RR(candidates, ground_truth): def NDCG(candidates, ground_truth, K = 20): DCG, IDCG = 0, 0 ndcg_5, ndcg_10, ndcg_20 = 0, 0, 0 - for i in xrange(K): + for i in range(K): x = 1.0 / log(i + 2, 2) IDCG += x if i < len(candidates) and candidates[i] in ground_truth: @@ -93,7 +93,7 @@ def NDCG(candidates, ground_truth, K = 20): def bpref(candidates, ground_truth): R = len(candidates) false_positive, bpref, k = 0, 0, 0 - for n in xrange(R): + for n in range(R): if candidates[n] not in ground_truth: false_positive += 1 else: @@ -104,11 +104,11 @@ def bpref(candidates, ground_truth): return bpref / float(k) def printResults(name, result): - print name, - print "%.4f" % (np.mean(result)), + print(name, end=' ') + print("%.4f" % (np.mean(result)), end=' ') for res in result: - print "%.4f" % res, - print + print("%.4f" % res, end=' ') + print() if __name__ == "__main__": parser = argparse.ArgumentParser(prog='metrics.py', \ @@ -131,11 +131,11 @@ def printResults(name, result): for line in query_data: query = line.split("\n")[0].replace(" ", "_") queries.append(query) - print "measure overall", " ".join(queries) + print("measure overall", " ".join(queries)) Precision_5, Precision_10, Precision_20, Precision_R, l_MAP, l_MRR, l_bpref = \ - list([[] for i in xrange(7)]) - NDCG_5, NDCG_10, NDCG_20 = list([[] for i in xrange(3)]) + list([[] for i in range(7)]) + NDCG_5, NDCG_10, NDCG_20 = list([[] for i in range(3)]) for query in queries: candidate_file = input_folder + "/" + query + "/author.weights" @@ -145,7 +145,7 @@ def printResults(name, result): for line in data: value = line.split("\n")[0].split() scores[value[0]] = float(value[1]) - scores = sorted(scores.items(), key=lambda x:x[1], reverse=True) + scores = sorted(list(scores.items()), key=lambda x:x[1], reverse=True) candidates = [x[0] for x in scores] diff --git a/code/eval.py b/code/eval.py index bd67933..ff239a5 100644 --- a/code/eval.py +++ b/code/eval.py @@ -13,7 +13,7 @@ def handler(folder): if True: files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'intrusion_exp' in f] - print files + print(files) intru_gold = '%s/%s' % (folder, 'intrusion_gold.txt') @@ -51,9 +51,9 @@ def handler(folder): # print idx - print '\nIntrusion results!!' + print('\nIntrusion results!!') for method in total: - print '%s accuracy: %d/%d - %f' % (method, correct[method], total[method], float(correct[method]) / total[method]) + print('%s accuracy: %d/%d - %f' % (method, correct[method], total[method], float(correct[method]) / total[method])) # files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'isa_exp' in f] @@ -95,7 +95,7 @@ def handler(folder): # subdomain result files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'subdomain_exp' in f] - print files + print(files) intru_gold = '%s/%s' % (folder, 'subdomain_gold.txt') @@ -129,9 +129,9 @@ def handler(folder): if value == 'n': incorrect[method] += 1 - print '\nSubdomain results!!' + print('\nSubdomain results!!') for method in total: - print '%s accuracy: %d/%d/%d - %f' % (method, correct[method], incorrect[method], total[method], float(correct[method]) / total[method]) + print('%s accuracy: %d/%d/%d - %f' % (method, correct[method], incorrect[method], total[method], float(correct[method]) / total[method])) if __name__ == "__main__": diff --git a/code/gen_eval.py b/code/gen_eval.py index 1407708..cdc8926 100644 --- a/code/gen_eval.py +++ b/code/gen_eval.py @@ -59,11 +59,11 @@ def gen_isa_pairs(tax, isa_N, case_N): p_phs = '|'.join(p_node.ph_list[:isa_N]) rmd_phs = '|'.join(rmd_node.ph_list[:isa_N]) if len(p_phs) == 0 or len(rmd_phs) == 0: - print tax - print n_phs - print node.name - print p_node.name - print rmd_node.name + print(tax) + print(n_phs) + print(node.name) + print(p_node.name) + print(rmd_node.name) exit(1) order = random.choice([0, 1]) p_id = 0 @@ -122,7 +122,7 @@ def handler(folder, output, N, isa_N, case_N): subdomain_map = {} for tax_name in taxs: - print tax_name + print(tax_name) # generate intrusion pairs # intru_f = '%s/%s.intrusion' % (output, tax_name) intru_maps[tax_name] = gen_intrusion_pairs(taxs[tax_name], N, case_N) diff --git a/code/labeling.py b/code/labeling.py index f05ef0d..4234e9e 100644 --- a/code/labeling.py +++ b/code/labeling.py @@ -1,13 +1,13 @@ import argparse import utils import operator -import Queue +import queue from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename from case_ranker import read_caseolap_result, rank_phrase def label_emb_centric(folder, c_id): - print 'Start labeling for %s, %s ========================' % (folder, c_id) + print('Start labeling for %s, %s ========================' % (folder, c_id)) # print folder par_folder = dirname(folder) cur_label = basename(folder) @@ -18,7 +18,7 @@ def label_emb_centric(folder, c_id): # generate word2vec phrases embs = utils.load_embeddings(emb_f) if cur_label not in embs: - print 'Error!!!' + print('Error!!!') exit(1) N = 100 worst = -100 @@ -58,8 +58,8 @@ def label_emb_centric(folder, c_id): label_cands[ph] = score - ranked_list = sorted(label_cands.items(), key=operator.itemgetter(1), reverse=True) - print ranked_list + ranked_list = sorted(list(label_cands.items()), key=operator.itemgetter(1), reverse=True) + print(ranked_list) return ranked_list[0][0] @@ -129,7 +129,7 @@ def label_emb_centric(folder, c_id): def recursion(root): - q = Queue.Queue() + q = queue.Queue() q.put((root, -1)) label_map = {} @@ -149,10 +149,10 @@ def recursion(root): l = label_emb_centric(c_folder, str(c_id)) cur_label = basename(c_folder) label_map[cur_label] = l - print 'label for %s is: %s\n' % (c_folder, l) + print('label for %s is: %s\n' % (c_folder, l)) except: - for (o_l, l) in label_map.items(): - print '%s ==> %s' % (o_l, l) + for (o_l, l) in list(label_map.items()): + print('%s ==> %s' % (o_l, l)) if __name__ == "__main__": diff --git a/code/local_embedding_training.py b/code/local_embedding_training.py index 7d6fcca..dcab154 100644 --- a/code/local_embedding_training.py +++ b/code/local_embedding_training.py @@ -10,7 +10,7 @@ import os def read_files(folder, parent): - print("[Local-embedding] Reading file:", parent) + print(("[Local-embedding] Reading file:", parent)) emb_file = '%s/embeddings.txt' % folder hier_file = '%s/hierarchy.txt' % folder keyword_file = '%s/keywords.txt' % folder ## here only consider those remaining keywords @@ -106,7 +106,7 @@ def run_word2vec(pd_map, docs, cates, folder): for ph in cates[cate]: c_docs = c_docs.union(pd_map[ph]) - print('Starting cell %s with %d docs.' % (cate, len(c_docs))) + print(('Starting cell %s with %d docs.' % (cate, len(c_docs)))) # save file # sub_folder = '%s/%s' % (folder, cate) diff --git a/code/main.py b/code/main.py index b7a0ff7..e464c02 100644 --- a/code/main.py +++ b/code/main.py @@ -49,20 +49,22 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ n_expand, level, caseolap=True, local_embedding=True): if level > MAX_LEVEL: return - print('============================= Running level ', level, ' and node ', parent, '=============================') + print(('============================= Running level ', level, ' and node ', parent, '=============================')) start = time.time() df = DataFiles(input_dir, node_dir) ## TODO: Everytime we need to read-in the whole corpus, which can be slow. full_data = DataSet(df.embedding_file, df.doc_file) end = time.time() - print('[Main] Done reading the full data using time %s seconds' % (end-start)) + print(('[Main] Done reading the full data using time %s seconds' % (end-start))) # filter the keywords if caseolap is False: try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) - except: + except Exception as e: + print(e) + print(e) print('Clustering not finished.') return copyfile(df.seed_keyword_file, df.filtered_keyword_file) @@ -74,7 +76,8 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) - except: + except Exception as e: + print(e) print('Clustering not finished.') return @@ -82,7 +85,7 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ main_caseolap(df.link_file, df.doc_membership_file, df.cluster_keyword_file, df.caseolap_keyword_file) main_rank_phrase(df.caseolap_keyword_file, df.filtered_keyword_file, filter_thre) end = time.time() - print("[Main] Finish running CaseOALP using %s (seconds)" % (end - start)) + print(("[Main] Finish running CaseOALP using %s (seconds)" % (end - start))) # prepare the embedding for child level if level < MAX_LEVEL: @@ -96,7 +99,7 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ start = time.time() main_local_embedding(node_dir, df.doc_file, df.index_file, parent, n_expand) end = time.time() - print("[Main] Finish running local embedding training using %s (seconds)" % (end - start)) + print(("[Main] Finish running local embedding training using %s (seconds)" % (end - start))) for child in children: recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter, \ diff --git a/code/paras.py b/code/paras.py index 027ae53..f29e54b 100644 --- a/code/paras.py +++ b/code/paras.py @@ -99,14 +99,14 @@ def load_sp_params(): def load_dblp_params_method(): pd = dict() - pd['data_dir'] = '/shared/data/jiaming/local-embedding/data/dblp/' + pd['data_dir'] = '/home/sasce/PycharmProjects/taxogen/data/dblp/' pd['doc_file'] = pd['data_dir'] + 'input/papers.txt' pd['doc_keyword_cnt_file'] = pd['data_dir'] + 'input/keyword_cnt.txt' pd['input_dir'] = pd['data_dir'] + 'input/' pd['root_node_dir'] = pd['data_dir'] + 'cluster/' pd['n_cluster'] = 5 - pd['filter_thre'] = 0.25 - pd['n_expand'] = 100 - pd['n_cluster_iter'] = 2 + pd['filter_thre'] = 0.15 + pd['n_expand'] = 200 + pd['n_cluster_iter'] = 5 return pd diff --git a/code/postprocess/redundancy.py b/code/postprocess/redundancy.py index 256810c..af505ea 100644 --- a/code/postprocess/redundancy.py +++ b/code/postprocess/redundancy.py @@ -31,7 +31,7 @@ def load_taxonomy(tax_file): def compute_pmi(inverted_index, tax): values = [] - for node, keywords in tax.items(): + for node, keywords in list(tax.items()): pairs = set(frozenset([i, j]) for i in keywords for j in keywords if i != j) for p in pairs: pair = list(p) @@ -54,7 +54,7 @@ def main(idx_file, taxonomy_dir, output_file): inverted_index = load_inverted_index(idx_file) # print len(inverted_index) taxonomy_files = get_tax_filenames(taxonomy_dir) - print taxonomy_files + print(taxonomy_files) with open(output_file, 'a') as fout: for tax_file in taxonomy_files: tax = load_taxonomy(tax_file) diff --git a/code/postprocess/visualize.py b/code/postprocess/visualize.py index 87ed098..f63f99a 100644 --- a/code/postprocess/visualize.py +++ b/code/postprocess/visualize.py @@ -13,7 +13,7 @@ def load_nodes(node_file, min_level=1, max_level=3, prefix_list=['*']): node_content = [] nodes[node_id] = node_content prune_nodes = {} - for node_id, node_content in nodes.items(): + for node_id, node_content in list(nodes.items()): # prune nodes if not has_one_prefix(node_id, prefix_list): continue @@ -43,11 +43,11 @@ def is_exact_prefix(s, prefix): def gen_edges(nodes): - node_ids = nodes.keys() + node_ids = list(nodes.keys()) node_ids.sort(key=lambda x: len(x)) edges = [] - for i in xrange(len(nodes) - 1): - for j in xrange(i+1, len(nodes)): + for i in range(len(nodes) - 1): + for j in range(i+1, len(nodes)): if is_parent(node_ids[i], node_ids[j]): edges.append([node_ids[i], node_ids[j]]) return edges @@ -75,7 +75,7 @@ def gen_node_label(node_id, node_content): def draw(nodes, edges, output_file): d = Digraph(node_attr={'shape': 'record'}) - for node_id, node_content in nodes.items(): + for node_id, node_content in list(nodes.items()): d.node(node_id, gen_node_label(node_id, node_content)) for e in edges: d.edge(e[0], e[1]) diff --git a/code/preprocess.py b/code/preprocess.py index 10446bc..8499e5e 100644 --- a/code/preprocess.py +++ b/code/preprocess.py @@ -17,7 +17,7 @@ def get_candidates(folder, o_file): phrase = line.split(' ')[0] quality_phrases.add(phrase) - print('Quality phrase count: ' + str(len(quality_phrases))) + print(('Quality phrase count: ' + str(len(quality_phrases)))) with open(o_file, 'w+') as g: for phrase in quality_phrases: @@ -49,7 +49,7 @@ def get_reidx_file(text, cand_f, o_file): pd_map[t].add(str(idx)) idx += 1 if idx % 10000 == 0: - print("[Construct Inverted Index] Parse %s documents" % idx) + print(("[Construct Inverted Index] Parse %s documents" % idx)) with open(o_file, 'w+') as g: for ph in pd_map: diff --git a/code/preprocess/AutoPhraseOutput.py b/code/preprocess/AutoPhraseOutput.py index cf7b8e0..3b72141 100644 --- a/code/preprocess/AutoPhraseOutput.py +++ b/code/preprocess/AutoPhraseOutput.py @@ -67,7 +67,7 @@ def parse_one_doc(self, doc): # sanity checking if (len(q) != 0): - print("[ERROR]: mismatched in document: %s" % doc) + print(("[ERROR]: mismatched in document: %s" % doc)) def save_phrase_to_pos_sequence(self, output_path=""): with open(output_path, "w") as fout: @@ -88,7 +88,7 @@ def load_phrase_to_pos_sequence(self, input_path=""): def obtain_pos_sequence_to_score(self): pos_sequence_2_score = {} - for v in self.phrase_to_pos_sequence.values(): + for v in list(self.phrase_to_pos_sequence.values()): for pos_sequence in v: if pos_sequence not in pos_sequence_2_score: pos_sequence_list = pos_sequence.split() @@ -103,8 +103,8 @@ def obtain_pos_sequence_to_score(self): self.pos_sequence_to_score = pos_sequence_2_score - print(len(self.pos_sequence_to_score)) - print(self.pos_sequence_to_score) + print((len(self.pos_sequence_to_score))) + print((self.pos_sequence_to_score)) def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5): candidate_phrase = [] @@ -113,15 +113,15 @@ def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5): freq = sum(self.phrase_to_pos_sequence[phrase].values()) if freq < min_sup: continue - for pos_sequence in self.phrase_to_pos_sequence[phrase].keys(): + for pos_sequence in list(self.phrase_to_pos_sequence[phrase].keys()): pos_sequence_weight = float(self.phrase_to_pos_sequence[phrase][pos_sequence]) / freq pos_sequence_score = self.pos_sequence_to_score[pos_sequence] phrase_score += (pos_sequence_weight * pos_sequence_score) if phrase_score >= threshold: # print(phrase, phrase_score) candidate_phrase.append(phrase) - print(len(candidate_phrase)) - print(candidate_phrase[0:10]) + print((len(candidate_phrase))) + print((candidate_phrase[0:10])) self.candidate_phrase = candidate_phrase def save_candidate_phrase(self, output_path=""): diff --git a/code/preprocess/SegPhraseOutput.py b/code/preprocess/SegPhraseOutput.py index d869a0f..ccd9a68 100644 --- a/code/preprocess/SegPhraseOutput.py +++ b/code/preprocess/SegPhraseOutput.py @@ -65,7 +65,7 @@ def parse_one_doc(self, doc): # sanity checking if (len(q) != 0): - print("[ERROR]: mismatched in document: %s" % doc) + print(("[ERROR]: mismatched in document: %s" % doc)) def save_phrase_to_pos_sequence(self, output_path=""): with open(output_path, "w") as fout: @@ -83,11 +83,11 @@ def load_phrase_to_pos_sequence(self, input_path=""): phrase = seg[0] pos_sequence = eval(seg[1]) self.phrase_to_pos_sequence[phrase] = pos_sequence - print("[INFO] Number of phrases before NP pruning = ", len(self.phrase_to_pos_sequence)) + print(("[INFO] Number of phrases before NP pruning = ", len(self.phrase_to_pos_sequence))) def obtain_pos_sequence_to_score(self): pos_sequence_2_score = {} - for v in self.phrase_to_pos_sequence.values(): + for v in list(self.phrase_to_pos_sequence.values()): for pos_sequence in v: if pos_sequence not in pos_sequence_2_score: pos_sequence_list = pos_sequence.split() @@ -102,8 +102,8 @@ def obtain_pos_sequence_to_score(self): self.pos_sequence_to_score = pos_sequence_2_score - print(len(self.pos_sequence_to_score)) - print(self.pos_sequence_to_score) + print((len(self.pos_sequence_to_score))) + print((self.pos_sequence_to_score)) def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5): candidate_phrase = [] @@ -112,15 +112,15 @@ def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5): freq = sum(self.phrase_to_pos_sequence[phrase].values()) if freq < min_sup: continue - for pos_sequence in self.phrase_to_pos_sequence[phrase].keys(): + for pos_sequence in list(self.phrase_to_pos_sequence[phrase].keys()): pos_sequence_weight = float(self.phrase_to_pos_sequence[phrase][pos_sequence]) / freq pos_sequence_score = self.pos_sequence_to_score[pos_sequence] phrase_score += (pos_sequence_weight * pos_sequence_score) if phrase_score >= threshold: # print(phrase, phrase_score) candidate_phrase.append(phrase) - print(len(candidate_phrase)) - print(candidate_phrase[0:10]) + print((len(candidate_phrase))) + print((candidate_phrase[0:10])) self.candidate_phrase = candidate_phrase def save_candidate_phrase(self, output_path=""): diff --git a/code/preprocess/main.py b/code/preprocess/main.py index b19a78c..8431e81 100644 --- a/code/preprocess/main.py +++ b/code/preprocess/main.py @@ -3,9 +3,9 @@ __description__: Based on AutoPhrase output, generate 1) papers.txt, 2) keywords.txt __latest_update__: Auguest 2, 2017 ''' -from config import * -from AutoPhraseOutput import AutoPhraseOutput -from SegPhraseOutput import SegPhraseOutput +from .config import * +from .AutoPhraseOutput import AutoPhraseOutput +from .SegPhraseOutput import SegPhraseOutput import re from pattern.en import parsetree from pattern.en import parse diff --git a/code/run.sh b/code/run.sh old mode 100644 new mode 100755 index 1bcf2ff..8f0b7db --- a/code/run.sh +++ b/code/run.sh @@ -2,14 +2,16 @@ ## Name of the input corpus corpusName=dblp ## Name of the taxonomy -taxonName=our-l3-0.25 +taxonName=our-l3-0.15 ## If need preprocessing from raw input, set it to be 1, otherwise, set 0 -FIRST_RUN=${FIRST_RUN:- 0} +FIRST_RUN=${FIRST_RUN:- 1} + +source activate '/home/sasce/.cache/pypoetry/virtualenvs/taxogen-Q1ywnX6T-py3.8/bin/python' if [ $FIRST_RUN -eq 1 ]; then echo 'Start data preprocessing' ## compile word2vec for embedding learning - gcc word2vec.c -o word2veec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result + gcc word2vec.c -o word2vec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result ## create initial folder if not exist if [ ! -d ../data/$corpusName/init ]; then diff --git a/code/sim_emb.py b/code/sim_emb.py index fc5ff17..0367eef 100644 --- a/code/sim_emb.py +++ b/code/sim_emb.py @@ -1,7 +1,7 @@ import argparse import utils import operator -import Queue +import queue import math from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename, exists @@ -28,7 +28,7 @@ def compare(a_f, b_f): b_emb_f = '%s/%s' % (b_f, emb_f) if not exists(a_emb_f) or not exists(b_emb_f): - print 'Embedding file not found' + print('Embedding file not found') exit(1) embs_a = load_embeddings(a_emb_f) @@ -37,9 +37,9 @@ def compare(a_f, b_f): embs_groups = [embs_a, embs_b] while 1: - n_name = raw_input("Enter your node: ") + n_name = input("Enter your node: ") if n_name not in embs_a or n_name not in embs_b: - print '%s not found' % n_name + print('%s not found' % n_name) for embs in embs_groups: t_emb = embs[n_name] @@ -47,11 +47,11 @@ def compare(a_f, b_f): for key in embs: sim_map[key] = utils.cossim(t_emb, embs[key]) - sim_map = sorted(sim_map.items(), key=operator.itemgetter(1), reverse=True) + sim_map = sorted(list(sim_map.items()), key=operator.itemgetter(1), reverse=True) output_str = '\n'.join([sim_map[i][0] + '\t' + str(sim_map[i][1]) for i in range(10)]) # print sim_map[:10] - print output_str - print 'group finished\n' + print(output_str) + print('group finished\n') diff --git a/code/spherecluster/__init__.py b/code/spherecluster/__init__.py new file mode 100644 index 0000000..5a1e5aa --- /dev/null +++ b/code/spherecluster/__init__.py @@ -0,0 +1,12 @@ +from __future__ import absolute_import +from .spherical_kmeans import SphericalKMeans +from .von_mises_fisher_mixture import VonMisesFisherMixture +from .util import sample_vMF +""" +@author = 'Jason Laska', +@author_email = 'jason@claralabs.com', +@url = 'https://github.com/jasonlaska/spherecluster/' +@edited = 'https://github.com/rfayat/spherecluster' +""" + +__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"] diff --git a/code/spherecluster/spherical_kmeans.py b/code/spherecluster/spherical_kmeans.py new file mode 100644 index 0000000..882eba1 --- /dev/null +++ b/code/spherecluster/spherical_kmeans.py @@ -0,0 +1,424 @@ +import warnings + +import numpy as np +import scipy.sparse as sp +from joblib import Parallel, delayed + +from sklearn.cluster import KMeans + +from sklearn.cluster import _k_means_fast as _k_means +from sklearn.cluster import _k_means_lloyd +from sklearn.cluster._kmeans import ( + _check_sample_weight, + _labels_inertia, + _tolerance, +) +from sklearn.preprocessing import normalize +from sklearn.utils import check_array, check_random_state +from sklearn.utils.extmath import row_norms, squared_norm +from sklearn.utils.validation import _num_samples +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + + +def _spherical_kmeans_single_lloyd( + X, + n_clusters, + centers_init, + sample_weight=None, + max_iter=300, + init="k-means++", + verbose=False, + x_squared_norms=None, + random_state=None, + tol=1e-4, + n_threads=1, +): + """ + Modified from sklearn.cluster.k_means_.k_means_single_lloyd. + """ + random_state = check_random_state(random_state) + + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + best_labels, best_inertia, best_centers = None, None, None + + # Allocate memory to store the distances for each sample to its + # closer center for reallocation in case of ties + distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) + + # Buffers to avoid new allocations at each iteration. + centers = centers_init + centers_new = np.zeros_like(centers) + labels = np.full(X.shape[0], -1, dtype=np.int32) + labels_old = labels.copy() + weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) + center_shift = np.zeros(n_clusters, dtype=X.dtype) + + if sp.issparse(X): + lloyd_iter = _k_means_lloyd.lloyd_iter_chunked_sparse + _inertia = _k_means._inertia_sparse + else: + lloyd_iter = _k_means_lloyd.lloyd_iter_chunked_dense + _inertia = _k_means._inertia_dense + + # iterations + for i in range(max_iter): + centers_old = centers.copy() + + # labels assignment + # TODO: _labels_inertia should be done with cosine distance + # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized + # this doesn't really matter. + labels, inertia = _labels_inertia( + X, + sample_weight, + x_squared_norms, + centers, + ) + lloyd_iter(X, sample_weight, x_squared_norms, centers, centers_new, + weight_in_clusters, labels, center_shift, n_threads) + + centers, centers_new = centers_new, centers + # l2-normalize centers (this is the main contibution here) + centers = normalize(centers) + + if verbose: + print("Iteration %2d, inertia %.3f" % (i, inertia)) + + if best_inertia is None or inertia < best_inertia: + best_labels = labels.copy() + best_centers = centers.copy() + best_inertia = inertia + + center_shift_total = squared_norm(centers_old - centers) + if center_shift_total <= tol: + if verbose: + print( + "Converged at iteration %d: " + "center shift %e within tolerance %e" % (i, center_shift_total, tol) + ) + break + + if center_shift_total > 0: + # rerun E-step in case of non-convergence so that predicted labels + # match cluster centers + best_labels, best_inertia = _labels_inertia( + X, + sample_weight, + x_squared_norms, + best_centers, + ) + + return best_labels, best_inertia, best_centers, i + 1 + + +def spherical_k_means( + X, + n_clusters, + centers_init, + sample_weight=None, + init="k-means++", + n_init=10, + max_iter=300, + verbose=False, + tol=1e-4, + random_state=None, + copy_x=True, + n_jobs=1, + algorithm="auto", + return_n_iter=False, +): + """Modified from sklearn.cluster.k_means_.k_means. + """ + if n_init <= 0: + raise ValueError( + "Invalid number of initializations." + " n_init=%d must be bigger than zero." % n_init + ) + random_state = check_random_state(random_state) + + if max_iter <= 0: + raise ValueError( + "Number of iterations should be a positive number," + " got %d instead" % max_iter + ) + + best_inertia = np.infty + # avoid forcing order when copy_x=False + order = "C" if copy_x else None + X = check_array( + X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x + ) + # verify that the number of samples given is larger than k + if _num_samples(X) < n_clusters: + raise ValueError( + "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters) + ) + tol = _tolerance(X, tol) + + if hasattr(init, "__array__"): + if n_init != 1: + warnings.warn( + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) + n_init = 1 + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + if n_jobs == 1: + # For a single thread, less memory is needed if we just store one set + # of the best results (as opposed to one set per run per thread). + for it in range(n_init): + # run a k-means once + labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( + X, + n_clusters, + centers_init=centers_init, + sample_weight=sample_weight, + max_iter=max_iter, + init=init, + verbose=verbose, + tol=tol, + x_squared_norms=x_squared_norms, + random_state=random_state, + ) + + # determine if these results are the best so far + if best_inertia is None or inertia < best_inertia: + best_labels = labels.copy() + best_centers = centers.copy() + best_inertia = inertia + best_n_iter = n_iter_ + else: + # parallelisation of k-means runs + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + results = Parallel(n_jobs=n_jobs, verbose=0)( + delayed(_spherical_kmeans_single_lloyd)( + X, + n_clusters, + sample_weight, + max_iter=max_iter, + init=init, + verbose=verbose, + tol=tol, + x_squared_norms=x_squared_norms, + # Change seed to ensure variety + random_state=seed, + ) + for seed in seeds + ) + + # Get results with the lowest inertia + labels, inertia, centers, n_iters = zip(*results) + best = np.argmin(inertia) + best_labels = labels[best] + best_inertia = inertia[best] + best_centers = centers[best] + best_n_iter = n_iters[best] + + if return_n_iter: + return best_centers, best_labels, best_inertia, best_n_iter + else: + return best_centers, best_labels, best_inertia + + +class SphericalKMeans(KMeans): + """Spherical K-Means clustering + + Modfication of sklearn.cluster.KMeans where cluster centers are normalized + (projected onto the sphere) in each iteration. + + Parameters + ---------- + + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init : {'k-means++', 'random' or an ndarray} + Method for initialization, defaults to 'k-means++': + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + 'random': choose k observations (rows) at random from data for + the initial centroids. + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-4 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + + normalize : boolean, default True + Normalize the input to have unnit norm. + + Attributes + ---------- + + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers + + labels_ : + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + """ + + def __init__( + self, + n_clusters=8, + init="k-means++", + n_init=10, + max_iter=300, + tol=1e-4, + n_jobs=1, + verbose=0, + random_state=None, + copy_x=True, + normalize=True, + ): + self.n_clusters = n_clusters + self.init = init + self.max_iter = max_iter + self.tol = tol + self.n_init = n_init + self.verbose = verbose + self.random_state = random_state + self.copy_x = copy_x + self.n_jobs = n_jobs + self.normalize = normalize + + def _check_params(self, X): + # n_jobs + if self.n_jobs != 'deprecated': + warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" + " removed in 1.0 (renaming of 0.25).", FutureWarning) + self._n_threads = self.n_jobs + else: + self._n_threads = None + self._n_threads = _openmp_effective_n_threads(self._n_threads) + + # n_init + if self.n_init <= 0: + raise ValueError( + f"n_init should be > 0, got {self.n_init} instead.") + self._n_init = self.n_init + + # max_iter + if self.max_iter <= 0: + raise ValueError( + f"max_iter should be > 0, got {self.max_iter} instead.") + + # n_clusters + if X.shape[0] < self.n_clusters: + raise ValueError(f"n_samples={X.shape[0]} should be >= " + f"n_clusters={self.n_clusters}.") + + # tol + self._tol = _tolerance(X, self.tol) + + # init + if not (hasattr(self.init, '__array__') or callable(self.init) + or (isinstance(self.init, str) + and self.init in ["k-means++", "random"])): + raise ValueError( + f"init should be either 'k-means++', 'random', a ndarray or a " + f"callable, got '{self.init}' instead.") + + if hasattr(self.init, '__array__') and self._n_init != 1: + warnings.warn( + f"Explicit initial center position passed: performing only" + f" one init in {self.__class__.__name__} instead of " + f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2) + self._n_init = 1 + + + def fit(self, X, y=None, sample_weight=None): + """Compute k-means clustering. + + Parameters + ---------- + + X : array-like or sparse matrix, shape=(n_samples, n_features) + + y : Ignored + not used, present here for API consistency by convention. + + sample_weight : array-like, shape (n_samples,), optional + The weights for each observation in X. If None, all observations + are assigned equal weight (default: None) + """ + if self.normalize: + X = normalize(X) + self._check_params(X) + + random_state = check_random_state(self.random_state) + + # Validate init array + init = self.init + if hasattr(init, "__array__"): + init = check_array(init, dtype=X.dtype.type, order="C", copy=True) + self._validate_center_shape(X, init) + + # TODO: add check that all data is unit-normalized + x_squared_norms = row_norms(X, squared=True) + centers_init = self._init_centroids( + X, + init=self.init, + random_state=random_state, + x_squared_norms=x_squared_norms + ) + + self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means( + X, + n_clusters=self.n_clusters, + centers_init=centers_init, + sample_weight=sample_weight, + init=self.init, + n_init=self.n_init, + max_iter=self.max_iter, + verbose=self.verbose, + tol=self.tol, + random_state=random_state, + copy_x=self.copy_x, + n_jobs=self.n_jobs, + return_n_iter=True, + ) + + return self diff --git a/code/spherecluster/util.py b/code/spherecluster/util.py new file mode 100644 index 0000000..bb68261 --- /dev/null +++ b/code/spherecluster/util.py @@ -0,0 +1,57 @@ +""" +Generate multivariate von Mises Fisher samples. + +This solution originally appears here: +http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python + +Also see: + +Sampling from vMF on S^2: + https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf + http://www.stat.pitt.edu/sungkyu/software/randvonMisesFisher3.pdf +""" +import numpy as np + + +def sample_vMF(mu, kappa, num_samples): + """Generate num_samples N-dimensional samples from von Mises Fisher + distribution around center mu \in R^N with concentration kappa. + """ + dim = len(mu) + result = np.zeros((num_samples, dim)) + for nn in range(num_samples): + # sample offset from center (on sphere) with spread kappa + w = _sample_weight(kappa, dim) + + # sample a point v on the unit sphere that's orthogonal to mu + v = _sample_orthonormal_to(mu) + + # compute new point + result[nn, :] = v * np.sqrt(1. - w ** 2) + w * mu + + return result + + +def _sample_weight(kappa, dim): + """Rejection sampling scheme for sampling distance from center on + surface of the sphere. + """ + dim = dim - 1 # since S^{n-1} + b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa) + x = (1. - b) / (1. + b) + c = kappa * x + dim * np.log(1 - x ** 2) + + while True: + z = np.random.beta(dim / 2., dim / 2.) + w = (1. - (1. + b) * z) / (1. - (1. - b) * z) + u = np.random.uniform(low=0, high=1) + if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u): + return w + + +def _sample_orthonormal_to(mu): + """Sample point on sphere orthogonal to mu.""" + v = np.random.randn(mu.shape[0]) + proj_mu_v = mu * np.dot(mu, v) / np.linalg.norm(mu) + orthto = v - proj_mu_v + return orthto / np.linalg.norm(orthto) diff --git a/code/spherecluster/von_mises_fisher_mixture.py b/code/spherecluster/von_mises_fisher_mixture.py new file mode 100644 index 0000000..7193f35 --- /dev/null +++ b/code/spherecluster/von_mises_fisher_mixture.py @@ -0,0 +1,946 @@ +import warnings + +import numpy as np +import scipy.sparse as sp +from joblib import Parallel, delayed +from numpy import i0 # modified Bessel function of first kind order 0, I_0 +from scipy.special import iv # modified Bessel function of first kind, I_v +from scipy.special import logsumexp + + +from sklearn.cluster import KMeans +from sklearn.cluster._kmeans import _tolerance, _kmeans_plusplus +from sklearn.metrics.pairwise import cosine_distances +from sklearn.preprocessing import normalize +from sklearn.utils import check_array, check_random_state, as_float_array +from sklearn.utils.extmath import squared_norm +from sklearn.utils.validation import FLOAT_DTYPES +from sklearn.utils.validation import check_is_fitted +from . import spherical_kmeans +# _init_centroids + +MAX_CONTENTRATION = 1e10 + + +def _inertia_from_labels(X, centers, labels): + """Compute inertia with cosine distance using known labels. + """ + n_examples, n_features = X.shape + inertia = np.zeros((n_examples,)) + for ee in range(n_examples): + inertia[ee] = 1 - X[ee, :].dot(centers[int(labels[ee]), :].T) + + return np.sum(inertia) + + +def _labels_inertia(X, centers): + """Compute labels and inertia with cosine distance. + """ + n_examples, n_features = X.shape + n_clusters, n_features = centers.shape + + labels = np.zeros((n_examples,)) + inertia = np.zeros((n_examples,)) + + for ee in range(n_examples): + dists = np.zeros((n_clusters,)) + for cc in range(n_clusters): + dists[cc] = 1 - X[ee, :].dot(centers[cc, :].T) + + labels[ee] = np.argmin(dists) + inertia[ee] = dists[int(labels[ee])] + + return labels, np.sum(inertia) + + +def _vmf_log(X, kappa, mu): + """Computs log(vMF(X, kappa, mu)) using built-in numpy/scipy Bessel + approximations. + + Works well on small kappa and mu. + """ + n_examples, n_features = X.shape + return np.log(_vmf_normalize(kappa, n_features) * np.exp(kappa * X.dot(mu).T)) + + +def _vmf_normalize(kappa, dim): + """Compute normalization constant using built-in numpy/scipy Bessel + approximations. + + Works well on small kappa and mu. + """ + num = np.power(kappa, dim / 2.0 - 1.0) + + if dim / 2.0 - 1.0 < 1e-15: + denom = np.power(2.0 * np.pi, dim / 2.0) * i0(kappa) + else: + denom = np.power(2.0 * np.pi, dim / 2.0) * iv(dim / 2.0 - 1.0, kappa) + + if np.isinf(num): + raise ValueError("VMF scaling numerator was inf.") + + if np.isinf(denom): + raise ValueError("VMF scaling denominator was inf.") + + if np.abs(denom) < 1e-15: + raise ValueError("VMF scaling denominator was 0.") + + return num / denom + + +def _log_H_asymptotic(nu, kappa): + """Compute the Amos-type upper bound asymptotic approximation on H where + log(H_\nu)(\kappa) = \int_0^\kappa R_\nu(t) dt. + + See "lH_asymptotic <-" in movMF.R and utility function implementation notes + from https://cran.r-project.org/web/packages/movMF/index.html + """ + beta = np.sqrt((nu + 0.5) ** 2) + kappa_l = np.min([kappa, np.sqrt((3.0 * nu + 11.0 / 2.0) * (nu + 3.0 / 2.0))]) + return _S(kappa, nu + 0.5, beta) + ( + _S(kappa_l, nu, nu + 2.0) - _S(kappa_l, nu + 0.5, beta) + ) + + +def _S(kappa, alpha, beta): + """Compute the antiderivative of the Amos-type bound G on the modified + Bessel function ratio. + + Note: Handles scalar kappa, alpha, and beta only. + + See "S <-" in movMF.R and utility function implementation notes from + https://cran.r-project.org/web/packages/movMF/index.html + """ + kappa = 1.0 * np.abs(kappa) + alpha = 1.0 * alpha + beta = 1.0 * np.abs(beta) + a_plus_b = alpha + beta + u = np.sqrt(kappa ** 2 + beta ** 2) + if alpha == 0: + alpha_scale = 0 + else: + alpha_scale = alpha * np.log((alpha + u) / a_plus_b) + + return u - beta - alpha_scale + + +def _vmf_log_asymptotic(X, kappa, mu): + """Compute log(f(x|theta)) via Amos approximation + + log(f(x|theta)) = theta' x - log(H_{d/2-1})(\|theta\|) + + where theta = kappa * X, \|theta\| = kappa. + + Computing _vmf_log helps with numerical stability / loss of precision for + for large values of kappa and n_features. + + See utility function implementation notes in movMF.R from + https://cran.r-project.org/web/packages/movMF/index.html + """ + n_examples, n_features = X.shape + log_vfm = kappa * X.dot(mu).T + -_log_H_asymptotic(n_features / 2.0 - 1.0, kappa) + + return log_vfm + + +def _log_likelihood(X, centers, weights, concentrations): + if len(np.shape(X)) != 2: + X = X.reshape((1, len(X))) + + n_examples, n_features = np.shape(X) + n_clusters, _ = centers.shape + + if n_features <= 50: # works up to about 50 before numrically unstable + vmf_f = _vmf_log + else: + vmf_f = _vmf_log_asymptotic + + f_log = np.zeros((n_clusters, n_examples)) + for cc in range(n_clusters): + f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :]) + + posterior = np.zeros((n_clusters, n_examples)) + weights_log = np.log(weights) + posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee])) + + return posterior + + +def _init_unit_centers(X, n_clusters, random_state, init): + """Initializes unit norm centers. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + init: (string) one of + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + """ + n_examples, n_features = np.shape(X) + if isinstance(init, np.ndarray): + n_init_clusters, n_init_features = init.shape + assert n_init_clusters == n_clusters + assert n_init_features == n_features + + # ensure unit normed centers + centers = init + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "spherical-k-means": + labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd( + X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++" + ) + + return centers + + elif init == "random": + centers = np.random.randn(n_clusters, n_features) + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "k-means++": + centers, _ = _kmeans_plusplus(X, n_clusters, + random_state=random_state, + x_squared_norms=np.ones((n_examples,))) + + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "random-orthonormal": + centers = np.random.randn(n_clusters, n_features) + q, r = np.linalg.qr(centers.T, mode="reduced") + + return q.T + + elif init == "random-class": + centers = np.zeros((n_clusters, n_features)) + for cc in range(n_clusters): + while np.linalg.norm(centers[cc, :]) == 0: + labels = np.random.randint(0, n_clusters, n_examples) + centers[cc, :] = X[labels == cc, :].sum(axis=0) + + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + +def _expectation(X, centers, weights, concentrations, posterior_type="soft"): + """Compute the log-likelihood of each datapoint being in each cluster. + + Parameters + ---------- + centers (mu) : array, [n_centers x n_features] + weights (alpha) : array, [n_centers, ] (alpha) + concentrations (kappa) : array, [n_centers, ] + + Returns + ---------- + posterior : array, [n_centers, n_examples] + """ + n_examples, n_features = np.shape(X) + n_clusters, _ = centers.shape + + if n_features <= 50: # works up to about 50 before numrically unstable + vmf_f = _vmf_log + else: + vmf_f = _vmf_log_asymptotic + + f_log = np.zeros((n_clusters, n_examples)) + for cc in range(n_clusters): + f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :]) + + posterior = np.zeros((n_clusters, n_examples)) + if posterior_type == "soft": + weights_log = np.log(weights) + posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee])) + + elif posterior_type == "hard": + weights_log = np.log(weights) + weighted_f_log = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[np.argmax(weighted_f_log[:, ee]), ee] = 1.0 + + return posterior + + +def _maximization(X, posterior, force_weights=None): + """Estimate new centers, weights, and concentrations from + + Parameters + ---------- + posterior : array, [n_centers, n_examples] + The posterior matrix from the expectation step. + + force_weights : None or array, [n_centers, ] + If None is passed, will estimate weights. + If an array is passed, will use instead of estimating. + + Returns + ---------- + centers (mu) : array, [n_centers x n_features] + weights (alpha) : array, [n_centers, ] (alpha) + concentrations (kappa) : array, [n_centers, ] + """ + n_examples, n_features = X.shape + n_clusters, n_examples = posterior.shape + concentrations = np.zeros((n_clusters,)) + centers = np.zeros((n_clusters, n_features)) + if force_weights is None: + weights = np.zeros((n_clusters,)) + + for cc in range(n_clusters): + # update weights (alpha) + if force_weights is None: + weights[cc] = np.mean(posterior[cc, :]) + else: + weights = force_weights + + # update centers (mu) + X_scaled = X.copy() + if sp.issparse(X): + X_scaled.data *= posterior[cc, :].repeat(np.diff(X_scaled.indptr)) + else: + for ee in range(n_examples): + X_scaled[ee, :] *= posterior[cc, ee] + + centers[cc, :] = X_scaled.sum(axis=0) + + # normalize centers + center_norm = np.linalg.norm(centers[cc, :]) + if center_norm > 1e-8: + centers[cc, :] = centers[cc, :] / center_norm + + # update concentration (kappa) [TODO: add other kappa approximations] + rbar = center_norm / (n_examples * weights[cc]) + concentrations[cc] = rbar * n_features - np.power(rbar, 3.0) + if np.abs(rbar - 1.0) < 1e-10: + concentrations[cc] = MAX_CONTENTRATION + else: + concentrations[cc] /= 1.0 - np.power(rbar, 2.0) + + # let python know we can free this (good for large dense X) + del X_scaled + + return centers, weights, concentrations + + +def _movMF( + X, + n_clusters, + posterior_type="soft", + force_weights=None, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, +): + """Mixture of von Mises Fisher clustering. + + Implements the algorithms (i) and (ii) from + + "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions" + by Banerjee, Dhillon, Ghosh, and Sra. + + TODO: Currently only supports Banerjee et al 2005 approximation of kappa, + however, there are numerous other approximations see _update_params. + + Attribution + ---------- + Approximation of log-vmf distribution function from movMF R-package. + + movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions + by Kurt Hornik, Bettina Grun, 2014 + + Find more at: + https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf + https://cran.r-project.org/web/packages/movMF/index.html + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + posterior_type: 'soft' or 'hard' + Type of posterior computed in exepectation step. + See note about attribute: self.posterior_ + + force_weights : None or array [n_clusters, ] + If None, the algorithm will estimate the weights. + If an array of weights, algorithm will estimate concentrations and + centers with given weights. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init: (string) one of + random-class [default]: random class assignment & centroid computation + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-6 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + """ + random_state = check_random_state(random_state) + n_examples, n_features = np.shape(X) + + # init centers (mus) + centers = _init_unit_centers(X, n_clusters, random_state, init) + + # init weights (alphas) + if force_weights is None: + weights = np.ones((n_clusters,)) + weights = weights / np.sum(weights) + else: + weights = force_weights + + # init concentrations (kappas) + concentrations = np.ones((n_clusters,)) + + if verbose: + print("Initialization complete") + + for iter in range(max_iter): + centers_prev = centers.copy() + + # expectation step + posterior = _expectation( + X, centers, weights, concentrations, posterior_type=posterior_type + ) + + # maximization step + centers, weights, concentrations = _maximization( + X, posterior, force_weights=force_weights + ) + + # check convergence + tolcheck = squared_norm(centers_prev - centers) + if tolcheck <= tol: + if verbose: + print( + "Converged at iteration %d: " + "center shift %e within tolerance %e" % (iter, tolcheck, tol) + ) + break + + # labels come for free via posterior + labels = np.zeros((n_examples,)) + for ee in range(n_examples): + labels[ee] = np.argmax(posterior[:, ee]) + + inertia = _inertia_from_labels(X, centers, labels) + + return centers, weights, concentrations, posterior, labels, inertia + + +def movMF( + X, + n_clusters, + posterior_type="soft", + force_weights=None, + n_init=10, + n_jobs=1, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, + copy_x=True, +): + """Wrapper for parallelization of _movMF and running n_init times. + """ + if n_init <= 0: + raise ValueError( + "Invalid number of initializations." + " n_init=%d must be bigger than zero." % n_init + ) + random_state = check_random_state(random_state) + + if max_iter <= 0: + raise ValueError( + "Number of iterations should be a positive number," + " got %d instead" % max_iter + ) + + best_inertia = np.infty + X = as_float_array(X, copy=copy_x) + tol = _tolerance(X, tol) + + if hasattr(init, "__array__"): + if n_init != 1: + warnings.warn( + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) + n_init = 1 + + # defaults + best_centers = None + best_labels = None + best_weights = None + best_concentrations = None + best_posterior = None + best_inertia = None + + if n_jobs == 1: + # For a single thread, less memory is needed if we just store one set + # of the best results (as opposed to one set per run per thread). + for it in range(n_init): + # cluster on the sphere + (centers, weights, concentrations, posterior, labels, inertia) = _movMF( + X, + n_clusters, + posterior_type=posterior_type, + force_weights=force_weights, + max_iter=max_iter, + verbose=verbose, + init=init, + random_state=random_state, + tol=tol, + ) + + # determine if these results are the best so far + if best_inertia is None or inertia < best_inertia: + best_centers = centers.copy() + best_labels = labels.copy() + best_weights = weights.copy() + best_concentrations = concentrations.copy() + best_posterior = posterior.copy() + best_inertia = inertia + else: + # parallelisation of movMF runs + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + results = Parallel(n_jobs=n_jobs, verbose=0)( + delayed(_movMF)( + X, + n_clusters, + posterior_type=posterior_type, + force_weights=force_weights, + max_iter=max_iter, + verbose=verbose, + init=init, + random_state=random_state, + tol=tol, + ) + for seed in seeds + ) + + # Get results with the lowest inertia + centers, weights, concentrations, posteriors, labels, inertia = zip(*results) + best = np.argmin(inertia) + best_labels = labels[best] + best_inertia = inertia[best] + best_centers = centers[best] + best_concentrations = concentrations[best] + best_posterior = posteriors[best] + best_weights = weights[best] + + return ( + best_centers, + best_labels, + best_inertia, + best_weights, + best_concentrations, + best_posterior, + ) + + +class VonMisesFisherMixture(KMeans): + """Estimator for Mixture of von Mises Fisher clustering on the unit sphere. + + Implements the algorithms (i) and (ii) from + + "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions" + by Banerjee, Dhillon, Ghosh, and Sra. + + TODO: Currently only supports Banerjee et al 2005 approximation of kappa, + however, there are numerous other approximations see _update_params. + + Attribution + ---------- + Approximation of log-vmf distribution function from movMF R-package. + + movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions + by Kurt Hornik, Bettina Grun, 2014 + + Find more at: + https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf + https://cran.r-project.org/web/packages/movMF/index.html + + Basic sklearn scaffolding from sklearn.cluster.KMeans. + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + posterior_type: 'soft' or 'hard' + Type of posterior computed in exepectation step. + See note about attribute: self.posterior_ + + force_weights : None or array [n_clusters, ] + If None, the algorithm will estimate the weights. + If an array of weights, algorithm will estimate concentrations and + centers with given weights. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init: (string) one of + random-class [default]: random class assignment & centroid computation + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-6 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + + normalize : boolean, default True + Normalize the input to have unnit norm. + + Attributes + ---------- + + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers + + labels_ : + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + + weights_ : array, [n_clusters,] + Weights of each cluster in vMF distribution (alpha). + + concentrations_ : array [n_clusters,] + Concentration parameter for each cluster (kappa). + Larger values correspond to more concentrated clusters. + + posterior_ : array, [n_clusters, n_examples] + Each column corresponds to the posterio distribution for and example. + + If posterior_type='hard' is used, there will only be one non-zero per + column, its index corresponding to the example's cluster label. + + If posterior_type='soft' is used, this matrix will be dense and the + column values correspond to soft clustering weights. + """ + + def __init__( + self, + n_clusters=5, + posterior_type="soft", + force_weights=None, + n_init=10, + n_jobs=1, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, + copy_x=True, + normalize=True, + ): + self.n_clusters = n_clusters + self.posterior_type = posterior_type + self.force_weights = force_weights + self.n_init = n_init + self.n_jobs = n_jobs + self.max_iter = max_iter + self.verbose = verbose + self.init = init + self.random_state = random_state + self.tol = tol + self.copy_x = copy_x + self.normalize = normalize + + def _check_force_weights(self): + if self.force_weights is None: + return + + if len(self.force_weights) != self.n_clusters: + raise ValueError( + ( + "len(force_weights)={} but must equal " + "n_clusters={}".format(len(self.force_weights), self.n_clusters) + ) + ) + + def _check_fit_data(self, X): + """Verify that the number of samples given is larger than k""" + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + n_samples, n_features = X.shape + if X.shape[0] < self.n_clusters: + raise ValueError( + "n_samples=%d should be >= n_clusters=%d" + % (X.shape[0], self.n_clusters) + ) + + for ee in range(n_samples): + if sp.issparse(X): + n = sp.linalg.norm(X[ee, :]) + else: + n = np.linalg.norm(X[ee, :]) + + if np.abs(n - 1.0) > 1e-4: + raise ValueError("Data l2-norm must be 1, found {}".format(n)) + + return X + + def _check_test_data(self, X): + X = check_array(X, accept_sparse="csr", dtype=FLOAT_DTYPES) + n_samples, n_features = X.shape + expected_n_features = self.cluster_centers_.shape[1] + if not n_features == expected_n_features: + raise ValueError( + "Incorrect number of features. " + "Got %d features, expected %d" % (n_features, expected_n_features) + ) + + for ee in range(n_samples): + if sp.issparse(X): + n = sp.linalg.norm(X[ee, :]) + else: + n = np.linalg.norm(X[ee, :]) + + if np.abs(n - 1.0) > 1e-4: + raise ValueError("Data l2-norm must be 1, found {}".format(n)) + + return X + + def fit(self, X, y=None): + """Compute mixture of von Mises Fisher clustering. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + """ + if self.normalize: + X = normalize(X) + + # Validate init array + init = self.init + if hasattr(init, "__array__"): + init = check_array(init, dtype=X.dtype.type, order="C", copy=True) + self._validate_center_shape(X, init) + + self._check_force_weights() + random_state = check_random_state(self.random_state) + X = self._check_fit_data(X) + + ( + self.cluster_centers_, + self.labels_, + self.inertia_, + self.weights_, + self.concentrations_, + self.posterior_, + ) = movMF( + X, + self.n_clusters, + posterior_type=self.posterior_type, + force_weights=self.force_weights, + n_init=self.n_init, + n_jobs=self.n_jobs, + max_iter=self.max_iter, + verbose=self.verbose, + init=self.init, + random_state=random_state, + tol=self.tol, + copy_x=self.copy_x, + ) + + return self + + def fit_predict(self, X, y=None): + """Compute cluster centers and predict cluster index for each sample. + Convenience method; equivalent to calling fit(X) followed by + predict(X). + """ + return self.fit(X).labels_ + + def fit_transform(self, X, y=None): + """Compute clustering and transform X to cluster-distance space. + Equivalent to fit(X).transform(X), but more efficiently implemented. + """ + # Currently, this just skips a copy of the data if it is not in + # np.array or CSR format already. + # XXX This skips _check_test_data, which may change the dtype; + # we should refactor the input validation. + return self.fit(X)._transform(X) + + def transform(self, X, y=None): + """Transform X to a cluster-distance space. + In the new space, each dimension is the cosine distance to the cluster + centers. Note that even if X is sparse, the array returned by + `transform` will typically be dense. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data to transform. + + Returns + ------- + X_new : array, shape [n_samples, k] + X transformed in the new space. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + X = self._check_test_data(X) + return self._transform(X) + + def _transform(self, X): + """guts of transform method; no input validation""" + return cosine_distances(X, self.cluster_centers_) + + def predict(self, X): + """Predict the closest cluster each sample in X belongs to. + In the vector quantization literature, `cluster_centers_` is called + the code book and each value returned by `predict` is the index of + the closest code in the code book. + + Note: Does not check that each point is on the sphere. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data to predict. + + Returns + ------- + labels : array, shape [n_samples,] + Index of the cluster each sample belongs to. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + + X = self._check_test_data(X) + return _labels_inertia(X, self.cluster_centers_)[0] + + def score(self, X, y=None): + """Inertia score (sum of all distances to closest cluster). + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data. + + Returns + ------- + score : float + Larger score is better. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + X = self._check_test_data(X) + return -_labels_inertia(X, self.cluster_centers_)[1] + + def log_likelihood(self, X): + check_is_fitted(self) + + return _log_likelihood( + X, self.cluster_centers_, self.weights_, self.concentrations_ + ) diff --git a/code/taxonomy.py b/code/taxonomy.py index 3c5eb20..00794cd 100644 --- a/code/taxonomy.py +++ b/code/taxonomy.py @@ -53,7 +53,7 @@ def find_node(self, name): return None def sample_a_node(self): - return random.choice(self.all_nodes.values()) + return random.choice(list(self.all_nodes.values())) diff --git a/code/utils.py b/code/utils.py index 085e722..ca29d88 100644 --- a/code/utils.py +++ b/code/utils.py @@ -31,7 +31,7 @@ def avg_weighted_colors(color_list, c_size): for (color, weight) in color_list: w_color = [x * weight for x in color] # print w_color - result_color = map(operator.add, result_color, w_color) + result_color = list(map(operator.add, result_color, w_color)) # print result_color return l1_normalize(result_color) @@ -57,7 +57,7 @@ def cossim(p, q): def euclidean_distance(p, q): if len(p) != len(q): - print 'Euclidean distance error: p, q have different length' + print('Euclidean distance error: p, q have different length') distance = 0 @@ -69,7 +69,7 @@ def euclidean_distance(p, q): def euclidean_cluster(ps, c): if len(ps) == 0 or c == None: - print 'Cluster is empty' + print('Cluster is empty') distance = 0 @@ -83,7 +83,7 @@ def euclidean_cluster(ps, c): def dot_product(p, q): if len(p) != len(q): - print 'KL divergence error: p, q have different length' + print('KL divergence error: p, q have different length') p_len = q_len = mix_len = 0 @@ -122,7 +122,7 @@ def avg_emb_with_distinct(ele_map, embs_from, dist_map, vec_size): avg_emb = [0] * vec_size t_weight = 0 - for key, value in ele_map.iteritems(): + for key, value in ele_map.items(): t_emb = embs_from[key] w = value * dist_map[key] for i in range(vec_size): @@ -140,7 +140,7 @@ def avg_emb(ele_map, embs_from, vec_size): avg_emb = [0] * vec_size t_weight = 0 - for key, value in ele_map.iteritems(): + for key, value in ele_map.items(): t_emb = embs_from[key] w = value for i in range(vec_size): diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3e630e5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[tool.poetry] +name = "taxogen" +version = "0.1.0" +description = "" +authors = ["Cezar Sas "] + +[tool.poetry.dependencies] +python = ">=3.8,<3.11" +numpy = "^1.21.4" +scipy = "^1.7.3" +scikit-learn = "0.24.2" +PyYAML = "^6.0" + + +[tool.poetry.dev-dependencies] +2to3 = "^1.0" + + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api"