diff --git a/.gitignore b/.gitignore index 66b4a19..4fc984c 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,7 @@ ENV/ *.app ./code/word2vec + +code/spherecluster +.vscode +code/word2vec \ No newline at end of file diff --git a/code/case_ranker.py b/code/case_ranker.py index 22da68c..a1c63eb 100644 --- a/code/case_ranker.py +++ b/code/case_ranker.py @@ -41,7 +41,7 @@ def rank_phrase(case_file): unif = [1.0 / cell_cnt] * cell_cnt for ph in phrase_map: - ph_vec = [x[1] for x in phrase_map[ph].iteritems()] + ph_vec = [x[1] for x in phrase_map[ph].items()] if len(ph_vec) < cell_cnt: ph_vec += [0] * (cell_cnt - len(ph_vec)) # smoothing diff --git a/code/compress.py b/code/compress.py index d444ad5..06bd485 100644 --- a/code/compress.py +++ b/code/compress.py @@ -1,7 +1,7 @@ import argparse import utils import operator -import Queue +import queue import math from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename, exists @@ -37,7 +37,7 @@ def parse_reidx(reidx_f): if len(pd_map[ph]) > 0: ph_idf[ph] = math.log(float(d_cnt) / len(pd_map[ph])) - print 'Inverted Index file read.' + print('Inverted Index file read.') @@ -81,7 +81,7 @@ def get_rep(folder, c_id, N): break elif ph_idf == None: - print 'looking at embeddings for %s' % folder + print('looking at embeddings for %s' % folder) ph_f = '%s/embeddings.txt' % par_folder kw_f = '%s/keywords.txt' % par_folder @@ -119,7 +119,7 @@ def get_rep(folder, c_id, N): result_phrases.append(ph) else: # Using TF-IDF to generate - print 'looking at tf-idf for %s' % folder + print ('looking at tf-idf for %s' % folder) d_clus_f = '%s/paper_cluster.txt' % par_folder kw_clus_f = '%s/cluster_keywords.txt' % par_folder docs = [] @@ -161,7 +161,7 @@ def get_rep(folder, c_id, N): def recursion(root, o_file, N): - q = Queue.Queue() + q = queue.Queue() q.put((root, -1, '*')) g = open(o_file, 'w+') diff --git a/code/main.py b/code/main.py index b7a0ff7..d986408 100644 --- a/code/main.py +++ b/code/main.py @@ -62,8 +62,9 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) - except: + except Exception as error: print('Clustering not finished.') + print(error) return copyfile(df.seed_keyword_file, df.filtered_keyword_file) else: @@ -74,8 +75,9 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) - except: + except Exception as error: print('Clustering not finished.') + print(error) return start = time.time() @@ -110,7 +112,7 @@ def main(opt): n_expand = opt['n_expand'] n_cluster_iter = opt['n_cluster_iter'] level = 0 - + # our method root_dir = opt['data_dir'] + 'our-l3-0.15/' copy_tree(init_dir, root_dir) diff --git a/code/paras.py b/code/paras.py index 027ae53..98c71a0 100644 --- a/code/paras.py +++ b/code/paras.py @@ -99,7 +99,7 @@ def load_sp_params(): def load_dblp_params_method(): pd = dict() - pd['data_dir'] = '/shared/data/jiaming/local-embedding/data/dblp/' + pd['data_dir'] = '../data/dblp/' pd['doc_file'] = pd['data_dir'] + 'input/papers.txt' pd['doc_keyword_cnt_file'] = pd['data_dir'] + 'input/keyword_cnt.txt' pd['input_dir'] = pd['data_dir'] + 'input/' diff --git a/code/run.sh b/code/run.sh old mode 100644 new mode 100755 index 1bcf2ff..21c30d3 --- a/code/run.sh +++ b/code/run.sh @@ -9,7 +9,7 @@ FIRST_RUN=${FIRST_RUN:- 0} if [ $FIRST_RUN -eq 1 ]; then echo 'Start data preprocessing' ## compile word2vec for embedding learning - gcc word2vec.c -o word2veec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result + gcc word2vec.c -o word2vec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result ## create initial folder if not exist if [ ! -d ../data/$corpusName/init ]; then diff --git a/code/utils.py b/code/utils.py index 085e722..2b0192d 100644 --- a/code/utils.py +++ b/code/utils.py @@ -57,7 +57,7 @@ def cossim(p, q): def euclidean_distance(p, q): if len(p) != len(q): - print 'Euclidean distance error: p, q have different length' + print('Euclidean distance error: p, q have different length') distance = 0 @@ -69,7 +69,7 @@ def euclidean_distance(p, q): def euclidean_cluster(ps, c): if len(ps) == 0 or c == None: - print 'Cluster is empty' + print('Cluster is empty') distance = 0 @@ -83,7 +83,7 @@ def euclidean_cluster(ps, c): def dot_product(p, q): if len(p) != len(q): - print 'KL divergence error: p, q have different length' + print('KL divergence error: p, q have different length') p_len = q_len = mix_len = 0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ca3274b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy==1.19.5 +scipy +scikit-learn==0.22 +PyYAML \ No newline at end of file