Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,7 @@ ENV/
*.app

./code/word2vec

code/spherecluster
.vscode
code/word2vec
2 changes: 1 addition & 1 deletion code/case_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def rank_phrase(case_file):
unif = [1.0 / cell_cnt] * cell_cnt

for ph in phrase_map:
ph_vec = [x[1] for x in phrase_map[ph].iteritems()]
ph_vec = [x[1] for x in phrase_map[ph].items()]
if len(ph_vec) < cell_cnt:
ph_vec += [0] * (cell_cnt - len(ph_vec))
# smoothing
Expand Down
10 changes: 5 additions & 5 deletions code/compress.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import utils
import operator
import Queue
import queue
import math
from os import listdir
from os.path import isfile, join, isdir, abspath, dirname, basename, exists
Expand Down Expand Up @@ -37,7 +37,7 @@ def parse_reidx(reidx_f):
if len(pd_map[ph]) > 0:
ph_idf[ph] = math.log(float(d_cnt) / len(pd_map[ph]))

print 'Inverted Index file read.'
print('Inverted Index file read.')



Expand Down Expand Up @@ -81,7 +81,7 @@ def get_rep(folder, c_id, N):
break

elif ph_idf == None:
print 'looking at embeddings for %s' % folder
print('looking at embeddings for %s' % folder)

ph_f = '%s/embeddings.txt' % par_folder
kw_f = '%s/keywords.txt' % par_folder
Expand Down Expand Up @@ -119,7 +119,7 @@ def get_rep(folder, c_id, N):
result_phrases.append(ph)
else:
# Using TF-IDF to generate
print 'looking at tf-idf for %s' % folder
print ('looking at tf-idf for %s' % folder)
d_clus_f = '%s/paper_cluster.txt' % par_folder
kw_clus_f = '%s/cluster_keywords.txt' % par_folder
docs = []
Expand Down Expand Up @@ -161,7 +161,7 @@ def get_rep(folder, c_id, N):

def recursion(root, o_file, N):

q = Queue.Queue()
q = queue.Queue()
q.put((root, -1, '*'))

g = open(o_file, 'w+')
Expand Down
8 changes: 5 additions & 3 deletions code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\
try:
children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \
df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
except:
except Exception as error:
print('Clustering not finished.')
print(error)
return
copyfile(df.seed_keyword_file, df.filtered_keyword_file)
else:
Expand All @@ -74,8 +75,9 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\
try:
children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\
df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
except:
except Exception as error:
print('Clustering not finished.')
print(error)
return

start = time.time()
Expand Down Expand Up @@ -110,7 +112,7 @@ def main(opt):
n_expand = opt['n_expand']
n_cluster_iter = opt['n_cluster_iter']
level = 0

# our method
root_dir = opt['data_dir'] + 'our-l3-0.15/'
copy_tree(init_dir, root_dir)
Expand Down
2 changes: 1 addition & 1 deletion code/paras.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def load_sp_params():

def load_dblp_params_method():
pd = dict()
pd['data_dir'] = '/shared/data/jiaming/local-embedding/data/dblp/'
pd['data_dir'] = '../data/dblp/'
pd['doc_file'] = pd['data_dir'] + 'input/papers.txt'
pd['doc_keyword_cnt_file'] = pd['data_dir'] + 'input/keyword_cnt.txt'
pd['input_dir'] = pd['data_dir'] + 'input/'
Expand Down
2 changes: 1 addition & 1 deletion code/run.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ FIRST_RUN=${FIRST_RUN:- 0}
if [ $FIRST_RUN -eq 1 ]; then
echo 'Start data preprocessing'
## compile word2vec for embedding learning
gcc word2vec.c -o word2veec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result
gcc word2vec.c -o word2vec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result

## create initial folder if not exist
if [ ! -d ../data/$corpusName/init ]; then
Expand Down
6 changes: 3 additions & 3 deletions code/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def cossim(p, q):

def euclidean_distance(p, q):
if len(p) != len(q):
print 'Euclidean distance error: p, q have different length'
print('Euclidean distance error: p, q have different length')

distance = 0

Expand All @@ -69,7 +69,7 @@ def euclidean_distance(p, q):

def euclidean_cluster(ps, c):
if len(ps) == 0 or c == None:
print 'Cluster is empty'
print('Cluster is empty')

distance = 0

Expand All @@ -83,7 +83,7 @@ def euclidean_cluster(ps, c):

def dot_product(p, q):
if len(p) != len(q):
print 'KL divergence error: p, q have different length'
print('KL divergence error: p, q have different length')

p_len = q_len = mix_len = 0

Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
numpy==1.19.5
scipy
scikit-learn==0.22
PyYAML