Skip to content

Python3 upgrade and a new feature to eliminate three types of python comments (1. # 2. """comment""", and 3. '''comment''' ) #44

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions clone-detector/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ def populate_distinct_clone_groups_count(self):
self.clone_groups[rhsFile] = 1
count += 1
if (count % print_per_k) == 0:
print "rows processed: ", count
print "rows processed: ", count
print("rows processed: ", count)
print("rows processed: ", count)

def print_dict(self, dict_to_print):
print("clones of each file:")
with open("results.txt", 'w') as resultfile:
for key, val in sorted(dict_to_print.items(), key=lambda x:-x[1]):
for key, val in sorted(list(dict_to_print.items()), key=lambda x:-x[1]):
resultfile.write("{key},{val}\n".format(key=key, val=val))

if __name__ == '__main__':
Expand All @@ -47,4 +47,4 @@ def print_dict(self, dict_to_print):
# analyzer.get_count_of_distinct_files_that_have_clones()
analyzer.populate_distinct_clone_groups_count()
analyzer.print_dict(analyzer.clone_groups)
print "count of distinct files that have clones", len(analyzer.clone_groups.keys())
print("count of distinct files that have clones", len(list(analyzer.clone_groups.keys())))
2 changes: 1 addition & 1 deletion clone-detector/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

@author: saini
'''
from __future__ import absolute_import, division, print_function, unicode_literals

import subprocess
import sys
import os
Expand Down
20 changes: 10 additions & 10 deletions clone-detector/unevensplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ def split(self):
"""
count=0
line_limit = self.base_x
print "line_limit is ", line_limit
print("line_limit is ", line_limit)
file_count=1
try:
print "creating split ",file_count
print("creating split ",file_count)
self.outfile = open("query_{part}.file".format(part=file_count),'w')
with open(self.input_filename,'r') as inputfile:
for row in inputfile:
Expand All @@ -42,29 +42,29 @@ def split(self):
file_count+=1
count =0
line_limit =line_limit + math.ceil(0.5*self.base_x)
print "line_limit is ", line_limit
print "creating split ",file_count
print("line_limit is ", line_limit)
print("creating split ",file_count)
self.outfile = open("query_{part}.file".format(part=file_count),'w')
self.outfile.write(row)
count+=1
self.outfile.flush()
self.outfile.close()
except IOError, e:
print "Error: {error}".format(error=e)
except IOError as e:
print("Error: {error}".format(error=e))
sys.exit(1)


def get_num_lines_in_input_file(self):
with open(self.input_filename) as f:
for i, l in enumerate(f):
pass
print "total lines in the inputfile: {0} ".format(i+1)
print("total lines in the inputfile: {0} ".format(i+1))
return i + 1

def find_base_x(self):
# formula for S = x + x+.5x + x+2*.5x...x + (N-1)*.5x
self.base_x= math.ceil(float(2*self.total_lines)/(float((self.split_count+1)*(self.split_count+2)/2) - 1))
print "base_x is ", self.base_x
print("base_x is ", self.base_x)

if __name__ == '__main__':

Expand All @@ -73,7 +73,7 @@ def find_base_x(self):
split_count = int(sys.argv[2])
params= {'split_count':split_count,
'input_filename' : input_file}
print "spliting {inputfile} in {count} chunks".format(inputfile=input_file,count=split_count)
print("spliting {inputfile} in {count} chunks".format(inputfile=input_file,count=split_count))
splitter = Spliter(params)
splitter.split()
print "splitting done!"
print("splitting done!")
4 changes: 2 additions & 2 deletions tokenizers/block-level/separate-file-block-stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
elif line.startswith('f'):
file_info.write(line)
else:
print "error", line
print("error", line)

print "Done with: ", readfile
print("Done with: ", readfile)
26 changes: 13 additions & 13 deletions tokenizers/block-level/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
try:
from configparser import ConfigParser
except ImportError:
from ConfigParser import ConfigParser # ver. < 3.0
from configparser import ConfigParser # ver. < 3.0

MULTIPLIER = 50000000

Expand Down Expand Up @@ -152,7 +152,7 @@ def tokenize_files(file_string, comment_inline_pattern, comment_open_close_patte

t_time = dt.datetime.now()
#SourcererCC formatting
tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in file_string_for_tokenization.iteritems()])
tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in file_string_for_tokenization.items()])
t_time = (dt.datetime.now() - t_time).microseconds

# MD5
Expand Down Expand Up @@ -278,7 +278,7 @@ def tokenize_blocks(file_string, comment_inline_pattern, comment_open_close_patt
tokens_count_unique = len(block_string_for_tokenization)
t_time = dt.datetime.now()
#SourcererCC formatting
tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in block_string_for_tokenization.iteritems()])
tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in block_string_for_tokenization.items()])
token_time += (dt.datetime.now() - t_time).microseconds
# MD5
h_time = dt.datetime.now()
Expand Down Expand Up @@ -322,7 +322,7 @@ def process_file_contents(file_string, proj_id, file_id, container_path,

# file stats start with a letter 'f'
FILE_stats_file.write('f' + ','.join([proj_id,str(file_id),'\"'+file_path+'\"','\"'+file_url+'\"','\"'+file_hash+'\"',file_bytes,str(lines),str(LOC),str(SLOC)]) + '\n')
blocks_data = zip(range(10000,99999),blocks_data)
blocks_data = list(zip(list(range(10000,99999)),blocks_data))

logging.warning('Finished step2 on process_file_contents');

Expand Down Expand Up @@ -678,7 +678,7 @@ def start_child(processes, global_queue, proj_paths, batch, project_format):
paths_batch = proj_paths[:batch]
del proj_paths[:batch]

print("Starting new process %s" % (pid))
print(("Starting new process %s" % (pid)))
p = Process(name='Process '+str(pid), target=process_projects, args=(pid, paths_batch, processes[pid][1], global_queue, project_format, ))
processes[pid][0] = p
p.start()
Expand All @@ -690,7 +690,7 @@ def kill_child(processes, pid, n_files_processed):
processes[pid][0] = None
processes[pid][1] += n_files_processed

print("Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count))
print(("Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count)))

def active_process_count(processes):
count = 0
Expand All @@ -717,26 +717,26 @@ def active_process_count(processes):
for line in f:
line_split = line[:-1].split(',') # [:-1] to strip final character which is '\n'
prio_proj_paths.append((line_split[0],line_split[4]))
prio_proj_paths = zip(range(init_proj_id, len(prio_proj_paths)+init_proj_id), prio_proj_paths)
prio_proj_paths = list(zip(list(range(init_proj_id, len(prio_proj_paths)+init_proj_id)), prio_proj_paths))

proj_paths = []

if project_format in ['zipblocks']: # zipblocks will diverge the process flow on process_file()
print('\''+project_format+'\''+'format')
print(('\''+project_format+'\''+'format'))
with open(FILE_projects_list) as f:
for line in f:
proj_paths.append(line[:-1])
proj_paths = list(zip(range(1, len(proj_paths)+1), proj_paths))
proj_paths = list(zip(list(range(1, len(proj_paths)+1)), proj_paths))

if project_format in ['folderblocks']: # folderblocks will diverge the process flow on process_file()
print('\''+project_format+'\''+'format')
print(('\''+project_format+'\''+'format'))
with open(FILE_projects_list) as f:
for line in f:
proj_paths.append(line[:-1])
proj_paths = list(zip(range(1, len(proj_paths)+1), proj_paths))
proj_paths = list(zip(list(range(1, len(proj_paths)+1)), proj_paths))

if os.path.exists(PATH_stats_file_folder) or os.path.exists(PATH_bookkeeping_proj_folder) or os.path.exists(PATH_tokens_file_folder) or os.path.exists(PATH_logs):
print('ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!')
print(('ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!'))
sys.exit(1)
else:
os.makedirs(PATH_stats_file_folder)
Expand Down Expand Up @@ -774,5 +774,5 @@ def active_process_count(processes):
kill_child(processes, pid, n_files_processed)

p_elapsed = dt.datetime.now() - p_start
print("*** All done. %s files in %s" % (file_count, p_elapsed))
print(("*** All done. %s files in %s" % (file_count, p_elapsed)))

2 changes: 2 additions & 0 deletions tokenizers/file-level/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ separators = ; . [ ] ( ) ~ ! - + & * / %% < > & ^ | ? { } = # , \ : $ " '
comment_inline = //
comment_open_tag = /*
comment_close_tag = */
second_comment_open_tag = """
second_comment_close_tag = """
;.java
File_extensions = .java
;.cpp .hpp .c .h .C .cc .CPP .c++ .cp
Expand Down
38 changes: 19 additions & 19 deletions tokenizers/file-level/db-importer/clone_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
log_path = 'LOG-db-clonefinder.log'

if os.path.isfile(log_path):
print 'ERROR: Log file:',log_path,'already exists'
print('ERROR: Log file:',log_path,'already exists')
sys.exit(1)

FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'
Expand All @@ -29,7 +29,7 @@ def findAllTokenHashClones(project_id, token_hashes, files_clones, db_object):
try:
query = """SELECT fileId, projectId, f.fileHash, tokenHash FROM files as f
JOIN stats as s ON f.fileHash=s.fileHash
WHERE tokenHash in (%s) AND projectId >= %s;""" % ("'" + "','".join(token_hashes.keys()) + "'", project_id)
WHERE tokenHash in (%s) AND projectId >= %s;""" % ("'" + "','".join(list(token_hashes.keys())) + "'", project_id)
res = db_object.execute(query);
logging.info(query)
for (file_id, projectId, fileHash, tokenHash, ) in res:
Expand All @@ -39,8 +39,8 @@ def findAllTokenHashClones(project_id, token_hashes, files_clones, db_object):
files_clones[f].add((str(file_id), projectId))

except Exception as e:
print 'Error on findAllTokenHashClones'
print e
print('Error on findAllTokenHashClones')
print(e)
sys.exit(1)

def find_clones_for_project(project_id, project_file_counts, db_object, debug):
Expand Down Expand Up @@ -68,7 +68,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):

if debug == 'all':
logging.debug('## After round 1')
for k, v in files_clones.iteritems():
for k, v in files_clones.items():
if len(v) > 0:
logging.debug('%s-%s', k, v)

Expand All @@ -80,7 +80,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):

project_file_set = {}
clone_set = set()
for fid, clones in files_clones.iteritems():
for fid, clones in files_clones.items():
project_counted = False
for clone in clones:
projectId = clone[1]
Expand All @@ -90,22 +90,22 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
project_file_set[projectId].add(fid)

# How many of this project's files are present in each of the other project?
for pid, file_list in project_file_set.iteritems():
for pid, file_list in project_file_set.items():
percentage_clone_projects_counter[pid] = len(file_list)


# How many of the other projects files are present in this project?
for clone in clone_set:
projectId = clone[1]
if percentage_host_projects_counter.has_key(projectId):
if projectId in percentage_host_projects_counter:
percentage_host_projects_counter[projectId] += 1
else:
percentage_host_projects_counter[projectId] = 1

if len(percentage_host_projects_counter) > 0:
# The key k (projects) should be the same between
# percentage_clone_projects_counter and percentage_host_projects_counter
for k, v in percentage_host_projects_counter.iteritems():
for k, v in percentage_host_projects_counter.items():

percent_cloning = float(percentage_clone_projects_counter[k]*100)/total_files
percent_host = float(v*100)/project_file_counts[k]
Expand All @@ -116,16 +116,16 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):

if debug == 'all' or debug == 'final':
if True:#(percent_cloning > 99) and (str(project_id) != k):
print 'Proj',project_id,'in',k,'@',str( float("{0:.2f}".format(percent_cloning)) )+'% ('+str(v)+'/'+str(total_files),'files) affecting', str(float("{0:.2f}".format(percent_host)))+'%','['+str(percentage_cloning_counter[k])+'/'+str(total_files_host),'files]'
print('Proj',project_id,'in',k,'@',str( float("{0:.2f}".format(percent_cloning)) )+'% ('+str(v)+'/'+str(total_files),'files) affecting', str(float("{0:.2f}".format(percent_host)))+'%','['+str(percentage_cloning_counter[k])+'/'+str(total_files_host),'files]')

else:
db_object.insert_projectClones(project_id, percentage_clone_projects_counter[k], total_files, float("{0:.2f}".format(percent_cloning)),
k, v, project_file_counts[k],
float("{0:.2f}".format(percent_host)))

except Exception as e:
print 'Error on find_clones_for_project'
print e
print('Error on find_clones_for_project')
print(e)
traceback.print_exc()
sys.exit(1)

Expand Down Expand Up @@ -157,8 +157,8 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c
db_object.flush_projectClones()

except Exception as e:
print 'Error in clone_finder.start_process'
print e
print('Error in clone_finder.start_process')
print(e)
sys.exit(1)

finally:
Expand Down Expand Up @@ -216,14 +216,14 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c

project_ids = []

for projectId in project_file_counts.keys():
for projectId in list(project_file_counts.keys()):
project_ids.append(projectId)
pair_number += 1

project_ids = [ project_ids[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]
project_ids = [ project_ids[i::N_PROCESSES] for i in range(N_PROCESSES) ]

processes = []
for process_num in xrange(N_PROCESSES):
for process_num in range(N_PROCESSES):
p = Process(name='Process '+str(process_num), target=start_process,
args=(process_num, project_ids[process_num], DB_user, DB_name, DB_pass, project_file_counts, host, ))
processes.append(p)
Expand All @@ -232,8 +232,8 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c
[p.join() for p in processes]

except Exception as e:
print 'Error in clone_finder.__main__'
print e
print('Error in clone_finder.__main__')
print(e)
sys.exit(1)

finally:
Expand Down
4 changes: 2 additions & 2 deletions tokenizers/file-level/db-importer/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def insert_file(self, file_id, proj_id, file_path, file_url, file_hash, flush =

# Prepare the complete list
if autoID:
self.files = map(lambda (a, b, c, d, e): (b, c, d, e), self.files)
self.files = [(a_b_c_d_e[1], a_b_c_d_e[2], a_b_c_d_e[3], a_b_c_d_e[4]) for a_b_c_d_e in self.files]
flist = ','.join(self.files)

self.check_connection()
Expand Down Expand Up @@ -442,7 +442,7 @@ def project_exists(self, proj_path):
def sanitize_string(self, string_input):
# To clean non-ascii characters
printable = set(string.printable)
string_res = filter(lambda x: x in printable, string_input)
string_res = [x for x in string_input if x in printable]
return (string_res[:DB_MAX_STRING_SIZE])

def execute(self, query):
Expand Down
Loading