From 3aea332c75c45dc0866bfca5586e86ab3b7823c6 Mon Sep 17 00:00:00 2001 From: Malinda Date: Wed, 22 Jul 2020 17:20:42 -0700 Subject: [PATCH 1/3] upgrading python version from 2 to 3 --- clone-detector/analyze.py | 8 +- clone-detector/controller.py | 2 +- clone-detector/unevensplit.py | 20 +- .../block-level/separate-file-block-stats.py | 4 +- tokenizers/block-level/tokenizer.py | 26 +- tokenizers/file-level/tokenizer-unit-test.py | 162 +-- tokenizers/file-level/tokenizer.py | 1000 +++++++++-------- 7 files changed, 655 insertions(+), 567 deletions(-) diff --git a/clone-detector/analyze.py b/clone-detector/analyze.py index 2e2e7be68..2ef55d721 100644 --- a/clone-detector/analyze.py +++ b/clone-detector/analyze.py @@ -32,13 +32,13 @@ def populate_distinct_clone_groups_count(self): self.clone_groups[rhsFile] = 1 count += 1 if (count % print_per_k) == 0: - print "rows processed: ", count - print "rows processed: ", count + print("rows processed: ", count) + print("rows processed: ", count) def print_dict(self, dict_to_print): print("clones of each file:") with open("results.txt", 'w') as resultfile: - for key, val in sorted(dict_to_print.items(), key=lambda x:-x[1]): + for key, val in sorted(list(dict_to_print.items()), key=lambda x:-x[1]): resultfile.write("{key},{val}\n".format(key=key, val=val)) if __name__ == '__main__': @@ -47,4 +47,4 @@ def print_dict(self, dict_to_print): # analyzer.get_count_of_distinct_files_that_have_clones() analyzer.populate_distinct_clone_groups_count() analyzer.print_dict(analyzer.clone_groups) - print "count of distinct files that have clones", len(analyzer.clone_groups.keys()) + print("count of distinct files that have clones", len(list(analyzer.clone_groups.keys()))) diff --git a/clone-detector/controller.py b/clone-detector/controller.py index 7a424dfd8..527bdde2f 100644 --- a/clone-detector/controller.py +++ b/clone-detector/controller.py @@ -3,7 +3,7 @@ @author: saini ''' -from __future__ import absolute_import, division, print_function, unicode_literals + import subprocess import sys import os diff --git a/clone-detector/unevensplit.py b/clone-detector/unevensplit.py index cec985cab..ee630f9f2 100644 --- a/clone-detector/unevensplit.py +++ b/clone-detector/unevensplit.py @@ -27,10 +27,10 @@ def split(self): """ count=0 line_limit = self.base_x - print "line_limit is ", line_limit + print("line_limit is ", line_limit) file_count=1 try: - print "creating split ",file_count + print("creating split ",file_count) self.outfile = open("query_{part}.file".format(part=file_count),'w') with open(self.input_filename,'r') as inputfile: for row in inputfile: @@ -42,15 +42,15 @@ def split(self): file_count+=1 count =0 line_limit =line_limit + math.ceil(0.5*self.base_x) - print "line_limit is ", line_limit - print "creating split ",file_count + print("line_limit is ", line_limit) + print("creating split ",file_count) self.outfile = open("query_{part}.file".format(part=file_count),'w') self.outfile.write(row) count+=1 self.outfile.flush() self.outfile.close() - except IOError, e: - print "Error: {error}".format(error=e) + except IOError as e: + print("Error: {error}".format(error=e)) sys.exit(1) @@ -58,13 +58,13 @@ def get_num_lines_in_input_file(self): with open(self.input_filename) as f: for i, l in enumerate(f): pass - print "total lines in the inputfile: {0} ".format(i+1) + print("total lines in the inputfile: {0} ".format(i+1)) return i + 1 def find_base_x(self): # formula for S = x + x+.5x + x+2*.5x...x + (N-1)*.5x self.base_x= math.ceil(float(2*self.total_lines)/(float((self.split_count+1)*(self.split_count+2)/2) - 1)) - print "base_x is ", self.base_x + print("base_x is ", self.base_x) if __name__ == '__main__': @@ -73,7 +73,7 @@ def find_base_x(self): split_count = int(sys.argv[2]) params= {'split_count':split_count, 'input_filename' : input_file} - print "spliting {inputfile} in {count} chunks".format(inputfile=input_file,count=split_count) + print("spliting {inputfile} in {count} chunks".format(inputfile=input_file,count=split_count)) splitter = Spliter(params) splitter.split() - print "splitting done!" \ No newline at end of file + print("splitting done!") \ No newline at end of file diff --git a/tokenizers/block-level/separate-file-block-stats.py b/tokenizers/block-level/separate-file-block-stats.py index c61ee7aed..df32234c5 100644 --- a/tokenizers/block-level/separate-file-block-stats.py +++ b/tokenizers/block-level/separate-file-block-stats.py @@ -11,6 +11,6 @@ elif line.startswith('f'): file_info.write(line) else: - print "error", line + print("error", line) - print "Done with: ", readfile + print("Done with: ", readfile) diff --git a/tokenizers/block-level/tokenizer.py b/tokenizers/block-level/tokenizer.py index 45e52bf68..1c6e18645 100644 --- a/tokenizers/block-level/tokenizer.py +++ b/tokenizers/block-level/tokenizer.py @@ -16,7 +16,7 @@ try: from configparser import ConfigParser except ImportError: - from ConfigParser import ConfigParser # ver. < 3.0 + from configparser import ConfigParser # ver. < 3.0 MULTIPLIER = 50000000 @@ -152,7 +152,7 @@ def tokenize_files(file_string, comment_inline_pattern, comment_open_close_patte t_time = dt.datetime.now() #SourcererCC formatting - tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in file_string_for_tokenization.iteritems()]) + tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in file_string_for_tokenization.items()]) t_time = (dt.datetime.now() - t_time).microseconds # MD5 @@ -278,7 +278,7 @@ def tokenize_blocks(file_string, comment_inline_pattern, comment_open_close_patt tokens_count_unique = len(block_string_for_tokenization) t_time = dt.datetime.now() #SourcererCC formatting - tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in block_string_for_tokenization.iteritems()]) + tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in block_string_for_tokenization.items()]) token_time += (dt.datetime.now() - t_time).microseconds # MD5 h_time = dt.datetime.now() @@ -322,7 +322,7 @@ def process_file_contents(file_string, proj_id, file_id, container_path, # file stats start with a letter 'f' FILE_stats_file.write('f' + ','.join([proj_id,str(file_id),'\"'+file_path+'\"','\"'+file_url+'\"','\"'+file_hash+'\"',file_bytes,str(lines),str(LOC),str(SLOC)]) + '\n') - blocks_data = zip(range(10000,99999),blocks_data) + blocks_data = list(zip(list(range(10000,99999)),blocks_data)) logging.warning('Finished step2 on process_file_contents'); @@ -678,7 +678,7 @@ def start_child(processes, global_queue, proj_paths, batch, project_format): paths_batch = proj_paths[:batch] del proj_paths[:batch] - print("Starting new process %s" % (pid)) + print(("Starting new process %s" % (pid))) p = Process(name='Process '+str(pid), target=process_projects, args=(pid, paths_batch, processes[pid][1], global_queue, project_format, )) processes[pid][0] = p p.start() @@ -690,7 +690,7 @@ def kill_child(processes, pid, n_files_processed): processes[pid][0] = None processes[pid][1] += n_files_processed - print("Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count)) + print(("Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count))) def active_process_count(processes): count = 0 @@ -717,26 +717,26 @@ def active_process_count(processes): for line in f: line_split = line[:-1].split(',') # [:-1] to strip final character which is '\n' prio_proj_paths.append((line_split[0],line_split[4])) - prio_proj_paths = zip(range(init_proj_id, len(prio_proj_paths)+init_proj_id), prio_proj_paths) + prio_proj_paths = list(zip(list(range(init_proj_id, len(prio_proj_paths)+init_proj_id)), prio_proj_paths)) proj_paths = [] if project_format in ['zipblocks']: # zipblocks will diverge the process flow on process_file() - print('\''+project_format+'\''+'format') + print(('\''+project_format+'\''+'format')) with open(FILE_projects_list) as f: for line in f: proj_paths.append(line[:-1]) - proj_paths = list(zip(range(1, len(proj_paths)+1), proj_paths)) + proj_paths = list(zip(list(range(1, len(proj_paths)+1)), proj_paths)) if project_format in ['folderblocks']: # folderblocks will diverge the process flow on process_file() - print('\''+project_format+'\''+'format') + print(('\''+project_format+'\''+'format')) with open(FILE_projects_list) as f: for line in f: proj_paths.append(line[:-1]) - proj_paths = list(zip(range(1, len(proj_paths)+1), proj_paths)) + proj_paths = list(zip(list(range(1, len(proj_paths)+1)), proj_paths)) if os.path.exists(PATH_stats_file_folder) or os.path.exists(PATH_bookkeeping_proj_folder) or os.path.exists(PATH_tokens_file_folder) or os.path.exists(PATH_logs): - print('ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!') + print(('ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!')) sys.exit(1) else: os.makedirs(PATH_stats_file_folder) @@ -774,5 +774,5 @@ def active_process_count(processes): kill_child(processes, pid, n_files_processed) p_elapsed = dt.datetime.now() - p_start - print("*** All done. %s files in %s" % (file_count, p_elapsed)) + print(("*** All done. %s files in %s" % (file_count, p_elapsed))) diff --git a/tokenizers/file-level/tokenizer-unit-test.py b/tokenizers/file-level/tokenizer-unit-test.py index 8f9485764..169cc0f19 100644 --- a/tokenizers/file-level/tokenizer-unit-test.py +++ b/tokenizers/file-level/tokenizer-unit-test.py @@ -5,20 +5,20 @@ import collections import sys import unittest -import tokenizer +import tokenizer as tokenizer import hashlib try: from configparser import ConfigParser except ImportError: - from ConfigParser import ConfigParser # ver. < 3.0 + from ConfigParser import ConfigParser # ver. < 3.0 config = ConfigParser() # parse existing file try: config.read('config.ini') except IOError: - print 'ERROR - Config settings not found. Usage: $python this-script.py config.ini' + print('ERROR - Config settings not found. Usage: $python this-script.py config.ini') sys.exit() separators = config.get('Language', 'separators').strip('"').split(' ') @@ -27,12 +27,15 @@ comment_open_tag = re.escape(config.get('Language', 'comment_open_tag')) comment_close_tag = re.escape(config.get('Language', 'comment_close_tag')) comment_open_close_pattern = comment_open_tag + '.*?' + comment_close_tag - +second_comment_open_tag = re.escape(config.get('Language', 'second_comment_open_tag')) +second_comment_close_tag = re.escape(config.get('Language', 'second_comment_close_tag')) +second_comment_open_close_pattern = second_comment_open_tag + '.*?' + second_comment_close_tag REGEX = re.compile('.+?@@::@@+\d') + class TestParser(unittest.TestCase): - #Input is something like: @#@print@@::@@1,include@@::@@1,sys@@::@@1 + # Input is something like: @#@print@@::@@1,include@@::@@1,sys@@::@@1 def assert_common_properties(self, list_tokens_string): self.assertTrue(list_tokens_string.startswith('@#@')) @@ -42,103 +45,124 @@ def assert_common_properties(self, list_tokens_string): self.assertTrue(REGEX.match(pair)) def test_line_counts_1(self): - input = """ line 1 + input = b""" line 1 line 2 line 3 """ - (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) - (file_hash,lines,LOC,SLOC) = final_stats + (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, + comment_open_close_pattern, + second_comment_open_close_pattern, + separators) + (file_hash, lines, LOC, SLOC) = final_stats - self.assertEqual(lines,3) - self.assertEqual(LOC,3) - self.assertEqual(SLOC,3) + self.assertEqual(lines, 3) + self.assertEqual(LOC, 3) + self.assertEqual(SLOC, 3) def test_line_counts_2(self): - input = """ line 1 + input = b""" line 1 line 2 line 3 """ - (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) - (file_hash,lines,LOC,SLOC) = final_stats + (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, + comment_open_close_pattern, + second_comment_open_close_pattern, + separators) + (file_hash, lines, LOC, SLOC) = final_stats - self.assertEqual(lines,3) - self.assertEqual(LOC,3) - self.assertEqual(SLOC,3) + self.assertEqual(lines, 3) + self.assertEqual(LOC, 3) + self.assertEqual(SLOC, 3) def test_line_counts_3(self): - input = """ line 1 + input = b""" line 1 - // line 2 + # line 2 line 3 """ - (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) - (file_hash,lines,LOC,SLOC) = final_stats - - self.assertEqual(lines,5) - self.assertEqual(LOC,3) - self.assertEqual(SLOC,2) + (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, + comment_open_close_pattern, + second_comment_open_close_pattern, + separators) + (file_hash, lines, LOC, SLOC) = final_stats + print(lines, LOC, SLOC) + self.assertEqual(lines, 5) + self.assertEqual(LOC, 3) + self.assertEqual(SLOC, 2) def test_comments(self): - input = "// Hello\n // World" - (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) - (file_hash,lines,LOC,SLOC) = final_stats - (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens - - - self.assertEqual(lines,2) - self.assertEqual(LOC,2) - self.assertEqual(SLOC,0) - - self.assertEqual(tokens_count_total,0) - self.assertEqual(tokens_count_unique,0) + input = b"# Hello\n # World" + (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, + comment_open_close_pattern, + second_comment_open_close_pattern, + separators) + (file_hash, lines, LOC, SLOC) = final_stats + (tokens_count_total, tokens_count_unique, token_hash, tokens) = final_tokens + + self.assertEqual(lines, 2) + self.assertEqual(LOC, 2) + self.assertEqual(SLOC, 1) + + self.assertEqual(tokens_count_total, 0) + self.assertEqual(tokens_count_unique, 0) self.assert_common_properties(tokens) def test_multiline_comment(self): - input = '/* this is a \n comment */ /* Last one */ ' - (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) - (file_hash,lines,LOC,SLOC) = final_stats - (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens - - - self.assertEqual(lines,2) - self.assertEqual(LOC,2) - self.assertEqual(SLOC,0) - - self.assertEqual(tokens_count_total,0) - self.assertEqual(tokens_count_unique,0) + input = b'""" this is a \n comment """ """ Last one """' + (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, + comment_open_close_pattern, + second_comment_open_close_pattern, + separators) + (file_hash, lines, LOC, SLOC) = final_stats + (tokens_count_total, tokens_count_unique, token_hash, tokens) = final_tokens + + self.assertEqual(lines, 2) + self.assertEqual(LOC, 2) + self.assertEqual(SLOC, 1) + + self.assertEqual(tokens_count_total, 0) + self.assertEqual(tokens_count_unique, 0) self.assert_common_properties(tokens) def test_simple_file(self): - input = u"""#include GLFW_INCLUDE_GLU - #include - #include - - /* Random function */ + input = u"""include GLFW_INCLUDE_GLU + include + include + + '''Random function''' static void glfw_key_callback(int key, int scancode, int action, int mod){ if(glfw_key_callback){ - // Comment here + # Comment here input_event_queue->push(inputaction); } printf("%s", "asciiじゃない文字"); }""".encode("utf-8") - (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) - (file_hash,lines,LOC,SLOC) = final_stats - (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens - - self.assertEqual(lines,12) - self.assertEqual(LOC,11) - self.assertEqual(SLOC,9) - - self.assertEqual(tokens_count_total,27) - self.assertEqual(tokens_count_unique,21) + (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, + comment_open_close_pattern, + second_comment_open_close_pattern, + separators) + (file_hash, lines, LOC, SLOC) = final_stats + (tokens_count_total, tokens_count_unique, token_hash, tokens) = final_tokens + + self.assertEqual(lines, 12) + self.assertEqual(LOC, 11) + self.assertEqual(SLOC, 9) + + self.assertEqual(tokens_count_total, 27) + self.assertEqual(tokens_count_unique, 21) self.assert_common_properties(tokens) - hard_tokens = set(['int@@::@@4','void@@::@@1','cstdio@@::@@1','action@@::@@1','static@@::@@1','key@@::@@1','glfw_key_callback@@::@@1','mod@@::@@1','if@@::@@1','glfw3@@::@@1','scancode@@::@@1','h@@::@@1','GLFW_INCLUDE_GLU@@::@@1','input_event_queue@@::@@2','GLFW@@::@@1','push@@::@@1','inputaction@@::@@1','include@@::@@3']) + hard_tokens = set(['int@@::@@4', 'void@@::@@1', 'cstdio@@::@@1', 'action@@::@@1', 'static@@::@@1', 'key@@::@@1', + 'glfw_key_callback@@::@@1', 'mod@@::@@1', 'if@@::@@1', 'glfw3@@::@@1', 'scancode@@::@@1', + 'h@@::@@1', 'GLFW_INCLUDE_GLU@@::@@1', 'input_event_queue@@::@@2', 'GLFW@@::@@1', + 'push@@::@@1', 'inputaction@@::@@1', 'include@@::@@3']) this_tokens = set(tokens[3:].split(',')) - self.assertTrue(len(hard_tokens - this_tokens),0) + self.assertTrue(len(hard_tokens - this_tokens), 0) m = hashlib.md5() - m.update(tokens[3:]) - self.assertEqual(m.hexdigest(),token_hash) + + m.update(tokens[3:].encode('utf-8')) + self.assertEqual(m.hexdigest(), token_hash) + if __name__ == '__main__': unittest.main() diff --git a/tokenizers/file-level/tokenizer.py b/tokenizers/file-level/tokenizer.py index b61be834b..9ad0c7912 100644 --- a/tokenizers/file-level/tokenizer.py +++ b/tokenizers/file-level/tokenizer.py @@ -10,11 +10,14 @@ import datetime as dt import zipfile import javalang +import pathlib +import shutil +from subprocess import call try: - from configparser import ConfigParser + from configparser import ConfigParser except ImportError: - from ConfigParser import ConfigParser # ver. < 3.0 + from configparser import ConfigParser # ver. < 3.0 MULTIPLIER = 50000000 @@ -33,395 +36,442 @@ comment_inline_pattern = comment_inline + '.*?$' comment_open_tag = '' comment_close_tag = '' +second_comment_open_tag = '' +second_comment_close_tag = '' comment_open_close_pattern = comment_open_tag + '.*?' + comment_close_tag file_extensions = '.none' file_count = 0 +shutil.rmtree('files_stats') +shutil.rmtree('bookkeeping_projs') +shutil.rmtree('files_tokens') +shutil.rmtree('logs') + + def read_config(): - global N_PROCESSES, PROJECTS_BATCH, FILE_projects_list, FILE_priority_projects - global PATH_stats_file_folder, PATH_bookkeeping_proj_folder, PATH_tokens_file_folder, PATH_logs - global separators, comment_inline, comment_inline_pattern, comment_open_tag, comment_close_tag, comment_open_close_pattern - global file_extensions - - global init_file_id - global init_proj_id - - # instantiate - config = ConfigParser() - - # parse existing file - try: - config.read(os.path.join(os.path.dirname(os.path.abspath(__file__)) , 'config.ini')) - except IOError: - print 'ERROR - Config settings not found. Usage: $python this-script.py config-file.ini' - sys.exit() - - # Get info from config.ini into global variables - N_PROCESSES = config.getint('Main', 'N_PROCESSES') - PROJECTS_BATCH = config.getint('Main', 'PROJECTS_BATCH') - FILE_projects_list = config.get('Main', 'FILE_projects_list') - if config.has_option('Main', 'FILE_priority_projects'): - FILE_priority_projects = config.get('Main', 'FILE_priority_projects') - PATH_stats_file_folder = config.get('Folders/Files', 'PATH_stats_file_folder') - PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder') - PATH_tokens_file_folder = config.get('Folders/Files', 'PATH_tokens_file_folder') - PATH_logs = config.get('Folders/Files', 'PATH_logs') - - # Reading Language settings - separators = config.get('Language', 'separators').strip('"').split(' ') - comment_inline = re.escape(config.get('Language', 'comment_inline')) - comment_inline_pattern = comment_inline + '.*?$' - comment_open_tag = re.escape(config.get('Language', 'comment_open_tag')) - comment_close_tag = re.escape(config.get('Language', 'comment_close_tag')) - comment_open_close_pattern = comment_open_tag + '.*?' + comment_close_tag - file_extensions = config.get('Language', 'File_extensions').split(' ') - - # Reading config settings - init_file_id = config.getint('Config', 'init_file_id') - init_proj_id = config.getint('Config', 'init_proj_id') - -def tokenize_files(file_string, comment_inline_pattern, comment_open_close_pattern, separators): - - final_stats = 'ERROR' - final_tokens = 'ERROR' - - file_hash = 'ERROR' - lines = 'ERROR' - LOC = 'ERROR' - SLOC = 'ERROR' - - h_time = dt.datetime.now() - m = hashlib.md5() - m.update(file_string) - file_hash = m.hexdigest() - hash_time = (dt.datetime.now() - h_time).microseconds - - lines = file_string.count('\n') - if not file_string.endswith('\n'): - lines += 1 - file_string = "".join([s for s in file_string.splitlines(True) if s.strip()]) - - LOC = file_string.count('\n') - if not file_string.endswith('\n'): - LOC += 1 - - re_time = dt.datetime.now() - # Remove tagged comments - file_string = re.sub(comment_open_close_pattern, '', file_string, flags=re.DOTALL) - # Remove end of line comments - file_string = re.sub(comment_inline_pattern, '', file_string, flags=re.MULTILINE) - re_time = (dt.datetime.now() - re_time).microseconds - - file_string = "".join([s for s in file_string.splitlines(True) if s.strip()]).strip() - - SLOC = file_string.count('\n') - if file_string != '' and not file_string.endswith('\n'): - SLOC += 1 - - final_stats = (file_hash,lines,LOC,SLOC) - - # Rather a copy of the file string here for tokenization - file_string_for_tokenization = file_string.decode('utf-8') - - #Transform separators into spaces (remove them) - s_time = dt.datetime.now() - for x in separators: - file_string_for_tokenization = file_string_for_tokenization.replace(x,' ') - s_time = (dt.datetime.now() - s_time).microseconds - - ##Create a list of tokens - file_string_for_tokenization = file_string_for_tokenization.split() - ## Total number of tokens - tokens_count_total = len(file_string_for_tokenization) - ##Count occurrences - file_string_for_tokenization = collections.Counter(file_string_for_tokenization) - ##Converting Counter to dict because according to StackOverflow is better - file_string_for_tokenization=dict(file_string_for_tokenization) - ## Unique number of tokens - tokens_count_unique = len(file_string_for_tokenization) - - t_time = dt.datetime.now() - #SourcererCC formatting - tokens = ','.join(['{}@@::@@{}'.format(k.encode('utf-8'), v) - for k,v in file_string_for_tokenization.iteritems()]) - t_time = (dt.datetime.now() - t_time).microseconds - - # MD5 - h_time = dt.datetime.now() - m = hashlib.md5() - m.update(tokens) - hash_time += (dt.datetime.now() - h_time).microseconds - - final_tokens = (tokens_count_total,tokens_count_unique,m.hexdigest(),'@#@'+tokens) - - return (final_stats, final_tokens, [s_time, t_time, hash_time, re_time]) - -def process_file_contents(file_string, proj_id, file_id, container_path, - file_path, file_bytes, proj_url, FILE_tokens_file, FILE_stats_file, logging): - - logging.info('Attempting to process_file_contents '+os.path.join(container_path, file_path)) - - global file_count - file_count += 1 - - (final_stats, final_tokens, file_parsing_times) = tokenize_files(file_string, comment_inline_pattern, comment_open_close_pattern, separators) - - (file_hash,lines,LOC,SLOC) = final_stats - - (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens - - file_url = proj_url + '/' + file_path[7:].replace(' ','%20') - file_path = os.path.join(container_path, file_path) + global N_PROCESSES, PROJECTS_BATCH, FILE_projects_list, FILE_priority_projects + global PATH_stats_file_folder, PATH_bookkeeping_proj_folder, PATH_tokens_file_folder, PATH_logs + global separators, comment_inline, comment_inline_pattern, comment_open_tag, comment_close_tag, \ + comment_open_close_pattern, second_comment_open_tag, second_comment_close_tag, second_comment_open_close_pattern + global file_extensions - ww_time = dt.datetime.now() - FILE_stats_file.write(','.join([proj_id,str(file_id),'\"'+file_path+'\"','\"'+file_url+'\"','\"'+file_hash+'\"',file_bytes,str(lines),str(LOC),str(SLOC)]) + '\n') - w_time = (dt.datetime.now() - ww_time).microseconds + global init_file_id + global init_proj_id + + # instantiate + config = ConfigParser() + + # parse existing file + try: + config.read(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) + except IOError: + print('ERROR - Config settings not found. Usage: $python this-script.py config-file.ini') + sys.exit() + + # Get info from config.ini into global variables + N_PROCESSES = config.getint('Main', 'N_PROCESSES') + + PROJECTS_BATCH = config.getint('Main', 'PROJECTS_BATCH') + FILE_projects_list = config.get('Main', 'FILE_projects_list') + + print(FILE_projects_list, 'FILE_projects_list') + if config.has_option('Main', 'FILE_priority_projects'): + FILE_priority_projects = config.get('Main', 'FILE_priority_projects') + + PATH_stats_file_folder = config.get('Folders/Files', 'PATH_stats_file_folder') + PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder') + PATH_tokens_file_folder = config.get('Folders/Files', 'PATH_tokens_file_folder') + PATH_logs = config.get('Folders/Files', 'PATH_logs') + + # Reading Language settings + separators = config.get('Language', 'separators').strip('"').split(' ') + comment_inline = re.escape(config.get('Language', 'comment_inline')) + comment_inline_pattern = comment_inline + '.*?$' + comment_open_tag = re.escape(config.get('Language', 'comment_open_tag')) + comment_close_tag = re.escape(config.get('Language', 'comment_close_tag')) + comment_open_close_pattern = comment_open_tag + '.*?' + comment_close_tag + + second_comment_open_tag = re.escape(config.get('Language', 'second_comment_open_tag')) + second_comment_close_tag = re.escape(config.get('Language', 'second_comment_close_tag')) + second_comment_open_close_pattern = second_comment_open_tag + '.*?' + second_comment_close_tag + + file_extensions = config.get('Language', 'File_extensions').split(' ') + + # Reading config settings + init_file_id = config.getint('Config', 'init_file_id') + init_proj_id = config.getint('Config', 'init_proj_id') + + +def tokenize_files(file_string, comment_inline_pattern, comment_open_close_pattern, second_comment_open_close_pattern, + separators): + final_stats = 'ERROR' + final_tokens = 'ERROR' + + file_hash = 'ERROR' + lines = 'ERROR' + LOC = 'ERROR' + SLOC = 'ERROR' + + h_time = dt.datetime.now() + m = hashlib.md5() + + m.update(file_string) + + file_hash = m.hexdigest() + + hash_time = (dt.datetime.now() - h_time).microseconds + print(type(file_string), 'file_string') + lines = file_string.count(b"\n") + print(lines, 'file_string') + if not file_string.endswith(b'\n'): + lines += 1 + + file_string = b"".join([s for s in file_string.splitlines(True) if s.strip()]) + + LOC = file_string.count(b'\n') + if not file_string.endswith(b'\n'): + LOC += 1 + + re_time = dt.datetime.now() + + # Remove tagged comments + file_string = re.sub(bytes(comment_open_close_pattern, 'utf-8'), b'', file_string, flags=re.DOTALL) + # Remove second end of line comments + file_string = re.sub(bytes(second_comment_open_close_pattern, 'utf-8'), b'', file_string, flags=re.DOTALL) + # Remove end of line comments + file_string = re.sub(bytes(comment_inline_pattern, 'utf-8'), b'', file_string, flags=re.MULTILINE) + + re_time = (dt.datetime.now() - re_time).microseconds + + file_string = b"".join([s for s in file_string.splitlines(True) if s.strip()]).strip() + print(file_string) + SLOC = file_string.count(b'\n') + if file_string != '' and not file_string.endswith(b'\n'): + SLOC += 1 + + final_stats = (file_hash, lines, LOC, SLOC) + print(file_string, 'file_string file_string file_string') + # Rather a copy of the file string here for tokenization + file_string_for_tokenization = file_string.decode('utf-8') - ww_time = dt.datetime.now() - FILE_tokens_file.write(','.join([proj_id,str(file_id),str(tokens_count_total),str(tokens_count_unique),token_hash+tokens]) + '\n') - w_time += (dt.datetime.now() - ww_time).microseconds + # Transform separators into spaces (remove them) + s_time = dt.datetime.now() + for x in separators: + file_string_for_tokenization = file_string_for_tokenization.replace(x, ' ') + s_time = (dt.datetime.now() - s_time).microseconds - logging.info('Successfully ran process_file_contents '+os.path.join(container_path, file_path)) + ##Create a list of tokens + file_string_for_tokenization = file_string_for_tokenization.split() + ## Total number of tokens + tokens_count_total = len(file_string_for_tokenization) + ##Count occurrences + file_string_for_tokenization = collections.Counter(file_string_for_tokenization) + ##Converting Counter to dict because according to StackOverflow is better + file_string_for_tokenization = dict(file_string_for_tokenization) + ## Unique number of tokens - return file_parsing_times + [w_time] # [s_time, t_time, w_time, hash_time, re_time] + tokens_count_unique = len(file_string_for_tokenization) -def process_regular_folder(args, folder_path, files): - process_num, proj_id, proj_path, proj_url, base_file_id, \ - FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, times = args + t_time = dt.datetime.now() + # SourcererCC formatting + tokens = ','.join(['{}@@::@@{}'.format(k.encode('utf-8'), v) + for k, v in file_string_for_tokenization.items()]) + t_time = (dt.datetime.now() - t_time).microseconds - file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0 - all_files = files - - # Filter them by the correct extension - aux = [] - for extension in file_extensions: - aux.extend([x for x in all_files if x.endswith(extension)]) - all_files = aux - - # This is very strange, but I did find some paths with newlines, - # so I am simply eliminates them - all_files = [x for x in all_files if '\n' not in x] - - for file_path in all_files: - file_id = process_num*MULTIPLIER + base_file_id + file_count - print "<%s, %s, %s>" %(file_id, folder_path, file_path) - file_path = os.path.join(folder_path, file_path) - - with open(file_path) as f: - f_time = dt.datetime.now() - file_string = f.read() - f_time = (dt.datetime.now() - f_time).microseconds - - times_c = process_file_contents(file_string, proj_id, file_id, "", file_path, str(os.path.getsize(file_path)), - proj_url, FILE_tokens_file, FILE_stats_file, logging) - times[0] += f_time - times[1] += times_c[0] - times[2] += times_c[1] - times[3] += times_c[4] - times[4] += times_c[2] - times[5] += times_c[3] - - -def process_tgz_ball(process_num, tar_file, proj_id, proj_path, proj_url, base_file_id, - FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging): - zip_time = file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0 - - try: - with tarfile.open(tar_file,'r|*') as my_tar_file: - - for f in my_tar_file: - if not f.isfile(): - continue - - file_path = f.name - # Filter by the correct extension - if not os.path.splitext(f.name)[1] in file_extensions: - continue - - # This is very strange, but I did find some paths with newlines, - # so I am simply ignoring them - if '\n' in file_path: - continue - - file_id = process_num*MULTIPLIER + base_file_id + file_count - - file_bytes=str(f.size) - - z_time = dt.datetime.now() - try: - myfile = my_tar_file.extractfile(f) - except: - logging.warning('Unable to open file (1) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+ - '> (process '+str(process_num)+')') - break - zip_time += (dt.datetime.now() - z_time).microseconds - - if myfile is None: - logging.warning('Unable to open file (2) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+ - '> (process '+str(process_num)+')') - break - - f_time = dt.datetime.now() - file_string = myfile.read() - file_time += (dt.datetime.now() - f_time).microseconds - - times = process_file_contents(file_string, proj_id, file_id, tar_file, file_path, file_bytes, - proj_url, FILE_tokens_file, FILE_stats_file, logging) - string_time += times[0] - tokens_time += times[1] - write_time += times[4] - hash_time += times[2] - regex_time += times[3] - -# if (file_count % 50) == 0: -# logging.info('Zip: %s Read: %s Separators: %s Tokens: %s Write: %s Hash: %s regex: %s', -# zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) - - except Exception as e: - logging.warning('Unable to open tar on <'+proj_id+','+proj_path+'> (process '+str(process_num)+')') - logging.warning(e) - return - - return (zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) - -def process_zip_ball(process_num, zip_file, proj_id, proj_path, proj_url, base_file_id, - FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging): - zip_time = file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0 - - logging.info('Attempting to process_zip_ball '+zip_file) - - try: - with zipfile.ZipFile(proj_path,'r') as my_file: - - for file in my_file.infolist(): - - if not os.path.splitext(file.filename)[1] in file_extensions: - continue - - file_path = file.filename - - # This is very strange, but I did find some paths with newlines, - # so I am simply ignoring them - if '\n' in file_path: - continue - - file_id = process_num*MULTIPLIER + base_file_id + file_count - - file_bytes=str(file.file_size) - - z_time = dt.datetime.now() - try: - my_zip_file = my_file.open(file.filename,'r') - except: - logging.warning('Unable to open file (1) <'+os.path.join(proj_path,file)+'> (process '+str(process_num)+')') - break - zip_time += (dt.datetime.now() - z_time).microseconds - - if my_zip_file is None: - logging.warning('Unable to open file (2) <'+os.path.join(proj_path,file)+'> (process '+str(process_num)+')') - break - - f_time = dt.datetime.now() - file_string = my_zip_file.read() - file_time += (dt.datetime.now() - f_time).microseconds - - times = process_file_contents(file_string, proj_id, file_id, zip_file, file_path, file_bytes, - proj_url, FILE_tokens_file, FILE_stats_file, logging) - - string_time += times[0] - tokens_time += times[1] - write_time += times[4] - hash_time += times[2] - regex_time += times[3] - -# if (file_count % 50) == 0: -# logging.info('Zip: %s Read: %s Separators: %s Tokens: %s Write: %s Hash: %s regex: %s', -# zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) - - except Exception as e: - logging.warning('Unable to open zip on <'+proj_path+'> (process '+str(process_num)+')') - logging.warning(e) - return - - logging.info('Successfully ran process_zip_ball '+zip_file) - return (zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) - -def process_one_project(process_num, proj_id, proj_path, base_file_id, - FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, project_format): - - p_start = dt.datetime.now() - - if project_format == 'leidos': - proj_path = proj_path - proj_url = 'None' - - logging.info('Starting leidos project <'+proj_id+','+proj_path+'> (process '+str(process_num)+')') - - if not os.path.isdir(proj_path): - logging.warning('Unable to open project <'+proj_id+','+proj_path+'> (process '+str(process_num)+')') - return - - # Search for tar files with _code in them - tar_files = [os.path.join(proj_path, f) for f in os.listdir(proj_path) if os.path.isfile(os.path.join(proj_path, f))] - tar_files = [f for f in tar_files if '_code' in f] - if(len(tar_files) != 1): - logging.warning('Tar not found on <'+proj_id+','+proj_path+'> (process '+str(process_num)+')') - times = [0,0,0,0,0,0,0] - os.path.walk(proj_path, process_regular_folder, - (process_num, proj_id, proj_path, proj_url, base_file_id, - FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, times)) - zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times - zip_time = 0 - else: - tar_file = tar_files[0] - times = process_tgz_ball(process_num, tar_file, proj_id, proj_path, proj_url, base_file_id, - FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging) - if times is not None: - zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times - else: - zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = (-1,-1,-1,-1,-1,-1,-1) + # MD5 + h_time = dt.datetime.now() + m = hashlib.md5() + m.update(bytes(tokens, 'utf-8')) + hash_time += (dt.datetime.now() - h_time).microseconds - FILE_bookkeeping_proj.write(proj_id+',\"'+proj_path+'\",\"'+proj_url+'\"\n') + final_tokens = (tokens_count_total, tokens_count_unique, m.hexdigest(), '@#@' + tokens) - if project_format in ['zip']: - proj_url = 'NULL' + return (final_stats, final_tokens, [s_time, t_time, hash_time, re_time]) - logging.info('Starting zip project <'+proj_id+','+proj_path+'> (process '+str(process_num)+')') - if not os.path.isfile(proj_path): - logging.warning('Unable to open project <'+proj_id+','+proj_path+'> (process '+str(process_num)+')') - return +def process_file_contents(file_string, proj_id, file_id, container_path, + file_path, file_bytes, proj_url, FILE_tokens_file, FILE_stats_file, logging): + logging.info('Attempting to process_file_contents ' + os.path.join(container_path, file_path)) - zip_file = proj_path - times = process_zip_ball(process_num, zip_file, proj_id, proj_path, proj_url, base_file_id, - FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging) - if times is not None: - zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times - else: - zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = (-1,-1,-1,-1,-1,-1,-1) + global file_count + file_count += 1 + + (final_stats, final_tokens, file_parsing_times) = tokenize_files(file_string, comment_inline_pattern, + comment_open_close_pattern, + second_comment_open_close_pattern, separators) - FILE_bookkeeping_proj.write(proj_id+',\"'+proj_path+'\",\"'+proj_url+'\"\n') + (file_hash, lines, LOC, SLOC) = final_stats + + (tokens_count_total, tokens_count_unique, token_hash, tokens) = final_tokens + + file_url = proj_url + '/' + file_path[7:].replace(' ', '%20') + file_path = os.path.join(container_path, file_path) + + ww_time = dt.datetime.now() + FILE_stats_file.write(','.join( + [proj_id, str(file_id), '\"' + file_path + '\"', '\"' + file_url + '\"', '\"' + file_hash + '\"', file_bytes, + str(lines), str(LOC), str(SLOC)]) + '\n') + w_time = (dt.datetime.now() - ww_time).microseconds + + ww_time = dt.datetime.now() + FILE_tokens_file.write(','.join( + [proj_id, str(file_id), str(tokens_count_total), str(tokens_count_unique), token_hash + tokens]) + '\n') + w_time += (dt.datetime.now() - ww_time).microseconds + + logging.info('Successfully ran process_file_contents ' + os.path.join(container_path, file_path)) + + return file_parsing_times + [w_time] # [s_time, t_time, w_time, hash_time, re_time] + + +def process_regular_folder(args, folder_path, files): + process_num, proj_id, proj_path, proj_url, base_file_id, \ + FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, times = args + + file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0 + all_files = files + + # Filter them by the correct extension + aux = [] + for extension in file_extensions: + aux.extend([x for x in all_files if x.endswith(extension)]) + all_files = aux + + # This is very strange, but I did find some paths with newlines, + # so I am simply eliminates them + all_files = [x for x in all_files if '\n' not in x] + + for file_path in all_files: + file_id = process_num * MULTIPLIER + base_file_id + file_count + print("<%s, %s, %s>" % (file_id, folder_path, file_path)) + file_path = os.path.join(folder_path, file_path) + + with open(file_path) as f: + f_time = dt.datetime.now() + file_string = f.read() + f_time = (dt.datetime.now() - f_time).microseconds + + times_c = process_file_contents(file_string, proj_id, file_id, "", file_path, + str(os.path.getsize(file_path)), + proj_url, FILE_tokens_file, FILE_stats_file, logging) + times[0] += f_time + times[1] += times_c[0] + times[2] += times_c[1] + times[3] += times_c[4] + times[4] += times_c[2] + times[5] += times_c[3] + + +def process_tgz_ball(process_num, tar_file, proj_id, proj_path, proj_url, base_file_id, + FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging): + zip_time = file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0 + + try: + with tarfile.open(tar_file, 'r|*') as my_tar_file: + + for f in my_tar_file: + if not f.isfile(): + continue + + file_path = f.name + # Filter by the correct extension + if not os.path.splitext(f.name)[1] in file_extensions: + continue + + # This is very strange, but I did find some paths with newlines, + # so I am simply ignoring them + if '\n' in file_path: + continue + + file_id = process_num * MULTIPLIER + base_file_id + file_count + + file_bytes = str(f.size) + + z_time = dt.datetime.now() + try: + myfile = my_tar_file.extractfile(f) + except: + logging.warning( + 'Unable to open file (1) <' + proj_id + ',' + str(file_id) + ',' + os.path.join(tar_file, + file_path) + + '> (process ' + str(process_num) + ')') + break + zip_time += (dt.datetime.now() - z_time).microseconds + + if myfile is None: + logging.warning( + 'Unable to open file (2) <' + proj_id + ',' + str(file_id) + ',' + os.path.join(tar_file, + file_path) + + '> (process ' + str(process_num) + ')') + break + + f_time = dt.datetime.now() + file_string = myfile.read() + file_time += (dt.datetime.now() - f_time).microseconds + + times = process_file_contents(file_string, proj_id, file_id, tar_file, file_path, file_bytes, + proj_url, FILE_tokens_file, FILE_stats_file, logging) + string_time += times[0] + tokens_time += times[1] + write_time += times[4] + hash_time += times[2] + regex_time += times[3] + + # if (file_count % 50) == 0: + # logging.info('Zip: %s Read: %s Separators: %s Tokens: %s Write: %s Hash: %s regex: %s', + # zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) + + except Exception as e: + logging.warning('Unable to open tar on <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')') + logging.warning(e) + return + + return (zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) + + +def process_zip_ball(process_num, zip_file, proj_id, proj_path, proj_url, base_file_id, + FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging): + zip_time = file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0 + + logging.info('Attempting to process_zip_ball ' + zip_file) + + try: + with zipfile.ZipFile(proj_path, 'r') as my_file: + for file in my_file.infolist(): + if not os.path.splitext(file.filename)[1] in file_extensions: + continue + + file_path = file.filename + # This is very strange, but I did find some paths with newlines, + # so I am simply ignoring them + if '\n' in file_path: + continue + file_id = process_num * MULTIPLIER + base_file_id + file_count + file_bytes = str(file.file_size) + z_time = dt.datetime.now() + try: + my_zip_file = my_file.open(file.filename, 'r') + except: + logging.warning( + 'Unable to open file (1) <' + os.path.join(proj_path, file) + '> (process ' + str( + process_num) + ')') + break + zip_time += (dt.datetime.now() - z_time).microseconds + if my_zip_file is None: + logging.warning( + 'Unable to open file (2) <' + os.path.join(proj_path, file) + '> (process ' + str( + process_num) + ')') + break + + f_time = dt.datetime.now() + file_string = my_zip_file.read() + + file_time += (dt.datetime.now() - f_time).microseconds + + times = process_file_contents(file_string, proj_id, file_id, zip_file, file_path, file_bytes, + proj_url, FILE_tokens_file, FILE_stats_file, logging) + + string_time += times[0] + tokens_time += times[1] + write_time += times[4] + hash_time += times[2] + regex_time += times[3] + + # if (file_count % 50) == 0: + # logging.info('Zip: %s Read: %s Separators: %s Tokens: %s Write: %s Hash: %s regex: %s', + # zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) + + + except Exception as e: + logging.warning('Unable to open zip on <' + proj_path + '> (process ' + str(process_num) + ')') + logging.warning(e) + return + + logging.info('Successfully ran process_zip_ball ' + zip_file) + return (zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) + + +def process_one_project(process_num, proj_id, proj_path, base_file_id, + FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, project_format): + p_start = dt.datetime.now() + + if project_format == 'leidos': + proj_path = proj_path + proj_url = 'None' + + logging.info('Starting leidos project <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')') + + if not os.path.isdir(proj_path): + logging.warning( + 'Unable to open project <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')') + return + + # Search for tar files with _code in them + tar_files = [os.path.join(proj_path, f) for f in os.listdir(proj_path) if + os.path.isfile(os.path.join(proj_path, f))] + tar_files = [f for f in tar_files if '_code' in f] + if (len(tar_files) != 1): + logging.warning('Tar not found on <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')') + times = [0, 0, 0, 0, 0, 0, 0] + os.path.walk(proj_path, process_regular_folder, + (process_num, proj_id, proj_path, proj_url, base_file_id, + FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, times)) + zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times + zip_time = 0 + else: + tar_file = tar_files[0] + times = process_tgz_ball(process_num, tar_file, proj_id, proj_path, proj_url, base_file_id, + FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging) + if times is not None: + zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times + else: + zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = ( + -1, -1, -1, -1, -1, -1, -1) + + FILE_bookkeeping_proj.write(proj_id + ',\"' + proj_path + '\",\"' + proj_url + '\"\n') + + if project_format in ['zip']: + proj_url = 'NULL' + logging.info('Starting zip project <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')') + if not os.path.isfile(proj_path): + logging.warning( + 'Unable to open project <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')') + return + + zip_file = proj_path + + times = process_zip_ball(process_num, zip_file, proj_id, proj_path, proj_url, base_file_id, + FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging) + if times is not None: + zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times + else: + zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = ( + -1, -1, -1, -1, -1, -1, -1) + + FILE_bookkeeping_proj.write(proj_id + ',\"' + proj_path + '\",\"' + proj_url + '\"\n') + + p_elapsed = dt.datetime.now() - p_start + logging.info('Project finished <%s,%s> (process %s)', proj_id, proj_path, process_num) + logging.info( + ' (%s): Total: %smicros | Zip: %s Read: %s Separators: %smicros Tokens: %smicros Write: %smicros Hash: %s regex: %s', + process_num, p_elapsed, zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) - p_elapsed = dt.datetime.now() - p_start - logging.info('Project finished <%s,%s> (process %s)', proj_id, proj_path, process_num) - logging.info(' (%s): Total: %smicros | Zip: %s Read: %s Separators: %smicros Tokens: %smicros Write: %smicros Hash: %s regex: %s', - process_num, p_elapsed, zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time) def process_projects(process_num, list_projects, base_file_id, global_queue, project_format): - if platform.system() =='Windows': + if platform.system() == 'Windows': read_config() # Logging code FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' - logging.basicConfig(level=logging.DEBUG,format=FORMAT) - file_handler = logging.FileHandler(os.path.join(PATH_logs,'LOG-'+str(process_num)+'.log')) + logging.basicConfig(level=logging.DEBUG, format=FORMAT) + file_handler = logging.FileHandler(os.path.join(PATH_logs, 'LOG-' + str(process_num) + '.log')) file_handler.setFormatter(logging.Formatter(FORMAT)) logging.getLogger().addHandler(file_handler) - FILE_files_stats_file = os.path.join(PATH_stats_file_folder,'files-stats-'+str(process_num)+'.stats') - FILE_bookkeeping_proj_name = os.path.join(PATH_bookkeeping_proj_folder,'bookkeeping-proj-'+str(process_num)+'.projs') - FILE_files_tokens_file = os.path.join(PATH_tokens_file_folder,'files-tokens-'+str(process_num)+'.tokens') + FILE_files_stats_file = os.path.join(PATH_stats_file_folder, 'files-stats-' + str(process_num) + '.stats') + FILE_bookkeeping_proj_name = os.path.join(PATH_bookkeeping_proj_folder, + 'bookkeeping-proj-' + str(process_num) + '.projs') + FILE_files_tokens_file = os.path.join(PATH_tokens_file_folder, 'files-tokens-' + str(process_num) + '.tokens') global file_count file_count = 0 @@ -431,127 +481,141 @@ def process_projects(process_num, list_projects, base_file_id, global_queue, pro logging.info("Process %s starting", process_num) p_start = dt.datetime.now() for proj_id, proj_path in list_projects: - process_one_project(process_num, str(proj_id), proj_path, base_file_id, + process_one_project(process_num, str(proj_id), proj_path, base_file_id, FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, project_format) p_elapsed = (dt.datetime.now() - p_start).seconds - logging.info('Process %s finished. %s files in %ss.', + logging.info('Process %s finished. %s files in %ss.', process_num, file_count, p_elapsed) # Let parent know global_queue.put((process_num, file_count)) sys.exit(0) + def start_child(processes, global_queue, proj_paths, batch, project_format): - # This is a blocking get. If the queue is empty, it waits - pid, n_files_processed = global_queue.get() - # OK, one of the processes finished. Let's get its data and kill it - kill_child(processes, pid, n_files_processed) + # This is a blocking get. If the queue is empty, it waits + pid, n_files_processed = global_queue.get() + # OK, one of the processes finished. Let's get its data and kill it + kill_child(processes, pid, n_files_processed) - # Get a new batch of project paths ready - paths_batch = proj_paths[:batch] - del proj_paths[:batch] + # Get a new batch of project paths ready + paths_batch = proj_paths[:batch] + del proj_paths[:batch] + + print("Starting new process %s" % (pid)) + p = Process(name='Process ' + str(pid), target=process_projects, + args=(pid, paths_batch, processes[pid][1], global_queue, project_format,)) + processes[pid][0] = p + p.start() - print "Starting new process %s" % (pid) - p = Process(name='Process '+str(pid), target=process_projects, args=(pid, paths_batch, processes[pid][1], global_queue, project_format, )) - processes[pid][0] = p - p.start() def kill_child(processes, pid, n_files_processed): - global file_count - file_count += n_files_processed - if processes[pid][0] != None: - processes[pid][0] = None - processes[pid][1] += n_files_processed - - print "Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count) + global file_count + file_count += n_files_processed + if processes[pid][0] != None: + processes[pid][0] = None + processes[pid][1] += n_files_processed -def active_process_count(processes): - count = 0 - for p in processes: - if p[0] != None: - count +=1 - return count + print("Process %s finished, %s files processed (%s). Current total: %s" % ( + pid, n_files_processed, processes[pid][1], file_count)) -if __name__ == '__main__': - global project_format - project_format = sys.argv[1] # 'zip' or 'leidos' - - if project_format not in ['zip','leidos']: - print "ERROR - Please insert archive format, 'zip', 'leidos'!" - sys.exit() - - read_config() - p_start = dt.datetime.now() - - prio_proj_paths = [] - if FILE_priority_projects != None: - with open(FILE_priority_projects) as f: - for line in f: - line_split = line.strip('\n') # [:-1] to strip final character which is '\n' - prio_proj_paths.append(line_split) - prio_proj_paths = zip(range(init_proj_id, len(prio_proj_paths)+init_proj_id), prio_proj_paths) - - proj_paths = [] - if project_format == 'leidos': - print '\'',project_format,'\'','format' - with open(FILE_projects_list) as f: - for line in f: - prio = False - line_split = line.strip('\n') # [:-1] to strip final character which is '\n' - for p in prio_proj_paths: - if p[1][0] == line_split: - prio = True - print "Project %s is in priority list" % line_split - if not prio: - proj_paths.append(line_split) - proj_paths = zip(range(1, len(proj_paths)+1), proj_paths) - if project_format in ['zip']: - print '\'',project_format,'\'','format' - with open(FILE_projects_list) as f: - for line in f: - proj_paths.append(line[:-1]) - proj_paths = zip(range(1, len(proj_paths)+1), proj_paths) - - if os.path.exists(PATH_stats_file_folder) or os.path.exists(PATH_bookkeeping_proj_folder) or os.path.exists(PATH_tokens_file_folder) or os.path.exists(PATH_logs): - print 'ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!' - sys.exit(1) - else: - os.makedirs(PATH_stats_file_folder) - os.makedirs(PATH_bookkeeping_proj_folder) - os.makedirs(PATH_tokens_file_folder) - os.makedirs(PATH_logs) - - #Split list of projects into N_PROCESSES lists - #proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] - - # Multiprocessing with N_PROCESSES - # [process, file_count] - processes = [[None, init_file_id] for i in xrange(N_PROCESSES)] - # Multiprocessing shared variable instance for recording file_id - #file_id_global_var = Value('i', 1) - # The queue for processes to communicate back to the parent (this process) - # Initialize it with N_PROCESSES number of (process_id, n_files_processed) - global_queue = Queue() - for i in xrange(N_PROCESSES): - global_queue.put((i, 0)) - - # Start the priority projects - print "*** Starting priority projects..." - while len(prio_proj_paths) > 0: - start_child(processes, global_queue, prio_proj_paths, 1, project_format) - - # Start all other projects - print "*** Starting regular projects..." - while len(proj_paths) > 0: - start_child(processes, global_queue, proj_paths, PROJECTS_BATCH, project_format) - - print "*** No more projects to process. Waiting for children to finish..." - while active_process_count(processes) > 0: - pid, n_files_processed = global_queue.get() - kill_child(processes, pid, n_files_processed) +def active_process_count(processes): + count = 0 + for p in processes: + if p[0] != None: + count += 1 + return count + - p_elapsed = dt.datetime.now() - p_start - print "*** All done. %s files in %s" % (file_count, p_elapsed) +if __name__ == '__main__': + global project_format + project_format = sys.argv[1] # 'zip' or 'leidos' + + if project_format not in ['zip', 'leidos']: + print("ERROR - Please insert archive format, 'zip', 'leidos'!") + sys.exit() + + read_config() + p_start = dt.datetime.now() + + prio_proj_paths = [] + + if FILE_priority_projects != None: + with open(FILE_priority_projects) as f: + for line in f: + line_split = line.strip('\n') # [:-1] to strip final character which is '\n' + prio_proj_paths.append(line_split) + prio_proj_paths = list(zip(list(range(init_proj_id, len(prio_proj_paths) + init_proj_id)), prio_proj_paths)) + + proj_paths = [] + if project_format == 'leidos': + print('\'', project_format, '\'', 'format') + with open(FILE_projects_list) as f: + + for line in f: + prio = False + line_split = line.strip('\n') # [:-1] to strip final character which is '\n' + for p in prio_proj_paths: + if p[1][0] == line_split: + prio = True + print("Project %s is in priority list" % line_split) + if not prio: + proj_paths.append(line_split) + proj_paths = list(zip(list(range(1, len(proj_paths) + 1)), proj_paths)) + + if project_format in ['zip']: + print('\'', project_format, '\'', 'format') + with open(FILE_projects_list) as f: + for line in f: + proj_paths.append(line[:-1]) + proj_paths = list(zip(list(range(1, len(proj_paths) + 1)), proj_paths)) + + if os.path.exists(PATH_stats_file_folder) or os.path.exists(PATH_bookkeeping_proj_folder) or os.path.exists( + PATH_tokens_file_folder) or os.path.exists(PATH_logs): + print( + 'ERROR - Folder [' + PATH_stats_file_folder + '] or [' + PATH_bookkeeping_proj_folder + '] or [' + PATH_tokens_file_folder + '] or [' + PATH_logs + '] already exists!') + sys.exit(1) + else: + os.makedirs(PATH_stats_file_folder) + os.makedirs(PATH_bookkeeping_proj_folder) + os.makedirs(PATH_tokens_file_folder) + os.makedirs(PATH_logs) + + # Split list of projects into N_PROCESSES lists + # proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] + + # Multiprocessing with N_PROCESSES + # [process, file_count] + processes = [[None, init_file_id] for i in range(N_PROCESSES)] + # Multiprocessing shared variable instance for recording file_id + # file_id_global_var = Value('i', 1) + # The queue for processes to communicate back to the parent (this process) + # Initialize it with N_PROCESSES number of (process_id, n_files_processed) + global_queue = Queue() + for i in range(N_PROCESSES): + global_queue.put((i, 0)) + + # Start the priority projects + print("*** Starting priority projects...") + while len(prio_proj_paths) > 0: + start_child(processes, global_queue, prio_proj_paths, 1, project_format) + + # Start all other projects + print("*** Starting regular projects...") + while len(proj_paths) > 0: + start_child(processes, global_queue, proj_paths, PROJECTS_BATCH, project_format) + + print("*** No more projects to process. Waiting for children to finish...") + while active_process_count(processes) > 0: + pid, n_files_processed = global_queue.get() + kill_child(processes, pid, n_files_processed) + + p_elapsed = dt.datetime.now() - p_start + print("*** All done. %s files in %s" % (file_count, p_elapsed)) + + call('cat ./files_tokens/* > blocks.file', shell=True) + shutil.move('./blocks.file', + '/Users/malinda/Documents/Research_Topic_2/SourcererCC-master/clone-detector/input/dataset/') \ No newline at end of file From f53261e4ffcd20aa7f29f098c1aa8e23c6149d31 Mon Sep 17 00:00:00 2001 From: Malinda Date: Wed, 22 Jul 2020 17:26:09 -0700 Subject: [PATCH 2/3] more changes to upgrade python version --- .../utils/get_source_from_tokens.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tokenizers/file-level/utils/get_source_from_tokens.py b/tokenizers/file-level/utils/get_source_from_tokens.py index 62228504a..459100e0b 100644 --- a/tokenizers/file-level/utils/get_source_from_tokens.py +++ b/tokenizers/file-level/utils/get_source_from_tokens.py @@ -16,7 +16,7 @@ def grab_ids(folder_or_file): if os.path.isfile(folder_or_file): paths.add(folder_or_file) else: - print "ERROR: '",projects_from_blocks,"' not found!" + print("ERROR: '",projects_from_blocks,"' not found!") res = set() for p in paths: @@ -38,7 +38,7 @@ def copy_files(ids_set, folder_or_file, output_folder): if os.path.isfile(folder_or_file): paths.add(folder_or_file) else: - print "ERROR: '",projects_from_blocks,"' not found!" + print("ERROR: '",projects_from_blocks,"' not found!") for p in paths: with open(p,'r') as file: @@ -59,7 +59,7 @@ def copy_files(ids_set, folder_or_file, output_folder): with open(os.path.join(output_folder,file_path), 'w') as f: f.write(z.read(file_path)) except Exception as e: - print 'ERROR reading',zip_path,e + print('ERROR reading',zip_path,e) copy_count += 1 @@ -81,40 +81,40 @@ def copy_files(ids_set, folder_or_file, output_folder): (options, args) = parser.parse_args() if not len(sys.argv) > 1: - print "No arguments were passed. Try running with '--help'." + print("No arguments were passed. Try running with '--help'.") sys.exit(0) if (not options.tokensFiles) or (not options.statsFiles): - print "Arguments '-b' and '-s' are mandatory. Try running with '--help'." + print("Arguments '-b' and '-s' are mandatory. Try running with '--help'.") sys.exit(0) #### ARGUMENTS HANDLING MUST BE below output_folder = default_output_folder if options.outputDir: if os.path.isdir(options.outputDir): - print 'Folder',options.outputDir,'already exists.' + print('Folder',options.outputDir,'already exists.') sys.exit(0) else: os.makedirs(options.outputDir) output_folder = options.outputDir - print 'Folder',options.outputDir,'created.' + print('Folder',options.outputDir,'created.') else: if os.path.isdir(default_output_folder): - print 'Folder',default_output_folder,'already exists.' + print('Folder',default_output_folder,'already exists.') sys.exit(0) else: os.makedirs(default_output_folder) - print 'Folder',default_output_folder,'created.' + print('Folder',default_output_folder,'created.') p_start = dt.datetime.now() - print 'Grabbing IDs...' + print('Grabbing IDs...') ids_set = set() ids_set = grab_ids(options.tokensFiles) - print '%s file ids in %s' % (len(ids_set), dt.datetime.now() - p_start) + print('%s file ids in %s' % (len(ids_set), dt.datetime.now() - p_start)) p_start = dt.datetime.now() - print 'Copying files...' + print('Copying files...') copy_count = copy_files(ids_set, options.statsFiles, default_output_folder) - print '%s files copied in %s' % (copy_count, dt.datetime.now() - p_start) + print('%s files copied in %s' % (copy_count, dt.datetime.now() - p_start)) From a515c0c9e2ce47874ec5ff80d167b9d5723af48e Mon Sep 17 00:00:00 2001 From: Malinda Date: Wed, 22 Jul 2020 17:36:26 -0700 Subject: [PATCH 3/3] more changes to upgrade python version --- tokenizers/file-level/config.ini | 2 + .../file-level/db-importer/clone_finder.py | 38 +++++++++---------- tokenizers/file-level/db-importer/db.py | 4 +- .../file-level/db-importer/mysql-import.py | 24 ++++++------ tokenizers/file-level/tokenizer-unit-test.py | 6 +-- tokenizers/file-level/tokenizer.py | 26 ++++++------- 6 files changed, 49 insertions(+), 51 deletions(-) diff --git a/tokenizers/file-level/config.ini b/tokenizers/file-level/config.ini index a34edf093..55ba359b8 100644 --- a/tokenizers/file-level/config.ini +++ b/tokenizers/file-level/config.ini @@ -18,6 +18,8 @@ separators = ; . [ ] ( ) ~ ! - + & * / %% < > & ^ | ? { } = # , \ : $ " ' comment_inline = // comment_open_tag = /* comment_close_tag = */ +second_comment_open_tag = """ +second_comment_close_tag = """ ;.java File_extensions = .java ;.cpp .hpp .c .h .C .cc .CPP .c++ .cp diff --git a/tokenizers/file-level/db-importer/clone_finder.py b/tokenizers/file-level/db-importer/clone_finder.py index 1a6b642f7..b36501d20 100644 --- a/tokenizers/file-level/db-importer/clone_finder.py +++ b/tokenizers/file-level/db-importer/clone_finder.py @@ -15,7 +15,7 @@ log_path = 'LOG-db-clonefinder.log' if os.path.isfile(log_path): - print 'ERROR: Log file:',log_path,'already exists' + print('ERROR: Log file:',log_path,'already exists') sys.exit(1) FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' @@ -29,7 +29,7 @@ def findAllTokenHashClones(project_id, token_hashes, files_clones, db_object): try: query = """SELECT fileId, projectId, f.fileHash, tokenHash FROM files as f JOIN stats as s ON f.fileHash=s.fileHash - WHERE tokenHash in (%s) AND projectId >= %s;""" % ("'" + "','".join(token_hashes.keys()) + "'", project_id) + WHERE tokenHash in (%s) AND projectId >= %s;""" % ("'" + "','".join(list(token_hashes.keys())) + "'", project_id) res = db_object.execute(query); logging.info(query) for (file_id, projectId, fileHash, tokenHash, ) in res: @@ -39,8 +39,8 @@ def findAllTokenHashClones(project_id, token_hashes, files_clones, db_object): files_clones[f].add((str(file_id), projectId)) except Exception as e: - print 'Error on findAllTokenHashClones' - print e + print('Error on findAllTokenHashClones') + print(e) sys.exit(1) def find_clones_for_project(project_id, project_file_counts, db_object, debug): @@ -68,7 +68,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug): if debug == 'all': logging.debug('## After round 1') - for k, v in files_clones.iteritems(): + for k, v in files_clones.items(): if len(v) > 0: logging.debug('%s-%s', k, v) @@ -80,7 +80,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug): project_file_set = {} clone_set = set() - for fid, clones in files_clones.iteritems(): + for fid, clones in files_clones.items(): project_counted = False for clone in clones: projectId = clone[1] @@ -90,14 +90,14 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug): project_file_set[projectId].add(fid) # How many of this project's files are present in each of the other project? - for pid, file_list in project_file_set.iteritems(): + for pid, file_list in project_file_set.items(): percentage_clone_projects_counter[pid] = len(file_list) # How many of the other projects files are present in this project? for clone in clone_set: projectId = clone[1] - if percentage_host_projects_counter.has_key(projectId): + if projectId in percentage_host_projects_counter: percentage_host_projects_counter[projectId] += 1 else: percentage_host_projects_counter[projectId] = 1 @@ -105,7 +105,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug): if len(percentage_host_projects_counter) > 0: # The key k (projects) should be the same between # percentage_clone_projects_counter and percentage_host_projects_counter - for k, v in percentage_host_projects_counter.iteritems(): + for k, v in percentage_host_projects_counter.items(): percent_cloning = float(percentage_clone_projects_counter[k]*100)/total_files percent_host = float(v*100)/project_file_counts[k] @@ -116,7 +116,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug): if debug == 'all' or debug == 'final': if True:#(percent_cloning > 99) and (str(project_id) != k): - print 'Proj',project_id,'in',k,'@',str( float("{0:.2f}".format(percent_cloning)) )+'% ('+str(v)+'/'+str(total_files),'files) affecting', str(float("{0:.2f}".format(percent_host)))+'%','['+str(percentage_cloning_counter[k])+'/'+str(total_files_host),'files]' + print('Proj',project_id,'in',k,'@',str( float("{0:.2f}".format(percent_cloning)) )+'% ('+str(v)+'/'+str(total_files),'files) affecting', str(float("{0:.2f}".format(percent_host)))+'%','['+str(percentage_cloning_counter[k])+'/'+str(total_files_host),'files]') else: db_object.insert_projectClones(project_id, percentage_clone_projects_counter[k], total_files, float("{0:.2f}".format(percent_cloning)), @@ -124,8 +124,8 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug): float("{0:.2f}".format(percent_host))) except Exception as e: - print 'Error on find_clones_for_project' - print e + print('Error on find_clones_for_project') + print(e) traceback.print_exc() sys.exit(1) @@ -157,8 +157,8 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c db_object.flush_projectClones() except Exception as e: - print 'Error in clone_finder.start_process' - print e + print('Error in clone_finder.start_process') + print(e) sys.exit(1) finally: @@ -216,14 +216,14 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c project_ids = [] - for projectId in project_file_counts.keys(): + for projectId in list(project_file_counts.keys()): project_ids.append(projectId) pair_number += 1 - project_ids = [ project_ids[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] + project_ids = [ project_ids[i::N_PROCESSES] for i in range(N_PROCESSES) ] processes = [] - for process_num in xrange(N_PROCESSES): + for process_num in range(N_PROCESSES): p = Process(name='Process '+str(process_num), target=start_process, args=(process_num, project_ids[process_num], DB_user, DB_name, DB_pass, project_file_counts, host, )) processes.append(p) @@ -232,8 +232,8 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c [p.join() for p in processes] except Exception as e: - print 'Error in clone_finder.__main__' - print e + print('Error in clone_finder.__main__') + print(e) sys.exit(1) finally: diff --git a/tokenizers/file-level/db-importer/db.py b/tokenizers/file-level/db-importer/db.py index f209d7f95..43950b123 100644 --- a/tokenizers/file-level/db-importer/db.py +++ b/tokenizers/file-level/db-importer/db.py @@ -295,7 +295,7 @@ def insert_file(self, file_id, proj_id, file_path, file_url, file_hash, flush = # Prepare the complete list if autoID: - self.files = map(lambda (a, b, c, d, e): (b, c, d, e), self.files) + self.files = [(a_b_c_d_e[1], a_b_c_d_e[2], a_b_c_d_e[3], a_b_c_d_e[4]) for a_b_c_d_e in self.files] flist = ','.join(self.files) self.check_connection() @@ -442,7 +442,7 @@ def project_exists(self, proj_path): def sanitize_string(self, string_input): # To clean non-ascii characters printable = set(string.printable) - string_res = filter(lambda x: x in printable, string_input) + string_res = [x for x in string_input if x in printable] return (string_res[:DB_MAX_STRING_SIZE]) def execute(self, query): diff --git a/tokenizers/file-level/db-importer/mysql-import.py b/tokenizers/file-level/db-importer/mysql-import.py index cd2c14986..6a696fa20 100644 --- a/tokenizers/file-level/db-importer/mysql-import.py +++ b/tokenizers/file-level/db-importer/mysql-import.py @@ -1,7 +1,7 @@ import sys, os, csv from db import DB import logging -import urllib +import urllib.request, urllib.parse, urllib.error pattern = r'\"(.+?)\"' flag = None @@ -94,8 +94,8 @@ def import_tokenizer_output_files_tokens(db, output_path, logging): logging.warning('String partitioned into:'+file_id+'|'+proj_id+path+'|'+url+'|'+file_hash+'|'+bytess+'|'+lines+'|'+loc+'|'+sloc) - path = urllib.quote(path.strip('"')) - url = urllib.quote(url.strip('"')) + path = urllib.parse.quote(path.strip('"')) + url = urllib.parse.quote(url.strip('"')) file_hash = file_hash.strip('"') if flag == 'files-autoID': @@ -218,8 +218,8 @@ def import_tokenizer_output_blocks_tokens(db, output_path, logging): logging.warning('String partitioned into:'+file_id+'|'+proj_id+path+'|'+url+'|'+file_hash+'|'+bytess+'|'+lines+'|'+loc+'|'+sloc) - path = urllib.quote(path.strip('"')) - url = urllib.quote(url.strip('"')) + path = urllib.parse.quote(path.strip('"')) + url = urllib.parse.quote(url.strip('"')) file_hash = file_hash.strip('"') db.insert_file(file_id, proj_id, path, url, file_hash) @@ -266,7 +266,7 @@ def import_pairs(db, pairs_path): commit_interval = 1000 pair_number = 0 - print '## Importing pairs from',pairs_path + print('## Importing pairs from',pairs_path) with open(pairs_path, 'r') as file: for line in file: pair_number += 1 @@ -275,11 +275,11 @@ def import_pairs(db, pairs_path): db.insert_CCPairs(line_split[0], line_split[1], line_split[2], line_split[3]) if pair_number%commit_interval == 0: - print ' ',pair_number,'pairs committed' + print(' ',pair_number,'pairs committed') except Exception as e: - print 'Error accessing Database' - print e + print('Error accessing Database') + print(e) sys.exit(1) if __name__ == "__main__": @@ -299,7 +299,7 @@ def import_pairs(db, pairs_path): log_path = 'LOG-db-importer.log' if os.path.isfile(log_path): - print 'ERROR: Log file:',log_path,'already exists' + print('ERROR: Log file:',log_path,'already exists') sys.exit(1) FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' @@ -333,6 +333,6 @@ def import_pairs(db, pairs_path): db_object.close() except Exception as e: - print 'Error on __main__' - print e + print('Error on __main__') + print(e) diff --git a/tokenizers/file-level/tokenizer-unit-test.py b/tokenizers/file-level/tokenizer-unit-test.py index 169cc0f19..3cfccca36 100644 --- a/tokenizers/file-level/tokenizer-unit-test.py +++ b/tokenizers/file-level/tokenizer-unit-test.py @@ -1,12 +1,11 @@ # -*- encoding: utf-8 -*- +import hashlib import re -import os -import collections import sys import unittest + import tokenizer as tokenizer -import hashlib try: from configparser import ConfigParser @@ -166,4 +165,3 @@ def test_simple_file(self): if __name__ == '__main__': unittest.main() - diff --git a/tokenizers/file-level/tokenizer.py b/tokenizers/file-level/tokenizer.py index 9ad0c7912..23e5e51e7 100644 --- a/tokenizers/file-level/tokenizer.py +++ b/tokenizers/file-level/tokenizer.py @@ -1,17 +1,15 @@ +import collections +import datetime as dt +import hashlib import logging -import multiprocessing as mp -from multiprocessing import Process, Value, Queue +import os +import platform import re -import os, platform -import collections -import tarfile +import shutil import sys -import hashlib -import datetime as dt +import tarfile import zipfile -import javalang -import pathlib -import shutil +from multiprocessing import Process, Queue from subprocess import call try: @@ -426,7 +424,7 @@ def process_one_project(process_num, proj_id, proj_path, base_file_id, zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times else: zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = ( - -1, -1, -1, -1, -1, -1, -1) + -1, -1, -1, -1, -1, -1, -1) FILE_bookkeeping_proj.write(proj_id + ',\"' + proj_path + '\",\"' + proj_url + '\"\n') @@ -446,7 +444,7 @@ def process_one_project(process_num, proj_id, proj_path, base_file_id, zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times else: zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = ( - -1, -1, -1, -1, -1, -1, -1) + -1, -1, -1, -1, -1, -1, -1) FILE_bookkeeping_proj.write(proj_id + ',\"' + proj_path + '\",\"' + proj_url + '\"\n') @@ -518,7 +516,7 @@ def kill_child(processes, pid, n_files_processed): processes[pid][1] += n_files_processed print("Process %s finished, %s files processed (%s). Current total: %s" % ( - pid, n_files_processed, processes[pid][1], file_count)) + pid, n_files_processed, processes[pid][1], file_count)) def active_process_count(processes): @@ -618,4 +616,4 @@ def active_process_count(processes): call('cat ./files_tokens/* > blocks.file', shell=True) shutil.move('./blocks.file', - '/Users/malinda/Documents/Research_Topic_2/SourcererCC-master/clone-detector/input/dataset/') \ No newline at end of file + '/Users/malinda/Documents/Research_Topic_2/SourcererCC-master/clone-detector/input/dataset/')