Mondego · maldil · Jul 23, 2020 · Jul 23, 2020 · Jul 23, 2020
diff --git a/clone-detector/analyze.py b/clone-detector/analyze.py
@@ -32,13 +32,13 @@ def populate_distinct_clone_groups_count(self):
                     self.clone_groups[rhsFile] = 1
                 count += 1
                 if (count % print_per_k) == 0:
-                    print "rows processed: ", count
-        print "rows processed: ", count
+                    print("rows processed: ", count)
+        print("rows processed: ", count)
 
     def print_dict(self, dict_to_print):
         print("clones of each file:")
         with open("results.txt", 'w') as resultfile:
-            for key, val in sorted(dict_to_print.items(), key=lambda x:-x[1]):
+            for key, val in sorted(list(dict_to_print.items()), key=lambda x:-x[1]):
                 resultfile.write("{key},{val}\n".format(key=key, val=val))
 
 if __name__ == '__main__':
@@ -47,4 +47,4 @@ def print_dict(self, dict_to_print):
     # analyzer.get_count_of_distinct_files_that_have_clones()
     analyzer.populate_distinct_clone_groups_count()
     analyzer.print_dict(analyzer.clone_groups)
-    print "count of distinct files that have clones", len(analyzer.clone_groups.keys())
+    print("count of distinct files that have clones", len(list(analyzer.clone_groups.keys())))
diff --git a/clone-detector/controller.py b/clone-detector/controller.py
@@ -3,7 +3,7 @@
 
 @author: saini
 '''
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 import subprocess
 import sys
 import os

diff --git a/clone-detector/unevensplit.py b/clone-detector/unevensplit.py
@@ -27,10 +27,10 @@ def split(self):
         """
         count=0
         line_limit = self.base_x
-        print "line_limit is ", line_limit 
+        print("line_limit is ", line_limit) 
         file_count=1
         try:
-            print "creating split ",file_count
+            print("creating split ",file_count)
             self.outfile = open("query_{part}.file".format(part=file_count),'w')
             with open(self.input_filename,'r') as inputfile:
                 for row in inputfile:
@@ -42,29 +42,29 @@ def split(self):
                         file_count+=1
                         count =0
                         line_limit =line_limit + math.ceil(0.5*self.base_x)
-                        print "line_limit is ", line_limit 
-                        print "creating split ",file_count 
+                        print("line_limit is ", line_limit) 
+                        print("creating split ",file_count) 
                         self.outfile = open("query_{part}.file".format(part=file_count),'w')
                         self.outfile.write(row)
                     count+=1
             self.outfile.flush()
             self.outfile.close()
-        except IOError, e:
-            print "Error: {error}".format(error=e)
+        except IOError as e:
+            print("Error: {error}".format(error=e))
             sys.exit(1)
 
 
     def get_num_lines_in_input_file(self):
         with open(self.input_filename) as f:
             for i, l in enumerate(f):
                 pass
-        print "total lines in the inputfile: {0} ".format(i+1)
+        print("total lines in the inputfile: {0} ".format(i+1))
         return i + 1
 
     def find_base_x(self):
         # formula for S = x + x+.5x + x+2*.5x...x + (N-1)*.5x
         self.base_x= math.ceil(float(2*self.total_lines)/(float((self.split_count+1)*(self.split_count+2)/2) - 1))
-        print "base_x is ", self.base_x
+        print("base_x is ", self.base_x)
 
 if __name__ == '__main__':
 
@@ -73,7 +73,7 @@ def find_base_x(self):
     split_count = int(sys.argv[2])
     params= {'split_count':split_count,
              'input_filename' : input_file}
-    print "spliting {inputfile} in {count} chunks".format(inputfile=input_file,count=split_count)
+    print("spliting {inputfile} in {count} chunks".format(inputfile=input_file,count=split_count))
     splitter = Spliter(params)
     splitter.split()
-    print "splitting done!"
+    print("splitting done!")
diff --git a/tokenizers/block-level/separate-file-block-stats.py b/tokenizers/block-level/separate-file-block-stats.py
@@ -11,6 +11,6 @@
 					elif line.startswith('f'):
 						file_info.write(line)
 					else:
-						print "error", line
+						print("error", line)
 
-	print "Done with: ", readfile
+	print("Done with: ", readfile)
diff --git a/tokenizers/block-level/tokenizer.py b/tokenizers/block-level/tokenizer.py
@@ -16,7 +16,7 @@
 try:
   from configparser import ConfigParser
 except ImportError:
-  from ConfigParser import ConfigParser # ver. < 3.0
+  from configparser import ConfigParser # ver. < 3.0
 
 MULTIPLIER = 50000000
 
@@ -152,7 +152,7 @@ def tokenize_files(file_string, comment_inline_pattern, comment_open_close_patte
 
   t_time = dt.datetime.now()
   #SourcererCC formatting
-  tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in file_string_for_tokenization.iteritems()])
+  tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in file_string_for_tokenization.items()])
   t_time = (dt.datetime.now() - t_time).microseconds
 
   # MD5
@@ -278,7 +278,7 @@ def tokenize_blocks(file_string, comment_inline_pattern, comment_open_close_patt
         tokens_count_unique = len(block_string_for_tokenization)
         t_time = dt.datetime.now()
         #SourcererCC formatting
-        tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in block_string_for_tokenization.iteritems()])
+        tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in block_string_for_tokenization.items()])
         token_time += (dt.datetime.now() - t_time).microseconds
         # MD5
         h_time = dt.datetime.now()
@@ -322,7 +322,7 @@ def process_file_contents(file_string, proj_id, file_id, container_path,
 
     # file stats start with a letter 'f'
     FILE_stats_file.write('f' + ','.join([proj_id,str(file_id),'\"'+file_path+'\"','\"'+file_url+'\"','\"'+file_hash+'\"',file_bytes,str(lines),str(LOC),str(SLOC)]) + '\n')
-    blocks_data = zip(range(10000,99999),blocks_data)
+    blocks_data = list(zip(list(range(10000,99999)),blocks_data))
 
     logging.warning('Finished step2 on process_file_contents');
 
@@ -678,7 +678,7 @@ def start_child(processes, global_queue, proj_paths, batch, project_format):
   paths_batch = proj_paths[:batch]
   del proj_paths[:batch]
 
-  print("Starting new process %s" % (pid))
+  print(("Starting new process %s" % (pid)))
   p = Process(name='Process '+str(pid), target=process_projects, args=(pid, paths_batch, processes[pid][1], global_queue, project_format, ))
   processes[pid][0] = p
   p.start()
@@ -690,7 +690,7 @@ def kill_child(processes, pid, n_files_processed):
     processes[pid][0] = None
     processes[pid][1] += n_files_processed
 
-    print("Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count))
+    print(("Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count)))
 
 def active_process_count(processes):
   count = 0
@@ -717,26 +717,26 @@ def active_process_count(processes):
       for line in f:
         line_split = line[:-1].split(',') # [:-1] to strip final character which is '\n'
         prio_proj_paths.append((line_split[0],line_split[4]))
-    prio_proj_paths = zip(range(init_proj_id, len(prio_proj_paths)+init_proj_id), prio_proj_paths)
+    prio_proj_paths = list(zip(list(range(init_proj_id, len(prio_proj_paths)+init_proj_id)), prio_proj_paths))
 
   proj_paths = []
 
   if project_format in ['zipblocks']: # zipblocks will diverge the process flow on process_file()
-    print('\''+project_format+'\''+'format')
+    print(('\''+project_format+'\''+'format'))
     with open(FILE_projects_list) as f:
       for line in f:
         proj_paths.append(line[:-1])
-    proj_paths = list(zip(range(1, len(proj_paths)+1), proj_paths))
+    proj_paths = list(zip(list(range(1, len(proj_paths)+1)), proj_paths))
 
   if project_format in ['folderblocks']: # folderblocks will diverge the process flow on process_file()
-    print('\''+project_format+'\''+'format')
+    print(('\''+project_format+'\''+'format'))
     with open(FILE_projects_list) as f:
       for line in f:
         proj_paths.append(line[:-1])
-    proj_paths = list(zip(range(1, len(proj_paths)+1), proj_paths))
+    proj_paths = list(zip(list(range(1, len(proj_paths)+1)), proj_paths))
 
   if os.path.exists(PATH_stats_file_folder) or os.path.exists(PATH_bookkeeping_proj_folder) or os.path.exists(PATH_tokens_file_folder) or os.path.exists(PATH_logs):
-    print('ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!')
+    print(('ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!'))
     sys.exit(1)
   else:
     os.makedirs(PATH_stats_file_folder)
@@ -774,5 +774,5 @@ def active_process_count(processes):
     kill_child(processes, pid, n_files_processed)
 
   p_elapsed = dt.datetime.now() - p_start
-  print("*** All done. %s files in %s" % (file_count, p_elapsed))
+  print(("*** All done. %s files in %s" % (file_count, p_elapsed)))
 
diff --git a/tokenizers/file-level/config.ini b/tokenizers/file-level/config.ini
@@ -18,6 +18,8 @@ separators = ; . [ ] ( ) ~ ! - + & * / %% < > & ^ | ? { } = # , \ : $ " '
 comment_inline = //
 comment_open_tag = /*
 comment_close_tag = */
+second_comment_open_tag = """
+second_comment_close_tag = """
 ;.java
 File_extensions = .java
 ;.cpp .hpp .c .h .C .cc .CPP .c++ .cp

diff --git a/tokenizers/file-level/db-importer/clone_finder.py b/tokenizers/file-level/db-importer/clone_finder.py
@@ -15,7 +15,7 @@
 log_path = 'LOG-db-clonefinder.log'
 
 if os.path.isfile(log_path):
-    print 'ERROR: Log file:',log_path,'already exists'
+    print('ERROR: Log file:',log_path,'already exists')
     sys.exit(1)
 
 FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'
@@ -29,7 +29,7 @@ def findAllTokenHashClones(project_id, token_hashes, files_clones, db_object):
     try:
         query = """SELECT fileId, projectId, f.fileHash, tokenHash FROM files as f 
                    JOIN stats as s ON f.fileHash=s.fileHash 
-                   WHERE tokenHash in (%s) AND projectId >= %s;""" % ("'" + "','".join(token_hashes.keys()) + "'", project_id)
+                   WHERE tokenHash in (%s) AND projectId >= %s;""" % ("'" + "','".join(list(token_hashes.keys())) + "'", project_id)
         res = db_object.execute(query);
         logging.info(query)
         for (file_id, projectId, fileHash, tokenHash, ) in res:
@@ -39,8 +39,8 @@ def findAllTokenHashClones(project_id, token_hashes, files_clones, db_object):
                     files_clones[f].add((str(file_id), projectId))
 
     except Exception as e:
-        print 'Error on findAllTokenHashClones'
-        print e
+        print('Error on findAllTokenHashClones')
+        print(e)
         sys.exit(1)
 
 def find_clones_for_project(project_id, project_file_counts, db_object, debug):
@@ -68,7 +68,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
 
         if debug == 'all':
             logging.debug('## After round 1')
-            for k, v in files_clones.iteritems():
+            for k, v in files_clones.items():
                 if len(v) > 0:
                     logging.debug('%s-%s', k, v)
 
@@ -80,7 +80,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
 
         project_file_set = {}
         clone_set = set()
-        for fid, clones in files_clones.iteritems():
+        for fid, clones in files_clones.items():
             project_counted = False
             for clone in clones:
                 projectId = clone[1]
@@ -90,22 +90,22 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
                 project_file_set[projectId].add(fid)
 
         # How many of this project's files are present in each of the other project?
-        for pid, file_list in project_file_set.iteritems():
+        for pid, file_list in project_file_set.items():
             percentage_clone_projects_counter[pid] = len(file_list)
 
 
         # How many of the other projects files are present in this project?
         for clone in clone_set:
             projectId = clone[1]
-            if percentage_host_projects_counter.has_key(projectId):
+            if projectId in percentage_host_projects_counter:
                 percentage_host_projects_counter[projectId] += 1
             else:
                 percentage_host_projects_counter[projectId] = 1
 
         if len(percentage_host_projects_counter) > 0:
             # The key k (projects) should be the same between 
             # percentage_clone_projects_counter and percentage_host_projects_counter
-            for k, v in percentage_host_projects_counter.iteritems():
+            for k, v in percentage_host_projects_counter.items():
 
                 percent_cloning = float(percentage_clone_projects_counter[k]*100)/total_files
                 percent_host = float(v*100)/project_file_counts[k]
@@ -116,16 +116,16 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
 
                 if debug == 'all' or debug == 'final':
                     if True:#(percent_cloning > 99) and (str(project_id) != k):
-                        print 'Proj',project_id,'in',k,'@',str( float("{0:.2f}".format(percent_cloning)) )+'% ('+str(v)+'/'+str(total_files),'files) affecting', str(float("{0:.2f}".format(percent_host)))+'%','['+str(percentage_cloning_counter[k])+'/'+str(total_files_host),'files]'
+                        print('Proj',project_id,'in',k,'@',str( float("{0:.2f}".format(percent_cloning)) )+'% ('+str(v)+'/'+str(total_files),'files) affecting', str(float("{0:.2f}".format(percent_host)))+'%','['+str(percentage_cloning_counter[k])+'/'+str(total_files_host),'files]')
 
                 else:
                     db_object.insert_projectClones(project_id, percentage_clone_projects_counter[k], total_files, float("{0:.2f}".format(percent_cloning)), 
                                                    k, v, project_file_counts[k], 
                                                    float("{0:.2f}".format(percent_host)))
 
     except Exception as e:
-        print 'Error on find_clones_for_project'
-        print e
+        print('Error on find_clones_for_project')
+        print(e)
         traceback.print_exc()
         sys.exit(1)
 
@@ -157,8 +157,8 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c
         db_object.flush_projectClones()
 
     except Exception as e:
-        print 'Error in clone_finder.start_process'
-        print e
+        print('Error in clone_finder.start_process')
+        print(e)
         sys.exit(1)
 
     finally:
@@ -216,14 +216,14 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c
 
         project_ids = []
 
-        for projectId in project_file_counts.keys():
+        for projectId in list(project_file_counts.keys()):
             project_ids.append(projectId)
             pair_number += 1
 
-        project_ids = [ project_ids[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]
+        project_ids = [ project_ids[i::N_PROCESSES] for i in range(N_PROCESSES) ]
 
         processes = []
-        for process_num in xrange(N_PROCESSES):
+        for process_num in range(N_PROCESSES):
             p = Process(name='Process '+str(process_num), target=start_process, 
                         args=(process_num, project_ids[process_num], DB_user, DB_name, DB_pass, project_file_counts, host, ))
             processes.append(p)
@@ -232,8 +232,8 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c
         [p.join() for p in processes]
 
     except Exception as e:
-        print 'Error in clone_finder.__main__'
-        print e
+        print('Error in clone_finder.__main__')
+        print(e)
         sys.exit(1)
 
     finally:

diff --git a/tokenizers/file-level/db-importer/db.py b/tokenizers/file-level/db-importer/db.py
@@ -295,7 +295,7 @@ def insert_file(self, file_id, proj_id, file_path, file_url, file_hash, flush =
 
     # Prepare the complete list
     if autoID:
-      self.files = map(lambda (a, b, c, d, e): (b, c, d, e), self.files)
+      self.files = [(a_b_c_d_e[1], a_b_c_d_e[2], a_b_c_d_e[3], a_b_c_d_e[4]) for a_b_c_d_e in self.files]
     flist = ','.join(self.files)
 
     self.check_connection()
@@ -442,7 +442,7 @@ def project_exists(self, proj_path):
   def sanitize_string(self, string_input):
     # To clean non-ascii characters
     printable = set(string.printable)
-    string_res = filter(lambda x: x in printable, string_input)
+    string_res = [x for x in string_input if x in printable]
     return (string_res[:DB_MAX_STRING_SIZE])
 
   def execute(self, query):