From 3aea332c75c45dc0866bfca5586e86ab3b7823c6 Mon Sep 17 00:00:00 2001
From: Malinda <malinda.dilhara@gmail.com>
Date: Wed, 22 Jul 2020 17:20:42 -0700
Subject: [PATCH 1/3] upgrading python version from 2 to 3

---
 clone-detector/analyze.py                     |    8 +-
 clone-detector/controller.py                  |    2 +-
 clone-detector/unevensplit.py                 |   20 +-
 .../block-level/separate-file-block-stats.py  |    4 +-
 tokenizers/block-level/tokenizer.py           |   26 +-
 tokenizers/file-level/tokenizer-unit-test.py  |  162 +--
 tokenizers/file-level/tokenizer.py            | 1000 +++++++++--------
 7 files changed, 655 insertions(+), 567 deletions(-)

diff --git a/clone-detector/analyze.py b/clone-detector/analyze.py
index 2e2e7be68..2ef55d721 100644
--- a/clone-detector/analyze.py
+++ b/clone-detector/analyze.py
@@ -32,13 +32,13 @@ def populate_distinct_clone_groups_count(self):
                     self.clone_groups[rhsFile] = 1
                 count += 1
                 if (count % print_per_k) == 0:
-                    print "rows processed: ", count
-        print "rows processed: ", count
+                    print("rows processed: ", count)
+        print("rows processed: ", count)
 
     def print_dict(self, dict_to_print):
         print("clones of each file:")
         with open("results.txt", 'w') as resultfile:
-            for key, val in sorted(dict_to_print.items(), key=lambda x:-x[1]):
+            for key, val in sorted(list(dict_to_print.items()), key=lambda x:-x[1]):
                 resultfile.write("{key},{val}\n".format(key=key, val=val))
             
 if __name__ == '__main__':
@@ -47,4 +47,4 @@ def print_dict(self, dict_to_print):
     # analyzer.get_count_of_distinct_files_that_have_clones()
     analyzer.populate_distinct_clone_groups_count()
     analyzer.print_dict(analyzer.clone_groups)
-    print "count of distinct files that have clones", len(analyzer.clone_groups.keys())
+    print("count of distinct files that have clones", len(list(analyzer.clone_groups.keys())))
diff --git a/clone-detector/controller.py b/clone-detector/controller.py
index 7a424dfd8..527bdde2f 100644
--- a/clone-detector/controller.py
+++ b/clone-detector/controller.py
@@ -3,7 +3,7 @@
 
 @author: saini
 '''
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 import subprocess
 import sys
 import os
diff --git a/clone-detector/unevensplit.py b/clone-detector/unevensplit.py
index cec985cab..ee630f9f2 100644
--- a/clone-detector/unevensplit.py
+++ b/clone-detector/unevensplit.py
@@ -27,10 +27,10 @@ def split(self):
         """
         count=0
         line_limit = self.base_x
-        print "line_limit is ", line_limit 
+        print("line_limit is ", line_limit) 
         file_count=1
         try:
-            print "creating split ",file_count
+            print("creating split ",file_count)
             self.outfile = open("query_{part}.file".format(part=file_count),'w')
             with open(self.input_filename,'r') as inputfile:
                 for row in inputfile:
@@ -42,15 +42,15 @@ def split(self):
                         file_count+=1
                         count =0
                         line_limit =line_limit + math.ceil(0.5*self.base_x)
-                        print "line_limit is ", line_limit 
-                        print "creating split ",file_count 
+                        print("line_limit is ", line_limit) 
+                        print("creating split ",file_count) 
                         self.outfile = open("query_{part}.file".format(part=file_count),'w')
                         self.outfile.write(row)
                     count+=1
             self.outfile.flush()
             self.outfile.close()
-        except IOError, e:
-            print "Error: {error}".format(error=e)
+        except IOError as e:
+            print("Error: {error}".format(error=e))
             sys.exit(1)
                 
     
@@ -58,13 +58,13 @@ def get_num_lines_in_input_file(self):
         with open(self.input_filename) as f:
             for i, l in enumerate(f):
                 pass
-        print "total lines in the inputfile: {0} ".format(i+1)
+        print("total lines in the inputfile: {0} ".format(i+1))
         return i + 1
     
     def find_base_x(self):
         # formula for S = x + x+.5x + x+2*.5x...x + (N-1)*.5x
         self.base_x= math.ceil(float(2*self.total_lines)/(float((self.split_count+1)*(self.split_count+2)/2) - 1))
-        print "base_x is ", self.base_x
+        print("base_x is ", self.base_x)
         
 if __name__ == '__main__':
     
@@ -73,7 +73,7 @@ def find_base_x(self):
     split_count = int(sys.argv[2])
     params= {'split_count':split_count,
              'input_filename' : input_file}
-    print "spliting {inputfile} in {count} chunks".format(inputfile=input_file,count=split_count)
+    print("spliting {inputfile} in {count} chunks".format(inputfile=input_file,count=split_count))
     splitter = Spliter(params)
     splitter.split()
-    print "splitting done!"
\ No newline at end of file
+    print("splitting done!")
\ No newline at end of file
diff --git a/tokenizers/block-level/separate-file-block-stats.py b/tokenizers/block-level/separate-file-block-stats.py
index c61ee7aed..df32234c5 100644
--- a/tokenizers/block-level/separate-file-block-stats.py
+++ b/tokenizers/block-level/separate-file-block-stats.py
@@ -11,6 +11,6 @@
 					elif line.startswith('f'):
 						file_info.write(line)
 					else:
-						print "error", line
+						print("error", line)
 		
-	print "Done with: ", readfile
+	print("Done with: ", readfile)
diff --git a/tokenizers/block-level/tokenizer.py b/tokenizers/block-level/tokenizer.py
index 45e52bf68..1c6e18645 100644
--- a/tokenizers/block-level/tokenizer.py
+++ b/tokenizers/block-level/tokenizer.py
@@ -16,7 +16,7 @@
 try:
   from configparser import ConfigParser
 except ImportError:
-  from ConfigParser import ConfigParser # ver. < 3.0
+  from configparser import ConfigParser # ver. < 3.0
 
 MULTIPLIER = 50000000
 
@@ -152,7 +152,7 @@ def tokenize_files(file_string, comment_inline_pattern, comment_open_close_patte
 
   t_time = dt.datetime.now()
   #SourcererCC formatting
-  tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in file_string_for_tokenization.iteritems()])
+  tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in file_string_for_tokenization.items()])
   t_time = (dt.datetime.now() - t_time).microseconds
 
   # MD5
@@ -278,7 +278,7 @@ def tokenize_blocks(file_string, comment_inline_pattern, comment_open_close_patt
         tokens_count_unique = len(block_string_for_tokenization)
         t_time = dt.datetime.now()
         #SourcererCC formatting
-        tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in block_string_for_tokenization.iteritems()])
+        tokens = ','.join(['{}@@::@@{}'.format(k, v) for k,v in block_string_for_tokenization.items()])
         token_time += (dt.datetime.now() - t_time).microseconds
         # MD5
         h_time = dt.datetime.now()
@@ -322,7 +322,7 @@ def process_file_contents(file_string, proj_id, file_id, container_path,
     
     # file stats start with a letter 'f'
     FILE_stats_file.write('f' + ','.join([proj_id,str(file_id),'\"'+file_path+'\"','\"'+file_url+'\"','\"'+file_hash+'\"',file_bytes,str(lines),str(LOC),str(SLOC)]) + '\n')
-    blocks_data = zip(range(10000,99999),blocks_data)
+    blocks_data = list(zip(list(range(10000,99999)),blocks_data))
 
     logging.warning('Finished step2 on process_file_contents');
 
@@ -678,7 +678,7 @@ def start_child(processes, global_queue, proj_paths, batch, project_format):
   paths_batch = proj_paths[:batch]
   del proj_paths[:batch]
 
-  print("Starting new process %s" % (pid))
+  print(("Starting new process %s" % (pid)))
   p = Process(name='Process '+str(pid), target=process_projects, args=(pid, paths_batch, processes[pid][1], global_queue, project_format, ))
   processes[pid][0] = p
   p.start()
@@ -690,7 +690,7 @@ def kill_child(processes, pid, n_files_processed):
     processes[pid][0] = None
     processes[pid][1] += n_files_processed
     
-    print("Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count))
+    print(("Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count)))
 
 def active_process_count(processes):
   count = 0
@@ -717,26 +717,26 @@ def active_process_count(processes):
       for line in f:
         line_split = line[:-1].split(',') # [:-1] to strip final character which is '\n'
         prio_proj_paths.append((line_split[0],line_split[4]))
-    prio_proj_paths = zip(range(init_proj_id, len(prio_proj_paths)+init_proj_id), prio_proj_paths)
+    prio_proj_paths = list(zip(list(range(init_proj_id, len(prio_proj_paths)+init_proj_id)), prio_proj_paths))
 
   proj_paths = []
 
   if project_format in ['zipblocks']: # zipblocks will diverge the process flow on process_file()
-    print('\''+project_format+'\''+'format')
+    print(('\''+project_format+'\''+'format'))
     with open(FILE_projects_list) as f:
       for line in f:
         proj_paths.append(line[:-1])
-    proj_paths = list(zip(range(1, len(proj_paths)+1), proj_paths))
+    proj_paths = list(zip(list(range(1, len(proj_paths)+1)), proj_paths))
 
   if project_format in ['folderblocks']: # folderblocks will diverge the process flow on process_file()
-    print('\''+project_format+'\''+'format')
+    print(('\''+project_format+'\''+'format'))
     with open(FILE_projects_list) as f:
       for line in f:
         proj_paths.append(line[:-1])
-    proj_paths = list(zip(range(1, len(proj_paths)+1), proj_paths))
+    proj_paths = list(zip(list(range(1, len(proj_paths)+1)), proj_paths))
 
   if os.path.exists(PATH_stats_file_folder) or os.path.exists(PATH_bookkeeping_proj_folder) or os.path.exists(PATH_tokens_file_folder) or os.path.exists(PATH_logs):
-    print('ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!')
+    print(('ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!'))
     sys.exit(1)
   else:
     os.makedirs(PATH_stats_file_folder)
@@ -774,5 +774,5 @@ def active_process_count(processes):
     kill_child(processes, pid, n_files_processed)
 
   p_elapsed = dt.datetime.now() - p_start
-  print("*** All done. %s files in %s" % (file_count, p_elapsed))
+  print(("*** All done. %s files in %s" % (file_count, p_elapsed)))
 
diff --git a/tokenizers/file-level/tokenizer-unit-test.py b/tokenizers/file-level/tokenizer-unit-test.py
index 8f9485764..169cc0f19 100644
--- a/tokenizers/file-level/tokenizer-unit-test.py
+++ b/tokenizers/file-level/tokenizer-unit-test.py
@@ -5,20 +5,20 @@
 import collections
 import sys
 import unittest
-import tokenizer
+import tokenizer as tokenizer
 import hashlib
 
 try:
     from configparser import ConfigParser
 except ImportError:
-    from ConfigParser import ConfigParser # ver. < 3.0
+    from ConfigParser import ConfigParser  # ver. < 3.0
 
 config = ConfigParser()
 # parse existing file
 try:
     config.read('config.ini')
 except IOError:
-    print 'ERROR - Config settings not found. Usage: $python this-script.py config.ini'
+    print('ERROR - Config settings not found. Usage: $python this-script.py config.ini')
     sys.exit()
 
 separators = config.get('Language', 'separators').strip('"').split(' ')
@@ -27,12 +27,15 @@
 comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))
 comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))
 comment_open_close_pattern = comment_open_tag + '.*?' + comment_close_tag
-
+second_comment_open_tag = re.escape(config.get('Language', 'second_comment_open_tag'))
+second_comment_close_tag = re.escape(config.get('Language', 'second_comment_close_tag'))
+second_comment_open_close_pattern = second_comment_open_tag + '.*?' + second_comment_close_tag
 REGEX = re.compile('.+?@@::@@+\d')
 
+
 class TestParser(unittest.TestCase):
 
-    #Input is something like: @#@print@@::@@1,include@@::@@1,sys@@::@@1
+    # Input is something like: @#@print@@::@@1,include@@::@@1,sys@@::@@1
     def assert_common_properties(self, list_tokens_string):
         self.assertTrue(list_tokens_string.startswith('@#@'))
 
@@ -42,103 +45,124 @@ def assert_common_properties(self, list_tokens_string):
                 self.assertTrue(REGEX.match(pair))
 
     def test_line_counts_1(self):
-        input = """ line 1
+        input = b""" line 1
                     line 2
                     line 3 """
-        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
-        (file_hash,lines,LOC,SLOC) = final_stats
+        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern,
+                                                                           comment_open_close_pattern,
+                                                                           second_comment_open_close_pattern,
+                                                                           separators)
+        (file_hash, lines, LOC, SLOC) = final_stats
 
-        self.assertEqual(lines,3)
-        self.assertEqual(LOC,3)
-        self.assertEqual(SLOC,3)
+        self.assertEqual(lines, 3)
+        self.assertEqual(LOC, 3)
+        self.assertEqual(SLOC, 3)
 
     def test_line_counts_2(self):
-        input = """ line 1
+        input = b""" line 1
                     line 2
                     line 3
 """
-        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
-        (file_hash,lines,LOC,SLOC) = final_stats
+        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern,
+                                                                           comment_open_close_pattern,
+                                                                           second_comment_open_close_pattern,
+                                                                           separators)
+        (file_hash, lines, LOC, SLOC) = final_stats
 
-        self.assertEqual(lines,3)
-        self.assertEqual(LOC,3)
-        self.assertEqual(SLOC,3)
+        self.assertEqual(lines, 3)
+        self.assertEqual(LOC, 3)
+        self.assertEqual(SLOC, 3)
 
     def test_line_counts_3(self):
-        input = """ line 1
+        input = b""" line 1
 
-                    // line 2
+                    # line 2
                     line 3 
                 """
-        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
-        (file_hash,lines,LOC,SLOC) = final_stats
-
-        self.assertEqual(lines,5)
-        self.assertEqual(LOC,3)
-        self.assertEqual(SLOC,2)
+        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern,
+                                                                           comment_open_close_pattern,
+                                                                           second_comment_open_close_pattern,
+                                                                           separators)
+        (file_hash, lines, LOC, SLOC) = final_stats
+        print(lines, LOC, SLOC)
+        self.assertEqual(lines, 5)
+        self.assertEqual(LOC, 3)
+        self.assertEqual(SLOC, 2)
 
     def test_comments(self):
-        input = "// Hello\n // World"
-        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
-        (file_hash,lines,LOC,SLOC) = final_stats
-        (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens
-
-
-        self.assertEqual(lines,2)
-        self.assertEqual(LOC,2)
-        self.assertEqual(SLOC,0)
-
-        self.assertEqual(tokens_count_total,0)
-        self.assertEqual(tokens_count_unique,0)
+        input = b"# Hello\n # World"
+        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern,
+                                                                           comment_open_close_pattern,
+                                                                           second_comment_open_close_pattern,
+                                                                           separators)
+        (file_hash, lines, LOC, SLOC) = final_stats
+        (tokens_count_total, tokens_count_unique, token_hash, tokens) = final_tokens
+
+        self.assertEqual(lines, 2)
+        self.assertEqual(LOC, 2)
+        self.assertEqual(SLOC, 1)
+
+        self.assertEqual(tokens_count_total, 0)
+        self.assertEqual(tokens_count_unique, 0)
         self.assert_common_properties(tokens)
 
     def test_multiline_comment(self):
-        input = '/* this is a \n comment */ /* Last one */ '
-        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
-        (file_hash,lines,LOC,SLOC) = final_stats
-        (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens
-
-
-        self.assertEqual(lines,2)
-        self.assertEqual(LOC,2)
-        self.assertEqual(SLOC,0)
-
-        self.assertEqual(tokens_count_total,0)
-        self.assertEqual(tokens_count_unique,0)
+        input = b'""" this is a \n comment """ """ Last one """'
+        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern,
+                                                                           comment_open_close_pattern,
+                                                                           second_comment_open_close_pattern,
+                                                                           separators)
+        (file_hash, lines, LOC, SLOC) = final_stats
+        (tokens_count_total, tokens_count_unique, token_hash, tokens) = final_tokens
+
+        self.assertEqual(lines, 2)
+        self.assertEqual(LOC, 2)
+        self.assertEqual(SLOC, 1)
+
+        self.assertEqual(tokens_count_total, 0)
+        self.assertEqual(tokens_count_unique, 0)
         self.assert_common_properties(tokens)
 
     def test_simple_file(self):
-        input = u"""#include GLFW_INCLUDE_GLU
-                   #include <GLFW/glfw3.h>
-                   #include <cstdio>
-                   
-                   /* Random function */
+        input = u"""include GLFW_INCLUDE_GLU
+                   include <GLFW/glfw3.h>
+                   include <cstdio>
+
+                   '''Random function'''
                    static void glfw_key_callback(int key, int scancode, int action, int mod){
                      if(glfw_key_callback){
-                       // Comment here
+                       # Comment here
                        input_event_queue->push(inputaction);   
                      }
                      printf("%s", "asciiじゃない文字");
                    }""".encode("utf-8")
-        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
-        (file_hash,lines,LOC,SLOC) = final_stats
-        (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens
-
-        self.assertEqual(lines,12)
-        self.assertEqual(LOC,11)
-        self.assertEqual(SLOC,9)
-
-        self.assertEqual(tokens_count_total,27)
-        self.assertEqual(tokens_count_unique,21)
+        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern,
+                                                                           comment_open_close_pattern,
+                                                                           second_comment_open_close_pattern,
+                                                                           separators)
+        (file_hash, lines, LOC, SLOC) = final_stats
+        (tokens_count_total, tokens_count_unique, token_hash, tokens) = final_tokens
+
+        self.assertEqual(lines, 12)
+        self.assertEqual(LOC, 11)
+        self.assertEqual(SLOC, 9)
+
+        self.assertEqual(tokens_count_total, 27)
+        self.assertEqual(tokens_count_unique, 21)
         self.assert_common_properties(tokens)
 
-        hard_tokens = set(['int@@::@@4','void@@::@@1','cstdio@@::@@1','action@@::@@1','static@@::@@1','key@@::@@1','glfw_key_callback@@::@@1','mod@@::@@1','if@@::@@1','glfw3@@::@@1','scancode@@::@@1','h@@::@@1','GLFW_INCLUDE_GLU@@::@@1','input_event_queue@@::@@2','GLFW@@::@@1','push@@::@@1','inputaction@@::@@1','include@@::@@3'])
+        hard_tokens = set(['int@@::@@4', 'void@@::@@1', 'cstdio@@::@@1', 'action@@::@@1', 'static@@::@@1', 'key@@::@@1',
+                           'glfw_key_callback@@::@@1', 'mod@@::@@1', 'if@@::@@1', 'glfw3@@::@@1', 'scancode@@::@@1',
+                           'h@@::@@1', 'GLFW_INCLUDE_GLU@@::@@1', 'input_event_queue@@::@@2', 'GLFW@@::@@1',
+                           'push@@::@@1', 'inputaction@@::@@1', 'include@@::@@3'])
         this_tokens = set(tokens[3:].split(','))
-        self.assertTrue(len(hard_tokens - this_tokens),0)
+        self.assertTrue(len(hard_tokens - this_tokens), 0)
 
         m = hashlib.md5()
-        m.update(tokens[3:])
-        self.assertEqual(m.hexdigest(),token_hash)
+
+        m.update(tokens[3:].encode('utf-8'))
+        self.assertEqual(m.hexdigest(), token_hash)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tokenizers/file-level/tokenizer.py b/tokenizers/file-level/tokenizer.py
index b61be834b..9ad0c7912 100644
--- a/tokenizers/file-level/tokenizer.py
+++ b/tokenizers/file-level/tokenizer.py
@@ -10,11 +10,14 @@
 import datetime as dt
 import zipfile
 import javalang
+import pathlib
+import shutil
+from subprocess import call
 
 try:
-  from configparser import ConfigParser
+    from configparser import ConfigParser
 except ImportError:
-  from ConfigParser import ConfigParser # ver. < 3.0
+    from configparser import ConfigParser  # ver. < 3.0
 
 MULTIPLIER = 50000000
 
@@ -33,395 +36,442 @@
 comment_inline_pattern = comment_inline + '.*?$'
 comment_open_tag = ''
 comment_close_tag = ''
+second_comment_open_tag = ''
+second_comment_close_tag = ''
 comment_open_close_pattern = comment_open_tag + '.*?' + comment_close_tag
 file_extensions = '.none'
 
 file_count = 0
 
+shutil.rmtree('files_stats')
+shutil.rmtree('bookkeeping_projs')
+shutil.rmtree('files_tokens')
+shutil.rmtree('logs')
+
+
 def read_config():
-  global N_PROCESSES, PROJECTS_BATCH, FILE_projects_list, FILE_priority_projects
-  global PATH_stats_file_folder, PATH_bookkeeping_proj_folder, PATH_tokens_file_folder, PATH_logs
-  global separators, comment_inline, comment_inline_pattern, comment_open_tag, comment_close_tag, comment_open_close_pattern
-  global file_extensions
-  
-  global init_file_id
-  global init_proj_id
-
-  # instantiate
-  config = ConfigParser()
-
-  # parse existing file
-  try:
-    config.read(os.path.join(os.path.dirname(os.path.abspath(__file__)) , 'config.ini'))
-  except IOError:
-    print 'ERROR - Config settings not found. Usage: $python this-script.py config-file.ini'
-    sys.exit()
-
-  # Get info from config.ini into global variables
-  N_PROCESSES = config.getint('Main', 'N_PROCESSES')
-  PROJECTS_BATCH = config.getint('Main', 'PROJECTS_BATCH')
-  FILE_projects_list = config.get('Main', 'FILE_projects_list')
-  if config.has_option('Main', 'FILE_priority_projects'):
-    FILE_priority_projects = config.get('Main', 'FILE_priority_projects')
-  PATH_stats_file_folder = config.get('Folders/Files', 'PATH_stats_file_folder')
-  PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder')
-  PATH_tokens_file_folder = config.get('Folders/Files', 'PATH_tokens_file_folder')
-  PATH_logs = config.get('Folders/Files', 'PATH_logs')
-
-  # Reading Language settings
-  separators = config.get('Language', 'separators').strip('"').split(' ')
-  comment_inline = re.escape(config.get('Language', 'comment_inline'))
-  comment_inline_pattern = comment_inline + '.*?$'
-  comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))
-  comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))
-  comment_open_close_pattern = comment_open_tag + '.*?' + comment_close_tag
-  file_extensions = config.get('Language', 'File_extensions').split(' ')
-
-  # Reading config settings
-  init_file_id = config.getint('Config', 'init_file_id')
-  init_proj_id = config.getint('Config', 'init_proj_id')
-
-def tokenize_files(file_string, comment_inline_pattern, comment_open_close_pattern, separators):
-
-  final_stats  = 'ERROR'
-  final_tokens = 'ERROR'
-
-  file_hash = 'ERROR'
-  lines     = 'ERROR'
-  LOC       = 'ERROR'
-  SLOC      = 'ERROR'
-
-  h_time = dt.datetime.now()
-  m = hashlib.md5()
-  m.update(file_string)
-  file_hash = m.hexdigest()
-  hash_time = (dt.datetime.now() - h_time).microseconds
-  
-  lines = file_string.count('\n')
-  if not file_string.endswith('\n'):
-    lines += 1
-  file_string = "".join([s for s in file_string.splitlines(True) if s.strip()])
-
-  LOC = file_string.count('\n')
-  if not file_string.endswith('\n'):
-    LOC += 1
-
-  re_time = dt.datetime.now()
-  # Remove tagged comments
-  file_string = re.sub(comment_open_close_pattern, '', file_string, flags=re.DOTALL)
-  # Remove end of line comments
-  file_string = re.sub(comment_inline_pattern, '', file_string, flags=re.MULTILINE)
-  re_time = (dt.datetime.now() - re_time).microseconds
-
-  file_string = "".join([s for s in file_string.splitlines(True) if s.strip()]).strip()
-
-  SLOC = file_string.count('\n')
-  if file_string != '' and not file_string.endswith('\n'):
-    SLOC += 1
-
-  final_stats = (file_hash,lines,LOC,SLOC)
-
-  # Rather a copy of the file string here for tokenization
-  file_string_for_tokenization = file_string.decode('utf-8')
-
-  #Transform separators into spaces (remove them)
-  s_time = dt.datetime.now()
-  for x in separators:
-    file_string_for_tokenization = file_string_for_tokenization.replace(x,' ')
-  s_time = (dt.datetime.now() - s_time).microseconds
-
-  ##Create a list of tokens
-  file_string_for_tokenization = file_string_for_tokenization.split()
-  ## Total number of tokens
-  tokens_count_total = len(file_string_for_tokenization)
-  ##Count occurrences
-  file_string_for_tokenization = collections.Counter(file_string_for_tokenization)
-  ##Converting Counter to dict because according to StackOverflow is better
-  file_string_for_tokenization=dict(file_string_for_tokenization)
-  ## Unique number of tokens
-  tokens_count_unique = len(file_string_for_tokenization)
-
-  t_time = dt.datetime.now()
-  #SourcererCC formatting
-  tokens = ','.join(['{}@@::@@{}'.format(k.encode('utf-8'), v)
-                    for k,v in file_string_for_tokenization.iteritems()])
-  t_time = (dt.datetime.now() - t_time).microseconds
-
-  # MD5
-  h_time = dt.datetime.now()
-  m = hashlib.md5()
-  m.update(tokens)
-  hash_time += (dt.datetime.now() - h_time).microseconds
-
-  final_tokens = (tokens_count_total,tokens_count_unique,m.hexdigest(),'@#@'+tokens)
-
-  return (final_stats, final_tokens, [s_time, t_time, hash_time, re_time])
-
-def process_file_contents(file_string, proj_id, file_id, container_path, 
-              file_path, file_bytes, proj_url, FILE_tokens_file, FILE_stats_file, logging):
-  
-  logging.info('Attempting to process_file_contents '+os.path.join(container_path, file_path))
-
-  global file_count
-  file_count += 1
-
-  (final_stats, final_tokens, file_parsing_times) = tokenize_files(file_string, comment_inline_pattern, comment_open_close_pattern, separators)
-
-  (file_hash,lines,LOC,SLOC) = final_stats
-
-  (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens
-
-  file_url = proj_url + '/' + file_path[7:].replace(' ','%20')
-  file_path = os.path.join(container_path, file_path)
+    global N_PROCESSES, PROJECTS_BATCH, FILE_projects_list, FILE_priority_projects
+    global PATH_stats_file_folder, PATH_bookkeeping_proj_folder, PATH_tokens_file_folder, PATH_logs
+    global separators, comment_inline, comment_inline_pattern, comment_open_tag, comment_close_tag, \
+        comment_open_close_pattern, second_comment_open_tag, second_comment_close_tag, second_comment_open_close_pattern
+    global file_extensions
 
-  ww_time = dt.datetime.now()
-  FILE_stats_file.write(','.join([proj_id,str(file_id),'\"'+file_path+'\"','\"'+file_url+'\"','\"'+file_hash+'\"',file_bytes,str(lines),str(LOC),str(SLOC)]) + '\n')
-  w_time = (dt.datetime.now() - ww_time).microseconds
+    global init_file_id
+    global init_proj_id
+
+    # instantiate
+    config = ConfigParser()
+
+    # parse existing file
+    try:
+        config.read(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini'))
+    except IOError:
+        print('ERROR - Config settings not found. Usage: $python this-script.py config-file.ini')
+        sys.exit()
+
+    # Get info from config.ini into global variables
+    N_PROCESSES = config.getint('Main', 'N_PROCESSES')
+
+    PROJECTS_BATCH = config.getint('Main', 'PROJECTS_BATCH')
+    FILE_projects_list = config.get('Main', 'FILE_projects_list')
+
+    print(FILE_projects_list, 'FILE_projects_list')
+    if config.has_option('Main', 'FILE_priority_projects'):
+        FILE_priority_projects = config.get('Main', 'FILE_priority_projects')
+
+    PATH_stats_file_folder = config.get('Folders/Files', 'PATH_stats_file_folder')
+    PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder')
+    PATH_tokens_file_folder = config.get('Folders/Files', 'PATH_tokens_file_folder')
+    PATH_logs = config.get('Folders/Files', 'PATH_logs')
+
+    # Reading Language settings
+    separators = config.get('Language', 'separators').strip('"').split(' ')
+    comment_inline = re.escape(config.get('Language', 'comment_inline'))
+    comment_inline_pattern = comment_inline + '.*?$'
+    comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))
+    comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))
+    comment_open_close_pattern = comment_open_tag + '.*?' + comment_close_tag
+
+    second_comment_open_tag = re.escape(config.get('Language', 'second_comment_open_tag'))
+    second_comment_close_tag = re.escape(config.get('Language', 'second_comment_close_tag'))
+    second_comment_open_close_pattern = second_comment_open_tag + '.*?' + second_comment_close_tag
+
+    file_extensions = config.get('Language', 'File_extensions').split(' ')
+
+    # Reading config settings
+    init_file_id = config.getint('Config', 'init_file_id')
+    init_proj_id = config.getint('Config', 'init_proj_id')
+
+
+def tokenize_files(file_string, comment_inline_pattern, comment_open_close_pattern, second_comment_open_close_pattern,
+                   separators):
+    final_stats = 'ERROR'
+    final_tokens = 'ERROR'
+
+    file_hash = 'ERROR'
+    lines = 'ERROR'
+    LOC = 'ERROR'
+    SLOC = 'ERROR'
+
+    h_time = dt.datetime.now()
+    m = hashlib.md5()
+
+    m.update(file_string)
+
+    file_hash = m.hexdigest()
+
+    hash_time = (dt.datetime.now() - h_time).microseconds
+    print(type(file_string), 'file_string')
+    lines = file_string.count(b"\n")
+    print(lines, 'file_string')
+    if not file_string.endswith(b'\n'):
+        lines += 1
+
+    file_string = b"".join([s for s in file_string.splitlines(True) if s.strip()])
+
+    LOC = file_string.count(b'\n')
+    if not file_string.endswith(b'\n'):
+        LOC += 1
+
+    re_time = dt.datetime.now()
+
+    # Remove tagged comments
+    file_string = re.sub(bytes(comment_open_close_pattern, 'utf-8'), b'', file_string, flags=re.DOTALL)
+    # Remove second end of line comments
+    file_string = re.sub(bytes(second_comment_open_close_pattern, 'utf-8'), b'', file_string, flags=re.DOTALL)
+    # Remove end of line comments
+    file_string = re.sub(bytes(comment_inline_pattern, 'utf-8'), b'', file_string, flags=re.MULTILINE)
+
+    re_time = (dt.datetime.now() - re_time).microseconds
+
+    file_string = b"".join([s for s in file_string.splitlines(True) if s.strip()]).strip()
+    print(file_string)
+    SLOC = file_string.count(b'\n')
+    if file_string != '' and not file_string.endswith(b'\n'):
+        SLOC += 1
+
+    final_stats = (file_hash, lines, LOC, SLOC)
+    print(file_string, 'file_string file_string file_string')
+    # Rather a copy of the file string here for tokenization
+    file_string_for_tokenization = file_string.decode('utf-8')
 
-  ww_time = dt.datetime.now()
-  FILE_tokens_file.write(','.join([proj_id,str(file_id),str(tokens_count_total),str(tokens_count_unique),token_hash+tokens]) + '\n')
-  w_time += (dt.datetime.now() - ww_time).microseconds
+    # Transform separators into spaces (remove them)
+    s_time = dt.datetime.now()
+    for x in separators:
+        file_string_for_tokenization = file_string_for_tokenization.replace(x, ' ')
+    s_time = (dt.datetime.now() - s_time).microseconds
 
-  logging.info('Successfully ran process_file_contents '+os.path.join(container_path, file_path))
+    ##Create a list of tokens
+    file_string_for_tokenization = file_string_for_tokenization.split()
+    ## Total number of tokens
+    tokens_count_total = len(file_string_for_tokenization)
+    ##Count occurrences
+    file_string_for_tokenization = collections.Counter(file_string_for_tokenization)
+    ##Converting Counter to dict because according to StackOverflow is better
+    file_string_for_tokenization = dict(file_string_for_tokenization)
+    ## Unique number of tokens
 
-  return file_parsing_times + [w_time] # [s_time, t_time, w_time, hash_time, re_time]
+    tokens_count_unique = len(file_string_for_tokenization)
 
-def process_regular_folder(args, folder_path, files):
-  process_num, proj_id, proj_path, proj_url, base_file_id, \
-    FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, times = args
+    t_time = dt.datetime.now()
+    # SourcererCC formatting
+    tokens = ','.join(['{}@@::@@{}'.format(k.encode('utf-8'), v)
+                       for k, v in file_string_for_tokenization.items()])
+    t_time = (dt.datetime.now() - t_time).microseconds
 
-  file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0
-  all_files = files
-
-  # Filter them by the correct extension
-  aux = []
-  for extension in file_extensions:
-    aux.extend([x for x in all_files if x.endswith(extension)])
-  all_files = aux
-
-  # This is very strange, but I did find some paths with newlines,
-  # so I am simply eliminates them
-  all_files = [x for x in all_files if '\n' not in x]
-
-  for file_path in all_files:
-    file_id = process_num*MULTIPLIER + base_file_id + file_count
-    print "<%s, %s, %s>" %(file_id, folder_path, file_path)
-    file_path = os.path.join(folder_path, file_path)
-
-    with open(file_path) as f:
-      f_time = dt.datetime.now()
-      file_string = f.read()
-      f_time = (dt.datetime.now() - f_time).microseconds
-
-      times_c = process_file_contents(file_string, proj_id, file_id, "", file_path, str(os.path.getsize(file_path)),
-                        proj_url, FILE_tokens_file, FILE_stats_file, logging)
-      times[0] += f_time
-      times[1] += times_c[0]
-      times[2] += times_c[1]
-      times[3] += times_c[4]
-      times[4] += times_c[2]
-      times[5] += times_c[3]
-      
-
-def process_tgz_ball(process_num, tar_file, proj_id, proj_path, proj_url, base_file_id, 
-            FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging):
-  zip_time = file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0
-
-  try:
-    with tarfile.open(tar_file,'r|*') as my_tar_file:
-
-      for f in my_tar_file:
-        if not f.isfile():
-          continue
-        
-        file_path = f.name
-        # Filter by the correct extension
-        if not os.path.splitext(f.name)[1] in file_extensions:
-          continue
-        
-        # This is very strange, but I did find some paths with newlines,
-        # so I am simply ignoring them
-        if '\n' in file_path:
-          continue
-
-        file_id = process_num*MULTIPLIER + base_file_id + file_count
-
-        file_bytes=str(f.size)
-
-        z_time = dt.datetime.now()
-        try:
-          myfile = my_tar_file.extractfile(f)
-        except:
-          logging.warning('Unable to open file (1) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+
-                  '> (process '+str(process_num)+')')
-          break
-        zip_time += (dt.datetime.now() - z_time).microseconds
-
-        if myfile is None:
-          logging.warning('Unable to open file (2) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+
-                  '> (process '+str(process_num)+')')
-          break
-
-        f_time = dt.datetime.now()
-        file_string = myfile.read()
-        file_time += (dt.datetime.now() - f_time).microseconds
-
-        times = process_file_contents(file_string, proj_id, file_id, tar_file, file_path, file_bytes,
-                        proj_url, FILE_tokens_file, FILE_stats_file, logging)
-        string_time += times[0]
-        tokens_time += times[1]
-        write_time  += times[4]
-        hash_time   += times[2]
-        regex_time  += times[3]
-
-#                if (file_count % 50) == 0:
-#                    logging.info('Zip: %s Read: %s Separators: %s Tokens: %s Write: %s Hash: %s regex: %s', 
-#                                 zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
-
-  except Exception as e:
-    logging.warning('Unable to open tar on <'+proj_id+','+proj_path+'> (process '+str(process_num)+')')
-    logging.warning(e)
-    return
-
-  return (zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
-
-def process_zip_ball(process_num, zip_file, proj_id, proj_path, proj_url, base_file_id, 
-            FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging):
-  zip_time = file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0
-
-  logging.info('Attempting to process_zip_ball '+zip_file)
-
-  try:
-    with zipfile.ZipFile(proj_path,'r') as my_file:
-
-      for file in my_file.infolist():
-
-        if not os.path.splitext(file.filename)[1] in file_extensions:
-          continue
-
-        file_path = file.filename
-
-        # This is very strange, but I did find some paths with newlines,
-        # so I am simply ignoring them
-        if '\n' in file_path:
-          continue
-
-        file_id = process_num*MULTIPLIER + base_file_id + file_count
-
-        file_bytes=str(file.file_size)
-
-        z_time = dt.datetime.now()
-        try:
-          my_zip_file = my_file.open(file.filename,'r')
-        except:
-          logging.warning('Unable to open file (1) <'+os.path.join(proj_path,file)+'> (process '+str(process_num)+')')
-          break
-        zip_time += (dt.datetime.now() - z_time).microseconds
-
-        if my_zip_file is None:
-          logging.warning('Unable to open file (2) <'+os.path.join(proj_path,file)+'> (process '+str(process_num)+')')
-          break
-
-        f_time      = dt.datetime.now()
-        file_string = my_zip_file.read()
-        file_time   += (dt.datetime.now() - f_time).microseconds
-
-        times = process_file_contents(file_string, proj_id, file_id, zip_file, file_path, file_bytes,
-                        proj_url, FILE_tokens_file, FILE_stats_file, logging)
-
-        string_time += times[0]
-        tokens_time += times[1]
-        write_time  += times[4]
-        hash_time   += times[2]
-        regex_time  += times[3]
-
-#                if (file_count % 50) == 0:
-#                    logging.info('Zip: %s Read: %s Separators: %s Tokens: %s Write: %s Hash: %s regex: %s', 
-#                                 zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
-
-  except Exception as e:
-    logging.warning('Unable to open zip on <'+proj_path+'> (process '+str(process_num)+')')
-    logging.warning(e)
-    return
-
-  logging.info('Successfully ran process_zip_ball '+zip_file)
-  return (zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
-
-def process_one_project(process_num, proj_id, proj_path, base_file_id, 
-            FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, project_format):
-
-  p_start = dt.datetime.now()
-
-  if project_format == 'leidos':
-    proj_path = proj_path
-    proj_url = 'None'
-    
-    logging.info('Starting leidos project <'+proj_id+','+proj_path+'> (process '+str(process_num)+')')
-
-    if not os.path.isdir(proj_path):
-      logging.warning('Unable to open project <'+proj_id+','+proj_path+'> (process '+str(process_num)+')')
-      return
-
-    # Search for tar files with _code in them
-    tar_files = [os.path.join(proj_path, f) for f in os.listdir(proj_path) if os.path.isfile(os.path.join(proj_path, f))]
-    tar_files = [f for f in tar_files if '_code' in f]
-    if(len(tar_files) != 1):
-      logging.warning('Tar not found on <'+proj_id+','+proj_path+'> (process '+str(process_num)+')')
-      times = [0,0,0,0,0,0,0]
-      os.path.walk(proj_path, process_regular_folder, 
-             (process_num, proj_id, proj_path, proj_url, base_file_id, 
-              FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, times))
-      zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times
-      zip_time = 0
-    else:
-      tar_file = tar_files[0]
-      times = process_tgz_ball(process_num, tar_file, proj_id, proj_path, proj_url, base_file_id, 
-                   FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging)
-    if times is not None:
-      zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times
-    else:
-      zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = (-1,-1,-1,-1,-1,-1,-1)
+    # MD5
+    h_time = dt.datetime.now()
+    m = hashlib.md5()
+    m.update(bytes(tokens, 'utf-8'))
+    hash_time += (dt.datetime.now() - h_time).microseconds
 
-    FILE_bookkeeping_proj.write(proj_id+',\"'+proj_path+'\",\"'+proj_url+'\"\n')
+    final_tokens = (tokens_count_total, tokens_count_unique, m.hexdigest(), '@#@' + tokens)
 
-  if project_format in ['zip']:
-    proj_url = 'NULL'
+    return (final_stats, final_tokens, [s_time, t_time, hash_time, re_time])
 
-    logging.info('Starting zip project <'+proj_id+','+proj_path+'> (process '+str(process_num)+')')
 
-    if not os.path.isfile(proj_path):
-      logging.warning('Unable to open project <'+proj_id+','+proj_path+'> (process '+str(process_num)+')')
-      return
+def process_file_contents(file_string, proj_id, file_id, container_path,
+                          file_path, file_bytes, proj_url, FILE_tokens_file, FILE_stats_file, logging):
+    logging.info('Attempting to process_file_contents ' + os.path.join(container_path, file_path))
 
-    zip_file = proj_path
-    times = process_zip_ball(process_num, zip_file, proj_id, proj_path, proj_url, base_file_id, 
-                 FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging)
-    if times is not None:
-      zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times
-    else:
-      zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = (-1,-1,-1,-1,-1,-1,-1)
+    global file_count
+    file_count += 1
+
+    (final_stats, final_tokens, file_parsing_times) = tokenize_files(file_string, comment_inline_pattern,
+                                                                     comment_open_close_pattern,
+                                                                     second_comment_open_close_pattern, separators)
 
-    FILE_bookkeeping_proj.write(proj_id+',\"'+proj_path+'\",\"'+proj_url+'\"\n')
+    (file_hash, lines, LOC, SLOC) = final_stats
+
+    (tokens_count_total, tokens_count_unique, token_hash, tokens) = final_tokens
+
+    file_url = proj_url + '/' + file_path[7:].replace(' ', '%20')
+    file_path = os.path.join(container_path, file_path)
+
+    ww_time = dt.datetime.now()
+    FILE_stats_file.write(','.join(
+        [proj_id, str(file_id), '\"' + file_path + '\"', '\"' + file_url + '\"', '\"' + file_hash + '\"', file_bytes,
+         str(lines), str(LOC), str(SLOC)]) + '\n')
+    w_time = (dt.datetime.now() - ww_time).microseconds
+
+    ww_time = dt.datetime.now()
+    FILE_tokens_file.write(','.join(
+        [proj_id, str(file_id), str(tokens_count_total), str(tokens_count_unique), token_hash + tokens]) + '\n')
+    w_time += (dt.datetime.now() - ww_time).microseconds
+
+    logging.info('Successfully ran process_file_contents ' + os.path.join(container_path, file_path))
+
+    return file_parsing_times + [w_time]  # [s_time, t_time, w_time, hash_time, re_time]
+
+
+def process_regular_folder(args, folder_path, files):
+    process_num, proj_id, proj_path, proj_url, base_file_id, \
+    FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, times = args
+
+    file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0
+    all_files = files
+
+    # Filter them by the correct extension
+    aux = []
+    for extension in file_extensions:
+        aux.extend([x for x in all_files if x.endswith(extension)])
+    all_files = aux
+
+    # This is very strange, but I did find some paths with newlines,
+    # so I am simply eliminates them
+    all_files = [x for x in all_files if '\n' not in x]
+
+    for file_path in all_files:
+        file_id = process_num * MULTIPLIER + base_file_id + file_count
+        print("<%s, %s, %s>" % (file_id, folder_path, file_path))
+        file_path = os.path.join(folder_path, file_path)
+
+        with open(file_path) as f:
+            f_time = dt.datetime.now()
+            file_string = f.read()
+            f_time = (dt.datetime.now() - f_time).microseconds
+
+            times_c = process_file_contents(file_string, proj_id, file_id, "", file_path,
+                                            str(os.path.getsize(file_path)),
+                                            proj_url, FILE_tokens_file, FILE_stats_file, logging)
+            times[0] += f_time
+            times[1] += times_c[0]
+            times[2] += times_c[1]
+            times[3] += times_c[4]
+            times[4] += times_c[2]
+            times[5] += times_c[3]
+
+
+def process_tgz_ball(process_num, tar_file, proj_id, proj_path, proj_url, base_file_id,
+                     FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging):
+    zip_time = file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0
+
+    try:
+        with tarfile.open(tar_file, 'r|*') as my_tar_file:
+
+            for f in my_tar_file:
+                if not f.isfile():
+                    continue
+
+                file_path = f.name
+                # Filter by the correct extension
+                if not os.path.splitext(f.name)[1] in file_extensions:
+                    continue
+
+                # This is very strange, but I did find some paths with newlines,
+                # so I am simply ignoring them
+                if '\n' in file_path:
+                    continue
+
+                file_id = process_num * MULTIPLIER + base_file_id + file_count
+
+                file_bytes = str(f.size)
+
+                z_time = dt.datetime.now()
+                try:
+                    myfile = my_tar_file.extractfile(f)
+                except:
+                    logging.warning(
+                        'Unable to open file (1) <' + proj_id + ',' + str(file_id) + ',' + os.path.join(tar_file,
+                                                                                                        file_path) +
+                        '> (process ' + str(process_num) + ')')
+                    break
+                zip_time += (dt.datetime.now() - z_time).microseconds
+
+                if myfile is None:
+                    logging.warning(
+                        'Unable to open file (2) <' + proj_id + ',' + str(file_id) + ',' + os.path.join(tar_file,
+                                                                                                        file_path) +
+                        '> (process ' + str(process_num) + ')')
+                    break
+
+                f_time = dt.datetime.now()
+                file_string = myfile.read()
+                file_time += (dt.datetime.now() - f_time).microseconds
+
+                times = process_file_contents(file_string, proj_id, file_id, tar_file, file_path, file_bytes,
+                                              proj_url, FILE_tokens_file, FILE_stats_file, logging)
+                string_time += times[0]
+                tokens_time += times[1]
+                write_time += times[4]
+                hash_time += times[2]
+                regex_time += times[3]
+
+    #                if (file_count % 50) == 0:
+    #                    logging.info('Zip: %s Read: %s Separators: %s Tokens: %s Write: %s Hash: %s regex: %s',
+    #                                 zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
+
+    except Exception as e:
+        logging.warning('Unable to open tar on <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')')
+        logging.warning(e)
+        return
+
+    return (zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
+
+
+def process_zip_ball(process_num, zip_file, proj_id, proj_path, proj_url, base_file_id,
+                     FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging):
+    zip_time = file_time = string_time = tokens_time = hash_time = write_time = regex_time = 0
+
+    logging.info('Attempting to process_zip_ball ' + zip_file)
+
+    try:
+        with zipfile.ZipFile(proj_path, 'r') as my_file:
+            for file in my_file.infolist():
+                if not os.path.splitext(file.filename)[1] in file_extensions:
+                    continue
+
+                file_path = file.filename
+                # This is very strange, but I did find some paths with newlines,
+                # so I am simply ignoring them
+                if '\n' in file_path:
+                    continue
+                file_id = process_num * MULTIPLIER + base_file_id + file_count
+                file_bytes = str(file.file_size)
+                z_time = dt.datetime.now()
+                try:
+                    my_zip_file = my_file.open(file.filename, 'r')
+                except:
+                    logging.warning(
+                        'Unable to open file (1) <' + os.path.join(proj_path, file) + '> (process ' + str(
+                            process_num) + ')')
+                    break
+                zip_time += (dt.datetime.now() - z_time).microseconds
+                if my_zip_file is None:
+                    logging.warning(
+                        'Unable to open file (2) <' + os.path.join(proj_path, file) + '> (process ' + str(
+                            process_num) + ')')
+                    break
+
+                f_time = dt.datetime.now()
+                file_string = my_zip_file.read()
+
+                file_time += (dt.datetime.now() - f_time).microseconds
+
+                times = process_file_contents(file_string, proj_id, file_id, zip_file, file_path, file_bytes,
+                                              proj_url, FILE_tokens_file, FILE_stats_file, logging)
+
+                string_time += times[0]
+                tokens_time += times[1]
+                write_time += times[4]
+                hash_time += times[2]
+                regex_time += times[3]
+
+        #                if (file_count % 50) == 0:
+        #                    logging.info('Zip: %s Read: %s Separators: %s Tokens: %s Write: %s Hash: %s regex: %s',
+        #                                 zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
+
+
+    except Exception as e:
+        logging.warning('Unable to open zip on <' + proj_path + '> (process ' + str(process_num) + ')')
+        logging.warning(e)
+        return
+
+    logging.info('Successfully ran process_zip_ball ' + zip_file)
+    return (zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
+
+
+def process_one_project(process_num, proj_id, proj_path, base_file_id,
+                        FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, project_format):
+    p_start = dt.datetime.now()
+
+    if project_format == 'leidos':
+        proj_path = proj_path
+        proj_url = 'None'
+
+        logging.info('Starting leidos project <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')')
+
+        if not os.path.isdir(proj_path):
+            logging.warning(
+                'Unable to open project <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')')
+            return
+
+        # Search for tar files with _code in them
+        tar_files = [os.path.join(proj_path, f) for f in os.listdir(proj_path) if
+                     os.path.isfile(os.path.join(proj_path, f))]
+        tar_files = [f for f in tar_files if '_code' in f]
+        if (len(tar_files) != 1):
+            logging.warning('Tar not found on <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')')
+            times = [0, 0, 0, 0, 0, 0, 0]
+            os.path.walk(proj_path, process_regular_folder,
+                         (process_num, proj_id, proj_path, proj_url, base_file_id,
+                          FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, times))
+            zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times
+            zip_time = 0
+        else:
+            tar_file = tar_files[0]
+            times = process_tgz_ball(process_num, tar_file, proj_id, proj_path, proj_url, base_file_id,
+                                     FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging)
+        if times is not None:
+            zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times
+        else:
+            zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = (
+            -1, -1, -1, -1, -1, -1, -1)
+
+        FILE_bookkeeping_proj.write(proj_id + ',\"' + proj_path + '\",\"' + proj_url + '\"\n')
+
+    if project_format in ['zip']:
+        proj_url = 'NULL'
+        logging.info('Starting zip project <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')')
+        if not os.path.isfile(proj_path):
+            logging.warning(
+                'Unable to open project <' + proj_id + ',' + proj_path + '> (process ' + str(process_num) + ')')
+            return
+
+        zip_file = proj_path
+
+        times = process_zip_ball(process_num, zip_file, proj_id, proj_path, proj_url, base_file_id,
+                                 FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging)
+        if times is not None:
+            zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times
+        else:
+            zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = (
+            -1, -1, -1, -1, -1, -1, -1)
+
+        FILE_bookkeeping_proj.write(proj_id + ',\"' + proj_path + '\",\"' + proj_url + '\"\n')
+
+    p_elapsed = dt.datetime.now() - p_start
+    logging.info('Project finished <%s,%s> (process %s)', proj_id, proj_path, process_num)
+    logging.info(
+        ' (%s): Total: %smicros | Zip: %s Read: %s Separators: %smicros Tokens: %smicros Write: %smicros Hash: %s regex: %s',
+        process_num, p_elapsed, zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
 
-  p_elapsed = dt.datetime.now() - p_start
-  logging.info('Project finished <%s,%s> (process %s)', proj_id, proj_path, process_num)
-  logging.info(' (%s): Total: %smicros | Zip: %s Read: %s Separators: %smicros Tokens: %smicros Write: %smicros Hash: %s regex: %s', 
-         process_num,  p_elapsed, zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time)
 
 def process_projects(process_num, list_projects, base_file_id, global_queue, project_format):
-    if platform.system() =='Windows':
+    if platform.system() == 'Windows':
         read_config()
 
     # Logging code
     FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'
-    logging.basicConfig(level=logging.DEBUG,format=FORMAT)
-    file_handler = logging.FileHandler(os.path.join(PATH_logs,'LOG-'+str(process_num)+'.log'))
+    logging.basicConfig(level=logging.DEBUG, format=FORMAT)
+    file_handler = logging.FileHandler(os.path.join(PATH_logs, 'LOG-' + str(process_num) + '.log'))
     file_handler.setFormatter(logging.Formatter(FORMAT))
     logging.getLogger().addHandler(file_handler)
 
-    FILE_files_stats_file = os.path.join(PATH_stats_file_folder,'files-stats-'+str(process_num)+'.stats')
-    FILE_bookkeeping_proj_name = os.path.join(PATH_bookkeeping_proj_folder,'bookkeeping-proj-'+str(process_num)+'.projs')
-    FILE_files_tokens_file = os.path.join(PATH_tokens_file_folder,'files-tokens-'+str(process_num)+'.tokens')
+    FILE_files_stats_file = os.path.join(PATH_stats_file_folder, 'files-stats-' + str(process_num) + '.stats')
+    FILE_bookkeeping_proj_name = os.path.join(PATH_bookkeeping_proj_folder,
+                                              'bookkeeping-proj-' + str(process_num) + '.projs')
+    FILE_files_tokens_file = os.path.join(PATH_tokens_file_folder, 'files-tokens-' + str(process_num) + '.tokens')
 
     global file_count
     file_count = 0
@@ -431,127 +481,141 @@ def process_projects(process_num, list_projects, base_file_id, global_queue, pro
         logging.info("Process %s starting", process_num)
         p_start = dt.datetime.now()
         for proj_id, proj_path in list_projects:
-            process_one_project(process_num, str(proj_id), proj_path, base_file_id, 
+            process_one_project(process_num, str(proj_id), proj_path, base_file_id,
                                 FILE_tokens_file, FILE_bookkeeping_proj, FILE_stats_file, logging, project_format)
 
     p_elapsed = (dt.datetime.now() - p_start).seconds
-    logging.info('Process %s finished. %s files in %ss.', 
+    logging.info('Process %s finished. %s files in %ss.',
                  process_num, file_count, p_elapsed)
 
     # Let parent know
     global_queue.put((process_num, file_count))
     sys.exit(0)
 
+
 def start_child(processes, global_queue, proj_paths, batch, project_format):
-  # This is a blocking get. If the queue is empty, it waits
-  pid, n_files_processed = global_queue.get()
-  # OK, one of the processes finished. Let's get its data and kill it
-  kill_child(processes, pid, n_files_processed)
+    # This is a blocking get. If the queue is empty, it waits
+    pid, n_files_processed = global_queue.get()
+    # OK, one of the processes finished. Let's get its data and kill it
+    kill_child(processes, pid, n_files_processed)
 
-  # Get a new batch of project paths ready
-  paths_batch = proj_paths[:batch]
-  del proj_paths[:batch]
+    # Get a new batch of project paths ready
+    paths_batch = proj_paths[:batch]
+    del proj_paths[:batch]
+
+    print("Starting new process %s" % (pid))
+    p = Process(name='Process ' + str(pid), target=process_projects,
+                args=(pid, paths_batch, processes[pid][1], global_queue, project_format,))
+    processes[pid][0] = p
+    p.start()
 
-  print "Starting new process %s" % (pid)
-  p = Process(name='Process '+str(pid), target=process_projects, args=(pid, paths_batch, processes[pid][1], global_queue, project_format, ))
-  processes[pid][0] = p
-  p.start()
 
 def kill_child(processes, pid, n_files_processed):
-  global file_count
-  file_count += n_files_processed
-  if processes[pid][0] != None:
-    processes[pid][0] = None
-    processes[pid][1] += n_files_processed
-    
-    print "Process %s finished, %s files processed (%s). Current total: %s" % (pid, n_files_processed, processes[pid][1], file_count)
+    global file_count
+    file_count += n_files_processed
+    if processes[pid][0] != None:
+        processes[pid][0] = None
+        processes[pid][1] += n_files_processed
 
-def active_process_count(processes):
-  count = 0
-  for p in processes:
-    if p[0] != None:
-      count +=1
-  return count
+        print("Process %s finished, %s files processed (%s). Current total: %s" % (
+        pid, n_files_processed, processes[pid][1], file_count))
 
-if __name__ == '__main__':
 
-  global project_format
-  project_format = sys.argv[1] # 'zip' or 'leidos'
-
-  if project_format not in ['zip','leidos']:
-    print "ERROR - Please insert archive format, 'zip', 'leidos'!"
-    sys.exit()
-
-  read_config()
-  p_start = dt.datetime.now()
-
-  prio_proj_paths = []
-  if FILE_priority_projects != None:
-    with open(FILE_priority_projects) as f:
-      for line in f:
-        line_split = line.strip('\n') # [:-1] to strip final character which is '\n'
-        prio_proj_paths.append(line_split)
-    prio_proj_paths = zip(range(init_proj_id, len(prio_proj_paths)+init_proj_id), prio_proj_paths)
-
-  proj_paths = []
-  if project_format == 'leidos':
-    print '\'',project_format,'\'','format'
-    with open(FILE_projects_list) as f:
-      for line in f:
-        prio = False
-        line_split = line.strip('\n') # [:-1] to strip final character which is '\n'
-        for p in prio_proj_paths:
-          if p[1][0] == line_split:
-            prio = True
-            print "Project %s is in priority list" % line_split
-        if not prio:
-          proj_paths.append(line_split)
-    proj_paths = zip(range(1, len(proj_paths)+1), proj_paths)
-  if project_format in ['zip']:
-    print '\'',project_format,'\'','format'
-    with open(FILE_projects_list) as f:
-      for line in f:
-        proj_paths.append(line[:-1])
-    proj_paths = zip(range(1, len(proj_paths)+1), proj_paths)
-
-  if os.path.exists(PATH_stats_file_folder) or os.path.exists(PATH_bookkeeping_proj_folder) or os.path.exists(PATH_tokens_file_folder) or os.path.exists(PATH_logs):
-    print 'ERROR - Folder ['+PATH_stats_file_folder+'] or ['+PATH_bookkeeping_proj_folder+'] or ['+PATH_tokens_file_folder+'] or ['+PATH_logs+'] already exists!'
-    sys.exit(1)
-  else:
-    os.makedirs(PATH_stats_file_folder)
-    os.makedirs(PATH_bookkeeping_proj_folder)
-    os.makedirs(PATH_tokens_file_folder)
-    os.makedirs(PATH_logs)
-
-  #Split list of projects into N_PROCESSES lists
-  #proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]
-
-  # Multiprocessing with N_PROCESSES
-  # [process, file_count]
-  processes = [[None, init_file_id] for i in xrange(N_PROCESSES)]
-  # Multiprocessing shared variable instance for recording file_id
-  #file_id_global_var = Value('i', 1)
-  # The queue for processes to communicate back to the parent (this process)
-  # Initialize it with N_PROCESSES number of (process_id, n_files_processed)
-  global_queue = Queue()
-  for i in xrange(N_PROCESSES):
-    global_queue.put((i, 0))
-
-  # Start the priority projects
-  print "*** Starting priority projects..."
-  while len(prio_proj_paths) > 0:
-    start_child(processes, global_queue, prio_proj_paths, 1, project_format)
-
-  # Start all other projects
-  print "*** Starting regular projects..."
-  while len(proj_paths) > 0:
-    start_child(processes, global_queue, proj_paths, PROJECTS_BATCH, project_format)
-
-  print "*** No more projects to process. Waiting for children to finish..."
-  while active_process_count(processes) > 0:
-    pid, n_files_processed = global_queue.get()
-    kill_child(processes, pid, n_files_processed)
+def active_process_count(processes):
+    count = 0
+    for p in processes:
+        if p[0] != None:
+            count += 1
+    return count
+
 
-  p_elapsed = dt.datetime.now() - p_start
-  print "*** All done. %s files in %s" % (file_count, p_elapsed)
+if __name__ == '__main__':
 
+    global project_format
+    project_format = sys.argv[1]  # 'zip' or 'leidos'
+
+    if project_format not in ['zip', 'leidos']:
+        print("ERROR - Please insert archive format, 'zip', 'leidos'!")
+        sys.exit()
+
+    read_config()
+    p_start = dt.datetime.now()
+
+    prio_proj_paths = []
+
+    if FILE_priority_projects != None:
+        with open(FILE_priority_projects) as f:
+            for line in f:
+                line_split = line.strip('\n')  # [:-1] to strip final character which is '\n'
+                prio_proj_paths.append(line_split)
+        prio_proj_paths = list(zip(list(range(init_proj_id, len(prio_proj_paths) + init_proj_id)), prio_proj_paths))
+
+    proj_paths = []
+    if project_format == 'leidos':
+        print('\'', project_format, '\'', 'format')
+        with open(FILE_projects_list) as f:
+
+            for line in f:
+                prio = False
+                line_split = line.strip('\n')  # [:-1] to strip final character which is '\n'
+                for p in prio_proj_paths:
+                    if p[1][0] == line_split:
+                        prio = True
+                        print("Project %s is in priority list" % line_split)
+                if not prio:
+                    proj_paths.append(line_split)
+        proj_paths = list(zip(list(range(1, len(proj_paths) + 1)), proj_paths))
+
+    if project_format in ['zip']:
+        print('\'', project_format, '\'', 'format')
+        with open(FILE_projects_list) as f:
+            for line in f:
+                proj_paths.append(line[:-1])
+        proj_paths = list(zip(list(range(1, len(proj_paths) + 1)), proj_paths))
+
+    if os.path.exists(PATH_stats_file_folder) or os.path.exists(PATH_bookkeeping_proj_folder) or os.path.exists(
+            PATH_tokens_file_folder) or os.path.exists(PATH_logs):
+        print(
+            'ERROR - Folder [' + PATH_stats_file_folder + '] or [' + PATH_bookkeeping_proj_folder + '] or [' + PATH_tokens_file_folder + '] or [' + PATH_logs + '] already exists!')
+        sys.exit(1)
+    else:
+        os.makedirs(PATH_stats_file_folder)
+        os.makedirs(PATH_bookkeeping_proj_folder)
+        os.makedirs(PATH_tokens_file_folder)
+        os.makedirs(PATH_logs)
+
+    # Split list of projects into N_PROCESSES lists
+    # proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]
+
+    # Multiprocessing with N_PROCESSES
+    # [process, file_count]
+    processes = [[None, init_file_id] for i in range(N_PROCESSES)]
+    # Multiprocessing shared variable instance for recording file_id
+    # file_id_global_var = Value('i', 1)
+    # The queue for processes to communicate back to the parent (this process)
+    # Initialize it with N_PROCESSES number of (process_id, n_files_processed)
+    global_queue = Queue()
+    for i in range(N_PROCESSES):
+        global_queue.put((i, 0))
+
+    # Start the priority projects
+    print("*** Starting priority projects...")
+    while len(prio_proj_paths) > 0:
+        start_child(processes, global_queue, prio_proj_paths, 1, project_format)
+
+    # Start all other projects
+    print("*** Starting regular projects...")
+    while len(proj_paths) > 0:
+        start_child(processes, global_queue, proj_paths, PROJECTS_BATCH, project_format)
+
+    print("*** No more projects to process. Waiting for children to finish...")
+    while active_process_count(processes) > 0:
+        pid, n_files_processed = global_queue.get()
+        kill_child(processes, pid, n_files_processed)
+
+    p_elapsed = dt.datetime.now() - p_start
+    print("*** All done. %s files in %s" % (file_count, p_elapsed))
+
+    call('cat ./files_tokens/* > blocks.file', shell=True)
+    shutil.move('./blocks.file',
+                '/Users/malinda/Documents/Research_Topic_2/SourcererCC-master/clone-detector/input/dataset/')
\ No newline at end of file

From f53261e4ffcd20aa7f29f098c1aa8e23c6149d31 Mon Sep 17 00:00:00 2001
From: Malinda <malinda.dilhara@gmail.com>
Date: Wed, 22 Jul 2020 17:26:09 -0700
Subject: [PATCH 2/3] more changes to upgrade python version

---
 .../utils/get_source_from_tokens.py           | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tokenizers/file-level/utils/get_source_from_tokens.py b/tokenizers/file-level/utils/get_source_from_tokens.py
index 62228504a..459100e0b 100644
--- a/tokenizers/file-level/utils/get_source_from_tokens.py
+++ b/tokenizers/file-level/utils/get_source_from_tokens.py
@@ -16,7 +16,7 @@ def grab_ids(folder_or_file):
     if os.path.isfile(folder_or_file):
       paths.add(folder_or_file)
     else:
-      print "ERROR: '",projects_from_blocks,"' not found!"
+      print("ERROR: '",projects_from_blocks,"' not found!")
   
   res = set()
   for p in paths:
@@ -38,7 +38,7 @@ def copy_files(ids_set, folder_or_file, output_folder):
     if os.path.isfile(folder_or_file):
       paths.add(folder_or_file)
     else:
-      print "ERROR: '",projects_from_blocks,"' not found!"
+      print("ERROR: '",projects_from_blocks,"' not found!")
 
   for p in paths:
     with open(p,'r') as file:
@@ -59,7 +59,7 @@ def copy_files(ids_set, folder_or_file, output_folder):
               with open(os.path.join(output_folder,file_path), 'w') as f:
                 f.write(z.read(file_path))
           except Exception as e:
-            print 'ERROR reading',zip_path,e
+            print('ERROR reading',zip_path,e)
 
 
           copy_count += 1
@@ -81,40 +81,40 @@ def copy_files(ids_set, folder_or_file, output_folder):
   (options, args) = parser.parse_args()
 
   if not len(sys.argv) > 1:
-    print "No arguments were passed. Try running with '--help'."
+    print("No arguments were passed. Try running with '--help'.")
     sys.exit(0)
 
   if (not options.tokensFiles) or (not options.statsFiles):
-    print "Arguments '-b' and '-s' are mandatory. Try running with '--help'."
+    print("Arguments '-b' and '-s' are mandatory. Try running with '--help'.")
     sys.exit(0)
 
   #### ARGUMENTS HANDLING MUST BE below
   output_folder = default_output_folder
   if options.outputDir:
     if os.path.isdir(options.outputDir):
-      print 'Folder',options.outputDir,'already exists.'
+      print('Folder',options.outputDir,'already exists.')
       sys.exit(0)
     else:
       os.makedirs(options.outputDir)
       output_folder = options.outputDir
-      print 'Folder',options.outputDir,'created.'
+      print('Folder',options.outputDir,'created.')
   else:
     if os.path.isdir(default_output_folder):
-      print 'Folder',default_output_folder,'already exists.'
+      print('Folder',default_output_folder,'already exists.')
       sys.exit(0)
     else:
       os.makedirs(default_output_folder)
-      print 'Folder',default_output_folder,'created.'
+      print('Folder',default_output_folder,'created.')
 
   p_start = dt.datetime.now()
-  print 'Grabbing IDs...'
+  print('Grabbing IDs...')
   ids_set = set()
   ids_set = grab_ids(options.tokensFiles)
-  print '%s file ids in %s' % (len(ids_set), dt.datetime.now() - p_start)
+  print('%s file ids in %s' % (len(ids_set), dt.datetime.now() - p_start))
 
   p_start = dt.datetime.now()
-  print 'Copying files...'
+  print('Copying files...')
   copy_count = copy_files(ids_set, options.statsFiles, default_output_folder)
-  print '%s files copied in %s' % (copy_count, dt.datetime.now() - p_start)
+  print('%s files copied in %s' % (copy_count, dt.datetime.now() - p_start))
 
 

From a515c0c9e2ce47874ec5ff80d167b9d5723af48e Mon Sep 17 00:00:00 2001
From: Malinda <malinda.dilhara@gmail.com>
Date: Wed, 22 Jul 2020 17:36:26 -0700
Subject: [PATCH 3/3] more changes to upgrade python version

---
 tokenizers/file-level/config.ini              |  2 +
 .../file-level/db-importer/clone_finder.py    | 38 +++++++++----------
 tokenizers/file-level/db-importer/db.py       |  4 +-
 .../file-level/db-importer/mysql-import.py    | 24 ++++++------
 tokenizers/file-level/tokenizer-unit-test.py  |  6 +--
 tokenizers/file-level/tokenizer.py            | 26 ++++++-------
 6 files changed, 49 insertions(+), 51 deletions(-)

diff --git a/tokenizers/file-level/config.ini b/tokenizers/file-level/config.ini
index a34edf093..55ba359b8 100644
--- a/tokenizers/file-level/config.ini
+++ b/tokenizers/file-level/config.ini
@@ -18,6 +18,8 @@ separators = ; . [ ] ( ) ~ ! - + & * / %% < > & ^ | ? { } = # , \ : $ " '
 comment_inline = //
 comment_open_tag = /*
 comment_close_tag = */
+second_comment_open_tag = """
+second_comment_close_tag = """
 ;.java
 File_extensions = .java
 ;.cpp .hpp .c .h .C .cc .CPP .c++ .cp
diff --git a/tokenizers/file-level/db-importer/clone_finder.py b/tokenizers/file-level/db-importer/clone_finder.py
index 1a6b642f7..b36501d20 100644
--- a/tokenizers/file-level/db-importer/clone_finder.py
+++ b/tokenizers/file-level/db-importer/clone_finder.py
@@ -15,7 +15,7 @@
 log_path = 'LOG-db-clonefinder.log'
 
 if os.path.isfile(log_path):
-    print 'ERROR: Log file:',log_path,'already exists'
+    print('ERROR: Log file:',log_path,'already exists')
     sys.exit(1)
 
 FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'
@@ -29,7 +29,7 @@ def findAllTokenHashClones(project_id, token_hashes, files_clones, db_object):
     try:
         query = """SELECT fileId, projectId, f.fileHash, tokenHash FROM files as f 
                    JOIN stats as s ON f.fileHash=s.fileHash 
-                   WHERE tokenHash in (%s) AND projectId >= %s;""" % ("'" + "','".join(token_hashes.keys()) + "'", project_id)
+                   WHERE tokenHash in (%s) AND projectId >= %s;""" % ("'" + "','".join(list(token_hashes.keys())) + "'", project_id)
         res = db_object.execute(query);
         logging.info(query)
         for (file_id, projectId, fileHash, tokenHash, ) in res:
@@ -39,8 +39,8 @@ def findAllTokenHashClones(project_id, token_hashes, files_clones, db_object):
                     files_clones[f].add((str(file_id), projectId))
 
     except Exception as e:
-        print 'Error on findAllTokenHashClones'
-        print e
+        print('Error on findAllTokenHashClones')
+        print(e)
         sys.exit(1)
 
 def find_clones_for_project(project_id, project_file_counts, db_object, debug):
@@ -68,7 +68,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
 
         if debug == 'all':
             logging.debug('## After round 1')
-            for k, v in files_clones.iteritems():
+            for k, v in files_clones.items():
                 if len(v) > 0:
                     logging.debug('%s-%s', k, v)
 
@@ -80,7 +80,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
 
         project_file_set = {}
         clone_set = set()
-        for fid, clones in files_clones.iteritems():
+        for fid, clones in files_clones.items():
             project_counted = False
             for clone in clones:
                 projectId = clone[1]
@@ -90,14 +90,14 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
                 project_file_set[projectId].add(fid)
                 
         # How many of this project's files are present in each of the other project?
-        for pid, file_list in project_file_set.iteritems():
+        for pid, file_list in project_file_set.items():
             percentage_clone_projects_counter[pid] = len(file_list)
 
 
         # How many of the other projects files are present in this project?
         for clone in clone_set:
             projectId = clone[1]
-            if percentage_host_projects_counter.has_key(projectId):
+            if projectId in percentage_host_projects_counter:
                 percentage_host_projects_counter[projectId] += 1
             else:
                 percentage_host_projects_counter[projectId] = 1
@@ -105,7 +105,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
         if len(percentage_host_projects_counter) > 0:
             # The key k (projects) should be the same between 
             # percentage_clone_projects_counter and percentage_host_projects_counter
-            for k, v in percentage_host_projects_counter.iteritems():
+            for k, v in percentage_host_projects_counter.items():
                 
                 percent_cloning = float(percentage_clone_projects_counter[k]*100)/total_files
                 percent_host = float(v*100)/project_file_counts[k]
@@ -116,7 +116,7 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
 
                 if debug == 'all' or debug == 'final':
                     if True:#(percent_cloning > 99) and (str(project_id) != k):
-                        print 'Proj',project_id,'in',k,'@',str( float("{0:.2f}".format(percent_cloning)) )+'% ('+str(v)+'/'+str(total_files),'files) affecting', str(float("{0:.2f}".format(percent_host)))+'%','['+str(percentage_cloning_counter[k])+'/'+str(total_files_host),'files]'
+                        print('Proj',project_id,'in',k,'@',str( float("{0:.2f}".format(percent_cloning)) )+'% ('+str(v)+'/'+str(total_files),'files) affecting', str(float("{0:.2f}".format(percent_host)))+'%','['+str(percentage_cloning_counter[k])+'/'+str(total_files_host),'files]')
 
                 else:
                     db_object.insert_projectClones(project_id, percentage_clone_projects_counter[k], total_files, float("{0:.2f}".format(percent_cloning)), 
@@ -124,8 +124,8 @@ def find_clones_for_project(project_id, project_file_counts, db_object, debug):
                                                    float("{0:.2f}".format(percent_host)))
 
     except Exception as e:
-        print 'Error on find_clones_for_project'
-        print e
+        print('Error on find_clones_for_project')
+        print(e)
         traceback.print_exc()
         sys.exit(1)
 
@@ -157,8 +157,8 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c
         db_object.flush_projectClones()
 
     except Exception as e:
-        print 'Error in clone_finder.start_process'
-        print e
+        print('Error in clone_finder.start_process')
+        print(e)
         sys.exit(1)
 
     finally:
@@ -216,14 +216,14 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c
 
         project_ids = []
 
-        for projectId in project_file_counts.keys():
+        for projectId in list(project_file_counts.keys()):
             project_ids.append(projectId)
             pair_number += 1
 
-        project_ids = [ project_ids[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]
+        project_ids = [ project_ids[i::N_PROCESSES] for i in range(N_PROCESSES) ]
 
         processes = []
-        for process_num in xrange(N_PROCESSES):
+        for process_num in range(N_PROCESSES):
             p = Process(name='Process '+str(process_num), target=start_process, 
                         args=(process_num, project_ids[process_num], DB_user, DB_name, DB_pass, project_file_counts, host, ))
             processes.append(p)
@@ -232,8 +232,8 @@ def start_process(pnum, input_process, DB_user, DB_name, DB_pass, project_file_c
         [p.join() for p in processes]
 
     except Exception as e:
-        print 'Error in clone_finder.__main__'
-        print e
+        print('Error in clone_finder.__main__')
+        print(e)
         sys.exit(1)
 
     finally:
diff --git a/tokenizers/file-level/db-importer/db.py b/tokenizers/file-level/db-importer/db.py
index f209d7f95..43950b123 100644
--- a/tokenizers/file-level/db-importer/db.py
+++ b/tokenizers/file-level/db-importer/db.py
@@ -295,7 +295,7 @@ def insert_file(self, file_id, proj_id, file_path, file_url, file_hash, flush =
 
     # Prepare the complete list
     if autoID:
-      self.files = map(lambda (a, b, c, d, e): (b, c, d, e), self.files)
+      self.files = [(a_b_c_d_e[1], a_b_c_d_e[2], a_b_c_d_e[3], a_b_c_d_e[4]) for a_b_c_d_e in self.files]
     flist = ','.join(self.files)
 
     self.check_connection()
@@ -442,7 +442,7 @@ def project_exists(self, proj_path):
   def sanitize_string(self, string_input):
     # To clean non-ascii characters
     printable = set(string.printable)
-    string_res = filter(lambda x: x in printable, string_input)
+    string_res = [x for x in string_input if x in printable]
     return (string_res[:DB_MAX_STRING_SIZE])
 
   def execute(self, query):
diff --git a/tokenizers/file-level/db-importer/mysql-import.py b/tokenizers/file-level/db-importer/mysql-import.py
index cd2c14986..6a696fa20 100644
--- a/tokenizers/file-level/db-importer/mysql-import.py
+++ b/tokenizers/file-level/db-importer/mysql-import.py
@@ -1,7 +1,7 @@
 import sys, os, csv
 from db import DB
 import logging
-import urllib
+import urllib.request, urllib.parse, urllib.error
 
 pattern = r'\"(.+?)\"'
 flag = None
@@ -94,8 +94,8 @@ def import_tokenizer_output_files_tokens(db, output_path, logging):
 
                 logging.warning('String partitioned into:'+file_id+'|'+proj_id+path+'|'+url+'|'+file_hash+'|'+bytess+'|'+lines+'|'+loc+'|'+sloc)
 
-            path = urllib.quote(path.strip('"'))
-            url = urllib.quote(url.strip('"'))
+            path = urllib.parse.quote(path.strip('"'))
+            url = urllib.parse.quote(url.strip('"'))
             file_hash = file_hash.strip('"')
 
             if flag == 'files-autoID':
@@ -218,8 +218,8 @@ def import_tokenizer_output_blocks_tokens(db, output_path, logging):
 
                 logging.warning('String partitioned into:'+file_id+'|'+proj_id+path+'|'+url+'|'+file_hash+'|'+bytess+'|'+lines+'|'+loc+'|'+sloc)
 
-            path = urllib.quote(path.strip('"'))
-            url = urllib.quote(url.strip('"'))
+            path = urllib.parse.quote(path.strip('"'))
+            url = urllib.parse.quote(url.strip('"'))
             file_hash = file_hash.strip('"')
 
             db.insert_file(file_id, proj_id, path, url, file_hash)
@@ -266,7 +266,7 @@ def import_pairs(db, pairs_path):
     commit_interval = 1000
     pair_number = 0
 
-    print '## Importing pairs from',pairs_path
+    print('## Importing pairs from',pairs_path)
     with open(pairs_path, 'r') as file:
       for line in file:
         pair_number += 1
@@ -275,11 +275,11 @@ def import_pairs(db, pairs_path):
         db.insert_CCPairs(line_split[0], line_split[1], line_split[2], line_split[3])
 
         if pair_number%commit_interval == 0:
-          print '    ',pair_number,'pairs committed'
+          print('    ',pair_number,'pairs committed')
 
   except Exception as e:
-    print 'Error accessing Database'
-    print e
+    print('Error accessing Database')
+    print(e)
     sys.exit(1)
 
 if __name__ == "__main__":
@@ -299,7 +299,7 @@ def import_pairs(db, pairs_path):
   log_path = 'LOG-db-importer.log'
 
   if os.path.isfile(log_path):
-    print 'ERROR: Log file:',log_path,'already exists'
+    print('ERROR: Log file:',log_path,'already exists')
     sys.exit(1)
 
   FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'
@@ -333,6 +333,6 @@ def import_pairs(db, pairs_path):
     db_object.close()
 
   except Exception as e:
-    print 'Error on __main__'
-    print e
+    print('Error on __main__')
+    print(e)
   
diff --git a/tokenizers/file-level/tokenizer-unit-test.py b/tokenizers/file-level/tokenizer-unit-test.py
index 169cc0f19..3cfccca36 100644
--- a/tokenizers/file-level/tokenizer-unit-test.py
+++ b/tokenizers/file-level/tokenizer-unit-test.py
@@ -1,12 +1,11 @@
 # -*- encoding: utf-8 -*-
 
+import hashlib
 import re
-import os
-import collections
 import sys
 import unittest
+
 import tokenizer as tokenizer
-import hashlib
 
 try:
     from configparser import ConfigParser
@@ -166,4 +165,3 @@ def test_simple_file(self):
 
 if __name__ == '__main__':
     unittest.main()
-
diff --git a/tokenizers/file-level/tokenizer.py b/tokenizers/file-level/tokenizer.py
index 9ad0c7912..23e5e51e7 100644
--- a/tokenizers/file-level/tokenizer.py
+++ b/tokenizers/file-level/tokenizer.py
@@ -1,17 +1,15 @@
+import collections
+import datetime as dt
+import hashlib
 import logging
-import multiprocessing as mp
-from multiprocessing import Process, Value, Queue
+import os
+import platform
 import re
-import os, platform
-import collections
-import tarfile
+import shutil
 import sys
-import hashlib
-import datetime as dt
+import tarfile
 import zipfile
-import javalang
-import pathlib
-import shutil
+from multiprocessing import Process, Queue
 from subprocess import call
 
 try:
@@ -426,7 +424,7 @@ def process_one_project(process_num, proj_id, proj_path, base_file_id,
             zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times
         else:
             zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = (
-            -1, -1, -1, -1, -1, -1, -1)
+                -1, -1, -1, -1, -1, -1, -1)
 
         FILE_bookkeeping_proj.write(proj_id + ',\"' + proj_path + '\",\"' + proj_url + '\"\n')
 
@@ -446,7 +444,7 @@ def process_one_project(process_num, proj_id, proj_path, base_file_id,
             zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = times
         else:
             zip_time, file_time, string_time, tokens_time, write_time, hash_time, regex_time = (
-            -1, -1, -1, -1, -1, -1, -1)
+                -1, -1, -1, -1, -1, -1, -1)
 
         FILE_bookkeeping_proj.write(proj_id + ',\"' + proj_path + '\",\"' + proj_url + '\"\n')
 
@@ -518,7 +516,7 @@ def kill_child(processes, pid, n_files_processed):
         processes[pid][1] += n_files_processed
 
         print("Process %s finished, %s files processed (%s). Current total: %s" % (
-        pid, n_files_processed, processes[pid][1], file_count))
+            pid, n_files_processed, processes[pid][1], file_count))
 
 
 def active_process_count(processes):
@@ -618,4 +616,4 @@ def active_process_count(processes):
 
     call('cat ./files_tokens/* > blocks.file', shell=True)
     shutil.move('./blocks.file',
-                '/Users/malinda/Documents/Research_Topic_2/SourcererCC-master/clone-detector/input/dataset/')
\ No newline at end of file
+                '/Users/malinda/Documents/Research_Topic_2/SourcererCC-master/clone-detector/input/dataset/')