omilab · raphihalff · Jul 8, 2019 · Jul 21, 2019 · Aug 18, 2019 · Aug 18, 2019
diff --git a/OCR_Pipeline/readme.md b/OCR_Pipeline/readme.md
@@ -1,10 +1,12 @@
 ## Legacy to PAGE Pipeline
-The folder "OCR_Pipeline" contains a "Runner" script (PipelineRunner.py) and a test Jupyter notebook: Pipeline.ipynb. 
+The folder "OCR_Pipeline" contains a "Runner" script (PipelineRunner.py) and a test Jupyter notebook: Pipeline.ipynb.
 
 The notebook describes the pipeline steps and includes usage samples. It includes a series of code snippets, and their expected output below.
 
-To use the runner script or the notebook samples, import additional two python files that are also included: TkbsApiClient.py and TkbsDocument.py. 
+To use the runner script or the notebook samples, import additional two python files that are also included: TkbsApiClient.py and TkbsDocument.py.
 
 The sample_data subfolder includes data that can be used when running the notebook and the runner script.
 
 The code was tested in Transkribus production environment.
+
+To run totkbs, use python 2.7 and install dependencies with "pip install -r requirements.txt"
diff --git a/OCR_Pipeline/requirements.txt b/OCR_Pipeline/requirements.txt
@@ -0,0 +1,7 @@
+certifi==2019.6.16
+chardet==3.0.4
+future==0.17.1
+idna==2.8
+lxml==4.3.4
+requests==2.22.0
+urllib3==1.25.3
diff --git a/OCR_Pipeline/totkbs.py b/OCR_Pipeline/totkbs.py
@@ -1,17 +1,20 @@
 #!/usr/bin/env python
 # coding: utf-8
+# To run totkbs, use python 2.7 and install dependencies with "pip install -r requirements.txt"
 
 import os
 from TkbsApiClient import TranskribusClient
 import xml.etree.ElementTree as ET
 from TkbsDocument import Document
 import json
-import os
-import xml.etree.cElementTree as ET
 import sys
 import pickle
 import requests.exceptions
 import time
+from lxml import etree
+import random
+import getpass
+import ast
 
 def log(s_or_e, msg):
     label = ["STARTING: ", "DONE WITH: "]
@@ -29,11 +32,85 @@ def log(s_or_e, msg):
 # 8. Download the document.
 # 9. Convert and combine the document data into TEI format.
 
+# in ar file get XMD-entity with id of ar and get box val
+# in pg file get Entity with id of ar and get box val
+# split return
+def get_coords(filename, article, ispagefile):
+    tree = etree.parse(filename)
+    if not ispagefile:
+        elem = tree.xpath('//XMD-entity')
+        if elem[0].get("ID") != article:
+            print "Getting coordinates: seems to be wrong article file"
+        return elem[0].get("BOX").split()
+    if ispagefile:
+        for entity in tree.xpath('//Entity'):
+            if entity.get("ID") == article:
+                resolution = 0
+                for res in tree.xpath('//Resolution'):
+                    res_text = res.text
+                    if int(res_text) > resolution:
+                        resolution = int(res_text)
+                return (resolution, entity.get("BOX").split())
+
+def get_files(issue):
+    pdir = "Document"
+    path = os.path.join(issue, pdir)
+    dirlist = os.listdir(path)
+    pagepath = dirlist[random.randint(0,len(dirlist)-1)]
+    while not os.path.isdir(os.path.join(path,pagepath)):
+        pagepath = dirlist[random.randint(0,len(dirlist)-1)]
+    path = os.path.join(path,pagepath)
+    # choose a Ar and Pg file
+    article_f = None
+    page_f = None
+    for f in os.listdir(path):
+        if (f.lower().startswith("ar") or f.lower().startswith("ad")) and f[len(f)-4:] == ".xml" and article_f is None:
+            article_f = f
+        elif f.lower().startswith("pg") and f[len(f)-4:] == ".xml" and page_f is None:
+            page_f = f
+        if article_f and page_f:
+            break
+    if not article_f and not page_f:
+        print "Required files in \"{}\" not found! Check if this directory belongs here. Trying again nonetheless.".format(path)
+        return get_files(issue)
+    article = article_f[:len(article_f)-4]
+    return [article, os.path.join(path, article_f), os.path.join(path, page_f)]
+
+def avg_coords(newcoords, oldcoords):
+    sum = 0.0
+    for n, o in zip(newcoords, oldcoords):
+        if float(o) != 0:
+            sum += (float(n)/float(o))
+    sum = sum/4.0
+    return sum
+
+def calc_factor(issue):
+    factor = 0.0
+    def_factor2 = 4.0
+    sample_cnt = 10
+    if config['sample_cnt'] > 0:
+        sample_cnt = config['sample_cnt']
+    for x in range(0,sample_cnt):
+        # choose a page directory
+        files = get_files(issue)
+        article = files[0]
+        article_f = files[1]
+        page_f = files[2]
+        # get coordinates
+        new_coord = get_coords(article_f, article, False)
+        og_coord = get_coords(page_f, article, True)
+        resolution = og_coord[0]
+        og_coord = og_coord[1]
+        # get average difference
+        avg = avg_coords(new_coord, og_coord)
+        factor += avg
+    return [int(resolution), factor/float(sample_cnt), def_factor2]
+
 # getting list of pxml filenames and their imagefile names
 def pxml_list(main_dir):
     pxml_dic = {}
     for x in sorted(os.listdir(os.path.join(main_dir,config['pxml_dir']))):
-        if x.endswith("png"):
+        if int(x[2:5]) not in pxml_dic:
             pxml_dic[int(x[2:5])] = [x]
             pxml_dic[int(x[2:5])].append(open(os.path.join(main_dir,config['pxml_dir'],x),'rb'))
         else:
@@ -48,11 +125,15 @@ def pxml_list(main_dir):
     return (pagelist, pxml_dic)
 
 # Convert Abbyy Olive document layout into PageXML format
-def make_pxml(res=None, f1=None, f2=None):
+def make_pxml(res=None, f1=None, f2=None, factors=None):
     log(0,"pxml")
     p = Document()
-    for f in factors:
-        p.set_factors(f[0], f[1], f[2])
+    if type(config['factors']) == list:
+        for f in config['factors']:
+            p.set_factors(f[0], f[1], f[2])
+    elif factors is not None:
+        for f in factors:
+            p.set_factors(f[0], f[1], f[2])
     if res is not None:
         p.set_factors(res,f1,f2)
     # directory containing TOC.xml
@@ -208,22 +289,22 @@ def set_config():
     #depth
     while True:
         try:
-            config['depth'] = int(raw_input("Enter depth of the toc.xml from the base.\nIn the example above it is 3: "))
+            config['depth'] = int(raw_input("Enter depth of the toc.xml from the base.\nIn the example above it is 4: "))
         except Exception:
             print error
             continue
         break
     #where pxml results are to be stored
-    config["pxml_dir"] = ".pxml_converted/"
+    config["pxml_dir"] = ".pxml_converted"
     #where the trankribus results are to be stored
-    config["tkribus_dir"] = ".transkribus/"
+    config["tkribus_dir"] = ".transkribus"
 
     #TRANSKRIBUS INFO
     #transkribus credentials
     while True:
         try:
             config['user'] = raw_input("Enter Transkribus username: ")
-            config['key'] = raw_input("Enter Transkribus password: ")
+            config['key'] = getpass.getpass("Enter Transkribus password: ")
         except Exception:
             print error
             continue
@@ -263,9 +344,21 @@ def set_config():
     # custom factors
     while True:
         try:
-            custom_factors = input("Enter a list of custom factors in the following format [[resolution, factor1, factor2], ...]\nOr enter 'None' for default values: ")
-            if custom_factors is not None:
-                config['factors'] = custom_factors
+            custom_factors = raw_input("Enter a list of custom factors in the following format [[resolution, factor1, factor2], ...]\nOr enter '1' to have them calculated for each document, or '0' to use defaults: ")
+            config['factors'] = custom_factors.strip()
+            if config['factors'] == "1":
+                config['factors'] = True
+                while True:
+                    try:
+                        config['sample_cnt'] = int(raw_input("Enter number of textblocks from which to compute the factors: "))
+                    except Exception:
+                        print error
+                        continue
+                    break
+            elif config['factors'] == "0":
+                config['factors'] = False
+            else:
+                config['factors'] == ast.literal_eval(config['factors'])
         except Exception:
             print error
             continue
@@ -303,7 +396,7 @@ def set_config():
                [210,2.46938191718,.413],
                [180,3.59747421296,.2889],
                [150,2.48414234145,.4]]
-help = "python " + sys.argv[0] + " [flags] [config_filename] [overideable_configs]\n\nflags:\n\t-p\tprint configurations to text file\n\t-s\tsave configurations if updated\n\t-h\thelp\n\noveridable_configs:\n\tbase=[base_directory]\n\tdepth=[1-3]\n\tcollection=[collection_id]\n\tHTRmodelid=[model_id]\n\tupload_only=[boolean]"
+help = "python " + sys.argv[0] + " [flags] [config_filename] [overideable_configs]\n\nflags:\n\t-p\tprint configurations to text file\n\t-s\tsave configurations if updated\n\t-h\thelp\n\noveridable_configs:\n\tbase=[base_directory]\n\tdepth=[1-4]\n\tcollection=[collection_id]\n\tHTRmodelid=[model_id]\n\tupload_only=[boolean]"
 save_changes = False
 print_config = False
 config_f = None
@@ -393,30 +486,36 @@ def set_config():
             ctf.write(config_text)
 tkbs = tkbs_login()
 paper = ""
-if config['depth'] == 3:
-    data_dirs = [(os.path.join(config['base'], x, y, w) + "/" ) for x in os.listdir(config['base']) if not x.startswith(".") for y in os.listdir(os.path.join(config['base'], x)) if not y.startswith(".") for w in os.listdir(os.path.join(config['base'],x,y)) if not w.startswith(".")]
+if config['depth'] == 4:
+    data_dirs = [(os.path.join(config['base'], x, y, w)) for x in os.listdir(config['base']) if not x.startswith(".") for y in os.listdir(os.path.join(config['base'], x)) if not y.startswith(".") for w in os.listdir(os.path.join(config['base'],x,y)) if not w.startswith(".")]
+    [(count, data_dirs[count]) for count in range(len(data_dirs))]
+elif config['depth'] == 3:
+    data_dirs = [(os.path.join(config['base'], x, y)) for x in os.listdir(config['base']) if not x.startswith(".") for y in os.listdir(os.path.join(config['base'], x)) if not y.startswith(".")]
     [(count, data_dirs[count]) for count in range(len(data_dirs))]
 elif config['depth'] == 2:
-    data_dirs = [(os.path.join(config['base'], x, y) + "/" ) for x in os.listdir(config['base']) if not x.startswith(".") for y in os.listdir(os.path.join(config['base'], x)) if not y.startswith(".")]
+    data_dirs = [(os.path.join(config['base'], x)) for x in os.listdir(config['base']) if not x.startswith(".")]
     [(count, data_dirs[count]) for count in range(len(data_dirs))]
 elif config['depth'] == 1:
     data_dirs = [config['base']]
 
 for issue in data_dirs:
     log(0,issue)
     paper = issue
-    config['upload_desc'] = config['upload_desc'] + ": " + issue
-    config['upload_title'] = config['upload_title'] + ": " + issue
-
     # magid 144 factors = [[2.00803520956,.5]]
     # la epoca 180 factors = [[1.66539490841,.6]]
     # todo: custom rules per paper
-    if issue.lower().find("magid") > -1:
-        make_pxml(res=144,f1=2.00803520956,f2=.5)
-    elif issue.lower().find("epoca") > -1:
-        make_pxml(res=180,f1=1.66539490841,f2=.6)
-    else:
+    if type(config['factors']) == list:
         make_pxml()
+    elif config['factors'] == True:
+        fkts = calc_factor(issue)
+        make_pxml(res=fkts[0], f1=fkts[1], f2=fkts[2])
+    else:
+        if issue.lower().find("magid") > -1:
+            make_pxml(res=144,f1=2.00803520956,f2=.5, factors=factors)
+        elif issue.lower().find("epoca") > -1:
+            make_pxml(res=180,f1=1.66539490841,f2=.6, factors=factors)
+        else:
+            make_pxml(factors=factors)
     uploadid = upload_pxml()
     if config['upload_only'] is False:
         page_list = get_pagelist(uploadid)
@@ -429,6 +528,3 @@ def set_config():
         edit_baseline()
         ocr()
     log(1,issue)
-
-
-# In[ ]: