Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions OCR_Pipeline/readme.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
## Legacy to PAGE Pipeline
The folder "OCR_Pipeline" contains a "Runner" script (PipelineRunner.py) and a test Jupyter notebook: Pipeline.ipynb.
The folder "OCR_Pipeline" contains a "Runner" script (PipelineRunner.py) and a test Jupyter notebook: Pipeline.ipynb.

The notebook describes the pipeline steps and includes usage samples. It includes a series of code snippets, and their expected output below.

To use the runner script or the notebook samples, import additional two python files that are also included: TkbsApiClient.py and TkbsDocument.py.
To use the runner script or the notebook samples, import additional two python files that are also included: TkbsApiClient.py and TkbsDocument.py.

The sample_data subfolder includes data that can be used when running the notebook and the runner script.

The code was tested in Transkribus production environment.

To run totkbs, use python 2.7 and install dependencies with "pip install -r requirements.txt"
7 changes: 7 additions & 0 deletions OCR_Pipeline/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
certifi==2019.6.16
chardet==3.0.4
future==0.17.1
idna==2.8
lxml==4.3.4
requests==2.22.0
urllib3==1.25.3
152 changes: 124 additions & 28 deletions OCR_Pipeline/totkbs.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
#!/usr/bin/env python
# coding: utf-8
# To run totkbs, use python 2.7 and install dependencies with "pip install -r requirements.txt"

import os
from TkbsApiClient import TranskribusClient
import xml.etree.ElementTree as ET
from TkbsDocument import Document
import json
import os
import xml.etree.cElementTree as ET
import sys
import pickle
import requests.exceptions
import time
from lxml import etree
import random
import getpass
import ast

def log(s_or_e, msg):
label = ["STARTING: ", "DONE WITH: "]
Expand All @@ -29,11 +32,85 @@ def log(s_or_e, msg):
# 8. Download the document.
# 9. Convert and combine the document data into TEI format.

# in ar file get XMD-entity with id of ar and get box val
# in pg file get Entity with id of ar and get box val
# split return
def get_coords(filename, article, ispagefile):
tree = etree.parse(filename)
if not ispagefile:
elem = tree.xpath('//XMD-entity')
if elem[0].get("ID") != article:
print "Getting coordinates: seems to be wrong article file"
return elem[0].get("BOX").split()
if ispagefile:
for entity in tree.xpath('//Entity'):
if entity.get("ID") == article:
resolution = 0
for res in tree.xpath('//Resolution'):
res_text = res.text
if int(res_text) > resolution:
resolution = int(res_text)
return (resolution, entity.get("BOX").split())

def get_files(issue):
pdir = "Document"
path = os.path.join(issue, pdir)
dirlist = os.listdir(path)
pagepath = dirlist[random.randint(0,len(dirlist)-1)]
while not os.path.isdir(os.path.join(path,pagepath)):
pagepath = dirlist[random.randint(0,len(dirlist)-1)]
path = os.path.join(path,pagepath)
# choose a Ar and Pg file
article_f = None
page_f = None
for f in os.listdir(path):
if (f.lower().startswith("ar") or f.lower().startswith("ad")) and f[len(f)-4:] == ".xml" and article_f is None:
article_f = f
elif f.lower().startswith("pg") and f[len(f)-4:] == ".xml" and page_f is None:
page_f = f
if article_f and page_f:
break
if not article_f and not page_f:
print "Required files in \"{}\" not found! Check if this directory belongs here. Trying again nonetheless.".format(path)
return get_files(issue)
article = article_f[:len(article_f)-4]
return [article, os.path.join(path, article_f), os.path.join(path, page_f)]

def avg_coords(newcoords, oldcoords):
sum = 0.0
for n, o in zip(newcoords, oldcoords):
if float(o) != 0:
sum += (float(n)/float(o))
sum = sum/4.0
return sum

def calc_factor(issue):
factor = 0.0
def_factor2 = 4.0
sample_cnt = 10
if config['sample_cnt'] > 0:
sample_cnt = config['sample_cnt']
for x in range(0,sample_cnt):
# choose a page directory
files = get_files(issue)
article = files[0]
article_f = files[1]
page_f = files[2]
# get coordinates
new_coord = get_coords(article_f, article, False)
og_coord = get_coords(page_f, article, True)
resolution = og_coord[0]
og_coord = og_coord[1]
# get average difference
avg = avg_coords(new_coord, og_coord)
factor += avg
return [int(resolution), factor/float(sample_cnt), def_factor2]

# getting list of pxml filenames and their imagefile names
def pxml_list(main_dir):
pxml_dic = {}
for x in sorted(os.listdir(os.path.join(main_dir,config['pxml_dir']))):
if x.endswith("png"):
if int(x[2:5]) not in pxml_dic:
pxml_dic[int(x[2:5])] = [x]
pxml_dic[int(x[2:5])].append(open(os.path.join(main_dir,config['pxml_dir'],x),'rb'))
else:
Expand All @@ -48,11 +125,15 @@ def pxml_list(main_dir):
return (pagelist, pxml_dic)

# Convert Abbyy Olive document layout into PageXML format
def make_pxml(res=None, f1=None, f2=None):
def make_pxml(res=None, f1=None, f2=None, factors=None):
log(0,"pxml")
p = Document()
for f in factors:
p.set_factors(f[0], f[1], f[2])
if type(config['factors']) == list:
for f in config['factors']:
p.set_factors(f[0], f[1], f[2])
elif factors is not None:
for f in factors:
p.set_factors(f[0], f[1], f[2])
if res is not None:
p.set_factors(res,f1,f2)
# directory containing TOC.xml
Expand Down Expand Up @@ -208,22 +289,22 @@ def set_config():
#depth
while True:
try:
config['depth'] = int(raw_input("Enter depth of the toc.xml from the base.\nIn the example above it is 3: "))
config['depth'] = int(raw_input("Enter depth of the toc.xml from the base.\nIn the example above it is 4: "))
except Exception:
print error
continue
break
#where pxml results are to be stored
config["pxml_dir"] = ".pxml_converted/"
config["pxml_dir"] = ".pxml_converted"
#where the trankribus results are to be stored
config["tkribus_dir"] = ".transkribus/"
config["tkribus_dir"] = ".transkribus"

#TRANSKRIBUS INFO
#transkribus credentials
while True:
try:
config['user'] = raw_input("Enter Transkribus username: ")
config['key'] = raw_input("Enter Transkribus password: ")
config['key'] = getpass.getpass("Enter Transkribus password: ")
except Exception:
print error
continue
Expand Down Expand Up @@ -263,9 +344,21 @@ def set_config():
# custom factors
while True:
try:
custom_factors = input("Enter a list of custom factors in the following format [[resolution, factor1, factor2], ...]\nOr enter 'None' for default values: ")
if custom_factors is not None:
config['factors'] = custom_factors
custom_factors = raw_input("Enter a list of custom factors in the following format [[resolution, factor1, factor2], ...]\nOr enter '1' to have them calculated for each document, or '0' to use defaults: ")
config['factors'] = custom_factors.strip()
if config['factors'] == "1":
config['factors'] = True
while True:
try:
config['sample_cnt'] = int(raw_input("Enter number of textblocks from which to compute the factors: "))
except Exception:
print error
continue
break
elif config['factors'] == "0":
config['factors'] = False
else:
config['factors'] == ast.literal_eval(config['factors'])
except Exception:
print error
continue
Expand Down Expand Up @@ -303,7 +396,7 @@ def set_config():
[210,2.46938191718,.413],
[180,3.59747421296,.2889],
[150,2.48414234145,.4]]
help = "python " + sys.argv[0] + " [flags] [config_filename] [overideable_configs]\n\nflags:\n\t-p\tprint configurations to text file\n\t-s\tsave configurations if updated\n\t-h\thelp\n\noveridable_configs:\n\tbase=[base_directory]\n\tdepth=[1-3]\n\tcollection=[collection_id]\n\tHTRmodelid=[model_id]\n\tupload_only=[boolean]"
help = "python " + sys.argv[0] + " [flags] [config_filename] [overideable_configs]\n\nflags:\n\t-p\tprint configurations to text file\n\t-s\tsave configurations if updated\n\t-h\thelp\n\noveridable_configs:\n\tbase=[base_directory]\n\tdepth=[1-4]\n\tcollection=[collection_id]\n\tHTRmodelid=[model_id]\n\tupload_only=[boolean]"
save_changes = False
print_config = False
config_f = None
Expand Down Expand Up @@ -393,30 +486,36 @@ def set_config():
ctf.write(config_text)
tkbs = tkbs_login()
paper = ""
if config['depth'] == 3:
data_dirs = [(os.path.join(config['base'], x, y, w) + "/" ) for x in os.listdir(config['base']) if not x.startswith(".") for y in os.listdir(os.path.join(config['base'], x)) if not y.startswith(".") for w in os.listdir(os.path.join(config['base'],x,y)) if not w.startswith(".")]
if config['depth'] == 4:
data_dirs = [(os.path.join(config['base'], x, y, w)) for x in os.listdir(config['base']) if not x.startswith(".") for y in os.listdir(os.path.join(config['base'], x)) if not y.startswith(".") for w in os.listdir(os.path.join(config['base'],x,y)) if not w.startswith(".")]
[(count, data_dirs[count]) for count in range(len(data_dirs))]
elif config['depth'] == 3:
data_dirs = [(os.path.join(config['base'], x, y)) for x in os.listdir(config['base']) if not x.startswith(".") for y in os.listdir(os.path.join(config['base'], x)) if not y.startswith(".")]
[(count, data_dirs[count]) for count in range(len(data_dirs))]
elif config['depth'] == 2:
data_dirs = [(os.path.join(config['base'], x, y) + "/" ) for x in os.listdir(config['base']) if not x.startswith(".") for y in os.listdir(os.path.join(config['base'], x)) if not y.startswith(".")]
data_dirs = [(os.path.join(config['base'], x)) for x in os.listdir(config['base']) if not x.startswith(".")]
[(count, data_dirs[count]) for count in range(len(data_dirs))]
elif config['depth'] == 1:
data_dirs = [config['base']]

for issue in data_dirs:
log(0,issue)
paper = issue
config['upload_desc'] = config['upload_desc'] + ": " + issue
config['upload_title'] = config['upload_title'] + ": " + issue

# magid 144 factors = [[2.00803520956,.5]]
# la epoca 180 factors = [[1.66539490841,.6]]
# todo: custom rules per paper
if issue.lower().find("magid") > -1:
make_pxml(res=144,f1=2.00803520956,f2=.5)
elif issue.lower().find("epoca") > -1:
make_pxml(res=180,f1=1.66539490841,f2=.6)
else:
if type(config['factors']) == list:
make_pxml()
elif config['factors'] == True:
fkts = calc_factor(issue)
make_pxml(res=fkts[0], f1=fkts[1], f2=fkts[2])
else:
if issue.lower().find("magid") > -1:
make_pxml(res=144,f1=2.00803520956,f2=.5, factors=factors)
elif issue.lower().find("epoca") > -1:
make_pxml(res=180,f1=1.66539490841,f2=.6, factors=factors)
else:
make_pxml(factors=factors)
uploadid = upload_pxml()
if config['upload_only'] is False:
page_list = get_pagelist(uploadid)
Expand All @@ -429,6 +528,3 @@ def set_config():
edit_baseline()
ocr()
log(1,issue)


# In[ ]: