-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimage_ocr.sh
executable file
·116 lines (90 loc) · 2.85 KB
/
image_ocr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/bin/bash
ocrapp=$1
scan="$2"
tempbase=`mktemp /tmp/ocr-XXXXXX`
ocr_temp="${tempbase}.tif"
pnm_temp="${tempbase}.pnm"
if [ "$3" != "" ]; then
ocr_temp="$3"
fi
if [ "$4" != "" ]; then
pnm_temp="$3"
fi
rm -f $tempbase
function ocr {
local scan_app=$1
image_folder=`dirname $1`
pushd ${image_folder} > /dev/null
image_folder=`pwd`
popd > /dev/null
if [ ! -d ${image_folder}/${scan_app} ]; then
mkdir ${image_folder}/${scan_app}
fi
ocr_folder=${image_folder}/${scan_app}
if [ ! -e ${ocr_folder}/$textbase.txt ]; then
# Make this step optional for recursive calls
if [ "${scan_app}" == "tesseract" ]; then
echo -n "OCR-ing $scan using tesseract..."
nice tesseract ${ocr_temp} ${ocr_folder}/${textbase} -l eng
fi
if [ "${scan_app}" == "gocr" ]; then
echo "OCR-ing $scan using GOCR/JOCR..."
nice gocr -i ${pnm_temp} -f ASCII > ${ocr_folder}/${textbase}.txt
fi
if [ "${scan_app}" == "ocrad" ]; then
echo "OCR-ing $scan using OCRAD..."
nice ocrad --charset=ascii -o ${ocr_folder}/${textbase}.txt ${pnm_temp}
fi
fi
}
if [ "${ocrapp}" == "" ]; then
echo "You must specify an OCR application"
echo "usage:"
echo " $0 [gocr | ocrad | tesseract | all] <imagename.tif> "
echo ""
elif [ "${#scan}" == "0" ]; then
echo "You must specify an file name"
echo "usage:"
echo " $0 [gocr | ocrad | tesseract | all] <imagename.tif> "
echo ""
else
if [ -f ${scan} ]; then
image_folder=`dirname ${scan}`
textbase=`basename $scan .tif`
pushd ${image_folder} > /dev/null
image_folder=`pwd`
popd > /dev/null
echo "Resampling for better OCR accuracy"
# Downsample image for faster, more accurate processing
if [ ! -f ${ocr_temp} ]; then
echo " Converting to RGB..."
tiff2rgba ${scan} ${ocr_temp}
echo " Scaling to 400 DPI..."
nice mogrify -adaptive-blur 4x4 -resample 400x400 -shave 100x100 -type Bilevel ${ocr_temp}
fi
if [ "${scan_app}" != "tesseract" ] && [ ! -f ${pnm_temp} ]; then
echo " Converting to PNM format..."
#convert ${ocr_temp} -format "pnm" ${pnm_temp}
nice gm convert ${ocr_temp} -depth 8 pnm:${pnm_temp}
fi
pushd ${image_folder} > /dev/null
if [ "${ocrapp}" == "all" ]; then
echo "calling OCR, All"
# Disabling tesseract for now... too buggy.
#ocr tesseract
ocr ocrad
ocr gocr
else
echo "calling OCR, $ocrapp"
ocr $ocrapp
fi
popd > /dev/null
# Cleanup
if [ -f "${pnm_temp}" ]; then
rm -f ${pnm_temp}
fi
rm -f ${ocr_temp}
else
echo "Invalid file name: $scan"
fi
fi