Skip to content

Commit 70d5d01

Browse files
committed
feat: recompile for Node 10.x runtime
1 parent 2374943 commit 70d5d01

File tree

10 files changed

+50
-48
lines changed

10 files changed

+50
-48
lines changed

.circleci/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ version: 2
33
jobs:
44
build:
55
docker:
6-
- image: circleci/node:8
6+
- image: circleci/node:10
77

88
working_directory: ~/repo
99

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ node_modules/
33
*.log
44
.DS_Store
55
yarn.lock
6-
*.tar.gz
6+
bin/tesseract-standalone/

bin/tt.tar.br

-10.6 MB
Binary file not shown.

bin/tt.tar.gz

5.62 MB
Binary file not shown.

compile-tesseract.sh

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,13 @@
1-
#!/usr/bin/env bash
1+
# Spin up and enter the docker container on your machine with the following command:
2+
# docker run -it lambci/lambda:build-nodejs10.x bash
23

3-
# install basic stuff required for compilation
4-
sudo yum-config-manager --enable epel
54

6-
sudo yum install -y aclocal autoconf automake cmakegcc freetype-devel gcc gcc-c++ \
5+
# Then run the rest of the commands inside
6+
7+
# install basic stuff required for compilation
8+
yum install -y aclocal autoconf automake cmakegcc freetype-devel gcc gcc-c++ \
79
git lcms2-devel libjpeg-devel libjpeg-turbo-devel autogen autoconf libtool \
810
libpng-devel libtiff-devel libtool libwebp-devel libzip-devel make zlib-devel
9-
sudo yum groupinstall "Development Tools" -y
10-
11-
# autoconf
12-
cd ~
13-
wget http://babyname.tips/mirrors/gnu/autoconf-archive/autoconf-archive-2017.09.28.tar.xz
14-
tar -xvf autoconf-archive-2017.09.28.tar.xz
15-
cd autoconf-archive-2017.09.28
16-
./configure && make && sudo make install
17-
sudo cp m4/* /usr/share/aclocal/cd ~ wget http://babynam
1811

1912
# leptonica
2013
cd ~
@@ -23,29 +16,31 @@ cd leptonica/
2316
./autogen.sh
2417
./configure
2518
make
26-
sudo make install
19+
make install
2720

2821
# tesseract
2922
cd ~
3023
git clone https://github.com/tesseract-ocr/tesseract.git
3124
cd tesseract
25+
git checkout 4.0.0
3226
export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
3327
./autogen.sh
3428
./configure
3529
make
36-
sudo make install
30+
make install
3731

3832
cd ~
3933
mkdir tesseract-standalone
4034

41-
# trim unneeded ~ 15 MB
42-
strip ./tesseract-standalone/**/*
43-
4435
# copy files
4536
cd tesseract-standalone
4637
cp /usr/local/bin/tesseract .
4738
mkdir lib
4839
cp /usr/local/lib/libtesseract.so.4 lib/
40+
cp /lib64/libpng15.so.15 lib/
41+
cp /lib64/libtiff.so.5 lib/
42+
cp /lib64/libgomp.so.1 lib/
43+
cp /lib64/libjbig.so.2.0 lib/
4944
cp /usr/local/lib/liblept.so.5 lib/
5045
cp /usr/lib64/libjpeg.so.62 lib/
5146
cp /usr/lib64/libwebp.so.4 lib/
@@ -54,13 +49,16 @@ cp /usr/lib64/libstdc++.so.6 lib/
5449
# copy training data
5550
mkdir tessdata
5651
cd tessdata
57-
wget https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata
52+
curl -L https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata --output eng.traineddata
5853

5954
# archive
6055
cd ~
61-
tar -zcvf tesseract.tar.gz tesseract-standalone
6256

63-
# download from EC2 to local machine
64-
scp [email protected]:/home/ec2-user/tesseract.tar.gz $(pwd)
57+
# trim unneeded ~ 15 MB
58+
strip ./tesseract-standalone/**/*
59+
60+
tar -zcvf tesseract.tar.gz tesseract-standalone
6561

66-
# run compress-with-brotli.sh on local machine now
62+
# download from docker to local machine
63+
# 21c27dc1bf5d is docker container id, you can look it up by running "docker ps"
64+
docker cp 21c27dc1bf5d:/root/tesseract.tar.gz tt.tar.gz

compress-with-brotli.sh

Lines changed: 0 additions & 4 deletions
This file was deleted.

package.json

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "@shelf/aws-lambda-tesseract",
33
"version": "1.3.2",
4-
"description": "11 MB Tesseract (with English training data) to fit inside AWS Lambda compressed with Brotli",
4+
"description": "6 MB Tesseract (with English training data) to fit inside AWS Lambda",
55
"license": "MIT",
66
"repository": "shelfio/aws-lambda-tesseract",
77
"author": {
@@ -10,7 +10,7 @@
1010
"url": "shelf.io"
1111
},
1212
"engines": {
13-
"node": ">=8.10"
13+
"node": ">=10"
1414
},
1515
"scripts": {
1616
"lint": "eslint . --fix",
@@ -24,12 +24,11 @@
2424
"keywords": [
2525
"lambda",
2626
"ocr",
27-
"tesseract",
28-
"brotli"
27+
"tesseract"
2928
],
3029
"dependencies": {
31-
"@shelf/aws-lambda-brotli-unpacker": "0.0.2",
32-
"is-image": "3.0.0"
30+
"is-image": "3.0.0",
31+
"tar": "4.4.10"
3332
},
3433
"devDependencies": {
3534
"@shelf/eslint-config": "0.3.5",

readme.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# aws-lambda-tesseract [![CircleCI](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master.svg?style=svg)](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master) ![](https://img.shields.io/badge/code_style-prettier-ff69b4.svg) [![Tesseract](https://img.shields.io/badge/tesserract-11_MB-brightgreen.svg)](bin/)
1+
# aws-lambda-tesseract [![CircleCI](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master.svg?style=svg)](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master) ![](https://img.shields.io/badge/code_style-prettier-ff69b4.svg) [![Tesseract](https://img.shields.io/badge/tesserract-6_MB-brightgreen.svg)](bin/)
22

3-
> 11 MB Tesseract (with English training data) to fit inside AWS Lambda compressed with Brotli
3+
> 6 MB Tesseract (with English training data) to fit inside AWS Lambda
44
55
Inspired by [chrome-aws-lambda](https://github.com/alixaxel/chrome-aws-lambda) & [lambda-scanner-ocr](https://github.com/philippkeller/lambda-scanner-ocr)
66

@@ -10,9 +10,13 @@ Inspired by [chrome-aws-lambda](https://github.com/alixaxel/chrome-aws-lambda) &
1010
$ yarn add @shelf/aws-lambda-tesseract
1111
```
1212

13+
`1.x` versions of this library were compiled for Node 8.10.
14+
15+
`2.x` was compiled for Node 10.x runtime.
16+
1317
## How does it work?
1418

15-
This package contains an archive with [Tesseract 4.0 beta](https://github.com/tesseract-ocr/tesseract) compiled for usage in AWS Lambda environment.
19+
This package contains an archive with [Tesseract 4.0](https://github.com/tesseract-ocr/tesseract) compiled for usage in AWS Lambda environment.
1620

1721
When a Lambda starts, it unpacks an archive with a binary to the `/tmp` folder and makes sure it's done only once per Lambda cold start.
1822

@@ -38,14 +42,13 @@ unsupported by Tesseract file extensions.
3842

3943
## Compile It Yourself
4044

41-
See [compile-tesseract.sh](compile-tesseract.sh) & [compress-with-brotli.sh](compress-with-brotli.sh) files
45+
See [compile-tesseract.sh](compile-tesseract.sh)
4246

4347
Smoke test that it works by running `test.sh` script
4448

4549
## See Also
4650

4751
- [aws-lambda-libreoffice](https://github.com/shelfio/aws-lambda-libreoffice)
48-
- [aws-lambda-brotli-unpacker](https://github.com/shelfio/aws-lambda-brotli-unpacker)
4952
- [chrome-aws-lambda](https://github.com/alixaxel/chrome-aws-lambda)
5053

5154
## License

src/index.js

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
1-
const {unpack} = require('@shelf/aws-lambda-brotli-unpacker');
1+
const {extract} = require('tar');
22
const {execFileSync, execSync} = require('child_process');
33
const path = require('path');
44
const isImage = require('is-image');
55

66
const unsupportedExtensions = new Set(['ai', 'emf', 'eps', 'gif', 'ico', 'psd', 'svg']);
7-
const inputPath = path.join(__dirname, '..', 'bin', 'tt.tar.br');
8-
const outputPath = '/tmp/tesseract/tesseract';
7+
const inputPath = path.join(__dirname, '..', 'bin', 'tt.tar.gz');
8+
const outputPath = '/tmp/tesseract-standalone/tesseract';
99

1010
module.exports.getExecutablePath = async function() {
11-
return unpack({inputPath, outputPath});
11+
await extract({file: inputPath, cwd: '/tmp'});
12+
13+
return outputPath;
1214
};
1315

1416
module.exports.getTextFromImage = async function(filePath) {
15-
const ttBinary = await unpack({inputPath, outputPath});
17+
await extract({file: inputPath, cwd: '/tmp'});
1618

17-
const stdout = execFileSync(ttBinary, [filePath, 'stdout', '-l', 'eng'], {
18-
cwd: '/tmp/tesseract',
19+
const stdout = execFileSync(outputPath, [filePath, 'stdout', '-l', 'eng'], {
20+
cwd: '/tmp/tesseract-standalone',
1921
env: {
2022
LD_LIBRARY_PATH: './lib',
2123
TESSDATA_PREFIX: './tessdata'

test.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/usr/bin/env bash
22

3+
cd bin
4+
tar -xvzf tt.tar.gz
5+
cd ..
6+
37
docker run --rm \
48
-v "$PWD":/var/task \
5-
lambci/lambda:nodejs8.10 test.handler
9+
lambci/lambda:nodejs10.x test.handler

0 commit comments

Comments
 (0)