Skip to content

Commit 31b3954

Browse files
committed
Initial commit: PDF redline detection library
Elixir NIF wrapper around Rust/MuPDF for fast PDF redline extraction. Features: - Extract red text from PDFs with configurable thresholds - Multi-line redline merging - Precompiled binaries for macOS (x86_64/aarch64) and Linux (x86_64/aarch64) - Cross-platform CI with rustler_precompiled
0 parents  commit 31b3954

29 files changed

Lines changed: 3502 additions & 0 deletions

.credo.exs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
%{
2+
configs: [
3+
%{
4+
name: "default",
5+
files: %{
6+
included: ["lib/", "test/"],
7+
excluded: [~r"/_build/", ~r"/deps/", ~r"/node_modules/"]
8+
},
9+
strict: true
10+
}
11+
]
12+
}

.dialyzer_ignore.exs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[
2+
# Mix.Task callbacks aren't visible to Dialyzer at compile time
3+
~r/lib\/mix\/tasks\/pdf_redlines.bench.ex/,
4+
{:unknown_function, ~r/Mix\.Task\.run\/1/},
5+
{:unknown_function, ~r/Mix\.raise\/1/}
6+
]

.editorconfig

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
root = true
2+
3+
[*]
4+
charset = utf-8
5+
end_of_line = lf
6+
indent_style = space
7+
indent_size = 2
8+
insert_final_newline = true
9+
trim_trailing_whitespace = true

.formatter.exs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[
2+
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
3+
]

.github/workflows/ci.yml

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: ["**"]
8+
9+
jobs:
10+
check-formatting:
11+
name: Check formatting
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- uses: dtolnay/rust-toolchain@stable
17+
with:
18+
components: rustfmt
19+
20+
- name: Set up Elixir
21+
uses: erlef/setup-beam@v1
22+
with:
23+
otp-version: 28
24+
elixir-version: 1.19
25+
26+
- name: Install Elixir dependencies
27+
run: |
28+
mix local.rebar --force
29+
mix local.hex --force
30+
mix deps.get
31+
32+
- name: Check Rust formatting
33+
working-directory: native/pdf_redlines_nif
34+
run: cargo fmt --all -- --check
35+
36+
- name: Check Elixir formatting
37+
run: mix format --check-formatted
38+
39+
- name: Run Credo
40+
run: mix credo --strict
41+
42+
test-elixir:
43+
name: Test Elixir ${{ matrix.elixir }} / OTP ${{ matrix.otp }}
44+
45+
env:
46+
MIX_ENV: test
47+
PDF_REDLINES_BUILD: "1"
48+
49+
strategy:
50+
fail-fast: false
51+
matrix:
52+
include:
53+
- elixir: 1.19.0
54+
otp: 26.0
55+
- elixir: 1.19.5
56+
otp: 27.3
57+
- elixir: 1.19.5
58+
otp: 28.3
59+
60+
runs-on: ubuntu-latest
61+
62+
steps:
63+
- uses: actions/checkout@v4
64+
65+
- name: Set up Rust
66+
uses: dtolnay/rust-toolchain@stable
67+
68+
- name: Set up Elixir
69+
uses: erlef/setup-beam@v1
70+
with:
71+
otp-version: ${{ matrix.otp }}
72+
elixir-version: ${{ matrix.elixir }}
73+
74+
- name: Install system dependencies
75+
run: |
76+
sudo apt-get update
77+
sudo apt-get install -y pkg-config libfontconfig1-dev clang-14 libclang-14-dev llvm-14-dev
78+
79+
- name: Retrieve cached Rust dependencies
80+
uses: actions/cache@v4
81+
id: cargo-cache
82+
with:
83+
path: ~/.cargo
84+
key: ${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}-cargo-${{ hashFiles('**/Cargo.lock') }}-${{ hashFiles('**/Cargo.toml') }}
85+
restore-keys: |
86+
${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}-cargo-
87+
88+
- name: Retrieve cached Elixir dependencies
89+
uses: actions/cache@v4
90+
id: mix-cache
91+
with:
92+
path: deps
93+
key: ${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}-mix-${{ hashFiles('**/mix.lock') }}-${{ hashFiles('**/mix.exs') }}
94+
restore-keys: |
95+
${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}-mix-
96+
97+
- name: Retrieve cached Elixir build
98+
uses: actions/cache@v4
99+
id: build-cache
100+
with:
101+
path: _build
102+
key: ${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}-build-${{ github.sha }}
103+
restore-keys: |
104+
${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}-build-
105+
106+
- name: Install Elixir dependencies
107+
if: steps.mix-cache.outputs.cache-hit != 'true'
108+
run: |
109+
mix local.rebar --force
110+
mix local.hex --force
111+
mix deps.get
112+
mix deps.compile
113+
114+
- name: Compile Elixir
115+
run: mix compile --warnings-as-errors
116+
117+
- name: Run tests
118+
run: mix test
119+
120+
- name: Run Credo
121+
if: matrix.elixir == '1.19.5' && matrix.otp == '28.3'
122+
run: mix credo --strict
123+
124+
- name: Run Dialyzer
125+
if: matrix.elixir == '1.19.5' && matrix.otp == '28.3'
126+
run: mix dialyzer

.github/workflows/release.yml

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
name: Release
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
tags:
8+
- "**"
9+
10+
jobs:
11+
precompile_nifs:
12+
name: Precompile NIF ${{ matrix.nif }} - ${{ matrix.job.target }} (${{ matrix.job.os }})
13+
14+
strategy:
15+
fail-fast: false
16+
matrix:
17+
nif: ["2.17", "2.16", "2.15"]
18+
job:
19+
- { target: aarch64-apple-darwin , os: macos-latest }
20+
- { target: aarch64-unknown-linux-gnu , os: ubuntu-22.04 , use-cross: true }
21+
- { target: x86_64-apple-darwin , os: macos-15-intel }
22+
- { target: x86_64-unknown-linux-gnu , os: ubuntu-22.04 }
23+
24+
runs-on: ${{ matrix.job.os }}
25+
26+
env:
27+
RUSTLER_NIF_VERSION: ${{ matrix.nif }}
28+
RUST_FONTCONFIG_DLOPEN: "1"
29+
CFLAGS: "-DTOFU -DTOFU_CJK_LANG -DTOFU_CJK_EXT"
30+
31+
steps:
32+
- name: Checkout source code
33+
uses: actions/checkout@v4
34+
35+
- name: Install system dependencies
36+
if: startsWith(matrix.job.os, 'ubuntu') && !matrix.job.use-cross
37+
run: |
38+
sudo apt-get update
39+
sudo apt-get install -y pkg-config libfontconfig1-dev clang-14 libclang-14-dev llvm-14-dev
40+
echo "LIBCLANG_PATH=/usr/lib/llvm-14/lib" >> $GITHUB_ENV
41+
echo "LLVM_CONFIG_PATH=/usr/bin/llvm-config-14" >> $GITHUB_ENV
42+
43+
- name: Install LLVM (macOS)
44+
if: startsWith(matrix.job.os, 'macos')
45+
run: |
46+
brew install llvm
47+
echo "LIBCLANG_PATH=$(brew --prefix llvm)/lib" >> $GITHUB_ENV
48+
echo "LLVM_CONFIG_PATH=$(brew --prefix llvm)/bin/llvm-config" >> $GITHUB_ENV
49+
50+
- name: Extract project version
51+
shell: bash
52+
run: |
53+
echo "PROJECT_VERSION=$(sed -n 's/^ @version "\(.*\)"/\1/p' mix.exs | head -n1)" >> $GITHUB_ENV
54+
55+
- name: Install Rust toolchain
56+
uses: dtolnay/rust-toolchain@stable
57+
with:
58+
toolchain: stable
59+
target: ${{ matrix.job.target }}
60+
61+
- name: Build the project
62+
id: build-crate
63+
uses: philss/rustler-precompiled-action@v1.1.4
64+
env:
65+
RUST_FONTCONFIG_DLOPEN: "1"
66+
with:
67+
project-name: pdf_redlines_nif
68+
project-version: ${{ env.PROJECT_VERSION }}
69+
target: ${{ matrix.job.target }}
70+
nif-version: ${{ matrix.nif }}
71+
use-cross: ${{ matrix.job.use-cross }}
72+
cross-version: v0.2.5
73+
project-dir: "native/pdf_redlines_nif"
74+
75+
- name: Artifact upload
76+
uses: actions/upload-artifact@v4
77+
with:
78+
name: ${{ steps.build-crate.outputs.file-name }}
79+
path: ${{ steps.build-crate.outputs.file-path }}
80+
81+
- name: Publish archives and packages
82+
uses: softprops/action-gh-release@v2
83+
with:
84+
files: |
85+
${{ steps.build-crate.outputs.file-path }}
86+
if: startsWith(github.ref, 'refs/tags/')

.gitignore

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# The directory Mix will write compiled artifacts to.
2+
/_build/
3+
4+
# If you run "mix test --cover", coverage assets end up here.
5+
/cover/
6+
7+
# The directory Mix downloads dependencies sources to.
8+
/deps/
9+
10+
# Documentation artifacts
11+
/doc/
12+
13+
# Crash dumps.
14+
erl_crash.dump
15+
16+
# Rust NIF compiled artifacts
17+
/native/*/target/
18+
19+
# Rustler precompiled downloads
20+
/priv/native/
21+
22+
# The checksum files for precompiled NIFs
23+
checksum-*.exs
24+
25+
# Ignore .fetch files in case you like to edit your project deps locally.
26+
/.fetch
27+
28+
# macOS
29+
.DS_Store
30+
31+
# Elixir tooling
32+
.elixir_ls/
33+
34+
# Direnv / env files
35+
.env
36+
.envrc
37+
.direnv/
38+
39+
# Local output
40+
/tmp/

.tool-versions

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
elixir 1.19.5-otp-28
2+
erlang 28.3

CHANGES.md

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Cross-Compilation Fix Summary
2+
3+
## Problem
4+
5+
Cross-compilation for aarch64-unknown-linux-gnu was failing because MuPDF's embedded font object files (.cff.o) were being compiled for x86_64 instead of aarch64, causing linker errors.
6+
7+
## Solution
8+
9+
Disabled embedded fonts entirely using TOFU (TOfu FOr Unicode) flags:
10+
11+
- `CFLAGS="-DTOFU -DTOFU_CJK_LANG -DTOFU_CJK_EXT"`
12+
- Applied globally to all platforms for consistency
13+
14+
## Changes Made
15+
16+
### 1. Cross.toml (aarch64-linux only)
17+
18+
- Added TOFU CFLAGS to env section
19+
- Kept make wrapper (doesn't hurt, might help other builds)
20+
21+
### 2. release.yml (all platforms)
22+
23+
- Added TOFU CFLAGS as global env var
24+
- Applies to macOS, Linux x86_64, and Linux aarch64
25+
26+
### 3. Cargo.toml
27+
28+
- Stripped unnecessary features from mupdf dependency
29+
- Before: `["js", "xps", "svg", "cbz", "img", "html", "epub", "system-fonts"]`
30+
- After: `["system-fonts"]`
31+
- We only need PDF parsing, not rendering of other formats
32+
33+
## Benefits
34+
35+
### Binary Size
36+
37+
- **Before:** ~50MB per platform
38+
- **After:** Expected <10MB per platform
39+
- Embedded CJK fonts were the bulk of the size
40+
41+
### Build Time
42+
43+
- Fewer features to compile
44+
- Faster cross-compilation
45+
46+
### Consistency
47+
48+
- Same behavior across all platforms
49+
- No embedded fonts anywhere
50+
51+
### Security
52+
53+
- Smaller attack surface
54+
- Removed unnecessary format parsers (JS, XPS, etc.)
55+
56+
## Why This Works for Redline Detection
57+
58+
Embedded fonts are for **rendering** PDFs visually. Our use case only needs:
59+
60+
1. Parse PDF structure
61+
2. Read character positions (x, y, width, height)
62+
3. Read character colors (RGB values)
63+
4. Detect red text patterns
64+
65+
None of this requires font rendering or embedded fonts.
66+
67+
## Verification Needed
68+
69+
The Rust code has a fallback for character width:
70+
71+
```rust
72+
char_width = font_size * 0.6 * scale
73+
```
74+
75+
With system fonts, verify that:
76+
77+
1. Character width estimation remains accurate
78+
2. Paired redline detection (x_gap, y_diff) still works
79+
3. Multi-line merging doesn't break
80+
81+
Test with real PDFs on all platforms after CI passes.
82+
83+
## Next CI Run
84+
85+
All 9 jobs (3 platforms × 3 NIF versions) should now:
86+
87+
1. ✅ Build successfully
88+
2. ✅ Create smaller binaries (~10MB each)
89+
3. ✅ Complete faster
90+
91+
Monitor: https://github.com/EnaiaInc/pdf_redlines/actions

0 commit comments

Comments
 (0)