-
Notifications
You must be signed in to change notification settings - Fork 539
Expand file tree
/
Copy pathDockerfile.evaluation
More file actions
145 lines (110 loc) · 4.65 KB
/
Dockerfile.evaluation
File metadata and controls
145 lines (110 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
## Grobid evaluation image
# ------
# https://grobid.readthedocs.io/en/latest/End-to-end-evaluation/
# NOTE: To match the exact evaluation published in the Grobid documentation is necessary to have a
# running Biblio-glutton instance
#
# A project using this image can be found here: https://huggingface.co/spaces/lfoppiano/grobid-evaluation
# Please notice that the evaluation is run through a python script that runs all the needed commands
# TODO: upload the evaluation in Markdown somewhere
# -------------------
# build builder image
# -------------------
FROM eclipse-temurin:21.0.10_7-jdk AS builder
USER root
RUN apt-get update && \
apt-get -y upgrade && \
apt-get -y --no-install-recommends install unzip git
WORKDIR /opt/grobid
# gradle
COPY gradle/ ./gradle/
COPY gradlew ./
COPY gradle.properties ./
COPY build.gradle ./
COPY settings.gradle ./
# source
COPY grobid-home/ ./grobid-home/
COPY grobid-core/ ./grobid-core/
COPY grobid-service/ ./grobid-service/
COPY grobid-trainer/ ./grobid-trainer/
# .git for revision embedding (as late as possible)
COPY .git/ ./.git
# cleaning unused native libraries before packaging
RUN rm -rf grobid-home/pdf2xml
RUN rm -rf grobid-home/pdfalto/lin-32
RUN rm -rf grobid-home/pdfalto/mac-64
RUN rm -rf grobid-home/pdfalto/mac_arm-64
RUN rm -rf grobid-home/pdfalto/win-*
RUN rm -rf grobid-home/lib/lin-32
RUN rm -rf grobid-home/lib/win-*
RUN rm -rf grobid-home/lib/mac-64
# Setting DL-powered configuration
RUN rm grobid-home/config/grobid.yaml && \
mv grobid-home/config/grobid-evaluation.yaml grobid-home/config/grobid.yaml
# Download evaluation data (For space reasons, we are not downloading the evaluation data) \
# See https://huggingface.co/spaces/lfoppiano/grobid-evaluation/blob/main/Dockerfile
WORKDIR /opt/grobid/evaluation
#RUN git lfs install && git clone --depth 1 https://huggingface.co/datasets/sciencialab/grobid-evaluation evaluation
#RUN chmod -R uog+rw /opt/grobid/evaluation
# -------------------
# build runtime image
# -------------------
# use NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine
FROM tensorflow/tensorflow:2.17.2-gpu
# setting locale is likely useless but to be sure
ENV LANG=C.UTF-8
# install python and other dependencies
RUN apt-get update && \
apt-get -y upgrade && \
apt-get -y --no-install-recommends install \
bash apt-utils build-essential gcc libxml2 libfontconfig unzip curl \
musl gfortran \
python3 python3-pip python3-setuptools python3-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /opt/grobid
COPY --from=builder /opt/grobid .
RUN python3 -m pip install pip --upgrade --no-cache-dir
# install DeLFT
RUN pip3 install --no-cache-dir delft==0.4.5
# link the data directory to /data
# the current working directory will most likely be /opt/grobid
RUN mkdir -p /data \
&& ln -s /data /opt/grobid/data \
&& ln -s /data ./data
# disable python warnings (and fix logging)
ENV PYTHONWARNINGS="ignore"
ENV JAVA_HOME=/opt/java/openjdk
ENV PATH=$JAVA_HOME/bin:$PATH
ENV JAVA_OPTS=-Xmx4g
# Get java jdk from builder image
# See tag of builder image in [Dockerfile tag](https://github.com/docker-library/docs/blob/master/eclipse-temurin/README.md#simple-tags) to find JAVA_HOME to copy from builder image.
COPY --from=builder /opt/java/openjdk ${JAVA_HOME}
# install jep
RUN pip3 install jep==4.3.1
# set python version matching TF base image python version
ENV PYTHON_VERSION=3.11
# set LD_LIBRARY_PATH accordingly
ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/jep:grobid-home/lib/lin-64:${LD_LIBRARY_PATH}
# Add Tini
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "-s", "--"]
WORKDIR /opt/grobid
# preload embeddings, for GROBID all the RNN models use glove-840B (default for the script), ELMo is currently not loaded
# to be done: mechanism to download GROBID fine-tuned models based on SciBERT if selected (but not good enough for the moment)
COPY --from=builder /opt/grobid/grobid-home/scripts/preload_embeddings.py .
COPY --from=builder /opt/grobid/grobid-home/config/resources-registry.json .
RUN python3 preload_embeddings.py --registry ./resources-registry.json && \
ln -s /opt/grobid /opt/delft
RUN mkdir delft && \
cp ./resources-registry.json delft/
VOLUME ["/opt/grobid/grobid-home/tmp"]
ARG GROBID_VERSION
LABEL \
authors="The contributors" \
org.label-schema.name="Grobid" \
org.label-schema.description="Image running the Grobid End 2 end evaluation" \
org.label-schema.url="https://github.com/kermitt2/Grobid" \
org.label-schema.version=${GROBID_VERSION}