Skip to content

Commit d85b481

Browse files
committed
feat: Build to binary + CI/CD job + SBOM
1 parent 3828793 commit d85b481

12 files changed

+443
-23
lines changed

.github/workflows/pipeline.yaml

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
name: Pipeline
2+
3+
on:
4+
push:
5+
branches:
6+
- develop
7+
- feat/*
8+
- hotfix/*
9+
- main
10+
pull_request:
11+
branches:
12+
- develop
13+
- feat/*
14+
- hotfix/*
15+
- main
16+
17+
jobs:
18+
init:
19+
name: Init
20+
runs-on: ubuntu-22.04
21+
permissions:
22+
contents: read
23+
outputs:
24+
VERSION: ${{ steps.version.outputs.version }}
25+
VERSION_FULL: ${{ steps.version.outputs.version_full }}
26+
steps:
27+
- name: Checkout
28+
uses: actions/[email protected]
29+
with:
30+
# We need all Git history for "version.sh"
31+
fetch-depth: 0
32+
# Ensure "version.sh" submodule are up-to-date
33+
submodules: recursive
34+
35+
- name: Generate versions
36+
id: version
37+
run: |
38+
echo "version=$(bash cicd/version/version.sh -g . -c)" >> $GITHUB_OUTPUT
39+
echo "version_full=$(bash cicd/version/version.sh -g . -c -m)" >> $GITHUB_OUTPUT
40+
41+
build-app:
42+
name: Build & publish app
43+
permissions:
44+
contents: write
45+
packages: write
46+
runs-on: ${{ matrix.os }}
47+
needs:
48+
- init
49+
strategy:
50+
fail-fast: false
51+
matrix:
52+
# TODO: Build for musl (Alpine Linux)
53+
os:
54+
# Last 2 versions of macOS
55+
- macos-13
56+
- macos-14
57+
# Last 2 versions of Ubuntu
58+
- ubuntu-22.04
59+
- ubuntu-24.04
60+
# Last 2 versions of Windows
61+
- windows-2019
62+
- windows-2022
63+
steps:
64+
- name: Checkout
65+
uses: actions/[email protected]
66+
67+
- name: Configure Git
68+
run: |
69+
git config user.name "${{ github.actor }}"
70+
git config user.email "${{ github.actor }}@users.noreply.github.com"
71+
72+
- name: Set up Python
73+
uses: actions/[email protected]
74+
with:
75+
cache: pip
76+
python-version: "3.12"
77+
78+
- name: Install make (Windows)
79+
if: runner.os == 'Windows'
80+
run: choco install make
81+
82+
- name: Install dependencies
83+
run: |
84+
python3 -m pip install --upgrade pip wheel setuptools
85+
make install-deps
86+
87+
- name: Install dependencies (Windows)
88+
if: runner.os == 'Windows'
89+
run: |
90+
python3 -m pip install pywin32-ctypes pefile
91+
92+
- name: Build to binary
93+
run: make build
94+
95+
- name: Rename binary (Linux)
96+
if: runner.os == 'Linux' || runner.os == 'macOS'
97+
run: mv dist/scrape-it-now dist/scrape-it-now-${{ needs.init.outputs.VERSION }}-${{ matrix.os }}
98+
99+
- name: Rename binary (Windows)
100+
if: runner.os == 'Windows'
101+
run: mv dist\scrape-it-now.exe dist\scrape-it-now-${{ needs.init.outputs.VERSION }}-${{ matrix.os }}.exe
102+
103+
- name: Upload artifact
104+
uses: actions/[email protected]
105+
with:
106+
name: binary-${{ matrix.os }}
107+
path: dist/*
108+
109+
attest-dependencies:
110+
name: Attest - Dependencies
111+
permissions:
112+
contents: write
113+
runs-on: ubuntu-22.04
114+
steps:
115+
- name: Checkout
116+
uses: actions/[email protected]
117+
118+
- name: Run attestation
119+
uses: advanced-security/[email protected]
120+
with:
121+
directoryExclusionList: docs
122+
123+
attest-sbom:
124+
name: Attest - SBOM
125+
runs-on: ubuntu-22.04
126+
needs:
127+
- init
128+
steps:
129+
- name: Checkout
130+
uses: actions/[email protected]
131+
132+
- name: Init Syft
133+
uses: anchore/sbom-action/[email protected]
134+
135+
- name: Run attestation
136+
run: make sbom version_full=${{ needs.init.outputs.VERSION_FULL }}
137+
138+
- name: Upload results to release
139+
uses: actions/[email protected]
140+
with:
141+
name: sbom
142+
path: sbom-reports/*
143+
144+
publish-release:
145+
name: Release
146+
permissions:
147+
contents: write
148+
runs-on: ubuntu-22.04
149+
needs:
150+
- attest-dependencies
151+
- attest-sbom
152+
- build-app
153+
- init
154+
# Only publish on non-scheduled default branch
155+
if: (github.event_name != 'schedule') && (github.ref == 'refs/heads/main')
156+
steps:
157+
- name: Download artifacts
158+
id: download
159+
uses: actions/[email protected]
160+
with:
161+
merge-multiple: true
162+
path: artifacts
163+
164+
- name: Publish
165+
uses: softprops/[email protected]
166+
with:
167+
files: artifacts/*
168+
generate_release_notes: true
169+
make_latest: true
170+
name: scrape-it-now v${{ needs.init.outputs.VERSION }}
171+
tag_name: ${{ needs.init.outputs.VERSION }}

.syft.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ python:
55
guess-unpinned-requirements: true
66

77
source:
8-
name: Solution Architect AI
8+
name: scrape-it-now

Makefile

+31-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
.PHONY: version version-full install upgrade test dev build lint
2+
13
# Versioning
24
version_full ?= $(shell $(MAKE) --silent version-full)
35
version_small ?= $(shell $(MAKE) --silent version)
@@ -19,22 +21,29 @@ version:
1921
version-full:
2022
@bash ./cicd/version/version.sh -g . -c -m
2123

24+
brew:
25+
@echo "➡️ Installing Syft..."
26+
brew install syft
27+
2228
install:
29+
$(MAKE) install-deps
30+
31+
@echo "➡️ Installing Playwright dependencies..."
32+
python3 -m playwright install chrome --with-deps
33+
34+
install-deps:
2335
@echo "➡️ Installing pip-tools..."
2436
python3 -m pip install pip-tools
2537

2638
@echo "➡️ Syncing dependencies..."
2739
pip-sync --pip-args "--no-deps" requirements-dev.txt
2840

29-
@echo "➡️ Installing Playwright dependencies..."
30-
python3 -m playwright install chrome --with-deps
31-
3241
upgrade:
3342
@echo "➡️ Updating Git submodules..."
3443
git submodule update --init --recursive
3544

3645
@echo "➡️ Upgrading pip..."
37-
python3 -m pip install --upgrade pip wheel
46+
python3 -m pip install --upgrade pip wheel setuptools
3847

3948
@echo "➡️ Upgrading pip-tools..."
4049
python3 -m pip install --upgrade pip-tools
@@ -73,9 +82,27 @@ dev:
7382
python3 -m pip install --editable .
7483
@echo "Now you can run 'scrape-it-now' CLI!"
7584

85+
build:
86+
@echo "➡️ Building app..."
87+
pyinstaller \
88+
--add-data resources:resources \
89+
--clean \
90+
--icon resources/logo.ico \
91+
--name scrape-it-now \
92+
--onefile \
93+
--optimize 2 \
94+
app/app.py
95+
7696
lint:
7797
@echo "➡️ Fix with generic formatter (Black)..."
7898
python3 -m black .
7999

80100
@echo "➡️ Fix with import formatter (isort)..."
81101
python3 -m isort --jobs -1 .
102+
103+
sbom:
104+
@echo "🔍 Generating SBOM..."
105+
syft scan \
106+
--source-version $(version_full) \
107+
--output spdx-json=./sbom-reports/$(version_full).json \
108+
.

README.md

+27
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
A website to scrape? There's a simple way.
44

5+
<!-- github.com badges -->
6+
[![Last release date](https://img.shields.io/github/release-date/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/releases)
7+
[![Project license](https://img.shields.io/github/license/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/blob/main/LICENSE)
8+
59
## Features
610

711
Shared:
@@ -30,6 +34,29 @@ Indexer:
3034
- [x] Embed chunks with OpenAI embeddings
3135
- [x] Indexed content is semantically searchable with [Azure AI Search](https://learn.microsoft.com/en-us/azure/search)
3236

37+
## Installation
38+
39+
### From binary
40+
41+
[Download the latest release from the releases page](http://github.com/clemlesne/scrape-it-now/releases/latest). Binaries are available for Linux, macOS and Windows.
42+
43+
For configuring the CLI (including authentication to the backend services), use environment variables, a `.env` file or command line options.
44+
45+
### From sources
46+
47+
Application must be run with Python 3.12 or later. If this version is not installed, an easy way to install it is [pyenv](https://github.com/pyenv/pyenv).
48+
49+
```bash
50+
# Download the source code
51+
git clone https://github.com/clemlesne/scrape-it-now.git
52+
# Move to the directory
53+
cd scrape-it-now
54+
# Run install scripts
55+
make install dev
56+
# Run the CLI
57+
scrape-it-now --help
58+
```
59+
3360
## How to use
3461

3562
### Scrape a website

app/app.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import asyncio, functools, random, re, string
2-
from os import cpu_count
1+
import asyncio, functools, random, re, string, sys
2+
from os import cpu_count, environ as env
33
from platform import python_version
44

55
import click
@@ -369,3 +369,16 @@ def _job_name(job_name: str | None) -> str:
369369
return job_name or "".join(
370370
random.choices(string.ascii_lowercase + string.digits, k=7)
371371
)
372+
373+
374+
if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"): # Running in PyInstaller
375+
import certifi
376+
377+
# Path the bundle with certifi
378+
# See: https://github.com/Azure/azure-iot-sdk-python/issues/991#issuecomment-1118235694
379+
# See: https://github.com/pyinstaller/pyinstaller/issues/7229#issuecomment-1309406736
380+
# See: https://github.com/pyinstaller/pyinstaller/issues/6352#issuecomment-962499220
381+
env["SSL_CERT_FILE"] = certifi.where()
382+
383+
# Run the CLI
384+
cli(sys.argv[1:])

app/helpers/resources.py

+26-12
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,16 @@
1-
import hashlib, os
2-
from os import path
1+
import hashlib
2+
from os import makedirs, path
33
from pathlib import Path
44

5+
import click
56

6-
def resources_dir(folder: str) -> str:
7+
8+
def resources_dir(sub: str) -> str:
79
"""
810
Get the absolute path to the resources folder.
911
"""
1012
return str(
11-
Path(
12-
path.join(
13-
os.path.abspath(os.getcwd()),
14-
"resources",
15-
folder,
16-
)
17-
)
18-
.resolve()
19-
.absolute()
13+
Path(__file__).parent.parent.parent.joinpath("resources", sub).absolute()
2014
)
2115

2216

@@ -56,3 +50,23 @@ def hash_url(url: str) -> str:
5650
url.encode(),
5751
usedforsecurity=False,
5852
).hexdigest()
53+
54+
55+
def cache_dir() -> str:
56+
"""
57+
Get the path to the cache directory.
58+
59+
See: https://click.palletsprojects.com/en/8.1.x/api/#click.get_app_dir
60+
"""
61+
res = click.get_app_dir("scrape-it-now")
62+
# Create if not exists
63+
if not path.exists(res):
64+
makedirs(res)
65+
return res
66+
67+
68+
def browsers_install_path() -> str:
69+
"""
70+
Get the path to the browser executable.
71+
"""
72+
return path.join(cache_dir(), "browsers")

0 commit comments

Comments
 (0)