Skip to content

Commit 3d684e2

Browse files
v0.2.0: image size filter, open folder, license (Seth Strickland)
- CLI: --min-image-size / --max-image-size (k/m suffixes); filter by HEAD Content-Length - Fetcher: head_metadata() for content-type and content-length - GUI: image size filter (Min KB / Max MB), Open folder button - README: author and license attribution - LICENSE: Copyright (c) 2025 Seth Strickland - Bump to 0.2.0; CHANGELOG updated Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 3ce423a commit 3d684e2

File tree

7 files changed

+136
-12
lines changed

7 files changed

+136
-12
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66

77
## [Unreleased]
88

9+
## [0.2.0] - 2025-02-04
10+
11+
- GUI (tkinter) with file-type selector, image size filter, and Open folder button.
12+
- CLI: `--types` (pdf/text/images), `--min-image-size` / `--max-image-size`, `--workers`, `--no-progress`.
13+
- Auto-install deps via `BASIC_SCRAPER_AUTO_INSTALL_DEPS=1`.
14+
- Hardware-autodetected parallel crawl; progress bar (optional tqdm).
15+
- Docker image (slim) and GitHub Actions (PyInstaller + Docker).
16+
- License: MIT, Copyright (c) 2025 Seth Strickland.
17+
918
## [0.1.0]
1019

1120
- Initial package layout, CLI stub, and semver setup.

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2025
3+
Copyright (c) 2025 Seth Strickland
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
Basic scraper: PDFs, text, and images from websites at high quality, stored locally.
44

5+
**Author:** Seth Strickland · **License:** [MIT](LICENSE)
6+
57
## Versioning
68

79
This project follows [Semantic Versioning 2.0.0](https://semver.org/): `MAJOR.MINOR.PATCH`.
@@ -14,7 +16,7 @@ For the **0.y.z** range, the public API is treated as unstable: MINOR may introd
1416

1517
## License
1618

17-
MIT. See [LICENSE](LICENSE).
19+
MIT License. Copyright (c) 2025 Seth Strickland. See [LICENSE](LICENSE).
1820

1921
## Install and run
2022

@@ -30,6 +32,8 @@ This installs the package in editable mode and registers the `scrape` and `scrap
3032
scrape --url https://example.com/page [--out-dir output] [--delay 1] [--crawl] [--max-depth 2] [--same-domain-only]
3133
```
3234

35+
Filter images by file size (uses HEAD `Content-Length`): `--min-image-size 50k` and/or `--max-image-size 5m` (suffixes `k`/`m` for KB/MB).
36+
3337
Or open the simple GUI:
3438

3539
```bash

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "basic-scraper"
7-
version = "0.1.0"
7+
version = "0.2.0"
88
description = "Basic scraper: PDFs, text, and images from websites at high quality, stored locally."
99
readme = "README.md"
1010
license = { text = "MIT" }

web_scraper/cli.py

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,18 @@
4343
VALID_TYPES = frozenset({"pdf", "text", "images"})
4444

4545

46+
def _parse_size(s: str) -> int:
47+
"""Parse size string to bytes: 100, 100k, 1m (case-insensitive)."""
48+
s = s.strip().lower()
49+
if not s:
50+
raise ValueError("empty size")
51+
if s.endswith("k"):
52+
return int(s[:-1]) * 1024
53+
if s.endswith("m"):
54+
return int(s[:-1]) * 1024 * 1024
55+
return int(s)
56+
57+
4658
def _scrape_page(
4759
url: str,
4860
out_dir: Path,
@@ -54,6 +66,8 @@ def _scrape_page(
5466
collect_links: bool,
5567
types: set[str] | None = None,
5668
progress_callback: Callable[[str], None] | None = None,
69+
min_image_size: int | None = None,
70+
max_image_size: int | None = None,
5771
) -> list[str]:
5872
"""
5973
Scrape a single page: PDFs, text, images (according to types).
@@ -122,10 +136,14 @@ def _scrape_page(
122136
if img_url in urls_map:
123137
continue
124138
best_url = get_best_image_url(img_url, None, try_high_res=True)
125-
ct = fetcher.head_content_type(best_url, delay=delay)
139+
ct, content_length = fetcher.head_metadata(best_url, delay=delay)
126140
if ct and not ct.startswith("image/"):
127141
best_url = img_url
128-
ct = fetcher.head_content_type(img_url, delay=delay)
142+
ct, content_length = fetcher.head_metadata(img_url, delay=delay)
143+
if min_image_size is not None and content_length is not None and content_length < min_image_size:
144+
continue
145+
if max_image_size is not None and content_length is not None and content_length > max_image_size:
146+
continue
129147
dest = path_for_image(out_dir, domain, best_url, ct)
130148
if dest.exists():
131149
urls_map[img_url] = str(dest)
@@ -195,11 +213,37 @@ def main() -> None:
195213
action="store_true",
196214
help="Disable progress bar (e.g. for scripting)",
197215
)
216+
parser.add_argument(
217+
"--min-image-size",
218+
type=str,
219+
default=None,
220+
metavar="SIZE",
221+
help="Skip images smaller than SIZE (e.g. 50k, 1m). Uses HEAD Content-Length.",
222+
)
223+
parser.add_argument(
224+
"--max-image-size",
225+
type=str,
226+
default=None,
227+
metavar="SIZE",
228+
help="Skip images larger than SIZE (e.g. 5m, 10m). Uses HEAD Content-Length.",
229+
)
198230
args = parser.parse_args()
199231

200232
out_dir = Path(args.out_dir)
201233
limit = args.limit
202234
types_set = set(args.types) if args.types else None
235+
min_image_size = None
236+
max_image_size = None
237+
if args.min_image_size:
238+
try:
239+
min_image_size = _parse_size(args.min_image_size)
240+
except ValueError as e:
241+
parser.error(f"--min-image-size: {e}")
242+
if args.max_image_size:
243+
try:
244+
max_image_size = _parse_size(args.max_image_size)
245+
except ValueError as e:
246+
parser.error(f"--max-image-size: {e}")
203247
workers = args.workers if args.workers is not None else default_workers()
204248
workers = max(1, min(workers, default_workers()))
205249

@@ -208,9 +252,13 @@ def main() -> None:
208252
_crawl_parallel(
209253
args.url, out_dir, args.delay, args.max_depth,
210254
args.same_domain_only, limit, types_set, workers, use_progress,
255+
min_image_size, max_image_size,
211256
)
212257
else:
213-
_run_single_or_sequential_crawl(args, out_dir, limit, types_set, workers, use_progress)
258+
_run_single_or_sequential_crawl(
259+
args, out_dir, limit, types_set, workers, use_progress,
260+
min_image_size, max_image_size,
261+
)
214262

215263
print("\nDone.", file=sys.stderr)
216264

@@ -222,6 +270,8 @@ def _run_single_or_sequential_crawl(
222270
types_set: set[str] | None,
223271
workers: int,
224272
use_progress: bool,
273+
min_image_size: int | None,
274+
max_image_size: int | None,
225275
) -> None:
226276
"""Single-page scrape or sequential crawl (workers=1)."""
227277
with Fetcher() as fetcher:
@@ -248,6 +298,8 @@ def _run_single_or_sequential_crawl(
248298
url, out_dir, args.delay, manifest, fetcher,
249299
limit, limit, collect_links=True, types=types_set,
250300
progress_callback=None,
301+
min_image_size=min_image_size,
302+
max_image_size=max_image_size,
251303
)
252304
if use_progress:
253305
pbar.update(1)
@@ -276,6 +328,8 @@ def _run_single_or_sequential_crawl(
276328
args.url, out_dir, args.delay, manifest, fetcher,
277329
limit, limit, collect_links=False, types=types_set,
278330
progress_callback=progress_cb,
331+
min_image_size=min_image_size,
332+
max_image_size=max_image_size,
279333
)
280334
finally:
281335
if use_progress:
@@ -292,6 +346,8 @@ def _crawl_parallel(
292346
types_set: set[str] | None,
293347
workers: int,
294348
use_progress: bool,
349+
min_image_size: int | None,
350+
max_image_size: int | None,
295351
) -> None:
296352
"""Crawl with a thread pool; each worker uses its own Fetcher, shared manifest lock."""
297353
start_domain = urlparse(start_url).netloc
@@ -314,6 +370,9 @@ def process_one(url: str, depth: int) -> list[str]:
314370
links = _scrape_page(
315371
url, out_dir, delay, manifest, fetcher,
316372
limit, limit, collect_links=True, types=types_set,
373+
progress_callback=None,
374+
min_image_size=min_image_size,
375+
max_image_size=max_image_size,
317376
)
318377
except Exception as e:
319378
print(f"Error {url}: {e}", file=sys.stderr)

web_scraper/fetcher.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,22 @@ def head_content_type(self, url: str, *, timeout: float = 10.0, delay: float = 0
9898
except Exception:
9999
return None
100100

101+
def head_metadata(self, url: str, *, timeout: float = 10.0, delay: float = 0) -> tuple[str | None, int | None]:
102+
"""HEAD request; returns (content_type, content_length). content_length is None if header missing."""
103+
if delay > 0:
104+
time.sleep(delay)
105+
try:
106+
client = self._get_client()
107+
resp = client.head(url, timeout=timeout)
108+
resp.raise_for_status()
109+
ct = resp.headers.get("content-type", "")
110+
content_type = ct.split(";")[0].strip().lower() if ct else None
111+
cl = resp.headers.get("content-length")
112+
content_length = int(cl) if cl is not None and cl.isdigit() else None
113+
return content_type, content_length
114+
except Exception:
115+
return None, None
116+
101117

102118
def fetch_html(url: str, *, timeout: float = DEFAULT_TIMEOUT, delay: float = 0) -> tuple[bytes, str]:
103119
"""Standalone fetch (creates temporary client). Prefer Fetcher for multiple requests."""

web_scraper/gui.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,23 @@
1111
from web_scraper._deps import check_required
1212

1313

14+
def _open_folder(path: str) -> None:
15+
"""Open path in the system file manager; create dir if missing."""
16+
if not path or not path.strip():
17+
return
18+
abs_path = os.path.abspath(path.strip())
19+
try:
20+
os.makedirs(abs_path, exist_ok=True)
21+
except OSError:
22+
pass
23+
if sys.platform == "darwin":
24+
subprocess.run(["open", abs_path], check=False)
25+
elif sys.platform == "win32":
26+
os.startfile(abs_path) # type: ignore[attr-defined]
27+
else:
28+
subprocess.run(["xdg-open", abs_path], check=False)
29+
30+
1431
def main() -> None:
1532
check_required()
1633
root = tk.Tk()
@@ -26,9 +43,13 @@ def main() -> None:
2643
url_entry.grid(row=1, column=0, columnspan=2, sticky=tk.EW, pady=(0, 8))
2744

2845
ttk.Label(main_frame, text="Output directory").grid(row=2, column=0, sticky=tk.W, pady=(0, 2))
46+
out_row = ttk.Frame(main_frame)
47+
out_row.grid(row=3, column=0, columnspan=2, sticky=tk.EW, pady=(0, 8))
48+
main_frame.columnconfigure(0, weight=1)
2949
out_var = tk.StringVar(value="output")
30-
out_entry = ttk.Entry(main_frame, textvariable=out_var, width=50)
31-
out_entry.grid(row=3, column=0, columnspan=2, sticky=tk.EW, pady=(0, 8))
50+
out_entry = ttk.Entry(out_row, textvariable=out_var, width=50)
51+
out_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 8))
52+
ttk.Button(out_row, text="Open folder", command=lambda: _open_folder(out_var.get())).pack(side=tk.LEFT)
3253

3354
types_frame = ttk.LabelFrame(main_frame, text="File types")
3455
types_frame.grid(row=4, column=0, columnspan=2, sticky=tk.W, pady=(0, 8))
@@ -39,8 +60,17 @@ def main() -> None:
3960
ttk.Checkbutton(types_frame, text="Text", variable=type_text_var).pack(side=tk.LEFT, padx=(0, 12))
4061
ttk.Checkbutton(types_frame, text="Images", variable=type_images_var).pack(side=tk.LEFT)
4162

63+
size_frame = ttk.LabelFrame(main_frame, text="Image size filter (optional)")
64+
size_frame.grid(row=5, column=0, columnspan=2, sticky=tk.W, pady=(0, 8))
65+
min_image_var = tk.StringVar(value="")
66+
max_image_var = tk.StringVar(value="")
67+
ttk.Label(size_frame, text="Min (KB):").pack(side=tk.LEFT, padx=(0, 4))
68+
ttk.Entry(size_frame, textvariable=min_image_var, width=8).pack(side=tk.LEFT, padx=(0, 12))
69+
ttk.Label(size_frame, text="Max (MB):").pack(side=tk.LEFT, padx=(8, 4))
70+
ttk.Entry(size_frame, textvariable=max_image_var, width=8).pack(side=tk.LEFT)
71+
4272
opts_frame = ttk.Frame(main_frame)
43-
opts_frame.grid(row=5, column=0, columnspan=2, sticky=tk.W, pady=(0, 8))
73+
opts_frame.grid(row=6, column=0, columnspan=2, sticky=tk.W, pady=(0, 8))
4474
delay_var = tk.DoubleVar(value=1.0)
4575
ttk.Label(opts_frame, text="Delay (s):").pack(side=tk.LEFT)
4676
delay_spin = ttk.Spinbox(opts_frame, from_=0.5, to=10, increment=0.5, width=5, textvariable=delay_var)
@@ -55,9 +85,9 @@ def main() -> None:
5585
ttk.Checkbutton(opts_frame, text="Same domain only", variable=same_domain_var).pack(side=tk.LEFT)
5686

5787
log_frame = ttk.LabelFrame(main_frame, text="Log")
58-
log_frame.grid(row=6, column=0, columnspan=2, sticky=tk.NSEW, pady=(0, 8))
88+
log_frame.grid(row=7, column=0, columnspan=2, sticky=tk.NSEW, pady=(0, 8))
5989
main_frame.columnconfigure(0, weight=1)
60-
main_frame.rowconfigure(6, weight=1)
90+
main_frame.rowconfigure(7, weight=1)
6191

6292
log_text = tk.Text(log_frame, height=8, wrap=tk.WORD, state=tk.DISABLED)
6393
log_scroll = ttk.Scrollbar(log_frame)
@@ -122,6 +152,12 @@ def run_scrape(scrape_btn_ref: tk.Widget) -> None:
122152
append_log("Error: Select at least one file type.\n")
123153
scrape_btn_ref.config(state=tk.NORMAL)
124154
return
155+
min_s = min_image_var.get().strip()
156+
if min_s:
157+
cmd.extend(["--min-image-size", f"{min_s}k"])
158+
max_s = max_image_var.get().strip()
159+
if max_s:
160+
cmd.extend(["--max-image-size", f"{max_s}m"])
125161
if crawl_var.get():
126162
cmd.extend(["--crawl", "--max-depth", str(depth)])
127163
if same_domain_var.get():
@@ -164,7 +200,7 @@ def poll_queue(btn: tk.Widget) -> None:
164200
poll_queue(scrape_btn_ref)
165201

166202
btn_frame = ttk.Frame(main_frame)
167-
btn_frame.grid(row=7, column=0, columnspan=2)
203+
btn_frame.grid(row=8, column=0, columnspan=2)
168204
scrape_btn = ttk.Button(btn_frame, text="Scrape", command=lambda: run_scrape(scrape_btn))
169205
scrape_btn.pack(side=tk.LEFT, padx=(0, 8))
170206
ttk.Button(btn_frame, text="Clear log", command=lambda: (log_text.config(state=tk.NORMAL), log_text.delete("1.0", tk.END), log_text.config(state=tk.DISABLED))).pack(side=tk.LEFT)

0 commit comments

Comments
 (0)