4343VALID_TYPES = frozenset ({"pdf" , "text" , "images" })
4444
4545
46+ def _parse_size (s : str ) -> int :
47+ """Parse size string to bytes: 100, 100k, 1m (case-insensitive)."""
48+ s = s .strip ().lower ()
49+ if not s :
50+ raise ValueError ("empty size" )
51+ if s .endswith ("k" ):
52+ return int (s [:- 1 ]) * 1024
53+ if s .endswith ("m" ):
54+ return int (s [:- 1 ]) * 1024 * 1024
55+ return int (s )
56+
57+
4658def _scrape_page (
4759 url : str ,
4860 out_dir : Path ,
@@ -54,6 +66,8 @@ def _scrape_page(
5466 collect_links : bool ,
5567 types : set [str ] | None = None ,
5668 progress_callback : Callable [[str ], None ] | None = None ,
69+ min_image_size : int | None = None ,
70+ max_image_size : int | None = None ,
5771) -> list [str ]:
5872 """
5973 Scrape a single page: PDFs, text, images (according to types).
@@ -122,10 +136,14 @@ def _scrape_page(
122136 if img_url in urls_map :
123137 continue
124138 best_url = get_best_image_url (img_url , None , try_high_res = True )
125- ct = fetcher .head_content_type (best_url , delay = delay )
139+ ct , content_length = fetcher .head_metadata (best_url , delay = delay )
126140 if ct and not ct .startswith ("image/" ):
127141 best_url = img_url
128- ct = fetcher .head_content_type (img_url , delay = delay )
142+ ct , content_length = fetcher .head_metadata (img_url , delay = delay )
143+ if min_image_size is not None and content_length is not None and content_length < min_image_size :
144+ continue
145+ if max_image_size is not None and content_length is not None and content_length > max_image_size :
146+ continue
129147 dest = path_for_image (out_dir , domain , best_url , ct )
130148 if dest .exists ():
131149 urls_map [img_url ] = str (dest )
@@ -195,11 +213,37 @@ def main() -> None:
195213 action = "store_true" ,
196214 help = "Disable progress bar (e.g. for scripting)" ,
197215 )
216+ parser .add_argument (
217+ "--min-image-size" ,
218+ type = str ,
219+ default = None ,
220+ metavar = "SIZE" ,
221+ help = "Skip images smaller than SIZE (e.g. 50k, 1m). Uses HEAD Content-Length." ,
222+ )
223+ parser .add_argument (
224+ "--max-image-size" ,
225+ type = str ,
226+ default = None ,
227+ metavar = "SIZE" ,
228+ help = "Skip images larger than SIZE (e.g. 5m, 10m). Uses HEAD Content-Length." ,
229+ )
198230 args = parser .parse_args ()
199231
200232 out_dir = Path (args .out_dir )
201233 limit = args .limit
202234 types_set = set (args .types ) if args .types else None
235+ min_image_size = None
236+ max_image_size = None
237+ if args .min_image_size :
238+ try :
239+ min_image_size = _parse_size (args .min_image_size )
240+ except ValueError as e :
241+ parser .error (f"--min-image-size: { e } " )
242+ if args .max_image_size :
243+ try :
244+ max_image_size = _parse_size (args .max_image_size )
245+ except ValueError as e :
246+ parser .error (f"--max-image-size: { e } " )
203247 workers = args .workers if args .workers is not None else default_workers ()
204248 workers = max (1 , min (workers , default_workers ()))
205249
@@ -208,9 +252,13 @@ def main() -> None:
208252 _crawl_parallel (
209253 args .url , out_dir , args .delay , args .max_depth ,
210254 args .same_domain_only , limit , types_set , workers , use_progress ,
255+ min_image_size , max_image_size ,
211256 )
212257 else :
213- _run_single_or_sequential_crawl (args , out_dir , limit , types_set , workers , use_progress )
258+ _run_single_or_sequential_crawl (
259+ args , out_dir , limit , types_set , workers , use_progress ,
260+ min_image_size , max_image_size ,
261+ )
214262
215263 print ("\n Done." , file = sys .stderr )
216264
@@ -222,6 +270,8 @@ def _run_single_or_sequential_crawl(
222270 types_set : set [str ] | None ,
223271 workers : int ,
224272 use_progress : bool ,
273+ min_image_size : int | None ,
274+ max_image_size : int | None ,
225275) -> None :
226276 """Single-page scrape or sequential crawl (workers=1)."""
227277 with Fetcher () as fetcher :
@@ -248,6 +298,8 @@ def _run_single_or_sequential_crawl(
248298 url , out_dir , args .delay , manifest , fetcher ,
249299 limit , limit , collect_links = True , types = types_set ,
250300 progress_callback = None ,
301+ min_image_size = min_image_size ,
302+ max_image_size = max_image_size ,
251303 )
252304 if use_progress :
253305 pbar .update (1 )
@@ -276,6 +328,8 @@ def _run_single_or_sequential_crawl(
276328 args .url , out_dir , args .delay , manifest , fetcher ,
277329 limit , limit , collect_links = False , types = types_set ,
278330 progress_callback = progress_cb ,
331+ min_image_size = min_image_size ,
332+ max_image_size = max_image_size ,
279333 )
280334 finally :
281335 if use_progress :
@@ -292,6 +346,8 @@ def _crawl_parallel(
292346 types_set : set [str ] | None ,
293347 workers : int ,
294348 use_progress : bool ,
349+ min_image_size : int | None ,
350+ max_image_size : int | None ,
295351) -> None :
296352 """Crawl with a thread pool; each worker uses its own Fetcher, shared manifest lock."""
297353 start_domain = urlparse (start_url ).netloc
@@ -314,6 +370,9 @@ def process_one(url: str, depth: int) -> list[str]:
314370 links = _scrape_page (
315371 url , out_dir , delay , manifest , fetcher ,
316372 limit , limit , collect_links = True , types = types_set ,
373+ progress_callback = None ,
374+ min_image_size = min_image_size ,
375+ max_image_size = max_image_size ,
317376 )
318377 except Exception as e :
319378 print (f"Error { url } : { e } " , file = sys .stderr )
0 commit comments