From 751c8d48db4e572c208261f06ad5c8968ee39604 Mon Sep 17 00:00:00 2001 From: keaton-sublime Date: Fri, 5 Dec 2025 16:02:32 -0500 Subject: [PATCH 1/2] adding support for pdf-object-hashing adding: - scan_pdf_obj_hash.py - requirements.txt to pull in the main branch for the most up-to-date pdf object hashing library - updating scanners.yaml to also run this scanner on PDFs --- build/configs/scanners.yaml | 6 ++++ build/python/backend/requirements.txt | 3 +- .../strelka/scanners/scan_pdf_obj_hash.py | 30 +++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 src/python/strelka/scanners/scan_pdf_obj_hash.py diff --git a/build/configs/scanners.yaml b/build/configs/scanners.yaml index 9f3996ff..9607f519 100644 --- a/build/configs/scanners.yaml +++ b/build/configs/scanners.yaml @@ -382,6 +382,12 @@ scanners: limit: 2000 pdf_to_png: True no_object_extraction: True + 'ScanPdfObjHash': + - positive: + flavors: + - 'application/pdf' + - 'pdf_file' + priority: 5 'ScanOnenote': - positive: flavors: diff --git a/build/python/backend/requirements.txt b/build/python/backend/requirements.txt index da8926e8..debf16a0 100644 --- a/build/python/backend/requirements.txt +++ b/build/python/backend/requirements.txt @@ -30,6 +30,7 @@ olefile==0.46 oletools==0.60.1 opencv-python==4.8.1.78 opencv-contrib-python==4.8.1.78 +pdf-object-hashing @ git+https://github.com/0xkyle/pdf_object_hashing.git Pillow>=11.2.1 pi-heif>=0.16.0 idna==3.10 @@ -54,4 +55,4 @@ signify==0.3.0 ssdeep==3.4 tldextract==5.1.3 tnefparse==1.4.0 -xmltodict==0.12.0 \ No newline at end of file +xmltodict==0.12.0 diff --git a/src/python/strelka/scanners/scan_pdf_obj_hash.py b/src/python/strelka/scanners/scan_pdf_obj_hash.py new file mode 100644 index 00000000..912fa199 --- /dev/null +++ b/src/python/strelka/scanners/scan_pdf_obj_hash.py @@ -0,0 +1,30 @@ +from strelka import strelka +from hashlib import md5 +from pdf_object_hashing import pdf_object as po + +class ScanPdfObjHash(strelka.Scanner): + def scan(self, data, file, options, expire_at): + pdf_object = po(fdata=data) + if pdf_object: + obj_hash_str = "" + pdf_file_hash = pdf_object.sha256 + try: + pdf_object.check_pdf_header() + pdf_object.trailer_process() + pdf_object.trailer_process() + pdf_object.start_object_parsing() + pdf_object.pull_objects_xref_aware() + except: + self.event["object_hash"] = "error" + file_ordered_objects = pdf_object.get_objects_by_file_order(in_use_only=True) + if file_ordered_objects: + for item in file_ordered_objects: + obj_hash_str += item["object_type"] + "|" + if obj_hash_str: + obj_hash = md5(obj_hash_str.encode()).hexdigest() + self.event["object_hash"] = obj_hash + self.event["hash_string"] = obj_hash_str + else: + self.event["object_hash"] = False + self.event["hash_string"] = False + From a190442136009835f2f6f9a2b1a7c1b639a91525 Mon Sep 17 00:00:00 2001 From: keaton-sublime Date: Fri, 5 Dec 2025 16:30:50 -0500 Subject: [PATCH 2/2] Update scanners.yaml adding a time out --- build/configs/scanners.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build/configs/scanners.yaml b/build/configs/scanners.yaml index 9607f519..e72f1677 100644 --- a/build/configs/scanners.yaml +++ b/build/configs/scanners.yaml @@ -388,6 +388,8 @@ scanners: - 'application/pdf' - 'pdf_file' priority: 5 + options: + scanner_timeout: 10 # in seconds 'ScanOnenote': - positive: flavors: