Merge pull request #338 from GeospatialPython/Writer_speed_tests

JamesParrott · web-flow · commit 9ba30ccef5c6 · 2025-07-29T16:44:43.000+01:00
Add Writer speed tests
diff --git a/run_benchmarks.py b/run_benchmarks.py
@@ -2,14 +2,16 @@
 
 from __future__ import annotations
 
+import collections
 import functools
 import os
 import timeit
 from collections.abc import Callable
 from pathlib import Path
+from tempfile import TemporaryFile as TempF
 from typing import Union
 
-import shapefile as shp
+import shapefile
 
 # For shapefiles from https://github.com/JamesParrott/PyShp_test_shapefile
 DEFAULT_PYSHP_TEST_REPO = (
@@ -31,26 +33,41 @@ def benchmark(
     name: str,
     run_count: int,
     func: Callable,
-    col_width: tuple,
+    col_widths: tuple,
     compare_to: float | None = None,
 ) -> float:
     placeholder = "Running..."
-    print(f"{name:>{col_width[0]}} | {placeholder}", end="", flush=True)
+    print(f"{name:>{col_widths[0]}} | {placeholder}", end="", flush=True)
     time_taken = timeit.timeit(func, number=run_count)
     print("\b" * len(placeholder), end="")
     time_suffix = " s"
-    print(f"{time_taken:{col_width[1]-len(time_suffix)}.3g}{time_suffix}", end="")
+    print(f"{time_taken:{col_widths[1]-len(time_suffix)}.3g}{time_suffix}", end="")
     print()
     return time_taken
 
 
+fields = {}
+shapeRecords = collections.defaultdict(list)
+
+
 def open_shapefile_with_PyShp(target: Union[str, os.PathLike]):
-    with shp.Reader(target) as r:
+    with shapefile.Reader(target) as r:
+        fields[target] = r.fields
         for shapeRecord in r.iterShapeRecords():
-            pass
+            shapeRecords[target].append(shapeRecord)
+
+
+def write_shapefile_with_PyShp(target: Union[str, os.PathLike]):
+    with TempF("wb") as shp, TempF("wb") as dbf, TempF("wb") as shx:
+        with shapefile.Writer(shp=shp, dbf=dbf, shx=shx) as w:  # type: ignore [arg-type]
+            for field_info_tuple in fields[target]:
+                w.field(*field_info_tuple)
+            for shapeRecord in shapeRecords[target]:
+                w.shape(shapeRecord.shape)
+                w.record(*shapeRecord.record)
 
 
-READER_TESTS = {
+SHAPEFILES = {
     "Blockgroups": blockgroups_file,
     "Edit": edit_file,
     "Merge": merge_file,
@@ -60,24 +77,47 @@ def open_shapefile_with_PyShp(target: Union[str, os.PathLike]):
 }
 
 
-def run(run_count: int) -> None:
-    col_width = (21, 10)
+# Load files to avoid one off delays that only affect first disk seek
+for file_path in SHAPEFILES.values():
+    file_path.read_bytes()
+
+reader_benchmarks = [
+    functools.partial(
+        benchmark,
+        name=f"Read {test_name}",
+        func=functools.partial(open_shapefile_with_PyShp, target=target),
+    )
+    for test_name, target in SHAPEFILES.items()
+]
+
+# Require fields and shapeRecords to first have been populated
+# from data from previouly running the reader_benchmarks
+writer_benchmarks = [
+    functools.partial(
+        benchmark,
+        name=f"Write {test_name}",
+        func=functools.partial(write_shapefile_with_PyShp, target=target),
+    )
+    for test_name, target in SHAPEFILES.items()
+]
+
+
+def run(run_count: int, benchmarks: list[Callable[[], None]]) -> None:
+    col_widths = (22, 10)
     col_head = ("parser", "exec time", "performance (more is better)")
-    # Load files to avoid one off delays that only affect first disk seek
-    for file_path in READER_TESTS.values():
-        file_path.read_bytes()
     print(f"Running benchmarks {run_count} times:")
-    print("-" * col_width[0] + "---" + "-" * col_width[1])
-    print(f"{col_head[0]:>{col_width[0]}} | {col_head[1]:>{col_width[1]}}")
-    print("-" * col_width[0] + "-+-" + "-" * col_width[1])
-    for test_name, target in READER_TESTS.items():
-        benchmark(
-            f"Read {test_name}",
-            run_count,
-            functools.partial(open_shapefile_with_PyShp, target=target),
-            col_width,
+    print("-" * col_widths[0] + "---" + "-" * col_widths[1])
+    print(f"{col_head[0]:>{col_widths[0]}} | {col_head[1]:>{col_widths[1]}}")
+    print("-" * col_widths[0] + "-+-" + "-" * col_widths[1])
+    for benchmark in benchmarks:
+        benchmark(  # type: ignore [call-arg]
+            run_count=run_count,
+            col_widths=col_widths,
         )
 
 
 if __name__ == "__main__":
-    run(1)
+    print("Reader tests:")
+    run(1, reader_benchmarks)  # type: ignore [arg-type]
+    print("\n\nWriter tests:")
+    run(1, writer_benchmarks)  # type: ignore [arg-type]
diff --git a/test_shapefile.py b/test_shapefile.py
@@ -987,6 +987,7 @@ def test_record_oid():
             assert shaperec.record.oid == i
 
 
+@pytest.mark.slow
 def test_iterRecords_start_stop():
     """
     Assert that Reader.iterRecords(start, stop)
@@ -999,36 +1000,31 @@ def test_iterRecords_start_stop():
 
         # Arbitrary selection of record indices
         # (there are 663 records in blockgroups.dbf).
-        for i in [
+        indices = [
             0,
             1,
             2,
-            3,
             5,
             11,
-            17,
-            33,
-            51,
-            103,
-            170,
-            234,
-            435,
-            543,
+            41,
+            310,
+            513,
             N - 3,
-            N - 2,
             N - 1,
-        ]:
-            for record in sf.iterRecords(start=i):
+        ]
+        for i, index in enumerate(indices):
+            for record in sf.iterRecords(start=index):
                 assert record == sf.record(record.oid)
 
-            for record in sf.iterRecords(stop=i):
+            for record in sf.iterRecords(stop=index):
                 assert record == sf.record(record.oid)
 
-            for stop in range(i, len(sf)):
+            for j in range(i + 1, len(indices)):
+                stop = indices[j]
                 # test negative indexing from end, as well as
                 # positive values of stop, and its default
-                for stop_arg in (stop, stop - len(sf)):
-                    for record in sf.iterRecords(start=i, stop=stop_arg):
+                for stop_arg in (stop, stop - N):
+                    for record in sf.iterRecords(start=index, stop=stop_arg):
                         assert record == sf.record(record.oid)