diff --git a/CHANGELOG.md b/CHANGELOG.md index efa6d33a9c..50c354a84e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,25 @@ +## 0.18.15-dev2 + +### Enhancements +- Speed up function _assign_hash_ids by 34% (codeflash) + +### Features + +### Fixes + +## 0.18.15-dev1 + +### Enhancements +- Speed up function group_broken_paragraphs by 30% (codeflash) + +### Features + +### Fixes + ## 0.18.15-dev0 ### Enhancements +- Optimized the runtime of `ElementHtml._get_children_html` ### Features diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9d8d327217..d4a801ee5e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.15-dev0" # pragma: no cover +__version__ = "0.18.15-dev2" # pragma: no cover diff --git a/unstructured/partition/common/metadata.py b/unstructured/partition/common/metadata.py index 29704d97d0..08b0fe9597 100644 --- a/unstructured/partition/common/metadata.py +++ b/unstructured/partition/common/metadata.py @@ -5,7 +5,6 @@ import copy import datetime as dt import functools -import itertools import os from typing import Any, Callable, Iterator, Sequence @@ -252,15 +251,12 @@ def _assign_hash_ids(elements: list[Element]) -> list[Element]: or more fragments for parallel processing. """ # -- generate sequence number for each element on a page -- - page_numbers = [e.metadata.page_number for e in elements] - page_seq_numbers = [ - seq_on_page - for _, group in itertools.groupby(page_numbers) - for seq_on_page, _ in enumerate(group) - ] - - for element, seq_on_page_counter in zip(elements, page_seq_numbers): + page_seq_counts = {} + for element in elements: + page_number = element.metadata.page_number + seq_on_page_counter = page_seq_counts.get(page_number, 0) element.id_to_hash(seq_on_page_counter) + page_seq_counts[page_number] = seq_on_page_counter + 1 return elements