Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
## 0.18.15-dev0
## 0.18.15-dev1

### Enhancements
- Optimized the runtime of `ElementHtml._get_children_html`
- Speed up function group_broken_paragraphs by 30% (codeflash)

### Features

Expand All @@ -10,7 +12,6 @@

### Enhancements
- Speed up function sentence_count by 59% (codeflash)

- Speed up function `check_for_nltk_package` by 111% (codeflash)
- Speed up function `under_non_alpha_ratio` by 76% (codeflash)

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.15-dev0" # pragma: no cover
__version__ = "0.18.15-dev1" # pragma: no cover
32 changes: 21 additions & 11 deletions unstructured/cleaners/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,18 @@ def group_bullet_paragraph(paragraph: str) -> list:
'''○ The big red fox is walking down the lane.
○ At the end of the land the fox met a bear.'''
"""
clean_paragraphs = []
paragraph_pattern_re = re.compile(PARAGRAPH_PATTERN)

# pytesseract converts some bullet points to standalone "e" characters.
# Substitute "e" with bullets since they are later used in partition_text
# to determine list element type.
paragraph = (re.sub(E_BULLET_PATTERN, "·", paragraph)).strip()
paragraph = E_BULLET_PATTERN.sub("·", paragraph).strip()

bullet_paras = re.split(UNICODE_BULLETS_RE_0W, paragraph)
bullet_paras = UNICODE_BULLETS_RE_0W.split(paragraph)
clean_paragraphs = []
for bullet in bullet_paras:
if bullet:
clean_paragraphs.append(re.sub(PARAGRAPH_PATTERN, " ", bullet))
clean_paragraphs.append(paragraph_pattern_re.sub(" ", bullet))
return clean_paragraphs


Expand All @@ -151,10 +153,21 @@ def group_broken_paragraphs(
'''The big red fox is walking down the lane.
At the end of the land the fox met a bear.'''
"""
paragraph_pattern_re = (
PARAGRAPH_PATTERN
if isinstance(PARAGRAPH_PATTERN, re.Pattern)
else re.compile(PARAGRAPH_PATTERN)
)

paragraphs = paragraph_split.split(text)
clean_paragraphs = []
for paragraph in paragraphs:
if not paragraph.strip():
stripped_par = paragraph.strip()
if not stripped_par:
continue

if UNICODE_BULLETS_RE.match(stripped_par) or E_BULLET_PATTERN.match(stripped_par):
clean_paragraphs.extend(group_bullet_paragraph(paragraph))
continue
# NOTE(robinson) - This block is to account for lines like the following that shouldn't be
# grouped together, but aren't separated by a double line break.
Expand All @@ -163,13 +176,10 @@ def group_broken_paragraphs(
# http://www.apache.org/licenses/
para_split = line_split.split(paragraph)
all_lines_short = all(len(line.strip().split(" ")) < 5 for line in para_split)
# pytesseract converts some bullet points to standalone "e" characters
if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
clean_paragraphs.extend(group_bullet_paragraph(paragraph))
elif all_lines_short:
clean_paragraphs.extend([line for line in para_split if line.strip()])
if all_lines_short:
clean_paragraphs.extend(line for line in para_split if line.strip())
else:
clean_paragraphs.append(re.sub(PARAGRAPH_PATTERN, " ", paragraph))
clean_paragraphs.append(paragraph_pattern_re.sub(" ", paragraph))

return "\n\n".join(clean_paragraphs)

Expand Down