|
46 | 46 | + list(map(chr, range(0x25A0, 0x2600))) |
47 | 47 | ) |
48 | 48 |
|
| 49 | +FLAGS = ( |
| 50 | + 0 |
| 51 | + | pymupdf.TEXT_COLLECT_STYLES |
| 52 | + | pymupdf.TEXT_COLLECT_VECTORS |
| 53 | + | pymupdf.TEXT_PRESERVE_IMAGES |
| 54 | + | pymupdf.TEXT_ACCURATE_BBOXES |
| 55 | + # | pymupdf.TEXT_MEDIABOX_CLIP |
| 56 | +) |
| 57 | + |
| 58 | +REPLACEMENT_CHARACTER = chr(0xFFFD) |
| 59 | + |
| 60 | + |
| 61 | +def is_white(text): |
| 62 | + """Identify white text.""" |
| 63 | + return WHITE_CHARS.issuperset(text) |
| 64 | + |
| 65 | + |
| 66 | +def analyze_page(page, blocks=None) -> dict: |
| 67 | + """Analyze the page for the OCR decision. |
| 68 | +
|
| 69 | + Args: |
| 70 | + blocks: output of page.get_text("dict") if already available |
| 71 | + Returns: |
| 72 | + A dict with analysis results. The area-related float values are |
| 73 | + computed as fractions of the total covered area. |
| 74 | +
|
| 75 | + "covered": pymupdf.Rect, page area covered by content |
| 76 | + "img_joins": float, fraction of area of the joined images |
| 77 | + "img_area": float, fraction of sum of image area sizes |
| 78 | + "txt_joins": float, fraction of area of the joined text spans |
| 79 | + "txt_area": float, fraction of sum of text span bbox area sizes |
| 80 | + "vec_joins": float, fraction of area of the joined vector characters |
| 81 | + "vec_area": float, fraction of sum of vector character area sizes |
| 82 | + "chars_total": int, count of visible characters |
| 83 | + "chars_bad": int, count of Replacement Unicode characters |
| 84 | + "ocr_spans": int, count of text spans with 'GlyphLessFont' |
| 85 | +
|
| 86 | + """ |
| 87 | + chars_total = 0 |
| 88 | + chars_bad = 0 |
| 89 | + if blocks is None: |
| 90 | + blocks = page.get_text( |
| 91 | + "dict", |
| 92 | + flags=FLAGS, |
| 93 | + clip=pymupdf.INFINITE_RECT(), |
| 94 | + )["blocks"] |
| 95 | + img_rect = pymupdf.EMPTY_RECT() |
| 96 | + txt_rect = +img_rect |
| 97 | + vec_rect = +img_rect |
| 98 | + img_area = 0 |
| 99 | + txt_area = 0 |
| 100 | + vec_area = 0 |
| 101 | + ocr_spans = 0 |
| 102 | + for b in blocks: |
| 103 | + bbox = page.rect & b["bbox"] |
| 104 | + area = bbox.width * bbox.height |
| 105 | + if not area: |
| 106 | + continue |
| 107 | + if b["type"] == 1: # Image block |
| 108 | + img_rect |= bbox |
| 109 | + img_area += area |
| 110 | + elif b["type"] == 0: # Text block |
| 111 | + for l in b["lines"]: |
| 112 | + for s in l["spans"]: |
| 113 | + if is_white(s["text"]): |
| 114 | + continue |
| 115 | + sr = page.rect & s["bbox"] |
| 116 | + if sr.is_empty or sr.is_infinite: |
| 117 | + continue |
| 118 | + if s["font"] == "GlyphLessFont": |
| 119 | + ocr_spans += 1 |
| 120 | + elif s["alpha"] == 0: |
| 121 | + continue # skip invisible text |
| 122 | + chars_total += len(s["text"].strip()) |
| 123 | + chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)]) |
| 124 | + txt_rect |= sr |
| 125 | + txt_area += sr.width * sr.height |
| 126 | + elif ( |
| 127 | + 1 |
| 128 | + and b["type"] == 3 # vector block |
| 129 | + and b["stroked"] # has been stroked |
| 130 | + and bbox.width <= 20 # width limit for typical characters |
| 131 | + and bbox.height <= 20 # height limit for typical characters |
| 132 | + and not b["isrect"] # contains curves |
| 133 | + ): |
| 134 | + # potential character-like vector block |
| 135 | + vec_rect |= bbox |
| 136 | + vec_area += area |
| 137 | + |
| 138 | + # the rectangle on page covered by some content |
| 139 | + covered = img_rect | txt_rect | vec_rect |
| 140 | + cover_area = abs(covered) |
| 141 | + analysis = { |
| 142 | + "covered": covered, |
| 143 | + "img_joins": (abs(img_rect) / cover_area) if cover_area else 0, |
| 144 | + "img_area": img_area / cover_area if cover_area else 0, |
| 145 | + "txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0, |
| 146 | + "txt_area": txt_area / cover_area if cover_area else 0, |
| 147 | + "vec_area": vec_area / cover_area if cover_area else 0, |
| 148 | + "vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0, |
| 149 | + "chars_total": chars_total, |
| 150 | + "chars_bad": chars_bad, |
| 151 | + "ocr_spans": ocr_spans, |
| 152 | + } |
| 153 | + return analysis |
| 154 | + |
49 | 155 |
|
50 | 156 | def table_cleaner(page, blocks, tbbox): |
51 | 157 | """Clean the table bbox 'tbbox'. |
|
0 commit comments