@@ -144,11 +144,12 @@ def __init__(
144144 [f for f in fontsizes .keys () if f > self .body_limit ],
145145 reverse = True ,
146146 )[:max_levels ]
147- self .body_limit = min (self .body_limit , sizes [- 1 ] - 1 if sizes else body_limit )
148147
149148 # make the header tag dictionary
150149 for i , size in enumerate (sizes , start = 1 ):
151150 self .header_id [size ] = "#" * i + " "
151+ if self .header_id .keys ():
152+ self .body_limit = min (self .header_id .keys ()) - 1
152153
153154 def get_header_id (self , span : dict , page = None ) -> str :
154155 """Return appropriate markdown header prefix.
@@ -163,6 +164,54 @@ def get_header_id(self, span: dict, page=None) -> str:
163164 return hdr_id
164165
165166
167+ class TocHeaders :
168+ """Compute data for identifying header text.
169+
170+ This is an alternative to IdentifyHeaders. Instead of running through the
171+ full document to identify font sizes, it uses the document's Table Of
172+ Contents (TOC) to identify headers on pages.
173+ Like IdentifyHeaders, this also is no guarantee to find headers, but it
174+ is a good change for appropriately build documents. In such cases, this
175+ method can be very much faster and more accurate, because we can use the
176+ hierarchy level of TOC items directly to ientify the header level.
177+ Examples where this approach works very well are the Adobe PDF documents.
178+ """
179+
180+ def __init__ (self , doc : str ):
181+ """Read and store the TOC of the document."""
182+ if isinstance (doc , pymupdf .Document ):
183+ mydoc = doc
184+ else :
185+ mydoc = pymupdf .open (doc )
186+
187+ self .TOC = doc .get_toc ()
188+ if mydoc != doc :
189+ # if opened here, close it now
190+ mydoc .close ()
191+
192+ def get_header_id (self , span : dict , page = None ) -> str :
193+ """Return appropriate markdown header prefix.
194+
195+ Given a text span from a "dict"/"rawdict" extraction, determine the
196+ markdown header prefix string of 0 to n concatenated '#' characters.
197+ """
198+ if page is None :
199+ return ""
200+ # check if this page has TOC entries with an actual title
201+ my_toc = [t for t in self .TOC if t [1 ] and t [- 1 ] == page .number + 1 ]
202+ if not my_toc :
203+ return ""
204+ # check if the span matches a TOC entry
205+ text = span ["text" ].strip ()
206+ for t in toc :
207+ title = t [1 ].strip () # title of TOC entry
208+ lvl = t [0 ] # level of TOC entry
209+ if text .startswith (title ) or title .startswith (text ):
210+ # found a match: return the header tag
211+ return "#" * lvl + " "
212+ return ""
213+
214+
166215# store relevant parameters here
167216@dataclass
168217class Parameters :
@@ -216,19 +265,33 @@ def is_significant(box, paths):
216265 else :
217266 d = box .height * 0.025
218267 nbox = box + (d , d , - d , - d ) # nbox covers 90% of box interior
219- # paths contained in box:
268+ # paths contained in, but not equal to box:
220269 my_paths = [p for p in paths if p ["rect" ] in box and p ["rect" ] != box ]
221270 for p in my_paths :
222271 rect = p ["rect" ]
223- if not (rect & nbox ).is_empty : # intersects interior: significant!
272+ if (
273+ not (rect & nbox ).is_empty and not p ["rect" ].is_empty
274+ ): # intersects interior: significant!
224275 return True
225276 # Remaining case: a horizontal or vertical line
226277 # horizontal line:
227- if rect .y0 == rect .y1 and rect .x0 < nbox .x1 and rect .x1 > nbox .x0 :
228- return True
278+ if (
279+ 1
280+ and rect .y0 == rect .y1
281+ and nbox .y0 <= rect .y0 <= nbox .y1
282+ and rect .x0 < nbox .x1
283+ and rect .x1 > nbox .x0
284+ ):
285+ pass # return True
229286 # vertical line
230- if rect .x0 == rect .x1 and rect .y0 < nbox .y1 and rect .y1 > nbox .y0 :
231- return True
287+ if (
288+ 1
289+ and rect .x0 == rect .x1
290+ and nbox .x0 <= rect .x0 <= nbox .x1
291+ and rect .y0 < nbox .y1
292+ and rect .y1 > nbox .y0
293+ ):
294+ pass # return True
232295 return False
233296
234297
@@ -654,8 +717,10 @@ def is_in_rects(rect, rect_list):
654717 def intersects_rects (rect , rect_list ):
655718 """Check if middle of rect is contained in a rect of the list."""
656719 delta = (- 1 , - 1 , 1 , 1 ) # enlarge rect_list members somewhat by this
720+ enlarged = rect + delta
721+ abs_enlarged = abs (enlarged ) * 0.5
657722 for i , r in enumerate (rect_list , start = 1 ):
658- if ( rect . tl + rect . br ) / 2 in r + delta : # middle point is inside r
723+ if abs ( enlarged & r ) > abs_enlarged :
659724 return i
660725 return 0
661726
@@ -764,31 +829,32 @@ def get_bg_color(page):
764829 page. If they are unicolor and of the same color, we assume this to
765830 be the background color.
766831 """
767- pix = page .get_pixmap (clip = (0 , 0 , 10 , 10 ))
768- if not pix .is_unicolor :
832+ pix = page .get_pixmap (
833+ clip = (page .rect .x0 , page .rect .y0 , page .rect .x0 + 10 , page .rect .y0 + 10 )
834+ )
835+ if not pix .samples or not pix .is_unicolor :
769836 return None
770837 pixel_ul = pix .pixel (0 , 0 ) # upper left color
771- pix = page .get_pixmap (clip = (page .rect .width - 10 , 0 , page .rect .width , 10 ))
772- if not pix .is_unicolor :
838+ pix = page .get_pixmap (
839+ clip = (page .rect .x1 - 10 , page .rect .y0 , page .rect .x1 , page .rect .y0 + 10 )
840+ )
841+ if not pix .samples or not pix .is_unicolor :
773842 return None
774843 pixel_ur = pix .pixel (0 , 0 ) # upper right color
775844 if not pixel_ul == pixel_ur :
776845 return None
777- pix = page .get_pixmap (clip = (0 , page .rect .height - 10 , 10 , page .rect .height ))
778- if not pix .is_unicolor :
846+ pix = page .get_pixmap (
847+ clip = (page .rect .x0 , page .rect .y1 - 10 , page .rect .x0 + 10 , page .rect .y1 )
848+ )
849+ if not pix .samples or not pix .is_unicolor :
779850 return None
780851 pixel_ll = pix .pixel (0 , 0 ) # lower left color
781852 if not pixel_ul == pixel_ll :
782853 return None
783854 pix = page .get_pixmap (
784- clip = (
785- page .rect .width - 10 ,
786- page .rect .height - 10 ,
787- page .rect .width ,
788- page .rect .height ,
789- )
855+ clip = (page .rect .x1 - 10 , page .rect .y1 - 10 , page .rect .x1 , page .rect .y1 )
790856 )
791- if not pix .is_unicolor :
857+ if not pix .samples or not pix . is_unicolor :
792858 return None
793859 pixel_lr = pix .pixel (0 , 0 ) # lower right color
794860 if not pixel_ul == pixel_lr :
@@ -881,7 +947,7 @@ def get_page_output(
881947 for i in img_info
882948 if i ["bbox" ].width >= image_size_limit * parms .clip .width
883949 and i ["bbox" ].height >= image_size_limit * parms .clip .height
884- and i ["bbox" ] in parms .clip
950+ and i ["bbox" ]. intersects ( parms .clip )
885951 and i ["bbox" ].width > 3
886952 and i ["bbox" ].height > 3
887953 ]
@@ -904,23 +970,23 @@ def get_page_output(
904970
905971 # Locate all tables on page
906972 parms .written_tables = [] # stores already written tables
973+ omitted_table_rects = []
907974 if table_strategy is None :
908975 parms .tabs = []
909976 else :
910977 parms .tabs = page .find_tables (clip = parms .clip , strategy = table_strategy )
911- del_this = []
912- for i , t in enumerate (parms .tabs ):
978+ # remove tables with too few rows or columns
979+ for i in range (len (parms .tabs .tables ) - 1 , - 1 , - 1 ):
980+ t = parms .tabs .tables [i ]
913981 if t .row_count < 2 or t .col_count < 2 :
914- # ignore tables with too few rows or columns
915- del_this .append (i )
916- for i in sorted (del_this , reverse = True ):
917- del parms .tabs .tables [i ]
982+ omitted_table_rects .append (pymupdf .Rect (t .bbox ))
983+ del parms .tabs .tables [i ]
918984 parms .tabs .tables .sort (key = lambda t : (t .bbox [0 ], t .bbox [1 ]))
919985
920986 # Make a list of table boundary boxes.
921987 # Must include the header bbox (which may exist outside tab.bbox)
922988 tab_rects = {}
923- for i , t in enumerate (parms .tabs ):
989+ for i , t in enumerate (parms .tabs . tables ):
924990 tab_rects [i ] = pymupdf .Rect (t .bbox ) | pymupdf .Rect (t .header .bbox )
925991 tab_dict = {
926992 "bbox" : tuple (tab_rects [i ]),
@@ -944,7 +1010,9 @@ def get_page_output(
9441010 and p ["rect" ].height < parms .clip .height
9451011 and (p ["rect" ].width > 3 or p ["rect" ].height > 3 )
9461012 and not (p ["fill" ] == parms .bg_color and p ["fill" ] != None )
947- and not intersects_rects (p ["rect" ], parms .tab_rects0 )
1013+ and not intersects_rects (
1014+ p ["rect" ], parms .tab_rects0 + omitted_table_rects
1015+ )
9481016 and not intersects_rects (p ["rect" ], parms .annot_rects )
9491017 ]
9501018 else :
@@ -977,7 +1045,6 @@ def get_page_output(
9771045 parms .vg_clusters0 = refine_boxes (vg_clusters0 )
9781046
9791047 parms .vg_clusters = dict ((i , r ) for i , r in enumerate (parms .vg_clusters0 ))
980-
9811048 # identify text bboxes on page, avoiding tables, images and graphics
9821049 text_rects = column_boxes (
9831050 parms .page ,
0 commit comments