Skip to content

Commit 924c626

Browse files
committed
also scan all verticals for local PDF links
1 parent 158f896 commit 924c626

File tree

1 file changed

+34
-0
lines changed

1 file changed

+34
-0
lines changed

ocw2edx/ocw2xbundle.py

+34
Original file line numberDiff line numberDiff line change
@@ -553,13 +553,46 @@ def process_html(self, title, display_name, ocw_xml, seq, handle_broken_xml=Fals
553553
vert = etree.SubElement(seq,'vertical')
554554
vert.set('display_name',dn)
555555
self.add_contents_to_vert(a, vert)
556+
self.process_edx_xml_for_local_pdf_links(vert, seq)
556557
else:
557558
self.do_href(p)
558559
intro.append(p)
559560
if len(intro)==0:
560561
intro.getparent().remove(intro) # remove intro if empty
561562
else:
562563
self.process_html_intro_for_table_of_pdf_files(intro, seq)
564+
self.process_edx_xml_for_local_pdf_links(intro, seq)
565+
566+
def process_edx_xml_for_local_pdf_links(self, xml, seq):
567+
'''
568+
Process an edX XML block, and see if any links are local PDF files
569+
which haven't yet been processed. For each such link, generate
570+
a PDF vertical with an embedded PDF viewer.
571+
'''
572+
n_found = 0
573+
n_added = 0
574+
n_links = 0
575+
dn = xml.get('display_name')
576+
for aelem in xml.findall('.//a'):
577+
n_links += 1
578+
href = aelem.get('href')
579+
print " link: %s (%s)" % (aelem.text, href)
580+
if not href:
581+
continue
582+
if not href.lower().endswith(".pdf"):
583+
continue
584+
if href.startswith("http"):
585+
continue
586+
n_found += 1
587+
if aelem.get('pdf_processed')==1:
588+
continue
589+
title = aelem.text or ("File %s" % os.path.basename(href))
590+
self.add_pdf_vertical(title, href, aelem, seq)
591+
aelem.set("pdf_processed", "1") # so it isn't done again
592+
n_added += 1
593+
if 1 or n_found:
594+
print " [%s] Found %s links, %s are local PDF, %d new ones added as vertical pages" % (dn, n_links, n_found, n_added)
595+
563596

564597
def process_html_intro_for_table_of_pdf_files(self, intro_xml, seq):
565598
'''
@@ -601,6 +634,7 @@ def process_html_intro_for_table_of_pdf_files(self, intro_xml, seq):
601634
href = aelem.get('href')
602635
if href and href.lower().endswith("pdf"):
603636
self.add_pdf_vertical(rowtext, href, aelem, seq)
637+
aelem.set("pdf_processed", "1") # so it isn't done again
604638
nadded += 1
605639
summary = table.get('summary')
606640
print " Found table '%s' of PDFs, with %d rows: added %d pdf vertical pages" % (summary, nrows, nadded)

0 commit comments

Comments
 (0)