From 4822441f826201af6327a91311b5a5519b2a3a24 Mon Sep 17 00:00:00 2001 From: Franco Cedillo <franco.cedillo@gmail.com> Date: Wed, 31 Oct 2018 11:16:07 -0500 Subject: [PATCH 1/2] add lxml to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index fa225d3..fa87acc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ beautifulsoup4==4.6.3 +lxml==4.2.5 \ No newline at end of file From 84136a0ddeaebdd1627f762c744735d096bcaa7d Mon Sep 17 00:00:00 2001 From: Franco Cedillo <franco.cedillo@gmail.com> Date: Thu, 1 Nov 2018 00:44:49 -0500 Subject: [PATCH 2/2] refactor --- app.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/app.py b/app.py index 6b6c8b7..e8fd785 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,20 @@ from bs4 import BeautifulSoup from pprint import pprint + +def buildContentObject(titleText, childText): + contentObject = { + 'title': titleText.replace('[editar]',''), + 'anchor': childText + } + return contentObject + + +def spanWithId(tag): + # Si el tag tiene un id entonces nos servirá para añadirlo a la lista toc + return tag.name == 'span' and tag.has_attr('id') + + # Primero debemos definir la función para luego llamarla def extract_toc(filepath, body_class='#mw-content-text h3'): """ @@ -12,19 +26,30 @@ def extract_toc(filepath, body_class='#mw-content-text h3'): except FileNotFoundError: print("El fichero no existe") - soup = BeautifulSoup(file.read(), "lxml") + fileRead = file.read() + + soup = BeautifulSoup(fileRead, features="lxml") # TOC es una lista toc = [] # Los titulos del contenido estan en este selector titles = soup.select(body_class) - for i in range(0, len(titles)): + + #breakpoint() + + for title in titles: # Leemos los datos hijos de titles (el HTML contiene el ID del anchor) - for child in titles[i].children: - # Si el hijo tiene un id entonces añadimos a la lista toc - # TODO: El replace de [edit] debería ser un parser posterior - toc.append({'title': titles[i].getText().replace('[editar]',''), 'anchor': child.get('id')}) if child.get('id') is not None else None + # BeautifulSoup nos permite buscar dentro de una selección + spans = title.findAll(spanWithId) + + for span in spans: + + childText = span.get('id') + titleText = title.getText() + contentObject = buildContentObject(titleText, childText=childText) + toc.append(contentObject) return toc + pprint(extract_toc("sandbox/content/paella.html"))