Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add lxml to requirements.txt #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
from bs4 import BeautifulSoup
from pprint import pprint


def buildContentObject(titleText, childText):
contentObject = {
'title': titleText.replace('[editar]',''),
'anchor': childText
}
return contentObject


def spanWithId(tag):
# Si el tag tiene un id entonces nos servirá para añadirlo a la lista toc
return tag.name == 'span' and tag.has_attr('id')


# Primero debemos definir la función para luego llamarla
def extract_toc(filepath, body_class='#mw-content-text h3'):
"""
Expand All @@ -12,19 +26,30 @@ def extract_toc(filepath, body_class='#mw-content-text h3'):
except FileNotFoundError:
print("El fichero no existe")

soup = BeautifulSoup(file.read(), "lxml")
fileRead = file.read()

soup = BeautifulSoup(fileRead, features="lxml")

# TOC es una lista
toc = []
# Los titulos del contenido estan en este selector
titles = soup.select(body_class)
for i in range(0, len(titles)):

#breakpoint()

for title in titles:
# Leemos los datos hijos de titles (el HTML contiene el ID del anchor)
for child in titles[i].children:
# Si el hijo tiene un id entonces añadimos a la lista toc
# TODO: El replace de [edit] debería ser un parser posterior
toc.append({'title': titles[i].getText().replace('[editar]',''), 'anchor': child.get('id')}) if child.get('id') is not None else None
# BeautifulSoup nos permite buscar dentro de una selección
spans = title.findAll(spanWithId)

for span in spans:

childText = span.get('id')
titleText = title.getText()
contentObject = buildContentObject(titleText, childText=childText)
toc.append(contentObject)

return toc


pprint(extract_toc("sandbox/content/paella.html"))
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
beautifulsoup4==4.6.3
lxml==4.2.5