Skip to content

Commit 5cc7d54

Browse files
committed
feat(cte): Add parser for "Datos del Contribuyente"
- Implemented `parse_taxpayer_data` to parse taxpayer data from CTE HTML. - Added `TaxpayerData` and `LastFiledDocument` data models. - Created tests to validate parser functionality with sample HTML input. Ref: https://app.shortcut.com/cordada/story/16534/
1 parent c5e0015 commit 5cc7d54

File tree

4 files changed

+1076
-1
lines changed

4 files changed

+1076
-1
lines changed

src/cl_sii/cte/data_models.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from __future__ import annotations
22

33
from collections.abc import Sequence
4+
from datetime import date
5+
from typing import Optional
46

57
import pydantic
68

@@ -38,3 +40,49 @@ class LegalRepresentative:
3840
"""
3941
Fecha de incorporación.
4042
"""
43+
44+
45+
@pydantic.dataclasses.dataclass(
46+
frozen=True,
47+
config=pydantic.ConfigDict(
48+
arbitrary_types_allowed=True,
49+
extra='forbid',
50+
),
51+
)
52+
class TaxpayerData:
53+
start_of_activities_date: Optional[date]
54+
"""
55+
Fecha de inicio de actividades.
56+
"""
57+
economic_activities: str
58+
"""
59+
Actividades Económicas
60+
"""
61+
tax_category: str
62+
"""
63+
Categoría Tributaria
64+
"""
65+
Address: str
66+
"""
67+
Domicilio
68+
"""
69+
branches: Sequence[str]
70+
"""
71+
Sucursales
72+
"""
73+
last_filed_documents: Sequence[LastFiledDocument]
74+
"""
75+
Últimos documentos trimbados
76+
"""
77+
tax_observations: Optional[str] = None
78+
"""
79+
Observaciones tributarias
80+
"""
81+
82+
83+
@pydantic.dataclasses.dataclass(
84+
frozen=True,
85+
)
86+
class LastFiledDocument:
87+
name: str
88+
date: date

src/cl_sii/cte/parsers.py

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from __future__ import annotations
22

3+
from datetime import datetime
4+
35
from bs4 import BeautifulSoup
46

5-
from .data_models import LegalRepresentative, TaxpayerProvidedInfo
7+
from .data_models import LastFiledDocument, LegalRepresentative, TaxpayerData, TaxpayerProvidedInfo
68

79

810
def parse_taxpayer_provided_info(html_content: str) -> TaxpayerProvidedInfo:
@@ -89,3 +91,88 @@ def parse_taxpayer_provided_info(html_content: str) -> TaxpayerProvidedInfo:
8991
company_formation=company_formation,
9092
participation_in_existing_companies=participation_in_companies,
9193
)
94+
95+
96+
def parse_taxpayer_data(html_content: str) -> TaxpayerData:
97+
"""
98+
Parse the CTE HTML content to extract the content of the section:
99+
"Datos del Contribuyente"
100+
101+
Args:
102+
html_content: HTML string containing the taxpayer information table
103+
104+
Returns:
105+
TaxpayerData instance with the parsed data
106+
"""
107+
soup = BeautifulSoup(html_content, 'html.parser')
108+
table = soup.find('table', id='tbl_dbcontribuyente')
109+
if not table:
110+
raise ValueError("Could not find 'Datos del Contribuyente' table in HTML")
111+
112+
fecha_inicio_elem = table.find(id='td_fecha_inicio') # type: ignore[attr-defined]
113+
if fecha_inicio_elem:
114+
start_of_activities_date = datetime.strptime(
115+
fecha_inicio_elem.get_text(strip=True), "%d-%m-%Y"
116+
).date()
117+
else:
118+
start_of_activities_date = None
119+
120+
actividades_elem = table.find(id='td_actividades') # type: ignore[attr-defined]
121+
if actividades_elem:
122+
economic_activities = actividades_elem.get_text(separator="\n", strip=True)
123+
else:
124+
economic_activities = ""
125+
126+
categoria_elem = table.find(id='td_categoria') # type: ignore[attr-defined]
127+
if categoria_elem:
128+
tax_category = categoria_elem.get_text(strip=True)
129+
else:
130+
tax_category = ""
131+
132+
domicilio_elem = table.find(id='td_domicilio') # type: ignore[attr-defined]
133+
if domicilio_elem:
134+
address = domicilio_elem.get_text(strip=True)
135+
else:
136+
address = ""
137+
138+
# Sucursales
139+
branches = []
140+
sucursales_row = table.find( # type: ignore[attr-defined]
141+
'td',
142+
string=lambda s: s and 'Sucursales:' in s,
143+
)
144+
if sucursales_row:
145+
sucursales_td = sucursales_row.find_next_sibling('td')
146+
if sucursales_td:
147+
branches_text = sucursales_td.get_text(separator="\n", strip=True)
148+
branches = [b for b in branches_text.split("\n") if b]
149+
150+
# Últimos documentos timbrados
151+
last_filed_documents = []
152+
tim_nombre_elem = table.find(id='td_tim_nombre') # type: ignore[attr-defined]
153+
tim_fecha_elem = table.find(id='td_tim_fecha') # type: ignore[attr-defined]
154+
if tim_nombre_elem and tim_fecha_elem:
155+
names = tim_nombre_elem.get_text(separator="\n", strip=True).split("\n")
156+
dates = tim_fecha_elem.get_text(separator="\n", strip=True).split("\n")
157+
for name, date_str in zip(names, dates):
158+
if name and date_str:
159+
doc_date = datetime.strptime(date_str, "%d-%m-%Y").date()
160+
last_filed_documents.append(LastFiledDocument(name=name, date=doc_date))
161+
162+
# Observaciones tributarias
163+
tax_observations = None
164+
observaciones_elem = table.find(id='td_observaciones') # type: ignore[attr-defined]
165+
if observaciones_elem:
166+
obs_span = observaciones_elem.find('span', class_='textof')
167+
if obs_span:
168+
tax_observations = obs_span.get_text(strip=True)
169+
170+
return TaxpayerData(
171+
start_of_activities_date=start_of_activities_date,
172+
economic_activities=economic_activities,
173+
tax_category=tax_category,
174+
Address=address,
175+
branches=branches,
176+
last_filed_documents=last_filed_documents,
177+
tax_observations=tax_observations,
178+
)

src/tests/test_cte_parsers.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,46 @@ def test_parse_taxpayer_provided_info(self) -> None:
4949
assert_raises_cm.exception.args,
5050
("Could not find taxpayer information table in HTML",),
5151
)
52+
53+
def test_parse_taxpayer_data(self) -> None:
54+
html_content = read_test_file_str_utf8('test_data/sii-cte/cte_empty_f29.html')
55+
with self.subTest("Parsing ok"):
56+
result = parsers.parse_taxpayer_data(html_content)
57+
from datetime import date
58+
59+
expected_obj = data_models.TaxpayerData(
60+
start_of_activities_date=date(2023, 11, 15),
61+
economic_activities=(
62+
"SERVICIOS DE ASESORIA Y CONSULTORIA EN MATERIA DE ADMINISTRACION DE EMPRESAS "
63+
"Y OTROS SERVICIOS DE ASESORIA ADMINISTRATIVA Y DE NEGOCIOS N.C.P.\n"
64+
"ACTIVIDADES DE OTRAS ORGANIZACIONES EMPRESARIALES N.C.P.\n"
65+
"OTRAS ACTIVIDADES DE SERVICIOS PERSONALES N.C.P."
66+
),
67+
tax_category="Primera categoría",
68+
Address="AV REAL, LAS CONDES",
69+
branches=[],
70+
last_filed_documents=[
71+
data_models.LastFiledDocument(
72+
name="FACTURA ELECTRONICA", date=date(2025, 7, 24)
73+
),
74+
data_models.LastFiledDocument(
75+
name="FACTURA NO AFECTA O EXENTA ELECTRONICA", date=date(2025, 7, 17)
76+
),
77+
data_models.LastFiledDocument(
78+
name="GUIA DESPACHO ELECTRONICA", date=date(2025, 5, 14)
79+
),
80+
data_models.LastFiledDocument(
81+
name="NOTA CREDITO ELECTRONICA", date=date(2025, 7, 18)
82+
),
83+
],
84+
tax_observations="No tiene observaciones.",
85+
)
86+
self.assertEqual(result, expected_obj)
87+
88+
with self.subTest("Parsing empty content"):
89+
with self.assertRaises(ValueError) as assert_raises_cm:
90+
parsers.parse_taxpayer_data("")
91+
self.assertEqual(
92+
assert_raises_cm.exception.args,
93+
("Could not find 'Datos del Contribuyente' table in HTML",),
94+
)

0 commit comments

Comments
 (0)