From 21e0ca16b6a41176950e2f1adffd44712f2e8fad Mon Sep 17 00:00:00 2001 From: Carlos Alberto Costa Beppler Date: Wed, 10 May 2023 21:18:27 -0300 Subject: [PATCH 1/3] Support virtualenv creation using .venv directory. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c062fea..c34fe44 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ build/ dist/ .cache +.venv/ \ No newline at end of file From 59d1a386f2b8d50e0b57142089aa5d5b32263d71 Mon Sep 17 00:00:00 2001 From: Carlos Alberto Costa Beppler Date: Wed, 10 May 2023 21:19:08 -0300 Subject: [PATCH 2/3] Support loading html from bytes. --- html_similarity/structural_similarity.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html_similarity/structural_similarity.py b/html_similarity/structural_similarity.py index 1be5c88..3b80c29 100644 --- a/html_similarity/structural_similarity.py +++ b/html_similarity/structural_similarity.py @@ -1,5 +1,5 @@ import difflib -from io import StringIO +from io import BytesIO, StringIO import lxml.html @@ -32,8 +32,8 @@ def structural_similarity(document_1, document_2): :return: int """ try: - document_1 = lxml.html.parse(StringIO(document_1)) - document_2 = lxml.html.parse(StringIO(document_2)) + document_1 = lxml.html.parse(StringIO(document_1) if isinstance(document_1, str) else BytesIO(document_1)) + document_2 = lxml.html.parse(StringIO(document_2) if isinstance(document_2, str) else BytesIO(document_2)) except Exception as e: print(e) return 0 From 6b3b690a3e8dc2a2cc5293c1df295f2e9bdca250 Mon Sep 17 00:00:00 2001 From: Carlos Alberto Costa Beppler Date: Thu, 11 May 2023 10:01:37 -0300 Subject: [PATCH 3/3] Add unit tests. --- tests/test_similarity.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/test_similarity.py b/tests/test_similarity.py index e642c80..3ca31c9 100644 --- a/tests/test_similarity.py +++ b/tests/test_similarity.py @@ -1,9 +1,9 @@ from html_similarity import style_similarity from html_similarity.style_similarity import jaccard_similarity +from html_similarity import structural_similarity from .utils import almost_equal - html1 = '''' @@ -44,3 +44,36 @@ def test_jaccard_similarity(): assert almost_equal(0.6666, jaccard_similarity(['a', 'b'], ['a', 'b', 'c'])) assert 0 == jaccard_similarity(['d', 'e'], ['a', 'b', 'c']) assert almost_equal(jaccard_similarity(list(range(1, 1000000)), list(range(1000000 - 10, 2 * 1000000))), 0) + +xhtml_1 = ''' + + + + +

This a title

+
    +
  • item 1
  • +
  • item 2
  • +
  • item 3
  • +
      + + +''' + +xhtml_2 = ''' + + + + +

      This a different title

      +
        +
      • item 1
      • +
      • item 2
      • +
      • item 3
      • +
          + + +''' + +def test_bytes_similarity(): + assert 1 == structural_similarity(xhtml_1.encode('utf-8'), xhtml_2.encode('utf-8'))