From ec7129040069f4b0a64685a3c3cbc282b29650f2 Mon Sep 17 00:00:00 2001 From: Andy Jack Date: Wed, 23 Jun 2021 14:48:08 -0400 Subject: [PATCH 1/5] not ideal fix - no value for attribute sets value to undef instead of attribute name - but see https://github.com/libwww-perl/HTML-Parser/issues/17 discussion --- hparser.c | 2 +- t/cases.t | 12 ++++++------ t/msie-compat.t | 2 +- t/parser.t | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/hparser.c b/hparser.c index 769e6e0..fef261f 100644 --- a/hparser.c +++ b/hparser.c @@ -468,7 +468,7 @@ report_event(PSTATE* p_state, if (p_state->bool_attr_val) attrval = newSVsv(p_state->bool_attr_val); else - attrval = newSVsv(attrname); + attrval = newSV(0); } if (!CASE_SENSITIVE(p_state)) diff --git a/t/cases.t b/t/cases.t index 4331a27..3dd1bf7 100644 --- a/t/cases.t +++ b/t/cases.t @@ -24,7 +24,7 @@ my @result; my ($self, $tag, $attr) = @_; push @result, "START[$tag]"; for (sort keys %$attr) { - push @result, "\t$_: " . $attr->{$_}; + push @result, "\t$_: " . ( defined $attr->{$_} ? $attr->{$_} : '' ); } $start++; } @@ -57,10 +57,10 @@ my @result; } my @tests = ( - '' => ['START[a]', "\t\": \""], + '' => ['START[a]', "\t\": "], '' => ['START[a/]',], - '' => ['START[a]', "\t/: /"], - '' => ['START[a]', "\ta/: a/"], + '' => ['START[a]', "\t/: "], + '' => ['START[a]', "\ta/: "], '' => ['START[a]', "\ta/: /"], '' => ['START[a]', "\tx: foo\xA0bar"], '' => ['START[a]', "\tx: foo bar"], @@ -73,7 +73,7 @@ my @tests = ( "2 2" => ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], '2 2' => - ['TEXT[2 ]', 'START[a]', "\tbar: bar", "\thref: foo", 'TEXT[ 2]'], + ['TEXT[2 ]', 'START[a]', "\tbar: ", "\thref: foo", 'TEXT[ 2]'], '2 2' => ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], '2 2' => @@ -84,7 +84,7 @@ my @tests = ( ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], '2 2' => ['TEXT[2 ]', 'START[a.b]', 'TEXT[ 2]'], '2 2' => - ['TEXT[2 ]', 'START[a.b-12]', "\ta: a", "\ta.b: 2", 'TEXT[ 2]'], + ['TEXT[2 ]', 'START[a.b-12]', "\ta: ", "\ta.b: 2", 'TEXT[ 2]'], '2 2' => ['TEXT[2 ]', 'START[a_b]', 'TEXT[ 2]'], '' => ['DECLARATION[ENTITY nbsp CDATA " " -- no-break space --]'], diff --git a/t/msie-compat.t b/t/msie-compat.t index 3c170c5..8ba6279 100644 --- a/t/msie-compat.t +++ b/t/msie-compat.t @@ -62,7 +62,7 @@ $p->eof; is($TEXT, <<'EOT'); [start_document,,,] -[start,a,,name:`foo:bar`:bar`] +[start,a,,name:`foo:bar`:] [end_document,,,] EOT diff --git a/t/parser.t b/t/parser.t index ea5a3a4..a947dc0 100644 --- a/t/parser.t +++ b/t/parser.t @@ -71,7 +71,7 @@ HTML sub start { my ($self, $tag, $attr) = @_; - $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); + $attr = join("/", map { "$_=" . ( defined $attr->{$_} ? $attr->{$_} : '' ) } sort keys %$attr); $attr = "/$attr" if length $attr; $OUT .= "<<$tag$attr>>|"; } From 021660dcaf618f95725820582b54fed8ed109798 Mon Sep 17 00:00:00 2001 From: Andy Jack Date: Mon, 28 Jun 2021 12:15:24 -0400 Subject: [PATCH 2/5] Only set attrname to attrval for specific attributes --- hparser.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/hparser.c b/hparser.c index fef261f..a141ce3 100644 --- a/hparser.c +++ b/hparser.c @@ -85,6 +85,40 @@ static const char * const argname[] = { /* ARG_FLAG_FLAT_ARRAY */ }; +/* https://meiert.com/en/blog/boolean-attributes-of-html/ */ +const static struct boolean_attribute { + int len; + char* str; +} +boolean_attributes[] = { + {15, "allowfullscreen"}, + {19, "allowpaymentrequest"}, + {5, "async"}, + {9, "autofocus"}, + {8, "autoplay"}, + {6, "checked"}, + {8, "controls"}, + {7, "default"}, + {8, "disabled"}, + {14, "formnovalidate"}, + {6, "hidden"}, + {5, "ismap"}, + {9, "itemscope"}, + {4, "loop"}, + {8, "multiple"}, + {5, "muted"}, + {8, "nomodule"}, + {10, "novalidate"}, + {4, "open"}, + {11, "playsinline"}, + {8, "readonly"}, + {8, "required"}, + {8, "reversed"}, + {8, "selected"}, + {9, "truespeed"}, + {0, 0} +}; + #define CASE_SENSITIVE(p_state) \ ((p_state)->xml_mode || (p_state)->case_sensitive) #define STRICT_NAMES(p_state) \ @@ -438,8 +472,8 @@ report_event(PSTATE* p_state, } for (i = 1; i < num_tokens; i += 2) { - SV* attrname = newSVpvn(tokens[i].beg, - tokens[i].end-tokens[i].beg); + int attrlen = tokens[i].end-tokens[i].beg; + SV* attrname = newSVpvn(tokens[i].beg, attrlen); SV* attrval; if (utf8) @@ -465,11 +499,33 @@ report_event(PSTATE* p_state, } } else { /* boolean */ - if (p_state->bool_attr_val) - attrval = newSVsv(p_state->bool_attr_val); - else - attrval = newSV(0); - } + int i; + for ( i = 0; boolean_attributes[i].len; i++ ) { + if( attrlen == boolean_attributes[i].len ) { + char *attrname_s = tokens[i].beg; + char *t = boolean_attributes[i].str; + int len = attrlen; + while(len) { + if(toLOWER(*attrname_s) != *t) + break; + attrname_s++; + t++; + if(!--len) { + /* this is a boolean attribute */ + if (p_state->bool_attr_val) + attrval = newSVsv(p_state->bool_attr_val); + else + attrval = newSVsv(attrname); + } + goto BOOLEAN_ATTR_MATCH_DONE; + } + } + } + /* no matches were found, so set attr to undef */ + attrval = newSV(0); + BOOLEAN_ATTR_MATCH_DONE: + ; + } if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); From edfff24da1d9af59cff247277e0b982f236cf46a Mon Sep 17 00:00:00 2001 From: Andy Jack Date: Wed, 22 Sep 2021 11:28:36 -0400 Subject: [PATCH 3/5] ongoing work --- .gitignore | 2 +- hparser.c | 19 ++++++++++--------- t/cases.t | 1 + 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 053e57a..2d80996 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ Makefile MYMETA.* .build/ HTML-Parser-*/ - +local/ diff --git a/hparser.c b/hparser.c index a141ce3..a906c51 100644 --- a/hparser.c +++ b/hparser.c @@ -88,7 +88,7 @@ static const char * const argname[] = { /* https://meiert.com/en/blog/boolean-attributes-of-html/ */ const static struct boolean_attribute { int len; - char* str; + const char* str; } boolean_attributes[] = { {15, "allowfullscreen"}, @@ -96,7 +96,7 @@ boolean_attributes[] = { {5, "async"}, {9, "autofocus"}, {8, "autoplay"}, - {6, "checked"}, + {7, "checked"}, {8, "controls"}, {7, "default"}, {8, "disabled"}, @@ -500,10 +500,11 @@ report_event(PSTATE* p_state, } else { /* boolean */ int i; + int found = 0; for ( i = 0; boolean_attributes[i].len; i++ ) { if( attrlen == boolean_attributes[i].len ) { - char *attrname_s = tokens[i].beg; - char *t = boolean_attributes[i].str; + char *attrname_s = SvPVbyte_nolen(attrname); + const char *t = boolean_attributes[i].str; int len = attrlen; while(len) { if(toLOWER(*attrname_s) != *t) @@ -517,14 +518,14 @@ report_event(PSTATE* p_state, else attrval = newSVsv(attrname); } - goto BOOLEAN_ATTR_MATCH_DONE; - } + found = 1; + } } } /* no matches were found, so set attr to undef */ - attrval = newSV(0); - BOOLEAN_ATTR_MATCH_DONE: - ; + if (!found) + attrval = newSV(0); + } if (!CASE_SENSITIVE(p_state)) diff --git a/t/cases.t b/t/cases.t index 3dd1bf7..8833dc1 100644 --- a/t/cases.t +++ b/t/cases.t @@ -94,6 +94,7 @@ my @tests = ( ' comment -->' => ['COMMENT[ comment comment ]'], '' => ['COMMENT[ ]'], + '' => ['START[input]', "\tchecked: checked", "\tdisabled: disabled", "\tfoo: bar", "\ttype: checkbox" ], ); plan tests => @tests / 2; From edd4e49f27caddffb1f0af0547bc8c66aba48c3f Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Wed, 19 Jul 2023 12:52:13 -0400 Subject: [PATCH 4/5] Fix test case This was likely set to fail for debugging purposes --- t/cases.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/cases.t b/t/cases.t index 8833dc1..7d8a66b 100644 --- a/t/cases.t +++ b/t/cases.t @@ -94,7 +94,7 @@ my @tests = ( ' comment -->' => ['COMMENT[ comment comment ]'], '' => ['COMMENT[ ]'], - '' => ['START[input]', "\tchecked: checked", "\tdisabled: disabled", "\tfoo: bar", "\ttype: checkbox" ], + '' => ['START[input]', "\tchecked: checked", "\tdisabled: disabled", "\tfoo: ", "\ttype: checkbox" ], ); plan tests => @tests / 2; From 864523f0a74271fa1b3472ff957a14048dbe5013 Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Fri, 28 Jul 2023 13:23:41 -0400 Subject: [PATCH 5/5] Test more Perl versions in CI --- .github/workflows/linux.yml | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index bcd7c26..1a250ab 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -6,6 +6,7 @@ on: tags-ignore: - '*' pull_request: + workflow_dispatch: jobs: perl: runs-on: ubuntu-latest @@ -14,18 +15,19 @@ jobs: matrix: perl-version: - '5.38' - # - '5.34' - # - '5.32' - # - '5.30' - # - '5.28' - # - '5.26' - # - '5.24' - # - '5.22' - # - '5.20' + - '5.36' + - '5.34' + - '5.32' + - '5.30' + - '5.28' + - '5.26' + - '5.24' + - '5.22' + - '5.20' - '5.18' - # - '5.16' - # - '5.14' - # - '5.12' + - '5.16' + - '5.14' + - '5.12' - '5.10' container: image: perl:${{ matrix.perl-version }}