Date: | Wed, 4 Aug 2004 11:29:32 -0700 |
From: | Aaron Patterson <aaronp [...] classmates.com> |
To: | bug-html-tidy [...] rt.cpan.org |
Subject: | tidy->clean bug with patch |
Here's a snippet of code to repro the bug, it produces an 'Illegal instruction'
error (a proposed patch is below):
#!/usr/bin/perl -w
use strict;
use HTML::Tidy;
my $html = do { local $/; <DATA> };
my $tidy = new HTML::Tidy;
$tidy->clean($html);
my @mess = map { $_ ? $_->as_string() : undef } $tidy->messages();
__DATA__
<form action="http://www.alternation.net/cobra/index.pl">
<td><input name="random" type="image" value="random creature" src="http://www.creaturesinmyhead.com/images/random.gif"></td>
</form>
__END__
Here's a patch I made to get rid of the illigal instruction, and provide
messaging returned from the 'clean()' method:
diff -cNr --exclude=Makefile old_html/HTML-Tidy-1.04/lib/HTML/Tidy.pm new_html/HTML-Tidy-1.04/lib/HTML/Tidy.pm
*** old_html/HTML-Tidy-1.04/lib/HTML/Tidy.pm 2004-05-12 13:18:54.000000000 -0700
--- new_html/HTML-Tidy-1.04/lib/HTML/Tidy.pm 2004-07-27 12:13:08.101379320 -0700
***************
*** 167,201 ****
my $self = shift;
my $filename = shift;
- my $parse_errors;
my $html = join( "", @_ );
my $errorblock = _tidy_messages( $html );
return unless defined $errorblock;
- my @lines = split( /\012/, $errorblock );
- for my $line ( @lines ) {
- chomp $line;
-
- my $message;
- if ( $line =~ /^line (\d+) column (\d+) - (Warning|Error): (.+)$/ ) {
- my $type = ($3 eq "Warning") ? TIDY_WARNING : TIDY_ERROR;
- $message = HTML::Tidy::Message->new( $filename, $type, $1, $2, $4 );
-
- } elsif ( $line =~ /^\d+ warnings?, \d+ errors? were found!/ ) {
- # Summary line we don't want
-
- } elsif ( $line eq "No warnings or errors were found." ) {
- # Summary line we don't want
-
- } else {
- warn "Unknown error type: $line";
- ++$parse_errors;
- }
- push( @{$self->{messages}}, $message ) if $self->_is_keeper( $message );
- }
! return !$parse_errors;
}
=head2 clean( $str [, $str...] )
--- 167,222 ----
my $self = shift;
my $filename = shift;
my $html = join( "", @_ );
my $errorblock = _tidy_messages( $html );
return unless defined $errorblock;
! return !$self->_parse_errors($filename, $errorblock);
! }
!
! sub _parse_errors {
! my $self = shift;
! my $filename = shift;
! my $errs = shift;
!
! my $parse_errors;
!
! my @lines = split( /\012/, $errs );
! for my $line ( @lines ) {
! chomp $line;
!
! my $message;
! if ( $line =~ /^line (\d+) column (\d+) - (Warning|Error): (.+)$/ ) {
! my $type = ($3 eq "Warning") ? TIDY_WARNING : TIDY_ERROR;
! $message = HTML::Tidy::Message->new( $filename, $type, $1, $2, $4 );
!
! } elsif ( $line =~ /^\d+ warnings?, \d+ errors? were found!/ ) {
! # Summary line we don't want
!
! } elsif ( $line eq "No warnings or errors were found." ) {
! # Summary line we don't want
!
! } elsif ( $line eq "This document has errors that must be fixed before" ){
! # Summary line we don't want
!
! } elsif ( $line eq "using HTML Tidy to generate a tidied up version." ){
! # Summary line we don't want
!
! } elsif ( $line =~ m/^Info:/ ) {
! # Info line we don't want
!
! } elsif ( $line =~ m/^\s*$/ ) {
! # Blank line we don't want
!
! } else {
! warn "Unknown error type: $line";
! ++$parse_errors;
! }
! push( @{$self->{messages}}, $message ) if $self->_is_keeper( $message );
! }
! return $parse_errors;
}
=head2 clean( $str [, $str...] )
***************
*** 209,215 ****
sub clean {
my $self = shift;
! return _tidy_clean(join( "", @_ ));
}
--- 230,239 ----
sub clean {
my $self = shift;
! my ($cleaned, $errbuf) = _tidy_clean(join( "", @_ ));
!
! $self->_parse_errors('', $errbuf);
! return $cleaned;
}
diff -cNr --exclude=Makefile old_html/HTML-Tidy-1.04/Tidy.xs new_html/HTML-Tidy-1.04/Tidy.xs
*** old_html/HTML-Tidy-1.04/Tidy.xs 2004-04-01 21:36:19.000000000 -0800
--- new_html/HTML-Tidy-1.04/Tidy.xs 2004-07-27 13:47:07.415073328 -0700
***************
*** 38,76 ****
RETVAL
! SV *
_tidy_clean(input)
INPUT:
char *input
! CODE:
TidyBuffer errbuf = {0};
- TidyDoc tdoc = tidyCreate(); // Initialize "document"
TidyBuffer output = {0};
! int rc;
! rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics
if ( rc >= 0 )
rc = tidyParseString( tdoc, input ); // Parse the input
if ( rc >= 0 )
rc = tidyCleanAndRepair(tdoc);
if ( rc >= 0)
rc = tidySaveBuffer( tdoc, &output );
! if ( rc > 1 ) // If error, force output.
! rc = tidyOptSetBool( tdoc, TidyForceOutput, yes ) ? rc : -1;
if ( rc >= 0 ) {
char *str;
str = (char *)output.bp;
if ( str )
! RETVAL = newSVpvn( str, strlen(str) );
! tidyBufFree( &output );
} else {
XSRETURN_UNDEF;
}
tidyBufFree( &errbuf );
tidyRelease( tdoc );
- OUTPUT:
- RETVAL
-
--- 38,80 ----
RETVAL
! void
_tidy_clean(input)
INPUT:
char *input
! PPCODE:
TidyBuffer errbuf = {0};
TidyBuffer output = {0};
! TidyDoc tdoc = tidyCreate(); // Initialize "document"
! int rc = -1;
! rc = tidyOptSetInt( tdoc, TidyWrapLen, 0 );
! if(rc >= 0)
! rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics
if ( rc >= 0 )
rc = tidyParseString( tdoc, input ); // Parse the input
if ( rc >= 0 )
rc = tidyCleanAndRepair(tdoc);
+ if ( rc > 1 )
+ rc = tidyOptSetBool( tdoc, TidyForceOutput, yes ) ? rc : -1;
if ( rc >= 0)
rc = tidySaveBuffer( tdoc, &output );
! if ( rc >= 0)
! rc = tidyRunDiagnostics( tdoc);
if ( rc >= 0 ) {
char *str;
str = (char *)output.bp;
if ( str )
! XPUSHs(sv_2mortal(newSVpvn(str, strlen(str))));
!
! if(errbuf.bp)
! XPUSHs(sv_2mortal(newSVpvn(errbuf.bp, strlen(errbuf.bp))));
} else {
XSRETURN_UNDEF;
}
+ tidyBufFree( &output );
tidyBufFree( &errbuf );
tidyRelease( tdoc );