Subject: | [PATCH] a filter attribute generalizing decode_entities |
Date: | Fri, 8 Feb 2008 18:55:33 -0200 |
To: | "Alex Bowley" <kilinrax [...] cpan.org>, bug-html-strip <bug-html-strip [...] rt.cpan.org> |
From: | "Adriano Ferreira" <a.r.ferreira [...] gmail.com> |
The attached patch makes possible to apply other filter operations
beyond the currently supported decoding of HTML entities.
That is done by introducing
->set_filter()
which accepts a method name or a code ref. By default, it contains
'filter_entities' (which is where the HTML entities decoding code went
to).
With it, it is possible to apply other operations after stripping tags like:
sub {
require Text::Unidecode;
require HTML::Entities;
return Text::Unidecode::unidecode( HTML::Entities::decode( shift );
}
which applies ASCII transliteration after entities decoding or
sub {
my $s = shift;
$s =~ s/\s+/ /g;
return $s;
}
which normalize spaces (\s+) into ' '.
Best,
Adriano Ferreira
diff -ru HTML-Strip-1.06/Strip.pm HTML-Strip/Strip.pm
--- HTML-Strip-1.06/Strip.pm 2006-02-10 09:18:32.000000000 -0200
+++ HTML-Strip/Strip.pm 2008-02-08 18:43:00.000000000 -0200
@@ -31,7 +31,7 @@
# Preloaded methods go here.
-my $_html_entities_p = eval 'require HTML::Entities';
+my $_html_entities_p = eval { require HTML::Entities; 1 };
my %defaults = (
striptags => [qw( title
@@ -40,6 +40,7 @@
applet )],
emit_spaces => 1,
decode_entities => 1,
+ filter => $_html_entities_p ?
'filter_entities' : undef,
);
sub new {
@@ -68,13 +69,53 @@
}
}
+{
+ # an inside-out object approach
+ # for the 'filter' attribute
+ my %filter_of;
+
+ sub set_filter {
+ my ($self, $filter) = @_;
+ $filter_of{0+$self} = $filter;
+ }
+
+ sub filter {
+ my $self = shift;
+ return $filter_of{0+$self}
+ }
+
+ sub DESTROY {
+ my $self = shift;
+ delete $filter_of{0+$self};
+ }
+}
+
+# $decoded_string = $self->filter_entities( $string )
+sub filter_entities {
+ my $self = shift;
+ if( $self->decode_entities ) {
+ return HTML::Entities::decode($_[0]);
+ }
+ return $_[0];
+}
+
+sub _do_filter {
+ my $self = shift;
+ my $filter = $self->filter;
+ # no filter: return immediately
+ return $_[0] unless defined $filter;
+
+ if ( !ref $filter ) { # method name
+ return $self->$filter( @_ );
+ } else { # code ref
+ return $filter->( @_ );
+ }
+}
+
sub parse {
my ($self, $text) = @_;
my $stripped = $self->strip_html( $text );
- if( $self->decode_entities && $_html_entities_p ) {
- $stripped = HTML::Entities::decode($stripped);
- }
- return $stripped;
+ return $self->_do_filter( $stripped );
}
sub eof {
@@ -201,6 +242,19 @@
Takes a boolean value. If set to false, HTML::Strip will decode HTML
entities. Set to true by default.
+=item filter_entities()
+
+If HTML::Entities is available, this method behaves just
+like invoking HTML::Entities::decode_entities, except that
+it respects the current setting of 'decode_entities'.
+
+=item set_filter()
+
+Sets a filter to be applied after tags were stripped.
+It may accept the name of a method (like 'filter_entities')
+or a code ref. By default, its value is 'filter_entities'
+if HTML::Entities is available or C<undef> otherwise.
+
=head2 LIMITATIONS
=over 4
diff -ru HTML-Strip-1.06/test.pl HTML-Strip/test.pl
--- HTML-Strip-1.06/test.pl 2006-02-10 09:14:25.000000000 -0200
+++ HTML-Strip/test.pl 2008-02-08 18:46:26.000000000 -0200
@@ -89,3 +89,21 @@
ok( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), ' baz ' );
$hs->eof;
+
+{
+ my $hs = HTML::Strip->new( filter => undef );
+ ok( $hs->parse( '<html> </html>' ), ' ' );
+ $hs->eof;
+
+}
+
+{
+ my $filter = sub { my $s = shift; $s =~ s/\s/ /g;; $s };
+ my $hs = HTML::Strip->new( filter => $filter );
+ ok( $hs->parse( "<html>title\ntext\ntext</html>" ), 'title text text' );
+ $hs->eof;
+
+}
+
+
+
}
Message body is not shown because sender requested not to inline it.