Author: Nicholas Bamber <nicholas@periapt.co.uk>
Subject: Fix issues with example scripts
- Make sure all the scripts have some explanatory comment
- Add a README file in the eg directory
- Make sure the scripts run cleanly
Bug:
http://rt.cpan.org/Public/Bug/Display.html?id=58016
Last-Update: 2010-09-26
--- a/eg/hanchors
+++ b/eg/hanchors
@@ -6,6 +6,7 @@
# See also HTML::LinkExtor
use HTML::Parser;
+use Encode;
my $p = HTML::Parser->new(api_version => 3,
start_h => [\&a_start_handler, "self,tagname,attr"],
@@ -35,7 +36,7 @@
sub a_end_handler
{
my($self, $tag) = @_;
- my $text = join("", @{$self->handler("text")});
+ my $text = encode('utf8',join("", @{$self->handler("text")}));
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\s+/ /g;
--- /dev/null
+++ b/eg/README
@@ -0,0 +1,23 @@
+For most of these scripts if you run them with a file argument, where the file
+contains some HTML, you should get some output. The 'h*sub' scripts take two
+arguments the first of which is a perl expression and the second an HTML file.
+In any case all of the files have an exlanatory comment.
+
+For example try running:
+
+lynx -dump -source -raw
http://www.debian.org > /tmp/a.txt
+./hanchors /tmp/a.txt
+
+Of course if
http://www.debian.org is not your favourite web site you can
+make the appropriate substitution.
+
+hanchors - List all anchors in the HTML
+hlc - Correct any upper case tags to lower case
+hstrip - Removes deprecated scripting and styling tags and attributes
+htextsub - Apply arbirary perl expression to all text within HTML
+hrefsub - Apply arbirary perl expression to all hrefs within HTML
+htitle - Print title of the HTML document
+hdump - Output event information whilst parsing HTML document
+hform - Print analysis of form controls present in HTML
+htext - Print all the text from the HTML
+
--- a/eg/hlc
+++ b/eg/hlc
@@ -1,5 +1,9 @@
#!/usr/bin/perl -w
+# This script will assume that the first command line argument
+# is a file containing HTML, and return a version
+# where all the tags are converted to lowercase.
+
use strict;
use HTML::Parser ();
--- a/eg/hstrip
+++ b/eg/hstrip
@@ -1,6 +1,7 @@
#!/usr/bin/perl -w
-# This script cleans up an HTML document
+# This script cleans up an HTML document.
+# Specifically it removes deprecated styling and scripting tags.
use strict;
use HTML::Parser ();
--- a/eg/hdump
+++ b/eg/hdump
@@ -1,5 +1,8 @@
#!/usr/bin/perl -w
+# This script will output event information as it parses the HTML document.
+# This gives the user a "Parser's eye view" of an HTML document.
+
use HTML::Parser ();
use Data::Dump ();
--- a/eg/hform
+++ b/eg/hform
@@ -1,5 +1,6 @@
#!/usr/bin/perl -w
+# Print information about forms and their controls present in the HTML.
# See also HTML::Form module
use HTML::PullParser ();
--- a/eg/htext
+++ b/eg/htext
@@ -4,6 +4,7 @@
use strict;
use HTML::Parser 3.00 ();
+use Encode;
my %inside;
@@ -17,7 +18,7 @@
sub text
{
return if $inside{script} || $inside{style};
- print $_[0];
+ print encode('utf8',$_[0]);
}
HTML::Parser->new(api_version => 3,
--- a/eg/htextsub
+++ b/eg/htextsub
@@ -3,6 +3,9 @@
# Shows how to mangle all plain text in an HTML document, using an arbitrary
# Perl expression. Plain text is all text not within a tag declaration, i.e.
# not in <p ...>, but possibly between <p> and </p>
+# Example (Reverse 'Debian' in all text) :
+# lynx -dump -source -raw
http://www/debian.org > /tmp/a.txt
+# ./htextsub '$_ =~ s/Debian/Naibed/gi' /tmp/a.txt
use strict;
my $code = shift || usage();