Subject: | Add unidecode option to allow decoding of HTML entities without transliterating to ASCII |
Date: | Tue, 12 Jan 2010 19:57:48 +0000 |
To: | bug-IMDB-Film [...] rt.cpan.org |
From: | Christopher Key <cjk32 [...] cam.ac.uk> |
The attached patch adds a unidecode option, which allows html entities
to be decoded without the result being transliterated to ascii.
e.g.
use Data::Dumper qw(Dumper);
use IMDB::Film;
$t1 = IMDB::Film->new("crit" => "0910970")->title();
$t2 = IMDB::Film->new("crit" => "0910970", "unidecode" => 0)->title();
utf8::upgrade($t1); utf8::upgrade($t2);
print Dumper($t1, $t2);
giving
$VAR1 = 'WALL*E';
$VAR2 = "WALL\x{b7}E";
It also switches to using LWP's decoded_content. IMDB currenly returns
content encoded in ISO-8859-1 and this change has no effect, but if they
start to use a difference encoding, then this change will be required.
To correctly identify the charset requires the HTTP headers, and
decoding must be performed whilst they are still available.
HTML::TokeParser explicitly states that it will not work with undecoded
data.
diff -ur IMDB-Film-0.43.orig/lib/IMDB/BaseClass.pm IMDB-Film-0.43/lib/IMDB/BaseClass.pm
--- IMDB-Film-0.43.orig/lib/IMDB/BaseClass.pm 2010-01-12 18:42:12.000000000 +0000
+++ IMDB-Film-0.43/lib/IMDB/BaseClass.pm 2010-01-12 18:45:57.000000000 +0000
@@ -69,6 +69,7 @@
timeout
user_agent
decode_html
+ unidecode
_code
);
@@ -218,6 +219,12 @@
return $self->{decode_html};
}
+sub _unidecode {
+ my CLASS_NAME $self = shift;
+ if(@_) { $self->{unidecode} = shift }
+ return $self->{unidecode};
+}
+
=item _cache()
Store cache flag. Indicate use file cache to store content page or not:
@@ -424,14 +431,15 @@
$self->_show_message("URL is [$url]...", 'DEBUG');
- my $page = get($url);
+ my $r = $ua->get($url);
- unless($page) {
+ unless($r->code eq "200") {
$self->error("Cannot retieve an url: [$url]!");
$self->_show_message("Cannot retrieve url [$url]", 'CRITICAL');
}
-
- return $page;
+ my $content = $r->decoded_content();
+
+ return $content;
}
=item _parser()
@@ -594,7 +602,10 @@
sub _decode_special_symbols {
my($self, $text) = @_;
if($self->_decode_html) {
- $text = unidecode(decode_entities($text));
+ $text = decode_entities($text);
+ if($self->_unidecode) {
+ $text = unidecode($text);
+ }
}
return $text;
}
diff -ur IMDB-Film-0.43.orig/lib/IMDB/Film.pm IMDB-Film-0.43/lib/IMDB/Film.pm
--- IMDB-Film-0.43.orig/lib/IMDB/Film.pm 2010-01-12 18:42:12.000000000 +0000
+++ IMDB-Film-0.43/lib/IMDB/Film.pm 2010-01-12 18:46:19.000000000 +0000
@@ -134,6 +134,7 @@
timeout => 10,
user_agent => 'Mozilla/5.0',
decode_html => 1,
+ unidecode => 1,
full_plot_url => 'http://www.imdb.com/rg/title-tease/plotsummary/title/tt',
_also_known_as => [],
_official_sites => [],