Bug #125968 for IMDB-Film: [PATCH] add https support

Mon Jul 30 16:36:49 2018 CFABER [...] cpan.org - Ticket created

Subject:

[PATCH] add https support

Add HTTPS support

Subject:

https.diff

--- Film.pm 2013-01-02 10:28:08.000000000 -0700 +++ Film.pm 2018-07-30 12:14:47.014524133 -0600 @@ -138,7 +138,7 @@ timeout => 10, user_agent => 'Mozilla/5.0', decode_html => 1, - full_plot_url => 'http://www.imdb.com/rg/title-tease/plotsummary/title/tt', + full_plot_url => 'https://www.imdb.com/rg/title-tease/plotsummary/title/tt', _also_known_as => [], _official_sites => [], _release_dates => [], @@ -219,7 +219,7 @@ defines proxy server name and port: - proxy => 'http://proxy.myhost.com:80' + proxy => 'https://proxy.myhost.com:80' By default object tries to get proxy from environment @@ -366,7 +366,7 @@ } if($title) { - $self->retrieve_code($parser, 'http://www.imdb.com/title/tt(\d+)') unless $self->code; + $self->retrieve_code($parser, 'https://www.imdb.com/title/tt(\d+)') unless $self->code; $title =~ s/\*/\\*/g; $title = $self->_decode_special_symbols($title); @@ -378,13 +378,17 @@ unless($self->{_title}) { ($self->{_title}, $self->{_kind}, $self->{_year}) = $title =~ m!(.*?)\s+$(.*?)?\s?([0-9\-]*\s?)$!; } + + # Another whack at the year. + $self->{_year} = $1 if(!$self->{_year} && $title =~ /$(\d{4})$/); + $self->{_kind} = 'Movie' unless $self->{_kind}; # Default kind should be movie - # "The Series" An Episode (2005) + # "The Series" An Episode (2005) # "The Series" (2005) - if( $self->{_title} =~ /\"[^\"]+\"(\s+.+\s+)?/ ) { - $self->{_kind} = $1 ? 'E' : 'S'; - } + if( $self->{_title} =~ /\"[^\"]+\"(\s+.+\s+)?/ ) { + $self->{_kind} = $1 ? 'E' : 'S'; + } } } @@ -444,7 +448,7 @@ $page = $self->_cacheObj->get($self->code . '_connections') if $self->_cache; unless($page) { - my $url = "http://". $self->{host} . "/" . $self->{query} . $self->code . "/trivia?tab=mc"; + my $url = "https://". $self->{host} . "/" . $self->{query} . $self->code . "/trivia?tab=mc"; $self->_show_message("URL for movie connections is $url ...", 'DEBUG'); $page = $self->_get_page_from_internet($url); @@ -522,7 +526,7 @@ $page = $self->_cacheObj->get($self->code . '_full_companies') if $self->_cache; unless($page) { - my $url = "http://". $self->{host} . "/" . $self->{query} . $self->code . "/companycredits"; + my $url = "https://". $self->{host} . "/" . $self->{query} . $self->code . "/companycredits"; $self->_show_message("URL for company credits is $url ...", 'DEBUG'); $page = $self->_get_page_from_internet($url); @@ -607,7 +611,7 @@ $page = $self->_cacheObj->get($self->code . '_episodes') if $self->_cache; unless($page) { - my $url = "http://". $self->{host} . "/" . $self->{query} . $self->code . "/epcast"; + my $url = "https://". $self->{host} . "/" . $self->{query} . $self->code . "/epcast"; $self->_show_message("URL for episodes is $url ...", 'DEBUG'); $page = $self->_get_page_from_internet($url); @@ -1029,16 +1033,16 @@ } while($tag = $parser->get_tag()) { last if $tag->[0] eq 'a' && $tag->[1]{href} && $tag->[1]{href} =~ /fullcredits/i; - if($tag->[0] eq 'td' && $tag->[1]{class} && $tag->[1]{class} eq 'name') { + # if($tag->[0] eq 'td' && $tag->[1]{class} && $tag->[1]{class} eq 'name') { $tag = $parser->get_tag('a'); if($tag->[1]{href} && $tag->[1]{href} =~ m#name/nm(\d+?)/#) { $person = $parser->get_text; $id = $1; my $text = $parser->get_trimmed_text('/tr'); - ($role) = $text =~ /.*?\s+(.*)$/; - push @cast, {id => $id, name => $person, role => $role}; + ($role) = $text =~ /\.\.\. (.*)$/; + push @cast, {id => $id, name => $person, role => $role} if $person; } - } + # } } $self->{_cast} = \@cast; @@ -1399,7 +1403,7 @@ } } if($self->{'_big_cover_page'}) { - my $page = $self->_get_page_from_internet('http://' . $self->{'host'} . $self->{'_big_cover_page'}); + my $page = $self->_get_page_from_internet('https://' . $self->{'host'} . $self->{'_big_cover_page'}); return unless $page; my $parser = $self->_parser(FORCED, \$page); @@ -1435,7 +1439,7 @@ $page = $self->_cacheObj->get($self->code . '_sites') if $self->_cache; unless($page) { - my $url = "http://". $self->{host} . "/" . $self->{query} . $self->code . "/officialsites"; + my $url = "https://". $self->{host} . "/" . $self->{query} . $self->code . "/officialsites"; $self->_show_message("URL for sites is $url ...", 'DEBUG'); $page = $self->_get_page_from_internet($url); @@ -1483,7 +1487,7 @@ $page = $self->_cacheObj->get($self->code . '_dates') if $self->_cache; unless($page) { - my $url = "http://". $self->{host} . "/" . $self->{query} . $self->code . "/releaseinfo"; + my $url = "https://". $self->{host} . "/" . $self->{query} . $self->code . "/releaseinfo"; $self->_show_message("URL for sites is $url ...", 'DEBUG'); $page = $self->_get_page_from_internet($url); @@ -1538,7 +1542,7 @@ $page = $self->_cacheObj->get($self->code . '_keywords') if $self->_cache; unless($page) { - my $url = "http://". $self->{host} . "/" . $self->{query} . $self->code . "/keywords"; + my $url = "https://". $self->{host} . "/" . $self->{query} . $self->code . "/keywords"; $self->_show_message("URL for sites is $url ...", 'DEBUG'); $page = $self->_get_page_from_internet($url); @@ -1616,7 +1620,7 @@ =head1 BUGS Please, send me any found bugs by email: stepanov.michael@gmail.com or create -a bug report: http://rt.cpan.org/NoAuth/Bugs.html?Dist=IMDB-Film +a bug report: https://rt.cpan.org/NoAuth/Bugs.html?Dist=IMDB-Film =head1 SEE ALSO @@ -1626,7 +1630,7 @@ IMDB::Movie HTML::TokeParser -http://videoguide.sf.net +https://videoguide.sf.net =head1 AUTHOR --- BaseClass.pm 2013-01-02 10:39:53.000000000 -0700 +++ BaseClass.pm 2018-07-30 12:26:28.798403737 -0600 @@ -24,6 +24,7 @@ use Text::Unidecode qw(unidecode); use HTML::Entities; use Carp; +use IO::Socket::SSL; use Data::Dumper; @@ -406,7 +407,7 @@ } else { $self->_show_message("Retrieving page from internet ...", 'DEBUG'); - my $url = 'http://'.$self->_host().'/'. + my $url = 'https://'.$self->_host().'/'. ($crit =~ /^\d+$/ && length($crit) >= ID_LENGTH ? $self->_query() : $self->_search()) . $crit; $page = $self->_get_page_from_internet($url); @@ -431,7 +432,7 @@ $self->_show_message("URL is [$url]...", 'DEBUG'); - my $page = get($url); + my $page = _get($url); unless($page) { $self->error("Cannot retieve an url: [$url]!"); @@ -441,6 +442,23 @@ return $page; } +sub _get { + my ($URL) = @_; + + $ENV{'PERL_LWP_SSL_VERIFY_HOSTNAME'} = 0; + + my $ua = LWP::UserAgent->new( ssl_opts => { verify_hostname => 0, + SSL_verify_mode => IO::Socket::SSL::SSL_VERIFY_NONE, + }); + + $ua->agent('Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)'); #'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); + + my $req = HTTP::Request->new( GET => $URL); + + my $response = $ua->request($req); + return $response->content; +} + =item _parser() Setup HTML::TokeParser and store. To have possibility to inherite that class

Sun Aug 19 15:57:40 2018 CFABER [...] cpan.org - Correspondence added

On Mon Jul 30 16:36:49 2018, CFABER wrote: Show quoted text

> Add HTTPS support

I just realized that I based this off of my series of patches + the https patch set. No harm if you want to pick up all of my existing fixes from earlier patches provided here, if you want a clean patch let me know.

Tue Nov 06 11:40:22 2018 lukas [...] nietykalni.org - Correspondence added

Subject:	[rt.cpan.org #125968]
Date:	Tue, 6 Nov 2018 17:40:01 +0100
To:	"bug-IMDB-Film [...] rt.cpan.org" <bug-IMDB-Film [...] rt.cpan.org>
From:	Łukasz Nowik <lukas [...] nietykalni.org>

Your patch does not help, you wrote that you did it on earlier patches, which I probably do not have. Can you send working files BaseClass.pm, Film.pm and Persons.pm Thank you in advance for your help.

Tue Nov 06 11:40:23 2018 The RT System itself - Status changed from 'new' to 'open'