Subject: | HTML discovery fails on old-style HTML, CDATA, comments |
HTML discovery / _find_semantic_info:
fails on old-style HTML where the <LINK> or <META> tags are capitalized
(the search for link/meta tags should be case insensitive)
fails to ignore commented-out link/meta tags and <[CDATA .. ]]> sections.
fails to deal with unquoted (pre-xhtml) attribute values.
E.g., a match like
$val =~ m!title=.foaf.!i
will falsepositive on
title=bfoafc
which is legal prior to xhtml. (If we *could* assume xhtml then there's
no reason not to be using a real XML parser, but I don't think it's a
good idea to assume that anyway...)
I also see lots of ad hoc pasted regexps in _find_semantic_info(),...
So I wrote a patch to deal with this and make the guts of
_find_semantic_info be table-driven so that we're not having
to keep writing the same regexps over and over for each new case.
Subject: | patch.txt |
--- Net/OpenID/Consumer-0-.pm 2009-06-05 09:04:46.000000000 -0700
+++ Net/OpenID/Consumer.pm 2010-02-05 06:18:17.520169417 -0800
@@ -216,104 +216,121 @@
return $res ? $res->content : undef;
}
+sub _element_attributes {
+ local $_ = shift;
+ my %a = ();
+ while (m!\G[[:space:]]+([^[:space:]=]+)(?:=(?:([-a-zA-Z0-9._:]+)|'([^\']+)'|"([^\"]+)")([^[:space:]]*))?!g) {
+ next if $5; # skip malformed attributes
+ my $v = (defined $2 ? $2 : defined $3 ? $3 : $4);
+ $a{lc($1)} = $v if defined $v;
+ }
+ return \%a;
+}
+
+sub _extract_head_markup_only {
+ my $htmlref = shift;
+ # kill all CDATA sections
+ $$htmlref =~ s/<!\[CDATA\[.*?\]\]>//sg;
+
+ # kill all comments
+ $$htmlref =~ s/<!--[^-].*?-->//sg;
+
+ # trim everything past the body. this is in case the user doesn't
+ # have a head document and somebody was able to inject their own
+ # head. -- brad choate
+ $$htmlref =~ s/<body\b.*//is;
+}
+
+
+# List of head elements that matter for HTTP discovery.
+# Each entry defines a key for the _find_semantic_info hash
+# [
+# ELEMENT -- html element name, must be 'link' or 'meta'
+# KEY -- key name
+# ATTR -- html attribute name
+# MATCH_VALUE -- string (default = KEY)
+# MATCH_ATTRS -- list-ref of html attribute names
+# default = ['rel'] if ELEMENT is <link...>
+# default = ['name'] if ELEMENT is <meta...>
+# ]
+# _find_semantic_info->{KEY} =
+# ATTR value of the ELEMENT for which
+# MATCH_VALUE == ;-join of MATCH_ATTRS values
+#
+our @HTTP_discovery_link_meta_tags =
+ (
+ # OpenID servers / delegated identities
+ # <link rel="openid.server"
+ # href="http://www.livejournal.com/misc/openid.bml" />
+ # <link rel="openid.delegate"
+ # href="whatever" />
+ #
+ [qw(link openid.server href)], # 'openid.server'==.rel (<link> default)
+ [qw(link openid.delegate href)],
+
+ # OpenID2 providers / local identifiers
+ # <link rel="openid2.provider"
+ # href="http://www.livejournal.com/misc/openid.bml" />
+ # <link rel="openid2.local_id" href="whatever" />
+ #
+ [qw(link openid2.provider href)],
+ [qw(link openid2.local_id href)],
+
+ # FOAF maker info
+ # <meta name="foaf:maker"
+ # content="foaf:mbox_sha1sum '4caa1d6f6203d21705a00a7aca86203e82a9cf7a'"/>
+ #
+ [qw(meta foaf.maker content foaf:maker)], # == .name (<meta> default)
+
+ # FOAF documents
+ # <link rel="meta" type="application/rdf+xml" title="FOAF"
+ # href="http://brad.livejournal.com/data/foaf" />
+ #
+ [qw(link foaf href meta;foaf;application/rdf+xml), [qw(rel title type)]],
+
+ # RSS
+ # <link rel="alternate" type="application/rss+xml" title="RSS"
+ # href="http://www.livejournal.com/~brad/data/rss" />
+ #
+ [qw(link rss href alternate;application/rss+xml), [qw(rel type)]],
+
+ # Atom
+ # <link rel="alternate" type="application/atom+xml" title="Atom"
+ # href="http://www.livejournal.com/~brad/data/rss" />
+ #
+ [qw(link atom href alternate;application/atom+xml), [qw(rel type)]],
+ );
+
sub _find_semantic_info {
my Net::OpenID::Consumer $self = shift;
my $url = shift;
my $final_url_ref = shift;
- my $trim_hook = sub {
- my $htmlref = shift;
- # trim everything past the body. this is in case the user doesn't
- # have a head document and somebody was able to inject their own
- # head. -- brad choate
- $$htmlref =~ s/<body\b.*//is;
- };
-
- my $doc = $self->_get_url_contents($url, $final_url_ref, $trim_hook) || '';
+ my $doc = $self->_get_url_contents($url, $final_url_ref, \&_extract_head_markup_only) || '';
- # find <head> content of document (notably: the first head, if an attacker
- # has added others somehow)
+ # find <head> content of document
+ # (notably: the first head, if an attacker has added others somehow)
return $self->_fail("no_head_tag", "Couldn't find OpenID servers due to no head tag")
unless $doc =~ m!<head[^>]*>(.*?)</head>!is;
my $head = $1;
- my $ret = {
- 'openid.server' => undef,
- 'openid.delegate' => undef,
- 'foaf' => undef,
- 'foaf.maker' => undef,
- 'rss' => undef,
- 'atom' => undef,
- };
+ my $ret = {};
# analyze link/meta tags
- while ($head =~ m!<(link|meta)\b([^>]+)>!g) {
- my ($type, $val) = ($1, $2);
- my $temp;
-
- # OpenID servers / delegated identities
- # <link rel="openid.server" href="http://www.livejournal.com/misc/openid.bml" />
- if ($type eq "link" &&
- $val =~ /\brel=.openid\.(server|delegate)./i && ($temp = $1) &&
- $val =~ m!\bhref=[\"\']([^\"\']+)[\"\']!i) {
- $ret->{"openid.$temp"} = $1;
- next;
- }
-
- # OpenID2 providers / local identifiers
- # <link rel="openid2.provider" href="http://www.livejournal.com/misc/openid.bml" />
- if ($type eq "link" &&
- $val =~ /\brel=.openid2\.(provider|local_id)./i && ($temp = $1) &&
- $val =~ m!\bhref=[\"\']([^\"\']+)[\"\']!i) {
- $ret->{"openid2.$temp"} = $1;
- next;
- }
-
- # FOAF documents
- #<link rel="meta" type="application/rdf+xml" title="FOAF" href="http://brad.livejournal.com/data/foaf" />
- if ($type eq "link" &&
- $val =~ m!title=.foaf.!i &&
- $val =~ m!rel=.meta.!i &&
- $val =~ m!type=.application/rdf\+xml.!i &&
- $val =~ m!href=[\"\']([^\"\']+)[\"\']!i) {
- $ret->{"foaf"} = $1;
- next;
- }
-
- # FOAF maker info
- # <meta name="foaf:maker" content="foaf:mbox_sha1sum '4caa1d6f6203d21705a00a7aca86203e82a9cf7a'" />
- if ($type eq "meta" &&
- $val =~ m!name=.foaf:maker.!i &&
- $val =~ m!content=([\'\"])(.*?)\1!i) {
- $ret->{"foaf.maker"} = $2;
- next;
- }
-
- if ($type eq "meta" &&
- $val =~ m!name=.foaf:maker.!i &&
- $val =~ m!content=([\'\"])(.*?)\1!i) {
- $ret->{"foaf.maker"} = $2;
- next;
- }
-
- # RSS
- # <link rel="alternate" type="application/rss+xml" title="RSS" href="http://www.livejournal.com/~brad/data/rss" />
- if ($type eq "link" &&
- $val =~ m!rel=.alternate.!i &&
- $val =~ m!type=.application/rss\+xml.!i &&
- $val =~ m!href=[\"\']([^\"\']+)[\"\']!i) {
- $ret->{"rss"} = $1;
- next;
- }
-
- # Atom
- # <link rel="alternate" type="application/atom+xml" title="Atom" href="http://www.livejournal.com/~brad/data/rss" />
- if ($type eq "link" &&
- $val =~ m!rel=.alternate.!i &&
- $val =~ m!type=.application/atom\+xml.!i &&
- $val =~ m!href=[\"\']([^\"\']+)[\"\']!i) {
- $ret->{"atom"} = $1;
- next;
+ while ($head =~ m!<(link|meta)([[:space:]][^>]*?)/?>!ig) {
+ my $linkmeta = lc($1);
+ my $a = _element_attributes($2);
+
+ for (@HTTP_discovery_link_meta_tags) {
+ my ($elt, $target, $tattrib, $string, $attribs) = @$_;
+ next if $elt ne $linkmeta;
+
+ $string ||= $target;
+ $attribs ||= [$elt eq 'meta' ? 'name' : 'rel'];
+ next if $string ne join ';', map {lc($a->{$_})} @$attribs;
+
+ $ret->{$target} = $a->{$tattrib};
+ last;
}
}