Subject: | URL parser is too strict |
The parser has two problems. One, it requires that urls end in one of
0-9A-Za-z#/, which I don't believe is necessary.
Secondly, it is too strict when dealing with URLs wrapped in <>'s and
prevents people from using unescaped spaces.
<http://www.google.com/?q=some string> should be match the entire
bracketed string, whereas "http://www.google.com?q=some string" should
match up until the space.
Subject: | url_parsing.patch |
--- /usr/share/perl5/WikiText/Socialtext/Parser.pm 2008-07-23 03:07:04.000000000 -0700
+++ lib/WikiText/Socialtext/Parser.pm 2009-11-09 11:27:30.000000000 -0800
@@ -22,6 +22,7 @@
asap => 'asap',
);
my $im_re = join '|', keys %im_types;
+my $url_scheme = qr{(?:http|https|ftp|irc|file):(?://)?};
sub create_grammar {
my $all_phrases = [
@@ -269,17 +270,10 @@
},
a => {
- match => qr{
- (?:"([^"]*)"\s*)?
- <?
- (
- (?:http|https|ftp|irc|file):
- (?://)?
- [$uric]+
- [A-Za-z0-9/#]
- )
- >?
- }x,
+ match => [
+ qr{(?:"([^"]*)"\s*)? < ( $url_scheme [$uric ]+ ) >}x,
+ qr{()( $url_scheme [$uric]+ )}x,
+ ],
filter => sub {
my $node = shift;
$_ = $node->{1} || $node->{2};