Subject: | RFC2231 strings decoded in MIME header string can corrupt it |
Attached patch re-MIME::Words encode RFC2231 headers in order to
preserve the structure of the header, e.g. if 8bit chars, spaces,
quotes or quotes are embedded in the token.
Before re-encoding all parts are joined together, thus, if a
multi-byte character had been split on two parts, it is returned in
one string from decode_mimewords().
This would lead to a behaviour as if called decode() on the MIME::Head
object.
Furthermore, added " and ; to encode_mimeword() Q-mode in order to
protect the token against MIME structure processing.
Subject: | patch_1_rfc2231.diff |
# HG changeset patch
# User Steffen Kaiser
# Date 1228920577 -3600
# Node ID d6223803d86fb00cde6b054c1dd1992d1a97dffb
# Parent 07211c59ce82f5ee9cccd51a19c4a991c30255d1
add: RFC2231 encoding is re-encoded into MIME::Words
RFC2231 decoding breaks reconstruction of header and differs from the
fact that MIME::Words are left in the string
This patch re-encodes the strings into MIME::Words using the same
character set and the 'Q'-encoding, but as _one_ MIME::Word. This might
break the headers, too, because words longer than 64 characters may
happen and they are not separated by CRLF SPACE.
Before re-encoding all parts are joined together, thus, multi-byte
characters split over different parts are joined for easier MIME decoding
later on.
Though, embedded " and ; characters won't effect parsing.
diff -r 07211c59ce82 -r d6223803d86f Field/ParamVal.pm
--- a/Field/ParamVal.pm Mon Dec 01 11:42:32 2008 +0100
+++ b/Field/ParamVal.pm Wed Dec 10 15:49:37 2008 +0100
@@ -69,6 +69,7 @@
# Kit modules:
use MIME::Tools qw(:config :msgs);
+use MIME::Words;
@ISA = qw(Mail::Field);
@@ -146,6 +147,9 @@
Supplying undef for a hashref, or an empty set of values, effectively
clears the object.
+RFC2231 encoding is removed from the parameter values and replaced by
+MIME::Word encoding.
+
The self object is returned.
=cut
@@ -187,36 +191,11 @@
=cut
-sub rfc2231decode {
- my($val) = @_;
- my($enc, $lang, $rest);
-
- if ($val =~ m/^([^\']*)\'([^\']*)\'(.*)$/) {
- # SHOULD REALLY DO SOMETHING MORE INTELLIGENT WITH ENCODING!!!
- $enc = $1;
- $lang = $2;
- $rest = $3;
- $rest = rfc2231percent($rest);
- } elsif ($val =~ m/^([^\']*)\'([^\']*)$/) {
- $enc = $1;
- $rest = $2;
- $rest = rfc2231percent($rest);
- } else {
- $rest = rfc2231percent($val);
- }
- return $rest;
-}
-
-sub rfc2231percent {
- # Do percent-subsitution
- my($str) = @_;
- $str =~ s/%([0-9a-fA-F]{2})/pack("C", hex($1))/ge;
- return $str;
-}
-
sub parse_params {
my ($self, $raw) = @_;
my %params = ();
+ # next hash contains "charset" for the encoding to apply
+ # "decode" a HASH ref of params to apply the charset to
my %rfc2231params = ();
my $param;
my $val;
@@ -234,51 +213,94 @@
# Extract subsequent parameters.
# No, we can't just "split" on semicolons: they're legal in quoted strings!
while (1) { # keep chopping away until done...
- $raw =~ m/\G$SPCZ\;$SPCZ/og or last; # skip leading separator
- $raw =~ m/\G($PARAMNAME)\s*=\s*/og or last; # give up if not a param
- $param = lc($1);
- $raw =~ m/\G(\"([^\"]*)\")|\G($ENCTOKEN)|\G($BADTOKEN)|\G($TOKEN)/g or last; # give up if no value"
- my ($qstr, $str, $enctoken, $badtoken, $token) = ($1, $2, $3, $4, $5);
- if (defined($badtoken)) {
- # Strip leading/trailing whitespace from badtoken
- $badtoken =~ s/^\s*//;
- $badtoken =~ s/\s*$//;
- }
- $val = defined($qstr) ? $str :
- (defined($enctoken) ? $enctoken :
- (defined($badtoken) ? $badtoken : $token));
+ $raw =~ m/\G$SPCZ\;$SPCZ/og or last; # skip leading separator
+ $raw =~ m/\G($PARAMNAME)\s*=\s*/og or last; # give up if not a param
+ $param = lc($1);
+ $raw =~ m/\G(\"([^\"]*)\")|\G($ENCTOKEN)|\G($BADTOKEN)|\G($TOKEN)/g or last; # give up if no value"
+ my ($qstr, $str, $enctoken, $badtoken, $token) = ($1, $2, $3, $4, $5);
+ if (defined($badtoken)) {
+ # Strip leading/trailing whitespace from badtoken
+ $badtoken =~ s/^\s*//;
+ $badtoken =~ s/\s*$//;
+ }
+ $val = defined($qstr) ? $str :
+ (defined($enctoken) ? $enctoken :
+ (defined($badtoken) ? $badtoken : $token));
- # Do RFC 2231 processing
- if ($param =~ /\*/) {
- my($name, $num);
- # Pick out the parts of the parameter
- if ($param =~ m/^([^*]+)\*([^*]+)\*?$/) {
- # We have param*number* or param*number
- $name = $1;
- $num = $2;
- } else {
- # Fake a part of zero... not sure how to handle this properly
- $param =~ s/\*//g;
- $name = $param;
- $num = 0;
- }
- # Decode the value unless it was a quoted string
- if (!defined($qstr)) {
- $val = rfc2231decode($val);
- }
- $rfc2231params{$name}{$num} .= $val;
- } else {
- # Make a fake "part zero" for non-RFC2231 params
- $rfc2231params{$param}{"0"} = $val;
- }
+ # Do RFC 2231 processing
+ # Variants:
+ # 1) name="value value"
+ # 2) name=value
+ # 3) name*=percent_encoded
+ # 4) name*#="continuable value"
+ # 5) name*#*=continuable_value_percent_encoded
+ my $num;
+ if($param =~ s/(\*+(\d+))(\**)$//) { # param := name of field
+ $num = defined($1)? 0+$2: 0;
+ if(length($3)) { # decode this part
+ if($num == 0 && $val =~ s/^([^\']*)\'([^\']*)\'//) {
+ # the first field may contain encoding
+ $rfc2231params{$param}{charset} = $1;
+#not used! $rfc2231params{$param}{language} = $2;
+ }
+ # percent decode the value
+ # per standard no $qstr (quoted string) should be here
+ # we just ignore it, previous implementation skipped
+ # decoding in this situation
+ $val =~ s/%([0-9a-fA-F]{2})/pack("C", hex($1))/ge;
+ $rfc2231params{$param}{decode}{$num} = 1;
+ }
+ } else {
+ $num = 0; # neither \*\d+ nor \*$ => fake first part
+ }
+ $rfc2231params{$param}{$num} = $val;
}
+
+ # RFC2231-encoded and non-encoded parts may follow
+ # each other in any order (except that part number 0 MUST
+ # be encoded).
+
+ # Note: RFC2231 decoding breaks the further processing such
+ # that it is not reversable by MIME::Tools. Hence, we re-encode
+ # the string as proper MIME::Word and keep the character set
+ # that way. Also, because MIME::Word encoding is not removed
+ # in this function, there would be a different decoding of
+ # both encoding styles.
# Extract reconstructed parameters
foreach $param (keys %rfc2231params) {
- foreach $part (sort { $a <=> $b } keys %{$rfc2231params{$param}}) {
- $params{$param} .= $rfc2231params{$param}{$part};
- }
- debug " field param <$param> = <$params{$param}>";
+ $params{$param} = '';
+ my $charset = delete $rfc2231params{$param}{charset};
+ my $decode = delete $rfc2231params{$param}{decode};
+ # use empty %decode if no charset is to be applied
+ $decode = { } if !defined($charset) || !defined($decode);
+ for(sort { $a <=> $b } keys %{$rfc2231params{$param}}) {
+ my $part = delete $rfc2231params{$param}{$_};
+ unless(exists $decode->{$_}) {
+ $params{$param} .= $part;
+ } else { # re-encode
+ # the resulting string should not exceed the 74 char limit
+ # Also encode ; and " in order to keep the MIME header
+ # structure
+ $params{$param} .= MIME::Words::encode_mimeword($part, 'q', $charset);
+ }
+ }
+ my @parts = sort { $b <=> $a } keys %{$rfc2231params{$param}}; # others are parts
+ while(defined(my $first = pop @parts)) {
+ my $part = delete $rfc2231params{$param}{$first};
+ unless(exists $decode->{$first}) {
+ $params{$param} .= $part;
+ next;
+ }
+ # join successive parts
+ my $num = $first;
+ while(scalar(@parts) && $parts[$#parts] == ++$num) {
+ pop @parts;
+ $part .= delete $rfc2231params{$param}{$num};
+ }
+ $params{$param} .= encode_mimewords($part, Charset => $charset);
+ }
+ debug " field param <$param> = <$params{$param}>";
}
# Done:
diff -r 07211c59ce82 -r d6223803d86f Words.pm
--- a/Words.pm Mon Dec 01 11:42:32 2008 +0100
+++ b/Words.pm Wed Dec 10 15:49:37 2008 +0100
@@ -114,9 +114,11 @@
# _encode_Q STRING
# Private: used by _encode_header() to decode "Q" encoding, which is
# almost, but not exactly, quoted-printable. :-P
+# Also encode ; and " to drop their MIME header meaning.
sub _encode_Q {
my $str = shift;
- $str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
+ $str =~ s{([;"_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog; # rule 1
+ $str =~ tr/ /_/; # save 2 bytes per space # rule 2
$str;
}