Bug #41655 for MIME-tools: RFC2231 strings decoded in MIME header string can corrupt it

Fri Dec 12 10:44:19 2008 SKA [...] cpan.org - Ticket created

Subject:

RFC2231 strings decoded in MIME header string can corrupt it

Attached patch re-MIME::Words encode RFC2231 headers in order to preserve the structure of the header, e.g. if 8bit chars, spaces, quotes or quotes are embedded in the token. Before re-encoding all parts are joined together, thus, if a multi-byte character had been split on two parts, it is returned in one string from decode_mimewords(). This would lead to a behaviour as if called decode() on the MIME::Head object. Furthermore, added " and ; to encode_mimeword() Q-mode in order to protect the token against MIME structure processing.

Subject:

patch_1_rfc2231.diff

# HG changeset patch # User Steffen Kaiser # Date 1228920577 -3600 # Node ID d6223803d86fb00cde6b054c1dd1992d1a97dffb # Parent 07211c59ce82f5ee9cccd51a19c4a991c30255d1 add: RFC2231 encoding is re-encoded into MIME::Words RFC2231 decoding breaks reconstruction of header and differs from the fact that MIME::Words are left in the string This patch re-encodes the strings into MIME::Words using the same character set and the 'Q'-encoding, but as _one_ MIME::Word. This might break the headers, too, because words longer than 64 characters may happen and they are not separated by CRLF SPACE. Before re-encoding all parts are joined together, thus, multi-byte characters split over different parts are joined for easier MIME decoding later on. Though, embedded " and ; characters won't effect parsing. diff -r 07211c59ce82 -r d6223803d86f Field/ParamVal.pm --- a/Field/ParamVal.pm Mon Dec 01 11:42:32 2008 +0100 +++ b/Field/ParamVal.pm Wed Dec 10 15:49:37 2008 +0100 @@ -69,6 +69,7 @@ # Kit modules: use MIME::Tools qw(:config :msgs); +use MIME::Words; @ISA = qw(Mail::Field); @@ -146,6 +147,9 @@ Supplying undef for a hashref, or an empty set of values, effectively clears the object. +RFC2231 encoding is removed from the parameter values and replaced by +MIME::Word encoding. + The self object is returned. =cut @@ -187,36 +191,11 @@ =cut -sub rfc2231decode { - my($val) = @_; - my($enc, $lang, $rest); - - if ($val =~ m/^([^\']*)\'([^\']*)\'(.*)$/) { - # SHOULD REALLY DO SOMETHING MORE INTELLIGENT WITH ENCODING!!! - $enc = $1; - $lang = $2; - $rest = $3; - $rest = rfc2231percent($rest); - } elsif ($val =~ m/^([^\']*)\'([^\']*)$/) { - $enc = $1; - $rest = $2; - $rest = rfc2231percent($rest); - } else { - $rest = rfc2231percent($val); - } - return $rest; -} - -sub rfc2231percent { - # Do percent-subsitution - my($str) = @_; - $str =~ s/%([0-9a-fA-F]{2})/pack("C", hex($1))/ge; - return $str; -} - sub parse_params { my ($self, $raw) = @_; my %params = (); + # next hash contains "charset" for the encoding to apply + # "decode" a HASH ref of params to apply the charset to my %rfc2231params = (); my $param; my $val; @@ -234,51 +213,94 @@ # Extract subsequent parameters. # No, we can't just "split" on semicolons: they're legal in quoted strings! while (1) { # keep chopping away until done... - $raw =~ m/\G$SPCZ\;$SPCZ/og or last; # skip leading separator - $raw =~ m/\G($PARAMNAME)\s*=\s*/og or last; # give up if not a param - $param = lc($1); - $raw =~ m/\G(\"([^\"]*)\")|\G($ENCTOKEN)|\G($BADTOKEN)|\G($TOKEN)/g or last; # give up if no value" - my ($qstr, $str, $enctoken, $badtoken, $token) = ($1, $2, $3, $4, $5); - if (defined($badtoken)) { - # Strip leading/trailing whitespace from badtoken - $badtoken =~ s/^\s*//; - $badtoken =~ s/\s*$//; - } - $val = defined($qstr) ? $str : - (defined($enctoken) ? $enctoken : - (defined($badtoken) ? $badtoken : $token)); + $raw =~ m/\G$SPCZ\;$SPCZ/og or last; # skip leading separator + $raw =~ m/\G($PARAMNAME)\s*=\s*/og or last; # give up if not a param + $param = lc($1); + $raw =~ m/\G(\"([^\"]*)\")|\G($ENCTOKEN)|\G($BADTOKEN)|\G($TOKEN)/g or last; # give up if no value" + my ($qstr, $str, $enctoken, $badtoken, $token) = ($1, $2, $3, $4, $5); + if (defined($badtoken)) { + # Strip leading/trailing whitespace from badtoken + $badtoken =~ s/^\s*//; + $badtoken =~ s/\s*$//; + } + $val = defined($qstr) ? $str : + (defined($enctoken) ? $enctoken : + (defined($badtoken) ? $badtoken : $token)); - # Do RFC 2231 processing - if ($param =~ /\*/) { - my($name, $num); - # Pick out the parts of the parameter - if ($param =~ m/^([^*]+)\*([^*]+)\*?$/) { - # We have param*number* or param*number - $name = $1; - $num = $2; - } else { - # Fake a part of zero... not sure how to handle this properly - $param =~ s/\*//g; - $name = $param; - $num = 0; - } - # Decode the value unless it was a quoted string - if (!defined($qstr)) { - $val = rfc2231decode($val); - } - $rfc2231params{$name}{$num} .= $val; - } else { - # Make a fake "part zero" for non-RFC2231 params - $rfc2231params{$param}{"0"} = $val; - } + # Do RFC 2231 processing + # Variants: + # 1) name="value value" + # 2) name=value + # 3) name*=percent_encoded + # 4) name*#="continuable value" + # 5) name*#*=continuable_value_percent_encoded + my $num; + if($param =~ s/(\*+(\d+))(\**)$//) { # param := name of field + $num = defined($1)? 0+$2: 0; + if(length($3)) { # decode this part + if($num == 0 && $val =~ s/^([^\']*)\'([^\']*)\'//) { + # the first field may contain encoding + $rfc2231params{$param}{charset} = $1; +#not used! $rfc2231params{$param}{language} = $2; + } + # percent decode the value + # per standard no $qstr (quoted string) should be here + # we just ignore it, previous implementation skipped + # decoding in this situation + $val =~ s/%([0-9a-fA-F]{2})/pack("C", hex($1))/ge; + $rfc2231params{$param}{decode}{$num} = 1; + } + } else { + $num = 0; # neither \*\d+ nor \*$ => fake first part + } + $rfc2231params{$param}{$num} = $val; } + + # RFC2231-encoded and non-encoded parts may follow + # each other in any order (except that part number 0 MUST + # be encoded). + + # Note: RFC2231 decoding breaks the further processing such + # that it is not reversable by MIME::Tools. Hence, we re-encode + # the string as proper MIME::Word and keep the character set + # that way. Also, because MIME::Word encoding is not removed + # in this function, there would be a different decoding of + # both encoding styles. # Extract reconstructed parameters foreach $param (keys %rfc2231params) { - foreach $part (sort { $a <=> $b } keys %{$rfc2231params{$param}}) { - $params{$param} .= $rfc2231params{$param}{$part}; - } - debug " field param <$param> = <$params{$param}>"; + $params{$param} = ''; + my $charset = delete $rfc2231params{$param}{charset}; + my $decode = delete $rfc2231params{$param}{decode}; + # use empty %decode if no charset is to be applied + $decode = { } if !defined($charset) || !defined($decode); + for(sort { $a <=> $b } keys %{$rfc2231params{$param}}) { + my $part = delete $rfc2231params{$param}{$_}; + unless(exists $decode->{$_}) { + $params{$param} .= $part; + } else { # re-encode + # the resulting string should not exceed the 74 char limit + # Also encode ; and " in order to keep the MIME header + # structure + $params{$param} .= MIME::Words::encode_mimeword($part, 'q', $charset); + } + } + my @parts = sort { $b <=> $a } keys %{$rfc2231params{$param}}; # others are parts + while(defined(my $first = pop @parts)) { + my $part = delete $rfc2231params{$param}{$first}; + unless(exists $decode->{$first}) { + $params{$param} .= $part; + next; + } + # join successive parts + my $num = $first; + while(scalar(@parts) && $parts[$#parts] == ++$num) { + pop @parts; + $part .= delete $rfc2231params{$param}{$num}; + } + $params{$param} .= encode_mimewords($part, Charset => $charset); + } + debug " field param <$param> = <$params{$param}>"; } # Done: diff -r 07211c59ce82 -r d6223803d86f Words.pm --- a/Words.pm Mon Dec 01 11:42:32 2008 +0100 +++ b/Words.pm Wed Dec 10 15:49:37 2008 +0100 @@ -114,9 +114,11 @@ # _encode_Q STRING # Private: used by _encode_header() to decode "Q" encoding, which is # almost, but not exactly, quoted-printable. :-P +# Also encode ; and " to drop their MIME header meaning. sub _encode_Q { my $str = shift; - $str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog; + $str =~ s{([;"_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog; # rule 1 + $str =~ tr/ /_/; # save 2 bytes per space # rule 2 $str; }

Thu Apr 22 13:29:46 2010 dmo+pause [...] dmo.ca - Correspondence added

Any chance you've got time to respin this against 5.428, and maybe add a testcase or two?

Thu Apr 22 13:29:53 2010 The RT System itself - Status changed from 'new' to 'open'