Subject: | [PATCH] td attributes with embedded newlines, and false tag/text values |
Hi,
While converting a some HTML pages produced by MS Word I came across a
few bugs in HTML::WikiConverter::MediaWiki. First of all, 0's in text
and tags are false and removed, as demostrated by this code:
use HTML::WikiConverter;
my $wc = new HTML::WikiConverter( dialect => 'MediaWiki' );
print $wc->html2wiki("<b>1</b><b>0</b>");
Output in 0.52: '''1''' ''''''
Expected (fixed by patch): '''1''' '''0'''
Also, MS Word produces table cells like this:
<td style="xxx;
yyy">zzz
which HTML::WikiConverter translates to:
|xxx
yyy|zzz
This is easily fixed by stripping newlines. Patches follow.
Hope this helps,
-Jeff Connelly
--- /tmp/HTML-WikiConverter-0.52/lib/HTML/WikiConverter/MediaWiki.pm
2006-05-02 23:55:56.000000000 -0700
+++ MediaWiki.pm 2006-05-02 23:52:29.000000000 -0700
@@ -223,6 +223,7 @@
my @tr_attrs = ( @common_attrs, 'bgcolor', @tablealign_attrs );
my $attrs = $self->get_attr_str( $node, @tr_attrs );
+ $attrs =~ tr/\n//d;
$prefix .= ' '.$attrs if $attrs;
return '' unless $node->left or $attrs;
@@ -241,6 +242,10 @@
my @td_attrs = ( @common_attrs, @tablecell_attrs, @tablealign_attrs );
my $attrs = $self->get_attr_str( $node, @td_attrs );
+
+ # Handle multi-line 'style' attrs from MSWord -Jeff Connelly
+ $attrs =~ tr/\n//d;
+
$prefix .= ' '.$attrs.' |' if $attrs;
# If there are any non-text elements inside the cell, then the
@@ -264,7 +269,8 @@
sub preprocess_node {
my( $self, $node ) = @_;
- my $tag = $node->tag || '';
+ my $tag = $node->tag;
+ $tag = '' if !defined($tag);
$self->strip_aname($node) if $tag eq 'a';
$self->_strip_extra($node);
$self->_nowiki_text($node) if $tag eq '~text';
@@ -287,7 +293,8 @@
sub _nowiki_text {
my( $self, $node ) = @_;
- my $text = $node->attr('text') || '';
+ my $text = $node->attr('text');
+ $text = '' if !defined($text);
my $found_wikitext = 0;
foreach my $pat ( @wikitext_patterns ) {
--- /tmp/HTML-WikiConverter-0.52/lib/HTML/WikiConverter.pm 2006-03-03
20:38:56.000000000 -0800
+++ WikiConverter.pm 2006-05-02 23:57:06.000000000 -0700
@@ -306,7 +306,8 @@
# Encodes high-bit and control chars in node's text to HTML entities.
sub __encode_entities {
my( $self, $node ) = @_;
- my $text = $node->attr('text') || '';
+ my $text = $node->attr('text');
+ $text = '' if !defined($text);
encode_entities( $text, '<>&' );
$node->attr( text => $text );
}