Subject: | Bug report: segmentation fault |
Date: | Wed, 11 May 2011 09:39:22 +0200 |
To: | bug-HTML-Strip [...] rt.cpan.org |
From: | Mikołaj Hnatiuk <mikolaj.hnatiuk [...] gmail.com> |
Hi,
im sending a bug report for HTML::Strip. It has everything to do with me
using UTF8 and processing large chunks of text (blog posts) . This problem
doesnt occur while using HTML::Parse.
uname -u
Linux mhnatiuk-laptop 2.6.32-30-generic #59-Ubuntu SMP Tue Mar 1 21:30:21
UTC 2011 i686 GNU/Linux
PERL:
v5.10.1 (*) built for i486-linux-gnu-thread-multi
Best,
Mikołaj Hnatiuk
My program:
#!/usr/bin/perl
use HTML::Parse;
use HTML::FormatText;
use strict;
use utf8;
use HTML::Strip;
use Encode;
open FILE, "<:utf8", "posts.csv" or die ('error -input file');
open OUT, ">:utf8", "clean_text.csv" or die('error no df/perm to write');
my $hs = HTML::Strip->new();
while(<FILE>){
chomp;
my @t = split /\t/;
my $raw = $t[7];
if($t[7])
{
my $clean = $hs->parse( decode_utf8($raw) );
splice(@t,7,1);
print OUT "\n".join("\t",@t)."\t".$clean ;
}
}
close FILE;
close OUT;
*BUG REPORT:*
*
**** glibc detected *** perl: free(): invalid next size (fast): 0x0838f2d0
***
======= Backtrace: =========
/lib/tls/i686/cmov/libc.so.6(+0x6b591)[0x17b591]
/lib/tls/i686/cmov/libc.so.6(+0x6cde8)[0x17cde8]
/lib/tls/i686/cmov/libc.so.6(cfree+0x6d)[0x17fecd]
/usr/local/lib/perl/5.10.1/auto/HTML/Strip/Strip.so(XS_HTML__Strip_strip_html+0x403)[0xacf8c3]
perl(Perl_pp_entersub+0x533)[0x80d5af3]
perl(Perl_runops_standard+0x18)[0x80d3ee8]
perl(perl_run+0x342)[0x807c8e2]
perl(main+0xed)[0x806437d]
/lib/tls/i686/cmov/libc.so.6(__libc_start_main+0xe6)[0x126bd6]
perl[0x80641f1]
======= Memory map: ========
00110000-00263000 r-xp 00000000 08:01 9175412 /lib/tls/i686/cmov/
libc-2.11.1.so
00263000-00264000 ---p 00153000 08:01 9175412 /lib/tls/i686/cmov/
libc-2.11.1.so
00264000-00266000 r--p 00153000 08:01 9175412 /lib/tls/i686/cmov/
libc-2.11.1.so
00266000-00267000 rw-p 00155000 08:01 9175412 /lib/tls/i686/cmov/
libc-2.11.1.so
00267000-0026a000 rw-p 00000000 00:00 0
0028e000-00297000 r-xp 00000000 08:01 9175416 /lib/tls/i686/cmov/
libcrypt-2.11.1.so
00297000-00298000 r--p 00008000 08:01 9175416 /lib/tls/i686/cmov/
libcrypt-2.11.1.so
00298000-00299000 rw-p 00009000 08:01 9175416 /lib/tls/i686/cmov/
libcrypt-2.11.1.so
00299000-002c0000 rw-p 00000000 00:00 0
004dc000-004e4000 r-xp 00000000 08:01 697918
/usr/lib/perl/5.10.1/auto/Encode/Encode.so
004e4000-004e5000 r--p 00007000 08:01 697918
/usr/lib/perl/5.10.1/auto/Encode/Encode.so
004e5000-004e6000 rw-p 00008000 08:01 697918
/usr/lib/perl/5.10.1/auto/Encode/E00805000-00806000 rw-p 00015000 08:01
9175581 /lib/tls/i686/cmov/libpthread-2.11.1.so
00806000-00808000 rw-p 00000000 00:00 0
00899000-008b6000 r-xp 00000000 08:01 9175123 /lib/libgcc_s.so.1
008b6000-008b7000 r--p 0001c000 08:01 9175123 /lib/libgcc_s.so.1
008b7000-008b8000 rw-p 0001d000 08:01 9175123 /lib/libgcc_s.so.1
008f3000-008fe000 r-xp 00000000 08:01 1710917
/usr/local/lib/perl/5.10.1/auto/HTML/Parser/Parser.so
008fe000-008ff000 r--p 0000a000 08:01 1710917
/usr/local/lib/perl/5.10.1/auto/HTML/Parser/Parser.so
008ff000-00900000 rw-p 0000b000 08:01 1710917
/usr/local/lib/perl/5.10.1/auto/HTML/Parser/Parser.so
00962000-00963000 r-xp 00000000 00:00 0 [vdso]
00acc000-00ad1000 r-xp 00000000 08:01 1581062
/usr/local/lib/perl/5.10.1/auto/HTML/Strip/Strip.so
00ad1000-00ad2000 r--p 00004000 08:01 1581062
/usr/local/lib/perl/5.10.1/auto/HTML/Strip/Strip.so
00ad2000-00ad3000 rw-p 00005000 08:01 1581062
/usr/local/lib/perl/5.10.1/auto/HTML/Strip/Strip.so
00b5e000-00b82000 r-xp 00000000 08:01 9175571 /lib/tls/i686/cmov/
libm-2.11.1.so
00b82000-00b83000 r--p 00023000 08:01 9175571 /lib/tls/i686/cmov/
libm-2.11.1.so
00b83000-00b84000 rw-p 00024000 08:01 9175571 /lib/tls/i686/cmov/
libm-2.11.1.so
00ddf000-00dfa000 r-xp 00000000 08:01 9175088 /lib/ld-2.11.1.so
00dfa000-00dfb000 r--p 0001a000 08:01 9175088 /lib/ld-2.11.1.so
00dfb000-00dfc000 rw-p 0001b000 08:01 9175088 /lib/ld-2.11.1.so
00f31000-00f33000 r-xp 00000000 08:01 9175570 /lib/tls/i686/cmov/
libdl-2.11.1.so
00f33000-00f34000 r--p 00001000 08:01 9175570 /lib/tls/i686/cmov/
libdl-2.11.1.so
00f34000-00f35000 rw-p 00002000 08:01 9175570 /lib/tls/i686/cmov/
libdl-2.11.1.so
08048000-08174000 r-xp 00000000 08:01 524472 /usr/bin/perl
08174000-08175000 r--p 0012b000 08:01 524472 /usr/bin/perl
08175000-08177000 rw-p 0012c000 08:01 524472 /usr/bin/perl
08252000-08462000 rw-p 00000000 00:00 0 [heap]
b7500000-b7521000 rw-p 00000000 00:00 0
b7521000-b7600000 ---p 00000000 00:00 0
b76a2000-b76e1000 r--p 00000000 08:01 532173
/usr/lib/locale/pl_PL.utf8/LC_CTYPE
b76e1000-b77ff000 r--p 00000000 08:01 921385
/usr/lib/locale/pl_PL.utf8/LC_COLLATE
b77ff000-b7801000 rw-p 00000000 00:00 0
b7806000-b7807000 r--p 00000000 08:01 921383
/usr/lib/locale/pl_PL.utf8/LC_NUMERIC
b7807000-b7808000 r--p 00000000 08:01 921384
/usr/lib/locale/pl_PL.utf8/LC_TIME
b7808000-b7809000 r--p 00000000 08:01 921386
/usr/lib/locale/pl_PL.utf8/LC_MONETARY
b7809000-b780a000 r--p 00000000 08:01 921387
/usr/lib/locale/pl_PL.utf8/LC_MESSAGES/SYS_LC_MESSAGES
b780a000-b780b000 r--p 00000000 08:01 532179
/usr/lib/locale/pl_PL.utf8/LC_PAPER
b780b000-b780c000 r--p 00000000 08:01 532177
/usr/lib/locale/pl_PL.utf8/LC_NAME
b780c000-b780d000 r--p 00000000 08:01 921388
/usr/lib/locale/pl_PL.utf8/LC_ADDRESS
b780d000-b780e000 r--p 00000000 08:01 921389
/usr/lib/locale/pl_PL.utf8/LC_TELEPHONE
b780e000-b780f000 r--p 00000000 08:01 532175
/usr/lib/locale/pl_PL.utf8/LC_MEASUREMENT
b780f000-b7816000 r--s 00000000 08:01 536710
/usr/lib/gconv/gconv-modules.cache
b7816000-b7817000 r--p 00000000 08:01 921390
/usr/lib/locale/pl_PL.utf8/LC_IDENTIFICATION
b7817000-b7819000 rw-p 00000000 00:00 0
bfe09000-bfe1e000 rw-p 00000000 00:00 0 [stack]