*** ../HTML-TableExtract-2.10/lib/HTML/TableExtract.pm Sat Jul 15 19:52:34 2006
--- lib/HTML/TableExtract.pm Sat Jan 12 19:07:17 2008
***************
*** 52,57 ****
--- 52,58 ----
depth => undef,
count => undef,
attribs => undef,
+ columns => undef,
subtables => undef,
gridmap => 1,
decode => 1,
***************
*** 317,328 ****
keep_html => $self->{keep_html},
strip_html_on_match => $self->{strip_html_on_match},
parent_table => $pts,
);
# Target constraints. There is no point in passing any of these along
# if we are under an umbrella. Notice that with table states, "depth"
# and "count" are absolute coordinates recording where this table was
! # created, whereas "tdepth" and "tcount" are the target constraints.
# Headers have "absolute" meaning, therefore are passed by the
# same name.
if (!$umbrella) {
--- 318,330 ----
keep_html => $self->{keep_html},
strip_html_on_match => $self->{strip_html_on_match},
parent_table => $pts,
+ tcolumns => $self->{columns},
);
# Target constraints. There is no point in passing any of these along
# if we are under an umbrella. Notice that with table states, "depth"
# and "count" are absolute coordinates recording where this table was
! # created, whereas "tdepth", "tcount", and "tcolumns" are the target constraints
# Headers have "absolute" meaning, therefore are passed by the
# same name.
if (!$umbrella) {
***************
*** 356,361 ****
--- 358,365 ----
$ts->_exit_row;
}
+ $ts->{columns} = scalar @{ $ts->{grid}->[0] };
+
# transform from tree to grid using our rasterized template
$ts->_grid_map();
***************
*** 438,444 ****
my $class = ref($that) || $that;
# Note:
# - 'depth' and 'count' are where this table were found.
! # - 'tdepth' and 'tcount' are target constraints on which to trigger.
# - 'headers' represent a target constraint, location independent.
# - 'attribs' represent target table tag constraints
my $self = {
--- 442,449 ----
my $class = ref($that) || $that;
# Note:
# - 'depth' and 'count' are where this table were found.
! # - 'columns' is the number of columns in this table.
! # - 'tdepth', 'tcount', and 'tcolumns' are target constraints on which to trigger.
# - 'headers' represent a target constraint, location independent.
# - 'attribs' represent target table tag constraints
my $self = {
***************
*** 447,452 ****
--- 452,458 ----
in_cell => 0,
rc => -1,
cc => -1,
+ columns => 0,
grid => [],
translation => [],
hrow => [],
***************
*** 569,581 ****
--- 575,596 ----
sub _check_dtrigger {
# depth
my $self = shift;
+ return 1 if $self->{umbrella};
return 1 unless defined $self->{tdepth};
$self->{tdepth} == $self->{depth} ? 1 : 0;
}
+ sub _check_columns_trigger {
+ # depth
+ my $self = shift;
+ return 1 unless defined $self->{tcolumns};
+ $self->{tcolumns} == $self->{columns} ? 1 : 0;
+ }
+
sub _check_ctrigger {
# count
my $self = shift;
+ return 1 if $self->{umbrella};
return 1 unless defined $self->{tcount};
return 1 if (exists $self->{counts}[$self->{depth}] &&
$self->{tcount} == $self->{counts}[$self->{depth}]);
***************
*** 585,590 ****
--- 600,606 ----
sub _check_atrigger {
# attributes
my $self = shift;
+ return 1 if $self->{umbrella};
return 1 unless scalar keys %{$self->{tattribs}};
return 0 unless scalar keys %{$self->{attribs}};
my $a_hit = 1;
***************
*** 690,700 ****
sub _check_triggers {
my $self = shift;
! return 1 if $self->{umbrella};
! $self->_check_dtrigger &&
! $self->_check_ctrigger &&
! $self->_check_atrigger &&
! $self->_check_htrigger;
}
### Maintain table context
--- 706,718 ----
sub _check_triggers {
my $self = shift;
! return
! $self->_check_dtrigger
! && $self->_check_ctrigger
! && $self->_check_atrigger
! && $self->_check_columns_trigger
! && $self->_check_htrigger
! ;
}
### Maintain table context
***************
*** 1327,1335 ****
objects. Tables can be extracted as text, HTML, or HTML::ElementTable
structures (for in-place editing or manipulation).
! There are currently four constraints available to specify which tables
you would like to extract from a document: I<Headers>, I<Depth>,
! I<Count>, and I<Attributes>.
I<Headers>, the most flexible and adaptive of the techniques, involves
specifying text in an array that you expect to appear above the data in
--- 1345,1353 ----
objects. Tables can be extracted as text, HTML, or HTML::ElementTable
structures (for in-place editing or manipulation).
! There are currently five constraints available to specify which tables
you would like to extract from a document: I<Headers>, I<Depth>,
! I<Count>, I<Columns>, and I<Attributes>.
I<Headers>, the most flexible and adaptive of the techniques, involves
specifying text in an array that you expect to appear above the data in
***************
*** 1357,1366 ****
starting with 0. Providing both a I<depth> and a I<count> will
uniquely specify a table within a document.
I<Attributes> match based on the attributes of the html E<lt>tableE<gt>
tag, for example, boder widths or background color.
! Each of the I<Headers>, I<Depth>, I<Count>, and I<Attributes>
specifications are cumulative in their effect on the overall extraction.
For instance, if you specify only a I<Depth>, then you get all tables at
that depth (note that these could very well reside in separate higher-
--- 1375,1386 ----
starting with 0. Providing both a I<depth> and a I<count> will
uniquely specify a table within a document.
+ I<Columns> matches on tables with exactly N columns.
+
I<Attributes> match based on the attributes of the html E<lt>tableE<gt>
tag, for example, boder widths or background color.
! Each of the I<Headers>, I<Depth>, I<Count>, I<Columns>, and I<Attributes>
specifications are cumulative in their effect on the overall extraction.
For instance, if you specify only a I<Depth>, then you get all tables at
that depth (note that these could very well reside in separate higher-
***************
*** 1369,1379 ****
all depths are returned (i.e., the I<n>th occurrence of a table at each
depth). If you only specify I<Headers>, then you get all tables in the
document containing those column headers. If you have specified multiple
! constraints of I<Headers>, I<Depth>, I<Count>, and I<Attributes>, then
each constraint has veto power over whether a particular table is
extracted.
! If no I<Headers>, I<Depth>, I<Count>, or I<Attributes> are specified,
then all tables match.
When extracting only text from tables, the text is decoded with
--- 1389,1399 ----
all depths are returned (i.e., the I<n>th occurrence of a table at each
depth). If you only specify I<Headers>, then you get all tables in the
document containing those column headers. If you have specified multiple
! constraints of I<Headers>, I<Depth>, I<Count>, I<Columns>, and I<Attributes>, then
each constraint has veto power over whether a particular table is
extracted.
! If no I<Headers>, I<Depth>, I<Count>, I<Columns>, or I<Attributes> are specified,
then all tables match.
When extracting only text from tables, the text is decoded with