From d38a12eb78321133e7a14aa0e72a5c22c17a271b Mon Sep 17 00:00:00 2001 From: Meredith Howard Date: Wed, 19 Mar 2014 00:01:06 -0400 Subject: Parse worksheets in Twig mode. --- lib/Spreadsheet/ParseXLSX.pm | 317 +++++++++++++++++++++++++------------------ 1 file changed, 182 insertions(+), 135 deletions(-) diff --git a/lib/Spreadsheet/ParseXLSX.pm b/lib/Spreadsheet/ParseXLSX.pm index 07b83a2..e15d3e7 100644 --- a/lib/Spreadsheet/ParseXLSX.pm +++ b/lib/Spreadsheet/ParseXLSX.pm @@ -126,133 +126,197 @@ sub _parse_workbook { sub _parse_sheet { my $self = shift; - my ($sheet, $sheet_xml) = @_; + my ($sheet, $sheet_file) = @_; - my @cells = $sheet_xml->find_nodes('//sheetData/row/c'); + $sheet->{MinRow} = 0; + $sheet->{MinCol} = 0; + $sheet->{MaxRow} = -1; + $sheet->{MaxCol} = -1; + $sheet->{Selection} = [ 0, 0 ]; - if (@cells) { - # XXX need a fallback here, the dimension tag is optional - my ($dimension) = $sheet_xml->find_nodes('//dimension'); - my ($rmin, $cmin, $rmax, $cmax) = $self->_dimensions( - $dimension->att('ref') - ); + my @merged_cells; - $sheet->{MinRow} = $rmin; - $sheet->{MinCol} = $cmin; - $sheet->{MaxRow} = $rmax; - $sheet->{MaxCol} = $cmax; - } - else { - $sheet->{MinRow} = 0; - $sheet->{MinCol} = 0; - $sheet->{MaxRow} = -1; - $sheet->{MaxCol} = -1; - } + my @column_widths; + my @row_heights; - my @merged_cells; - for my $merge_area ($sheet_xml->find_nodes('//mergeCells/mergeCell')) { - if (my $ref = $merge_area->att('ref')) { - my ($topleft, $bottomright) = $ref =~ /([^:]+):([^:]+)/; - - my ($toprow, $leftcol) = $self->_cell_to_row_col($topleft); - my ($bottomrow, $rightcol) = $self->_cell_to_row_col($bottomright); - - push @{ $sheet->{MergedArea} }, [ - $toprow, $leftcol, - $bottomrow, $rightcol, - ]; - for my $row ($toprow .. $bottomrow) { - for my $col ($leftcol .. $rightcol) { - push(@merged_cells, [$row, $col]); + my $default_row_height = 15; + my $default_column_width = 10; + + my $sheet_xml = XML::Twig->new( + twig_roots => { + 'dimension' => sub { + my ($twig, $dimension) = @_; + + my ($rmin, $cmin, $rmax, $cmax) = $self->_dimensions( + $dimension->att('ref') + ); + + $sheet->{MinRow} = $rmin; + $sheet->{MinCol} = $cmin; + $sheet->{MaxRow} = $rmax; + $sheet->{MaxCol} = $cmax; + + $twig->purge; + }, + + 'mergeCells/mergeCell' => sub { + my ( $twig, $merge_area ) = @_; + + if (my $ref = $merge_area->att('ref')) { + my ($topleft, $bottomright) = $ref =~ /([^:]+):([^:]+)/; + + my ($toprow, $leftcol) = $self->_cell_to_row_col($topleft); + my ($bottomrow, $rightcol) = $self->_cell_to_row_col($bottomright); + + push @{ $sheet->{MergedArea} }, [ + $toprow, $leftcol, + $bottomrow, $rightcol, + ]; + for my $row ($toprow .. $bottomrow) { + for my $col ($leftcol .. $rightcol) { + push(@merged_cells, [$row, $col]); + } + } } - } - } - } - for my $cell (@cells) { - my ($row, $col) = $self->_cell_to_row_col($cell->att('r')); - my $type = $cell->att('t') || 'n'; - my $val_xml = $type eq 'inlineStr' - ? $cell->first_child('is')->first_child('t') - : $cell->first_child('v'); - my $val = $val_xml ? $val_xml->text : undef; - - my $long_type; - if (!defined($val)) { - $long_type = 'Text'; - $val = ''; - } - elsif ($type eq 's') { - $long_type = 'Text'; - $val = $sheet->{_Book}{PkgStr}[$val]{Text}; - } - elsif ($type eq 'n') { - $long_type = 'Numeric'; - $val = defined($val) ? 0+$val : undef; - } - elsif ($type eq 'd') { - $long_type = 'Date'; - } - elsif ($type eq 'b') { - $long_type = 'Text'; - $val = $val ? "TRUE" : "FALSE"; - } - elsif ($type eq 'e') { - $long_type = 'Text'; - } - elsif ($type eq 'str' || $type eq 'inlineStr') { - $long_type = 'Text'; - } - else { - die "unimplemented type $type"; # XXX - } + $twig->purge; + }, + + 'sheetFormatPr' => sub { + my ( $twig, $format ) = @_; + + $default_row_height //= $format->att('defaultRowHeight'); + $default_column_width //= $format->att('baseColWidth'); + + $twig->purge; + }, + + 'col' => sub { + my ( $twig, $col ) = @_; + + $column_widths[ $_ - 1 ] = $col->att('width') + for ( $col->att('min') .. $col->att('max') ); + + $twig->purge; + }, + + 'row' => sub { + my ( $twig, $row ) = @_; + + $row_heights[ $row->att('r') - 1 ] = $row->att('ht'); + + $twig->purge; + }, + + 'selection' => sub { + my ( $twig, $selection ) = @_; + + if (my $cell = $selection->att('activeCell')) { + $sheet->{Selection} = [ $self->_cell_to_row_col($cell) ]; + } + elsif (my $range = $selection->att('sqref')) { + my ($topleft, $bottomright) = $range =~ /([^:]+):([^:]+)/; + $sheet->{Selection} = [ + $self->_cell_to_row_col($topleft), + $self->_cell_to_row_col($bottomright), + ]; + } + + $twig->purge; + }, - my $format_idx = $cell->att('s') || 0; - my $format = $sheet->{_Book}{Format}[$format_idx]; - $format->{Merged} = !!grep { - $row == $_->[0] && $col == $_->[1] - } @merged_cells; - - # see the list of built-in formats below in _parse_styles - # XXX probably should figure this out from the actual format string, - # but that's not entirely trivial - if (grep { $format->{FmtIdx} == $_ } 14..22, 45..47) { - $long_type = 'Date'; } + ); - my $cell = Spreadsheet::ParseExcel::Cell->new( - Val => $val, - Type => $long_type, - Merged => $format->{Merged}, - Format => $format, - FormatNo => $format_idx, - ($cell->first_child('f') - ? (Formula => $cell->first_child('f')->text) - : ()), - ); - $cell->{_Value} = $sheet->{_Book}{FmtClass}->ValFmt( - $cell, $sheet->{_Book} - ); - $sheet->{Cells}[$row][$col] = $cell; - } + $sheet_xml->parse( $sheet_file ); + + # 2nd pass: cell/row building is dependent on having parsed the merge definitions + # beforehand. + + $sheet_xml = XML::Twig->new( + twig_roots => { + 'sheetData/row' => sub { + my ( $twig, $row_elt ) = @_; + + for my $cell ( $row_elt->children('c') ){ + my ($row, $col) = $self->_cell_to_row_col($cell->att('r')); + my $type = $cell->att('t') || 'n'; + my $val_xml = $type eq 'inlineStr' + ? $cell->first_child('is')->first_child('t') + : $cell->first_child('v'); + my $val = $val_xml ? $val_xml->text : undef; + + my $long_type; + if (!defined($val)) { + $long_type = 'Text'; + $val = ''; + } + elsif ($type eq 's') { + $long_type = 'Text'; + $val = $sheet->{_Book}{PkgStr}[$val]; + } + elsif ($type eq 'n') { + $long_type = 'Numeric'; + $val = defined($val) ? 0+$val : undef; + } + elsif ($type eq 'd') { + $long_type = 'Date'; + } + elsif ($type eq 'b') { + $long_type = 'Text'; + $val = $val ? "TRUE" : "FALSE"; + } + elsif ($type eq 'e') { + $long_type = 'Text'; + } + elsif ($type eq 'str' || $type eq 'inlineStr') { + $long_type = 'Text'; + } + else { + die "unimplemented type $type"; # XXX + } + + my $format_idx = $cell->att('s') || 0; + my $format = $sheet->{_Book}{Format}[$format_idx]; + $format->{Merged} = !!grep { + $row == $_->[0] && $col == $_->[1] + } @merged_cells; + + # see the list of built-in formats below in _parse_styles + # XXX probably should figure this out from the actual format string, + # but that's not entirely trivial + if (grep { $format->{FmtIdx} == $_ } 14..22, 45..47) { + $long_type = 'Date'; + } + + my $cell = Spreadsheet::ParseExcel::Cell->new( + Val => $val, + Type => $long_type, + Merged => $format->{Merged}, + Format => $format, + FormatNo => $format_idx, + ($cell->first_child('f') + ? (Formula => $cell->first_child('f')->text) + : ()), + ); + $cell->{_Value} = $sheet->{_Book}{FmtClass}->ValFmt( + $cell, $sheet->{_Book} + ); + $sheet->{Cells}[$row][$col] = $cell; + } - my @column_widths; - my @row_heights; + $twig->purge; + }, - my ($format) = $sheet_xml->find_nodes('//sheetFormatPr'); - my $default_row_height = $format->att('defaultRowHeight') || 15; - my $default_column_width = $format->att('baseColWidth') || 10; + } + ); - for my $col ($sheet_xml->find_nodes('//col')) { - my $width = $col->att('width'); - $column_widths[$_ - 1] = $width - for $col->att('min')..$col->att('max'); - } + $sheet_xml->parse( $sheet_file ); - for my $row ($sheet_xml->find_nodes('//row')) { - $row_heights[$row->att('r') - 1] = $row->att('ht'); + if ( ! $sheet->{Cells} ){ + $sheet->{MaxRow} = $sheet->{MaxCol} = -1; } - + $sheet->{DefRowHeight} = 0+$default_row_height; $sheet->{DefColWidth} = 0+$default_column_width; $sheet->{RowHeight} = [ @@ -262,22 +326,6 @@ sub _parse_sheet { map { defined $_ ? 0+$_ : 0+$default_column_width } @column_widths ]; - my ($selection) = $sheet_xml->find_nodes('//selection'); - if ($selection) { - if (my $cell = $selection->att('activeCell')) { - $sheet->{Selection} = [ $self->_cell_to_row_col($cell) ]; - } - elsif (my $range = $selection->att('sqref')) { - my ($topleft, $bottomright) = $range =~ /([^:]+):([^:]+)/; - $sheet->{Selection} = [ - $self->_cell_to_row_col($topleft), - $self->_cell_to_row_col($bottomright), - ]; - } - } - else { - $sheet->{Selection} = [ 0, 0 ]; - } } sub _parse_shared_strings { @@ -292,9 +340,8 @@ sub _parse_shared_strings { 'si' => sub { my ( $twig, $si ) = @_; - push @$PkgStr, { - Text => join( '', map { $_->text } $si->find_nodes('.//t') ) - }; + push @$PkgStr, + join( '', map { $_->text } $si->find_nodes('.//t') ); $twig->purge; }, } @@ -589,8 +636,6 @@ sub _extract_files { .( $wb_rels->find_nodes(qq) )[0]->att('Target') )->contents; }; - warn "got strings: " . length $strings_xml; - warn '$@: '. $@; my $styles_xml = $self->_parse_xml( $zip, @@ -600,7 +645,9 @@ sub _extract_files { ); my %worksheet_xml = map { - $_->att('Id') => $self->_parse_xml($zip, $path_base . $_->att('Target')) + if ( my $sheetfile = $zip->memberNamed($path_base . $_->att('Target'))->contents ) { + ( $_->att('Id') => $sheetfile ); + } } $wb_rels->find_nodes(qq); my %themes_xml = map { -- cgit v1.2.3-54-g00ecf