diff options
-rw-r--r-- | Changes | 4 | ||||
-rw-r--r-- | lib/Spreadsheet/ParseXLSX.pm | 271 | ||||
-rw-r--r-- | lib/Spreadsheet/ParseXLSX/Decryptor.pm | 2 | ||||
-rw-r--r-- | t/bug-32.t | 25 | ||||
-rw-r--r-- | t/bug-41.t | 14 | ||||
-rw-r--r-- | t/data/bug-32-2.xlsx | bin | 0 -> 533859 bytes | |||
-rw-r--r-- | t/data/bug-32.xlsx | bin | 0 -> 46729 bytes | |||
-rw-r--r-- | t/data/bug-41.xlsx | bin | 0 -> 54720 bytes |
8 files changed, 212 insertions, 104 deletions
@@ -1,6 +1,10 @@ Revision history for Spreadsheet-ParseXLSX {{$NEXT}} + - Give a better error message when opening xls files (mchendriks, #52) + - Fix parsing files with non-default XML namespace names + - Handle files with no style information + - Handle fills that don't specify a pattern type (vfilatov, #41) 0.20 2015-12-05 - Fix the test suite on perls compiled with -Duselongdouble (Slaven diff --git a/lib/Spreadsheet/ParseXLSX.pm b/lib/Spreadsheet/ParseXLSX.pm index a9b501f..fda4742 100644 --- a/lib/Spreadsheet/ParseXLSX.pm +++ b/lib/Spreadsheet/ParseXLSX.pm @@ -71,10 +71,11 @@ sub parse { my $workbook = Spreadsheet::ParseExcel::Workbook->new; if ($self->_check_signature($file)) { - $file = Spreadsheet::ParseXLSX::Decryptor->open( + my $decrypted_file = Spreadsheet::ParseXLSX::Decryptor->open( $file, $self->{Password} ); + $file = $decrypted_file if $decrypted_file; } if (openhandle($file)) { @@ -124,8 +125,8 @@ sub _parse_workbook { my $files = $self->_extract_files($zip); - my ($version) = $files->{workbook}->find_nodes('//fileVersion'); - my ($properties) = $files->{workbook}->find_nodes('//workbookPr'); + my ($version) = $files->{workbook}->find_nodes('//s:fileVersion'); + my ($properties) = $files->{workbook}->find_nodes('//s:workbookPr'); if ($version) { $workbook->{Version} = $version->att('appName') @@ -160,7 +161,7 @@ sub _parse_workbook { # $workbook->{PrintTitle} = ...; my @sheets = map { - my $idx = $_->att('r:id'); + my $idx = $_->att('rels:id'); if ($files->{sheets}{$idx}) { my $sheet = Spreadsheet::ParseExcel::Worksheet->new( Name => $_->att('name'), @@ -173,12 +174,12 @@ sub _parse_workbook { } else { () } - } $files->{workbook}->find_nodes('//sheets/sheet'); + } $files->{workbook}->find_nodes('//s:sheets/s:sheet'); $workbook->{Worksheet} = \@sheets; $workbook->{SheetCount} = scalar(@sheets); - my ($node) = $files->{workbook}->find_nodes('//workbookView'); + my ($node) = $files->{workbook}->find_nodes('//s:workbookView'); my $selected = $node ? $node->att('activeTab') : undef; $workbook->{SelectedSheet} = defined($selected) ? 0+$selected : 0; @@ -206,10 +207,10 @@ sub _parse_sheet { my $default_row_height = 15; my $default_column_width = 10; - my $sheet_xml = XML::Twig->new( + my $sheet_xml = $self->_new_twig( twig_roots => { #XXX need a fallback here, the dimension tag is optional - 'dimension' => sub { + 's:dimension' => sub { my ($twig, $dimension) = @_; my ($rmin, $cmin, $rmax, $cmax) = $self->_dimensions( @@ -224,11 +225,11 @@ sub _parse_sheet { $twig->purge; }, - 'headerFooter' => sub { + 's:headerFooter' => sub { my ($twig, $hf) = @_; my ($helem, $felem) = map { - $hf->first_child($_) + $hf->first_child("s:$_") } qw(oddHeader oddFooter); $sheet->{Header} = $helem->text if $helem; @@ -238,7 +239,7 @@ sub _parse_sheet { $twig->purge; }, - 'pageMargins' => sub { + 's:pageMargins' => sub { my ($twig, $margin) = @_; map { my $key = "\u${_}Margin"; @@ -249,7 +250,7 @@ sub _parse_sheet { $twig->purge; }, - 'pageSetup' => sub { + 's:pageSetup' => sub { my ($twig, $setup) = @_; $sheet->{Scale} = defined $setup->att('scale') ? $setup->att('scale') @@ -266,7 +267,7 @@ sub _parse_sheet { $twig->purge; }, - 'mergeCells/mergeCell' => sub { + 's:mergeCells/s:mergeCell' => sub { my ( $twig, $merge_area ) = @_; if (my $ref = $merge_area->att('ref')) { @@ -289,7 +290,7 @@ sub _parse_sheet { $twig->purge; }, - 'sheetFormatPr' => sub { + 's:sheetFormatPr' => sub { my ( $twig, $format ) = @_; $default_row_height = $format->att('defaultRowHeight') @@ -300,7 +301,7 @@ sub _parse_sheet { $twig->purge; }, - 'col' => sub { + 's:col' => sub { my ( $twig, $col ) = @_; for my $colnum ($col->att('min')..$col->att('max')) { @@ -312,7 +313,7 @@ sub _parse_sheet { $twig->purge; }, - 'row' => sub { + 's:row' => sub { my ( $twig, $row ) = @_; $row_heights[ $row->att('r') - 1 ] = $row->att('ht'); @@ -321,7 +322,7 @@ sub _parse_sheet { $twig->purge; }, - 'selection' => sub { + 's:selection' => sub { my ( $twig, $selection ) = @_; if (my $cell = $selection->att('activeCell')) { @@ -338,7 +339,7 @@ sub _parse_sheet { $twig->purge; }, - 'sheetPr/tabColor' => sub { + 's:sheetPr/s:tabColor' => sub { my ( $twig, $tab_color ) = @_; $sheet->{TabColor} = $self->_color($sheet->{_Book}{Color}, $tab_color); @@ -354,12 +355,12 @@ sub _parse_sheet { # 2nd pass: cell/row building is dependent on having parsed the merge definitions # beforehand. - $sheet_xml = XML::Twig->new( + $sheet_xml = $self->_new_twig( twig_roots => { - 'sheetData/row' => sub { + 's:sheetData/s:row' => sub { my ( $twig, $row_elt ) = @_; - for my $cell ( $row_elt->children('c') ){ + for my $cell ( $row_elt->children('s:c') ){ my ($row, $col) = $self->_cell_to_row_col($cell->att('r')); $sheet->{MaxRow} = $row if $sheet->{MaxRow} < $row; @@ -368,10 +369,10 @@ sub _parse_sheet { my $type = $cell->att('t') || 'n'; my $val_xml; if ($type ne 'inlineStr') { - $val_xml = $cell->first_child('v'); + $val_xml = $cell->first_child('s:v'); } - elsif (defined $cell->first_child('is')) { - $val_xml = ($cell->find_nodes('.//t'))[0]; + elsif (defined $cell->first_child('s:is')) { + $val_xml = ($cell->find_nodes('.//s:t'))[0]; } my $val = $val_xml ? $val_xml->text : undef; @@ -407,6 +408,7 @@ sub _parse_sheet { my $format_idx = $cell->att('s') || 0; my $format = $sheet->{_Book}{Format}[$format_idx]; + die "unknown format $format_idx" unless $format; $format->{Merged} = !!grep { $row == $_->[0] && $col == $_->[1] } @merged_cells; @@ -424,8 +426,8 @@ sub _parse_sheet { Merged => $format->{Merged}, Format => $format, FormatNo => $format_idx, - ($cell->first_child('f') - ? (Formula => $cell->first_child('f')->text) + ($cell->first_child('s:f') + ? (Formula => $cell->first_child('s:f')->text) : ()), ); $cell->{_Value} = $sheet->{_Book}{FmtClass}->ValFmt( @@ -467,15 +469,15 @@ sub _parse_shared_strings { my $PkgStr = []; if ($strings) { - my $xml = XML::Twig->new( + my $xml = $self->_new_twig( twig_handlers => { - 'si' => sub { + 's:si' => sub { my ( $twig, $si ) = @_; # XXX this discards information about formatting within cells # not sure how to represent that push @$PkgStr, - join( '', map { $_->text } $si->find_nodes('.//t') ); + join( '', map { $_->text } $si->find_nodes('.//s:t') ); $twig->purge; }, } @@ -492,8 +494,8 @@ sub _parse_themes { return {} unless $themes; my @color = map { - $_->name eq 'a:sysClr' ? $_->att('lastClr') : $_->att('val') - } $themes->find_nodes('//a:clrScheme/*/*'); + $_->name eq 'drawmain:sysClr' ? $_->att('lastClr') : $_->att('val') + } $themes->find_nodes('//drawmain:clrScheme/*/*'); # this shouldn't be necessary, but the documentation is wrong here # see http://stackoverflow.com/questions/2760976/theme-confusion-in-spreadsheetml @@ -509,6 +511,81 @@ sub _parse_styles { my $self = shift; my ($workbook, $styles) = @_; + # these defaults are from + # http://social.msdn.microsoft.com/Forums/en-US/oxmlsdk/thread/e27aaf16-b900-4654-8210-83c5774a179c + my %default_format_str = ( + 0 => 'GENERAL', + 1 => '0', + 2 => '0.00', + 3 => '#,##0', + 4 => '#,##0.00', + 5 => '$#,##0_);($#,##0)', + 6 => '$#,##0_);[Red]($#,##0)', + 7 => '$#,##0.00_);($#,##0.00)', + 8 => '$#,##0.00_);[Red]($#,##0.00)', + 9 => '0%', + 10 => '0.00%', + 11 => '0.00E+00', + 12 => '# ?/?', + 13 => '# ??/??', + 14 => 'm/d/yyyy', + 15 => 'd-mmm-yy', + 16 => 'd-mmm', + 17 => 'mmm-yy', + 18 => 'h:mm AM/PM', + 19 => 'h:mm:ss AM/PM', + 20 => 'h:mm', + 21 => 'h:mm:ss', + 22 => 'm/d/yyyy h:mm', + 37 => '#,##0_);(#,##0)', + 38 => '#,##0_);[Red](#,##0)', + 39 => '#,##0.00_);(#,##0.00)', + 40 => '#,##0.00_);[Red](#,##0.00)', + 45 => 'mm:ss', + 46 => '[h]:mm:ss', + 47 => 'mm:ss.0', + 48 => '##0.0E+0', + 49 => '@', + ); + + if (!$styles) { + # XXX i guess? + my $font = Spreadsheet::ParseExcel::Font->new( + Height => 12, + Color => '#000000', + Name => '', + ); + my $format = Spreadsheet::ParseExcel::Format->new( + IgnoreFont => 1, + IgnoreFill => 1, + IgnoreBorder => 1, + IgnoreAlignment => 1, + IgnoreNumberFormat => 1, + IgnoreProtection => 1, + FontNo => 0, + Font => $font, + FmtIdx => 0, + Lock => 1, + Hidden => 0, + AlignH => 0, + Wrap => 0, + AlignV => 2, + Rotate => 0, + Indent => 0, + Shrink => 0, + BdrStyle => [0, 0, 0, 0], + BdrColor => [undef, undef, undef, undef], + BdrDiag => [0, 0, undef], + Fill => [0, undef, undef], + ); + + return { + FormatStr => \%default_format_str, + Font => [ $font ], + Format => [ $format ], + }; + } + my %halign = ( center => 2, centerContinuous => 6, @@ -568,12 +645,13 @@ sub _parse_styles { ); my @fills = map { + my $pattern_type = $_->att('patternType'); [ - $fill{$_->att('patternType')}, - $self->_color($workbook->{Color}, $_->first_child('fgColor'), 1), - $self->_color($workbook->{Color}, $_->first_child('bgColor'), 1), + ($pattern_type ? $fill{$pattern_type} : 0), + $self->_color($workbook->{Color}, $_->first_child('s:fgColor'), 1), + $self->_color($workbook->{Color}, $_->first_child('s:bgColor'), 1), ] - } $styles->find_nodes('//fills/fill/patternFill'); + } $styles->find_nodes('//s:fills/s:fill/s:patternFill'); my @borders = map { my $border = $_; @@ -581,12 +659,12 @@ sub _parse_styles { $self->_xml_boolean($border->att($_)) } qw(diagonalDown diagonalUp); my %borderstyles = map { - my $e = $border->first_child($_); + my $e = $border->first_child("s:$_"); $_ => ($e ? $e->att('style') || 'none' : 'none') } qw(left right top bottom diagonal); my %bordercolors = map { - my $e = $border->first_child($_); - $_ => ($e ? $e->first_child('color') : undef) + my $e = $border->first_child("s:$_"); + $_ => ($e ? $e->first_child('s:color') : undef) } qw(left right top bottom diagonal); # XXX specs say "begin" and "end" rather than "left" and "right", # but... that's not what seems to be in the file itself (sigh) @@ -610,55 +688,22 @@ sub _parse_styles { $self->_color($workbook->{Color}, $bordercolors{diagonal}), ], } - } $styles->find_nodes('//borders/border'); + } $styles->find_nodes('//s:borders/s:border'); - # these defaults are from - # http://social.msdn.microsoft.com/Forums/en-US/oxmlsdk/thread/e27aaf16-b900-4654-8210-83c5774a179c my %format_str = ( - 0 => 'GENERAL', - 1 => '0', - 2 => '0.00', - 3 => '#,##0', - 4 => '#,##0.00', - 5 => '$#,##0_);($#,##0)', - 6 => '$#,##0_);[Red]($#,##0)', - 7 => '$#,##0.00_);($#,##0.00)', - 8 => '$#,##0.00_);[Red]($#,##0.00)', - 9 => '0%', - 10 => '0.00%', - 11 => '0.00E+00', - 12 => '# ?/?', - 13 => '# ??/??', - 14 => 'm/d/yyyy', - 15 => 'd-mmm-yy', - 16 => 'd-mmm', - 17 => 'mmm-yy', - 18 => 'h:mm AM/PM', - 19 => 'h:mm:ss AM/PM', - 20 => 'h:mm', - 21 => 'h:mm:ss', - 22 => 'm/d/yyyy h:mm', - 37 => '#,##0_);(#,##0)', - 38 => '#,##0_);[Red](#,##0)', - 39 => '#,##0.00_);(#,##0.00)', - 40 => '#,##0.00_);[Red](#,##0.00)', - 45 => 'mm:ss', - 46 => '[h]:mm:ss', - 47 => 'mm:ss.0', - 48 => '##0.0E+0', - 49 => '@', + %default_format_str, (map { $_->att('numFmtId') => $_->att('formatCode') - } $styles->find_nodes('//numFmts/numFmt')), + } $styles->find_nodes('//s:numFmts/s:numFmt')), ); my @font = map { - my $vert = $_->first_child('vertAlign'); - my $under = $_->first_child('u'); - my $heightelem = $_->first_child('sz'); + my $vert = $_->first_child('s:vertAlign'); + my $under = $_->first_child('s:u'); + my $heightelem = $_->first_child('s:sz'); # XXX i guess 12 is okay? my $height = 0+($heightelem ? $heightelem->att('val') : 12); - my $nameelem = $_->first_child('name'); + my $nameelem = $_->first_child('s:name'); my $name = $nameelem ? $nameelem->att('val') : ''; Spreadsheet::ParseExcel::Font->new( Height => $height, @@ -667,10 +712,10 @@ sub _parse_styles { # intact rather than just going straight to #xxxxxx # XXX also not sure what it means for the color tag to be missing, # just assuming black for now - Color => ($_->first_child('color') + Color => ($_->first_child('s:color') ? $self->_color( $workbook->{Color}, - $_->first_child('color') + $_->first_child('s:color') ) : '#000000' ), @@ -695,16 +740,16 @@ sub _parse_styles { ), Name => $name, - Bold => $_->has_child('b') ? 1 : 0, - Italic => $_->has_child('i') ? 1 : 0, - Underline => $_->has_child('u') ? 1 : 0, - Strikeout => $_->has_child('strike') ? 1 : 0, + Bold => $_->has_child('s:b') ? 1 : 0, + Italic => $_->has_child('s:i') ? 1 : 0, + Underline => $_->has_child('s:u') ? 1 : 0, + Strikeout => $_->has_child('s:strike') ? 1 : 0, ) - } $styles->find_nodes('//fonts/font'); + } $styles->find_nodes('//s:fonts/s:font'); my @format = map { - my $alignment = $_->first_child('alignment'); - my $protection = $_->first_child('protection'); + my $alignment = $_->first_child('s:alignment'); + my $protection = $_->first_child('s:protection'); Spreadsheet::ParseExcel::Format->new( IgnoreFont => !$self->_xml_boolean($_->att('applyFont')), IgnoreFill => !$self->_xml_boolean($_->att('applyFill')), @@ -749,7 +794,7 @@ sub _parse_styles { BdrDiag => $borders[$_->att('borderId')]{diagonal}, Fill => $fills[$_->att('fillId')], ) - } $styles->find_nodes('//cellXfs/xf'); + } $styles->find_nodes('//s:cellXfs/s:xf'); return { FormatStr => \%format_str, @@ -767,10 +812,10 @@ sub _extract_files { my $rels = $self->_parse_xml( $zip, - $self->_rels_for('') + $self->_rels_for(''), ); my $wb_name = ($rels->find_nodes( - qq<//Relationship[\@Type="$type_base/officeDocument"]> + qq<//packagerels:Relationship[\@Type="$type_base/officeDocument"]> ))[0]->att('Target'); $wb_name =~ s{^/}{}; my $wb_xml = $self->_parse_xml($zip, $wb_name); @@ -778,7 +823,7 @@ sub _extract_files { my $path_base = $self->_base_path_for($wb_name); my $wb_rels = $self->_parse_xml( $zip, - $self->_rels_for($wb_name) + $self->_rels_for($wb_name), ); my $get_path = sub { @@ -791,30 +836,32 @@ sub _extract_files { my ($strings_xml) = map { $zip->memberNamed($get_path->($_->att('Target')))->contents - } $wb_rels->find_nodes(qq<//Relationship[\@Type="$type_base/sharedStrings"]>); + } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/sharedStrings"]>); - my $styles_xml = $self->_parse_xml( - $zip, - $get_path->(($wb_rels->find_nodes( - qq<//Relationship[\@Type="$type_base/styles"]> - ))[0]->att('Target')) - ); + my ($styles_xml) = map { + $self->_parse_xml( + $zip, + $get_path->($_->att('Target')) + ) + } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/styles"]>); my %worksheet_xml = map { if ( my $sheetfile = $zip->memberNamed($get_path->($_->att('Target')))->contents ) { ( $_->att('Id') => $sheetfile ); } - } $wb_rels->find_nodes(qq<//Relationship[\@Type="$type_base/worksheet"]>); + } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/worksheet"]>); my %themes_xml = map { $_->att('Id') => $self->_parse_xml($zip, $get_path->($_->att('Target'))) - } $wb_rels->find_nodes(qq<//Relationship[\@Type="$type_base/theme"]>); + } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/theme"]>); return { workbook => $wb_xml, - styles => $styles_xml, sheets => \%worksheet_xml, themes => \%themes_xml, + ($styles_xml + ? (styles => $styles_xml) + : ()), ($strings_xml ? (strings => $strings_xml) : ()), @@ -823,12 +870,12 @@ sub _extract_files { sub _parse_xml { my $self = shift; - my ($zip, $subfile) = @_; + my ($zip, $subfile, $map_xmlns) = @_; my $member = $zip->memberNamed($subfile); die "no subfile named $subfile" unless $member; - my $xml = XML::Twig->new; + my $xml = $self->_new_twig; $xml->parse(scalar $member->contents); return $xml; @@ -920,7 +967,7 @@ sub _color { $color = "#$theme"; } else { - return; + return undef; } } @@ -948,6 +995,22 @@ sub _apply_tint { return scalar hls2rgb($h, $l, $s); } +sub _new_twig { + my $self = shift; + my %opts = @_; + + return XML::Twig->new( + map_xmlns => { + 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' => 's', + 'http://schemas.openxmlformats.org/package/2006/relationships' => 'packagerels', + 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' => 'rels', + 'http://schemas.openxmlformats.org/drawingml/2006/main' => 'drawmain', + }, + keep_original_prefix => 1, + %opts, + ); +} + =head1 INCOMPATIBILITIES This module returns data using classes from L<Spreadsheet::ParseExcel>, so for diff --git a/lib/Spreadsheet/ParseXLSX/Decryptor.pm b/lib/Spreadsheet/ParseXLSX/Decryptor.pm index 6d09800..4968a7d 100644 --- a/lib/Spreadsheet/ParseXLSX/Decryptor.pm +++ b/lib/Spreadsheet/ParseXLSX/Decryptor.pm @@ -25,6 +25,8 @@ sub open { ['EncryptionInfo', 'EncryptedPackage'] ); + return unless $infoFH; + my $buffer; $infoFH->read($buffer, 8); my ($majorVers, $minorVers) = unpack('s<s<', $buffer); diff --git a/t/bug-32.t b/t/bug-32.t new file mode 100644 index 0000000..57346f7 --- /dev/null +++ b/t/bug-32.t @@ -0,0 +1,25 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Test::More; + +use Spreadsheet::ParseXLSX; + +{ + my $wb = Spreadsheet::ParseXLSX->new->parse('t/data/bug-32.xlsx'); + + my $ws1 = $wb->worksheet(0); + like($ws1->get_cell(0, 0)->value, qr/^PURSUANT/); + + my $ws2 = $wb->worksheet(1); + like($ws2->get_cell(0, 0)->value, qr/^QMS/); +} + +{ + my $wb = Spreadsheet::ParseXLSX->new->parse('t/data/bug-32-2.xlsx'); + + my $ws = $wb->worksheet(0); + is($ws->get_cell(1, 1)->value, 93); +} + +done_testing; diff --git a/t/bug-41.t b/t/bug-41.t new file mode 100644 index 0000000..7bcb52b --- /dev/null +++ b/t/bug-41.t @@ -0,0 +1,14 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Test::More; + +use Spreadsheet::ParseXLSX; + +{ + local $SIG{__WARN__} = sub { fail("unexpected warning: $_[0]") }; + my $wb = Spreadsheet::ParseXLSX->new->parse('t/data/bug-41.xlsx'); + pass('it parses successfully'); +} + +done_testing; diff --git a/t/data/bug-32-2.xlsx b/t/data/bug-32-2.xlsx Binary files differnew file mode 100644 index 0000000..2aa0e0e --- /dev/null +++ b/t/data/bug-32-2.xlsx diff --git a/t/data/bug-32.xlsx b/t/data/bug-32.xlsx Binary files differnew file mode 100644 index 0000000..dd759a8 --- /dev/null +++ b/t/data/bug-32.xlsx diff --git a/t/data/bug-41.xlsx b/t/data/bug-41.xlsx Binary files differnew file mode 100644 index 0000000..d80db9e --- /dev/null +++ b/t/data/bug-41.xlsx |