lib/Text/CSV.pm
changeset 244 2251fde91223
equal deleted inserted replaced
243:db98b6a443ad 244:2251fde91223
       
     1 package Text::CSV;
       
     2 
       
     3 # Copyright (c) 1997 Alan Citterman. All rights reserved.
       
     4 # This program is free software; you can redistribute it and/or
       
     5 # modify it under the same terms as Perl itself.
       
     6 
       
     7 ################################################################################
       
     8 # HISTORY
       
     9 #
       
    10 # Written by:
       
    11 #    Alan Citterman <alan@mfgrtl.com>
       
    12 #
       
    13 # Version 0.01  06/05/1997
       
    14 #    original version
       
    15 # Version 0.02x  13/05/2010
       
    16 #    Hacked at symbian.org to permit top bit set characters in CSV
       
    17 ################################################################################
       
    18 
       
    19 require 5.002;
       
    20 
       
    21 use strict;
       
    22 
       
    23 BEGIN {
       
    24 #  use Exporter   ();
       
    25 #  use AutoLoader qw(AUTOLOAD);
       
    26   use vars       qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
       
    27   $VERSION =     '0.02x';
       
    28 #  @ISA =         qw(Exporter AutoLoader);
       
    29   @EXPORT =      qw();
       
    30   @EXPORT_OK =   qw();
       
    31   %EXPORT_TAGS = qw();
       
    32 }
       
    33 
       
    34 1;
       
    35 
       
    36 ################################################################################
       
    37 # version
       
    38 #
       
    39 #    class/object method expecting no arguments and returning the version number
       
    40 #    of Text::CSV.  there are no side-effects.
       
    41 ################################################################################
       
    42 sub version {
       
    43   return $VERSION;
       
    44 }
       
    45 
       
    46 ################################################################################
       
    47 # new
       
    48 #
       
    49 #    class/object method expecting no arguments and returning a reference to a
       
    50 #    newly created Text::CSV object.
       
    51 ################################################################################
       
    52 sub new {
       
    53   my $proto = shift;
       
    54   my $class = ref($proto) || $proto;
       
    55   my $self = {};
       
    56   $self->{'_STATUS'} = undef;
       
    57   $self->{'_ERROR_INPUT'} = undef;
       
    58   $self->{'_STRING'} = undef;
       
    59   $self->{'_FIELDS'} = undef;
       
    60   bless $self, $class;
       
    61   return $self;
       
    62 }
       
    63 
       
    64 ################################################################################
       
    65 # status
       
    66 #
       
    67 #    object method returning the success or failure of the most recent combine()
       
    68 #    or parse().  there are no side-effects.
       
    69 ################################################################################
       
    70 sub status {
       
    71   my $self = shift;
       
    72   return $self->{'_STATUS'};
       
    73 }
       
    74 
       
    75 ################################################################################
       
    76 # error_input
       
    77 #
       
    78 #    object method returning the first invalid argument to the most recent
       
    79 #    combine() or parse().  there are no side-effects.
       
    80 ################################################################################
       
    81 sub error_input {
       
    82   my $self = shift;
       
    83   return $self->{'_ERROR_INPUT'};
       
    84 }
       
    85 
       
    86 ################################################################################
       
    87 # string
       
    88 #
       
    89 #    object method returning the result of the most recent combine() or the
       
    90 #    input to the most recent parse(), whichever is more recent.  there are no
       
    91 #    side-effects.
       
    92 ################################################################################
       
    93 sub string {
       
    94   my $self = shift;
       
    95   return $self->{'_STRING'};
       
    96 }
       
    97 
       
    98 ################################################################################
       
    99 # fields
       
   100 #
       
   101 #    object method returning the result of the most recent parse() or the input
       
   102 #    to the most recent combine(), whichever is more recent.  there are no
       
   103 #    side-effects.
       
   104 ################################################################################
       
   105 sub fields {
       
   106   my $self = shift;
       
   107   if (ref($self->{'_FIELDS'})) {
       
   108     return @{$self->{'_FIELDS'}};
       
   109   }
       
   110   return undef;
       
   111 }
       
   112 
       
   113 ################################################################################
       
   114 # combine
       
   115 #
       
   116 #    object method returning success or failure.  the given arguments are
       
   117 #    combined into a single comma-separated value.  failure can be the result of
       
   118 #    no arguments or an argument containing an invalid character.   side-effects
       
   119 #    include:
       
   120 #      setting status()
       
   121 #      setting fields()
       
   122 #      setting string()
       
   123 #      setting error_input()
       
   124 ################################################################################
       
   125 sub combine {
       
   126   my $self = shift;
       
   127   my @part = @_;
       
   128   $self->{'_FIELDS'} = \@part;
       
   129   $self->{'_ERROR_INPUT'} = undef;
       
   130   $self->{'_STATUS'} = 0;
       
   131   $self->{'_STRING'} = '';
       
   132   my $column = '';
       
   133   my $combination = '';
       
   134   my $skip_comma = 1;
       
   135   if ($#part >= 0) {
       
   136 
       
   137     # at least one argument was given for "combining"...
       
   138     for $column (@part) {
       
   139 #      if ($column =~ /[^\t\040-\176]/) {
       
   140 #
       
   141 #	# an argument contained an invalid character...
       
   142 #	$self->{'_ERROR_INPUT'} = $column;
       
   143 #	return $self->{'_STATUS'};
       
   144 #      }
       
   145       if ($skip_comma) {
       
   146 
       
   147 	# do not put a comma before the first argument...
       
   148 	$skip_comma = 0;
       
   149       } else {
       
   150 
       
   151 	# do put a comma before all arguments except the first argument...
       
   152 	$combination .= ',';
       
   153       }
       
   154       $column =~ s/\042/\042\042/go;
       
   155       $combination .= "\042";
       
   156       $combination .= $column;
       
   157       $combination .= "\042";
       
   158     }
       
   159     $self->{'_STRING'} = $combination;
       
   160     $self->{'_STATUS'} = 1;
       
   161   }
       
   162   return $self->{'_STATUS'};
       
   163 }
       
   164 
       
   165 ################################################################################
       
   166 # parse
       
   167 #
       
   168 #    object method returning success or failure.  the given argument is expected
       
   169 #    to be a valid comma-separated value.  failure can be the result of
       
   170 #    no arguments or an argument containing an invalid sequence of characters.
       
   171 #    side-effects include:
       
   172 #      setting status()
       
   173 #      setting fields()
       
   174 #      setting string()
       
   175 #      setting error_input()
       
   176 ################################################################################
       
   177 sub parse {
       
   178   my $self = shift;
       
   179   $self->{'_STRING'} = shift;
       
   180   $self->{'_FIELDS'} = undef;
       
   181   $self->{'_ERROR_INPUT'} = $self->{'_STRING'};
       
   182   $self->{'_STATUS'} = 0;
       
   183   if (!defined($self->{'_STRING'})) {
       
   184     return $self->{'_STATUS'};
       
   185   }
       
   186   my $keep_biting = 1;
       
   187   my $palatable = 0;
       
   188   my $line = $self->{'_STRING'};
       
   189   if ($line =~ /\n$/) {
       
   190     chop($line);
       
   191     if ($line =~ /\r$/) {
       
   192       chop($line);
       
   193     }
       
   194   }
       
   195   my $mouthful = '';
       
   196   my @part = ();
       
   197   while ($keep_biting and ($palatable = $self->_bite(\$line, \$mouthful, \$keep_biting))) {
       
   198     push(@part, $mouthful);
       
   199   }
       
   200   if ($palatable) {
       
   201     $self->{'_ERROR_INPUT'} = undef;
       
   202     $self->{'_FIELDS'} = \@part;
       
   203   }
       
   204   return $self->{'_STATUS'} = $palatable;
       
   205 }
       
   206 
       
   207 ################################################################################
       
   208 # _bite
       
   209 #
       
   210 #    *private* class/object method returning success or failure.  the arguments
       
   211 #    are:
       
   212 #      - a reference to a comma-separated value string
       
   213 #      - a reference to a return string
       
   214 #      - a reference to a return boolean
       
   215 #    upon success the first comma-separated value of the csv string is
       
   216 #    transferred to the return string and the boolean is set to true if a comma
       
   217 #    followed that value.  in other words, "bite" one value off of csv
       
   218 #    returning the remaining string, the "piece" bitten, and if there's any
       
   219 #    more.  failure can be the result of the csv string containing an invalid
       
   220 #    sequence of characters.
       
   221 #   
       
   222 #    from the csv string and 
       
   223 #    to be a valid comma-separated value.  failure can be the result of
       
   224 #    no arguments or an argument containing an invalid sequence of characters.
       
   225 #    side-effects include:
       
   226 #      setting status()
       
   227 #      setting fields()
       
   228 #      setting string()
       
   229 #      setting error_input()
       
   230 ################################################################################
       
   231 sub _bite {
       
   232   my ($self, $line_ref, $piece_ref, $bite_again_ref) = @_;
       
   233   my $in_quotes = 0;
       
   234   my $ok = 0;
       
   235   $$piece_ref = '';
       
   236   $$bite_again_ref = 0;
       
   237   while (1) {
       
   238     if (length($$line_ref) < 1) {
       
   239 
       
   240       # end of string...
       
   241       if ($in_quotes) {
       
   242 
       
   243 	# end of string, missing closing double-quote...
       
   244 	last;
       
   245       } else {
       
   246 
       
   247 	# proper end of string...
       
   248 	$ok = 1;
       
   249 	last;
       
   250       }
       
   251     } elsif ($$line_ref =~ /^\042/) {
       
   252 
       
   253       # double-quote...
       
   254       if ($in_quotes) {
       
   255 	if (length($$line_ref) == 1) {
       
   256 
       
   257 	  # closing double-quote at end of string...
       
   258 	  substr($$line_ref, 0, 1) = '';
       
   259 	  $ok = 1;
       
   260 	  last;
       
   261 	} elsif ($$line_ref =~ /^\042\042/) {
       
   262 
       
   263 	  # an embedded double-quote...
       
   264 	  $$piece_ref .= "\042";
       
   265 	  substr($$line_ref, 0, 2) = '';
       
   266 	} elsif ($$line_ref =~ /^\042,/) {
       
   267 
       
   268 	  # closing double-quote followed by a comma...
       
   269 	  substr($$line_ref, 0, 2) = '';
       
   270 	  $$bite_again_ref = 1;
       
   271 	  $ok = 1;
       
   272 	  last;
       
   273 	} else {
       
   274 
       
   275 	  # double-quote, followed by undesirable character (bad character sequence)...
       
   276 	  last;
       
   277 	}
       
   278       } else {
       
   279 	if (length($$piece_ref) < 1) {
       
   280 
       
   281 	  # starting double-quote at beginning of string
       
   282 	  $in_quotes = 1;
       
   283 	  substr($$line_ref, 0, 1) = '';
       
   284 	} else {
       
   285 
       
   286 	  # double-quote, outside of double-quotes (bad character sequence)...
       
   287 	  last;
       
   288 	}
       
   289       }
       
   290     } elsif ($$line_ref =~ /^,/) {
       
   291 
       
   292       # comma...
       
   293       if ($in_quotes) {
       
   294 
       
   295 	# a comma, inside double-quotes...
       
   296 	$$piece_ref .= substr($$line_ref, 0 ,1);
       
   297 	substr($$line_ref, 0, 1) = '';
       
   298       } else {
       
   299 
       
   300 	# a comma, which separates values...
       
   301 	substr($$line_ref, 0, 1) = '';
       
   302 	$$bite_again_ref = 1;
       
   303 	$ok = 1;
       
   304 	last;
       
   305       }
       
   306     } elsif ($$line_ref =~ /^[\t\040-\176]/) {
       
   307 
       
   308       # a tab, space, or printable...
       
   309       $$piece_ref .= substr($$line_ref, 0 ,1);
       
   310       substr($$line_ref, 0, 1) = '';
       
   311     } else {
       
   312 
       
   313       # an undesirable character... But use it anyway
       
   314       $$piece_ref .= substr($$line_ref, 0 ,1);
       
   315       substr($$line_ref, 0, 1) = '';
       
   316     }
       
   317   }
       
   318   return $ok;
       
   319 }
       
   320 
       
   321 =head1 NAME
       
   322 
       
   323 Text::CSV - comma-separated values manipulation routines
       
   324 
       
   325 =head1 SYNOPSIS
       
   326 
       
   327  use Text::CSV;
       
   328 
       
   329  $version = Text::CSV->version();      # get the module version
       
   330 
       
   331  $csv = Text::CSV->new();              # create a new object
       
   332 
       
   333  $status = $csv->combine(@columns);    # combine columns into a string
       
   334  $line = $csv->string();               # get the combined string
       
   335 
       
   336  $status = $csv->parse($line);         # parse a CSV string into fields
       
   337  @columns = $csv->fields();            # get the parsed fields
       
   338 
       
   339  $status = $csv->status();             # get the most recent status
       
   340  $bad_argument = $csv->error_input();  # get the most recent bad argument
       
   341 
       
   342 =head1 DESCRIPTION
       
   343 
       
   344 Text::CSV provides facilities for the composition and decomposition of
       
   345 comma-separated values.  An instance of the Text::CSV class can combine
       
   346 fields into a CSV string and parse a CSV string into fields.
       
   347 
       
   348 =head1 FUNCTIONS
       
   349 
       
   350 =over 4
       
   351 
       
   352 =item version
       
   353 
       
   354  $version = Text::CSV->version();
       
   355 
       
   356 This function may be called as a class or an object method.  It returns the current
       
   357 module version.
       
   358 
       
   359 =item new
       
   360 
       
   361  $csv = Text::CSV->new();
       
   362 
       
   363 This function may be called as a class or an object method.  It returns a reference to a
       
   364 newly created Text::CSV object.
       
   365 
       
   366 =item combine
       
   367 
       
   368  $status = $csv->combine(@columns);
       
   369 
       
   370 This object function constructs a CSV string from the arguments, returning
       
   371 success or failure.  Failure can result from lack of arguments or an argument
       
   372 containing an invalid character.  Upon success, C<string()> can be called to
       
   373 retrieve the resultant CSV string.  Upon failure, the value returned by
       
   374 C<string()> is undefined and C<error_input()> can be called to retrieve an
       
   375 invalid argument.
       
   376 
       
   377 =item string
       
   378 
       
   379  $line = $csv->string();
       
   380 
       
   381 This object function returns the input to C<parse()> or the resultant CSV string of
       
   382 C<combine()>, whichever was called more recently.
       
   383 
       
   384 =item parse
       
   385 
       
   386  $status = $csv->parse($line);
       
   387 
       
   388 This object function decomposes a CSV string into fields, returning
       
   389 success or failure.  Failure can result from a lack of argument or the given CSV
       
   390 string is improperly formatted.  Upon success, C<fields()> can be called to
       
   391 retrieve the decomposed fields .  Upon failure, the value returned by
       
   392 C<fields()> is undefined and C<error_input()> can be called to retrieve the
       
   393 invalid argument.
       
   394 
       
   395 =item fields
       
   396 
       
   397  @columns = $csv->fields();
       
   398 
       
   399 This object function returns the input to C<combine()> or the resultant decomposed
       
   400 fields of C<parse()>, whichever was called more recently.
       
   401 
       
   402 =item status
       
   403 
       
   404  $status = $csv->status();
       
   405 
       
   406 This object function returns success (or failure) of C<combine()> or C<parse()>,
       
   407 whichever was called more recently.
       
   408 
       
   409 =item error_input
       
   410 
       
   411  $bad_argument = $csv->error_input();
       
   412 
       
   413 This object function returns the erroneous argument (if it exists) of C<combine()>
       
   414 or C<parse()>, whichever was called more recently.
       
   415 
       
   416 =back
       
   417 
       
   418 =head1 EXAMPLE
       
   419 
       
   420   require Text::CSV;
       
   421 
       
   422   my $csv = Text::CSV->new;
       
   423 
       
   424   my $column = '';
       
   425   my $sample_input_string = '"I said, ""Hi!""",Yes,"",2.34,,"1.09"';
       
   426   if ($csv->parse($sample_input_string)) {
       
   427     my @field = $csv->fields;
       
   428     my $count = 0;
       
   429     for $column (@field) {
       
   430       print ++$count, " => ", $column, "\n";
       
   431     }
       
   432     print "\n";
       
   433   } else {
       
   434     my $err = $csv->error_input;
       
   435     print "parse() failed on argument: ", $err, "\n";
       
   436   }
       
   437 
       
   438   my @sample_input_fields = ('You said, "Hello!"',
       
   439 			     5.67,
       
   440 			     'Surely',
       
   441 			     '',
       
   442 			     '3.14159');
       
   443   if ($csv->combine(@sample_input_fields)) {
       
   444     my $string = $csv->string;
       
   445     print $string, "\n";
       
   446   } else {
       
   447     my $err = $csv->error_input;
       
   448     print "combine() failed on argument: ", $err, "\n";
       
   449   }
       
   450 
       
   451 =head1 CAVEATS
       
   452 
       
   453 This module is based upon a working definition of CSV format which may not be
       
   454 the most general.
       
   455 
       
   456 =over 4
       
   457 
       
   458 =item 1 
       
   459 
       
   460 Allowable characters within a CSV field include 0x09 (tab) and the inclusive
       
   461 range of 0x20 (space) through 0x7E (tilde).
       
   462 
       
   463 =item 2
       
   464 
       
   465 A field within CSV may be surrounded by double-quotes.
       
   466 
       
   467 =item 3
       
   468 
       
   469 A field within CSV must be surrounded by double-quotes to contain a comma.
       
   470 
       
   471 =item 4
       
   472 
       
   473 A field within CSV must be surrounded by double-quotes to contain an embedded
       
   474 double-quote, represented by a pair of consecutive double-quotes.
       
   475 
       
   476 =item 5
       
   477 
       
   478 A CSV string may be terminated by 0x0A (line feed) or by 0x0D,0x0A
       
   479 (carriage return, line feed).
       
   480 
       
   481 =head1 AUTHOR
       
   482 
       
   483 Alan Citterman F<E<lt>alan@mfgrtl.comE<gt>>
       
   484 
       
   485 =head1 SEE ALSO
       
   486 
       
   487 perl(1)
       
   488 
       
   489 =cut