lib/Text/CSV.pm
author Shabe Razvi <shaber@symbian.org>
Thu, 03 Jun 2010 12:22:16 +0100
changeset 273 859eb1addb02
parent 267 2251fde91223
permissions -rw-r--r--
Remove duplicate script, use uh_parser/truclean.pl instead

package Text::CSV;

# Copyright (c) 1997 Alan Citterman. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.

################################################################################
# HISTORY
#
# Written by:
#    Alan Citterman <alan@mfgrtl.com>
#
# Version 0.01  06/05/1997
#    original version
# Version 0.02x  13/05/2010
#    Hacked at symbian.org to permit top bit set characters in CSV
################################################################################

require 5.002;

use strict;

BEGIN {
#  use Exporter   ();
#  use AutoLoader qw(AUTOLOAD);
  use vars       qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  $VERSION =     '0.02x';
#  @ISA =         qw(Exporter AutoLoader);
  @EXPORT =      qw();
  @EXPORT_OK =   qw();
  %EXPORT_TAGS = qw();
}

1;

################################################################################
# version
#
#    class/object method expecting no arguments and returning the version number
#    of Text::CSV.  there are no side-effects.
################################################################################
sub version {
  return $VERSION;
}

################################################################################
# new
#
#    class/object method expecting no arguments and returning a reference to a
#    newly created Text::CSV object.
################################################################################
sub new {
  my $proto = shift;
  my $class = ref($proto) || $proto;
  my $self = {};
  $self->{'_STATUS'} = undef;
  $self->{'_ERROR_INPUT'} = undef;
  $self->{'_STRING'} = undef;
  $self->{'_FIELDS'} = undef;
  bless $self, $class;
  return $self;
}

################################################################################
# status
#
#    object method returning the success or failure of the most recent combine()
#    or parse().  there are no side-effects.
################################################################################
sub status {
  my $self = shift;
  return $self->{'_STATUS'};
}

################################################################################
# error_input
#
#    object method returning the first invalid argument to the most recent
#    combine() or parse().  there are no side-effects.
################################################################################
sub error_input {
  my $self = shift;
  return $self->{'_ERROR_INPUT'};
}

################################################################################
# string
#
#    object method returning the result of the most recent combine() or the
#    input to the most recent parse(), whichever is more recent.  there are no
#    side-effects.
################################################################################
sub string {
  my $self = shift;
  return $self->{'_STRING'};
}

################################################################################
# fields
#
#    object method returning the result of the most recent parse() or the input
#    to the most recent combine(), whichever is more recent.  there are no
#    side-effects.
################################################################################
sub fields {
  my $self = shift;
  if (ref($self->{'_FIELDS'})) {
    return @{$self->{'_FIELDS'}};
  }
  return undef;
}

################################################################################
# combine
#
#    object method returning success or failure.  the given arguments are
#    combined into a single comma-separated value.  failure can be the result of
#    no arguments or an argument containing an invalid character.   side-effects
#    include:
#      setting status()
#      setting fields()
#      setting string()
#      setting error_input()
################################################################################
sub combine {
  my $self = shift;
  my @part = @_;
  $self->{'_FIELDS'} = \@part;
  $self->{'_ERROR_INPUT'} = undef;
  $self->{'_STATUS'} = 0;
  $self->{'_STRING'} = '';
  my $column = '';
  my $combination = '';
  my $skip_comma = 1;
  if ($#part >= 0) {

    # at least one argument was given for "combining"...
    for $column (@part) {
#      if ($column =~ /[^\t\040-\176]/) {
#
#	# an argument contained an invalid character...
#	$self->{'_ERROR_INPUT'} = $column;
#	return $self->{'_STATUS'};
#      }
      if ($skip_comma) {

	# do not put a comma before the first argument...
	$skip_comma = 0;
      } else {

	# do put a comma before all arguments except the first argument...
	$combination .= ',';
      }
      $column =~ s/\042/\042\042/go;
      $combination .= "\042";
      $combination .= $column;
      $combination .= "\042";
    }
    $self->{'_STRING'} = $combination;
    $self->{'_STATUS'} = 1;
  }
  return $self->{'_STATUS'};
}

################################################################################
# parse
#
#    object method returning success or failure.  the given argument is expected
#    to be a valid comma-separated value.  failure can be the result of
#    no arguments or an argument containing an invalid sequence of characters.
#    side-effects include:
#      setting status()
#      setting fields()
#      setting string()
#      setting error_input()
################################################################################
sub parse {
  my $self = shift;
  $self->{'_STRING'} = shift;
  $self->{'_FIELDS'} = undef;
  $self->{'_ERROR_INPUT'} = $self->{'_STRING'};
  $self->{'_STATUS'} = 0;
  if (!defined($self->{'_STRING'})) {
    return $self->{'_STATUS'};
  }
  my $keep_biting = 1;
  my $palatable = 0;
  my $line = $self->{'_STRING'};
  if ($line =~ /\n$/) {
    chop($line);
    if ($line =~ /\r$/) {
      chop($line);
    }
  }
  my $mouthful = '';
  my @part = ();
  while ($keep_biting and ($palatable = $self->_bite(\$line, \$mouthful, \$keep_biting))) {
    push(@part, $mouthful);
  }
  if ($palatable) {
    $self->{'_ERROR_INPUT'} = undef;
    $self->{'_FIELDS'} = \@part;
  }
  return $self->{'_STATUS'} = $palatable;
}

################################################################################
# _bite
#
#    *private* class/object method returning success or failure.  the arguments
#    are:
#      - a reference to a comma-separated value string
#      - a reference to a return string
#      - a reference to a return boolean
#    upon success the first comma-separated value of the csv string is
#    transferred to the return string and the boolean is set to true if a comma
#    followed that value.  in other words, "bite" one value off of csv
#    returning the remaining string, the "piece" bitten, and if there's any
#    more.  failure can be the result of the csv string containing an invalid
#    sequence of characters.
#   
#    from the csv string and 
#    to be a valid comma-separated value.  failure can be the result of
#    no arguments or an argument containing an invalid sequence of characters.
#    side-effects include:
#      setting status()
#      setting fields()
#      setting string()
#      setting error_input()
################################################################################
sub _bite {
  my ($self, $line_ref, $piece_ref, $bite_again_ref) = @_;
  my $in_quotes = 0;
  my $ok = 0;
  $$piece_ref = '';
  $$bite_again_ref = 0;
  while (1) {
    if (length($$line_ref) < 1) {

      # end of string...
      if ($in_quotes) {

	# end of string, missing closing double-quote...
	last;
      } else {

	# proper end of string...
	$ok = 1;
	last;
      }
    } elsif ($$line_ref =~ /^\042/) {

      # double-quote...
      if ($in_quotes) {
	if (length($$line_ref) == 1) {

	  # closing double-quote at end of string...
	  substr($$line_ref, 0, 1) = '';
	  $ok = 1;
	  last;
	} elsif ($$line_ref =~ /^\042\042/) {

	  # an embedded double-quote...
	  $$piece_ref .= "\042";
	  substr($$line_ref, 0, 2) = '';
	} elsif ($$line_ref =~ /^\042,/) {

	  # closing double-quote followed by a comma...
	  substr($$line_ref, 0, 2) = '';
	  $$bite_again_ref = 1;
	  $ok = 1;
	  last;
	} else {

	  # double-quote, followed by undesirable character (bad character sequence)...
	  last;
	}
      } else {
	if (length($$piece_ref) < 1) {

	  # starting double-quote at beginning of string
	  $in_quotes = 1;
	  substr($$line_ref, 0, 1) = '';
	} else {

	  # double-quote, outside of double-quotes (bad character sequence)...
	  last;
	}
      }
    } elsif ($$line_ref =~ /^,/) {

      # comma...
      if ($in_quotes) {

	# a comma, inside double-quotes...
	$$piece_ref .= substr($$line_ref, 0 ,1);
	substr($$line_ref, 0, 1) = '';
      } else {

	# a comma, which separates values...
	substr($$line_ref, 0, 1) = '';
	$$bite_again_ref = 1;
	$ok = 1;
	last;
      }
    } elsif ($$line_ref =~ /^[\t\040-\176]/) {

      # a tab, space, or printable...
      $$piece_ref .= substr($$line_ref, 0 ,1);
      substr($$line_ref, 0, 1) = '';
    } else {

      # an undesirable character... But use it anyway
      $$piece_ref .= substr($$line_ref, 0 ,1);
      substr($$line_ref, 0, 1) = '';
    }
  }
  return $ok;
}

=head1 NAME

Text::CSV - comma-separated values manipulation routines

=head1 SYNOPSIS

 use Text::CSV;

 $version = Text::CSV->version();      # get the module version

 $csv = Text::CSV->new();              # create a new object

 $status = $csv->combine(@columns);    # combine columns into a string
 $line = $csv->string();               # get the combined string

 $status = $csv->parse($line);         # parse a CSV string into fields
 @columns = $csv->fields();            # get the parsed fields

 $status = $csv->status();             # get the most recent status
 $bad_argument = $csv->error_input();  # get the most recent bad argument

=head1 DESCRIPTION

Text::CSV provides facilities for the composition and decomposition of
comma-separated values.  An instance of the Text::CSV class can combine
fields into a CSV string and parse a CSV string into fields.

=head1 FUNCTIONS

=over 4

=item version

 $version = Text::CSV->version();

This function may be called as a class or an object method.  It returns the current
module version.

=item new

 $csv = Text::CSV->new();

This function may be called as a class or an object method.  It returns a reference to a
newly created Text::CSV object.

=item combine

 $status = $csv->combine(@columns);

This object function constructs a CSV string from the arguments, returning
success or failure.  Failure can result from lack of arguments or an argument
containing an invalid character.  Upon success, C<string()> can be called to
retrieve the resultant CSV string.  Upon failure, the value returned by
C<string()> is undefined and C<error_input()> can be called to retrieve an
invalid argument.

=item string

 $line = $csv->string();

This object function returns the input to C<parse()> or the resultant CSV string of
C<combine()>, whichever was called more recently.

=item parse

 $status = $csv->parse($line);

This object function decomposes a CSV string into fields, returning
success or failure.  Failure can result from a lack of argument or the given CSV
string is improperly formatted.  Upon success, C<fields()> can be called to
retrieve the decomposed fields .  Upon failure, the value returned by
C<fields()> is undefined and C<error_input()> can be called to retrieve the
invalid argument.

=item fields

 @columns = $csv->fields();

This object function returns the input to C<combine()> or the resultant decomposed
fields of C<parse()>, whichever was called more recently.

=item status

 $status = $csv->status();

This object function returns success (or failure) of C<combine()> or C<parse()>,
whichever was called more recently.

=item error_input

 $bad_argument = $csv->error_input();

This object function returns the erroneous argument (if it exists) of C<combine()>
or C<parse()>, whichever was called more recently.

=back

=head1 EXAMPLE

  require Text::CSV;

  my $csv = Text::CSV->new;

  my $column = '';
  my $sample_input_string = '"I said, ""Hi!""",Yes,"",2.34,,"1.09"';
  if ($csv->parse($sample_input_string)) {
    my @field = $csv->fields;
    my $count = 0;
    for $column (@field) {
      print ++$count, " => ", $column, "\n";
    }
    print "\n";
  } else {
    my $err = $csv->error_input;
    print "parse() failed on argument: ", $err, "\n";
  }

  my @sample_input_fields = ('You said, "Hello!"',
			     5.67,
			     'Surely',
			     '',
			     '3.14159');
  if ($csv->combine(@sample_input_fields)) {
    my $string = $csv->string;
    print $string, "\n";
  } else {
    my $err = $csv->error_input;
    print "combine() failed on argument: ", $err, "\n";
  }

=head1 CAVEATS

This module is based upon a working definition of CSV format which may not be
the most general.

=over 4

=item 1 

Allowable characters within a CSV field include 0x09 (tab) and the inclusive
range of 0x20 (space) through 0x7E (tilde).

=item 2

A field within CSV may be surrounded by double-quotes.

=item 3

A field within CSV must be surrounded by double-quotes to contain a comma.

=item 4

A field within CSV must be surrounded by double-quotes to contain an embedded
double-quote, represented by a pair of consecutive double-quotes.

=item 5

A CSV string may be terminated by 0x0A (line feed) or by 0x0D,0x0A
(carriage return, line feed).

=head1 AUTHOR

Alan Citterman F<E<lt>alan@mfgrtl.comE<gt>>

=head1 SEE ALSO

perl(1)

=cut