Changed script to use CSV formatted input, rather than TSV.
authorSimon Howkins <simonh@symbian.org>
Thu, 13 May 2010 16:27:37 +0100
changeset 267 2251fde91223
parent 266 db98b6a443ad
child 268 478794dc0351
Changed script to use CSV formatted input, rather than TSV. This means that the script can directly process the CSV downloaded from Bugzilla, without any need to use Excel to convert it.
lib/Text/CSV.pm
williamr/buglist_to_mediawiki.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/Text/CSV.pm	Thu May 13 16:27:37 2010 +0100
@@ -0,0 +1,489 @@
+package Text::CSV;
+
+# Copyright (c) 1997 Alan Citterman. All rights reserved.
+# This program is free software; you can redistribute it and/or
+# modify it under the same terms as Perl itself.
+
+################################################################################
+# HISTORY
+#
+# Written by:
+#    Alan Citterman <alan@mfgrtl.com>
+#
+# Version 0.01  06/05/1997
+#    original version
+# Version 0.02x  13/05/2010
+#    Hacked at symbian.org to permit top bit set characters in CSV
+################################################################################
+
+require 5.002;
+
+use strict;
+
+BEGIN {
+#  use Exporter   ();
+#  use AutoLoader qw(AUTOLOAD);
+  use vars       qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
+  $VERSION =     '0.02x';
+#  @ISA =         qw(Exporter AutoLoader);
+  @EXPORT =      qw();
+  @EXPORT_OK =   qw();
+  %EXPORT_TAGS = qw();
+}
+
+1;
+
+################################################################################
+# version
+#
+#    class/object method expecting no arguments and returning the version number
+#    of Text::CSV.  there are no side-effects.
+################################################################################
+sub version {
+  return $VERSION;
+}
+
+################################################################################
+# new
+#
+#    class/object method expecting no arguments and returning a reference to a
+#    newly created Text::CSV object.
+################################################################################
+sub new {
+  my $proto = shift;
+  my $class = ref($proto) || $proto;
+  my $self = {};
+  $self->{'_STATUS'} = undef;
+  $self->{'_ERROR_INPUT'} = undef;
+  $self->{'_STRING'} = undef;
+  $self->{'_FIELDS'} = undef;
+  bless $self, $class;
+  return $self;
+}
+
+################################################################################
+# status
+#
+#    object method returning the success or failure of the most recent combine()
+#    or parse().  there are no side-effects.
+################################################################################
+sub status {
+  my $self = shift;
+  return $self->{'_STATUS'};
+}
+
+################################################################################
+# error_input
+#
+#    object method returning the first invalid argument to the most recent
+#    combine() or parse().  there are no side-effects.
+################################################################################
+sub error_input {
+  my $self = shift;
+  return $self->{'_ERROR_INPUT'};
+}
+
+################################################################################
+# string
+#
+#    object method returning the result of the most recent combine() or the
+#    input to the most recent parse(), whichever is more recent.  there are no
+#    side-effects.
+################################################################################
+sub string {
+  my $self = shift;
+  return $self->{'_STRING'};
+}
+
+################################################################################
+# fields
+#
+#    object method returning the result of the most recent parse() or the input
+#    to the most recent combine(), whichever is more recent.  there are no
+#    side-effects.
+################################################################################
+sub fields {
+  my $self = shift;
+  if (ref($self->{'_FIELDS'})) {
+    return @{$self->{'_FIELDS'}};
+  }
+  return undef;
+}
+
+################################################################################
+# combine
+#
+#    object method returning success or failure.  the given arguments are
+#    combined into a single comma-separated value.  failure can be the result of
+#    no arguments or an argument containing an invalid character.   side-effects
+#    include:
+#      setting status()
+#      setting fields()
+#      setting string()
+#      setting error_input()
+################################################################################
+sub combine {
+  my $self = shift;
+  my @part = @_;
+  $self->{'_FIELDS'} = \@part;
+  $self->{'_ERROR_INPUT'} = undef;
+  $self->{'_STATUS'} = 0;
+  $self->{'_STRING'} = '';
+  my $column = '';
+  my $combination = '';
+  my $skip_comma = 1;
+  if ($#part >= 0) {
+
+    # at least one argument was given for "combining"...
+    for $column (@part) {
+#      if ($column =~ /[^\t\040-\176]/) {
+#
+#	# an argument contained an invalid character...
+#	$self->{'_ERROR_INPUT'} = $column;
+#	return $self->{'_STATUS'};
+#      }
+      if ($skip_comma) {
+
+	# do not put a comma before the first argument...
+	$skip_comma = 0;
+      } else {
+
+	# do put a comma before all arguments except the first argument...
+	$combination .= ',';
+      }
+      $column =~ s/\042/\042\042/go;
+      $combination .= "\042";
+      $combination .= $column;
+      $combination .= "\042";
+    }
+    $self->{'_STRING'} = $combination;
+    $self->{'_STATUS'} = 1;
+  }
+  return $self->{'_STATUS'};
+}
+
+################################################################################
+# parse
+#
+#    object method returning success or failure.  the given argument is expected
+#    to be a valid comma-separated value.  failure can be the result of
+#    no arguments or an argument containing an invalid sequence of characters.
+#    side-effects include:
+#      setting status()
+#      setting fields()
+#      setting string()
+#      setting error_input()
+################################################################################
+sub parse {
+  my $self = shift;
+  $self->{'_STRING'} = shift;
+  $self->{'_FIELDS'} = undef;
+  $self->{'_ERROR_INPUT'} = $self->{'_STRING'};
+  $self->{'_STATUS'} = 0;
+  if (!defined($self->{'_STRING'})) {
+    return $self->{'_STATUS'};
+  }
+  my $keep_biting = 1;
+  my $palatable = 0;
+  my $line = $self->{'_STRING'};
+  if ($line =~ /\n$/) {
+    chop($line);
+    if ($line =~ /\r$/) {
+      chop($line);
+    }
+  }
+  my $mouthful = '';
+  my @part = ();
+  while ($keep_biting and ($palatable = $self->_bite(\$line, \$mouthful, \$keep_biting))) {
+    push(@part, $mouthful);
+  }
+  if ($palatable) {
+    $self->{'_ERROR_INPUT'} = undef;
+    $self->{'_FIELDS'} = \@part;
+  }
+  return $self->{'_STATUS'} = $palatable;
+}
+
+################################################################################
+# _bite
+#
+#    *private* class/object method returning success or failure.  the arguments
+#    are:
+#      - a reference to a comma-separated value string
+#      - a reference to a return string
+#      - a reference to a return boolean
+#    upon success the first comma-separated value of the csv string is
+#    transferred to the return string and the boolean is set to true if a comma
+#    followed that value.  in other words, "bite" one value off of csv
+#    returning the remaining string, the "piece" bitten, and if there's any
+#    more.  failure can be the result of the csv string containing an invalid
+#    sequence of characters.
+#   
+#    from the csv string and 
+#    to be a valid comma-separated value.  failure can be the result of
+#    no arguments or an argument containing an invalid sequence of characters.
+#    side-effects include:
+#      setting status()
+#      setting fields()
+#      setting string()
+#      setting error_input()
+################################################################################
+sub _bite {
+  my ($self, $line_ref, $piece_ref, $bite_again_ref) = @_;
+  my $in_quotes = 0;
+  my $ok = 0;
+  $$piece_ref = '';
+  $$bite_again_ref = 0;
+  while (1) {
+    if (length($$line_ref) < 1) {
+
+      # end of string...
+      if ($in_quotes) {
+
+	# end of string, missing closing double-quote...
+	last;
+      } else {
+
+	# proper end of string...
+	$ok = 1;
+	last;
+      }
+    } elsif ($$line_ref =~ /^\042/) {
+
+      # double-quote...
+      if ($in_quotes) {
+	if (length($$line_ref) == 1) {
+
+	  # closing double-quote at end of string...
+	  substr($$line_ref, 0, 1) = '';
+	  $ok = 1;
+	  last;
+	} elsif ($$line_ref =~ /^\042\042/) {
+
+	  # an embedded double-quote...
+	  $$piece_ref .= "\042";
+	  substr($$line_ref, 0, 2) = '';
+	} elsif ($$line_ref =~ /^\042,/) {
+
+	  # closing double-quote followed by a comma...
+	  substr($$line_ref, 0, 2) = '';
+	  $$bite_again_ref = 1;
+	  $ok = 1;
+	  last;
+	} else {
+
+	  # double-quote, followed by undesirable character (bad character sequence)...
+	  last;
+	}
+      } else {
+	if (length($$piece_ref) < 1) {
+
+	  # starting double-quote at beginning of string
+	  $in_quotes = 1;
+	  substr($$line_ref, 0, 1) = '';
+	} else {
+
+	  # double-quote, outside of double-quotes (bad character sequence)...
+	  last;
+	}
+      }
+    } elsif ($$line_ref =~ /^,/) {
+
+      # comma...
+      if ($in_quotes) {
+
+	# a comma, inside double-quotes...
+	$$piece_ref .= substr($$line_ref, 0 ,1);
+	substr($$line_ref, 0, 1) = '';
+      } else {
+
+	# a comma, which separates values...
+	substr($$line_ref, 0, 1) = '';
+	$$bite_again_ref = 1;
+	$ok = 1;
+	last;
+      }
+    } elsif ($$line_ref =~ /^[\t\040-\176]/) {
+
+      # a tab, space, or printable...
+      $$piece_ref .= substr($$line_ref, 0 ,1);
+      substr($$line_ref, 0, 1) = '';
+    } else {
+
+      # an undesirable character... But use it anyway
+      $$piece_ref .= substr($$line_ref, 0 ,1);
+      substr($$line_ref, 0, 1) = '';
+    }
+  }
+  return $ok;
+}
+
+=head1 NAME
+
+Text::CSV - comma-separated values manipulation routines
+
+=head1 SYNOPSIS
+
+ use Text::CSV;
+
+ $version = Text::CSV->version();      # get the module version
+
+ $csv = Text::CSV->new();              # create a new object
+
+ $status = $csv->combine(@columns);    # combine columns into a string
+ $line = $csv->string();               # get the combined string
+
+ $status = $csv->parse($line);         # parse a CSV string into fields
+ @columns = $csv->fields();            # get the parsed fields
+
+ $status = $csv->status();             # get the most recent status
+ $bad_argument = $csv->error_input();  # get the most recent bad argument
+
+=head1 DESCRIPTION
+
+Text::CSV provides facilities for the composition and decomposition of
+comma-separated values.  An instance of the Text::CSV class can combine
+fields into a CSV string and parse a CSV string into fields.
+
+=head1 FUNCTIONS
+
+=over 4
+
+=item version
+
+ $version = Text::CSV->version();
+
+This function may be called as a class or an object method.  It returns the current
+module version.
+
+=item new
+
+ $csv = Text::CSV->new();
+
+This function may be called as a class or an object method.  It returns a reference to a
+newly created Text::CSV object.
+
+=item combine
+
+ $status = $csv->combine(@columns);
+
+This object function constructs a CSV string from the arguments, returning
+success or failure.  Failure can result from lack of arguments or an argument
+containing an invalid character.  Upon success, C<string()> can be called to
+retrieve the resultant CSV string.  Upon failure, the value returned by
+C<string()> is undefined and C<error_input()> can be called to retrieve an
+invalid argument.
+
+=item string
+
+ $line = $csv->string();
+
+This object function returns the input to C<parse()> or the resultant CSV string of
+C<combine()>, whichever was called more recently.
+
+=item parse
+
+ $status = $csv->parse($line);
+
+This object function decomposes a CSV string into fields, returning
+success or failure.  Failure can result from a lack of argument or the given CSV
+string is improperly formatted.  Upon success, C<fields()> can be called to
+retrieve the decomposed fields .  Upon failure, the value returned by
+C<fields()> is undefined and C<error_input()> can be called to retrieve the
+invalid argument.
+
+=item fields
+
+ @columns = $csv->fields();
+
+This object function returns the input to C<combine()> or the resultant decomposed
+fields of C<parse()>, whichever was called more recently.
+
+=item status
+
+ $status = $csv->status();
+
+This object function returns success (or failure) of C<combine()> or C<parse()>,
+whichever was called more recently.
+
+=item error_input
+
+ $bad_argument = $csv->error_input();
+
+This object function returns the erroneous argument (if it exists) of C<combine()>
+or C<parse()>, whichever was called more recently.
+
+=back
+
+=head1 EXAMPLE
+
+  require Text::CSV;
+
+  my $csv = Text::CSV->new;
+
+  my $column = '';
+  my $sample_input_string = '"I said, ""Hi!""",Yes,"",2.34,,"1.09"';
+  if ($csv->parse($sample_input_string)) {
+    my @field = $csv->fields;
+    my $count = 0;
+    for $column (@field) {
+      print ++$count, " => ", $column, "\n";
+    }
+    print "\n";
+  } else {
+    my $err = $csv->error_input;
+    print "parse() failed on argument: ", $err, "\n";
+  }
+
+  my @sample_input_fields = ('You said, "Hello!"',
+			     5.67,
+			     'Surely',
+			     '',
+			     '3.14159');
+  if ($csv->combine(@sample_input_fields)) {
+    my $string = $csv->string;
+    print $string, "\n";
+  } else {
+    my $err = $csv->error_input;
+    print "combine() failed on argument: ", $err, "\n";
+  }
+
+=head1 CAVEATS
+
+This module is based upon a working definition of CSV format which may not be
+the most general.
+
+=over 4
+
+=item 1 
+
+Allowable characters within a CSV field include 0x09 (tab) and the inclusive
+range of 0x20 (space) through 0x7E (tilde).
+
+=item 2
+
+A field within CSV may be surrounded by double-quotes.
+
+=item 3
+
+A field within CSV must be surrounded by double-quotes to contain a comma.
+
+=item 4
+
+A field within CSV must be surrounded by double-quotes to contain an embedded
+double-quote, represented by a pair of consecutive double-quotes.
+
+=item 5
+
+A CSV string may be terminated by 0x0A (line feed) or by 0x0D,0x0A
+(carriage return, line feed).
+
+=head1 AUTHOR
+
+Alan Citterman F<E<lt>alan@mfgrtl.comE<gt>>
+
+=head1 SEE ALSO
+
+perl(1)
+
+=cut
--- a/williamr/buglist_to_mediawiki.pl	Thu May 20 17:04:35 2010 +0100
+++ b/williamr/buglist_to_mediawiki.pl	Thu May 13 16:27:37 2010 +0100
@@ -16,12 +16,26 @@
 
 use strict;
 
+use FindBin;
+use lib "$FindBin::Bin\\..\\lib";
+use Text::CSV;
+
+my $csv = Text::CSV->new();
+
 print "{|\n";   # start of table
 
 while (my $line = <>)
   {
   chomp $line;
-  my @columns = split /\t/, $line;
+  
+  unless ($csv->parse($line))
+  {
+    my $err = $csv->error_input();
+    warn "Failed to parse line '$line': $err\n";
+    next;
+  }
+
+  my @columns = $csv->fields();
   
   next if scalar @columns < 2;    # skip dubious looking lines