releasing/cbrtools/perl/MLDBM/Sync.pm
changeset 602 3145852acc89
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/releasing/cbrtools/perl/MLDBM/Sync.pm	Fri Jun 25 18:37:20 2010 +0800
@@ -0,0 +1,596 @@
+# Copyright (c) 2001-2002 Joshua Chamas, Chamas Enterprises Inc.  All rights reserved.
+# Sponsored by development on NodeWorks http://www.nodeworks.com and Apache::ASP
+# http://www.apache-asp.org
+#
+# This program is free software; you can redistribute it
+# and/or modify it under the same terms as Perl itself.
+
+
+package MLDBM::Sync;
+$VERSION = '0.30';
+
+use MLDBM;
+use MLDBM::Sync::SDBM_File;
+use Data::Dumper;
+use Fcntl qw(:flock);
+use Digest::MD5 qw(md5_hex);
+use strict;
+use Carp qw(confess);
+no strict qw(refs);
+use vars qw($AUTOLOAD @EXT $CACHE_ERR $LOCK_SH $LOCK_EX $LOCK_UN);
+
+eval "use Tie::Cache;";
+if (($@)) {
+    $CACHE_ERR = $@;
+}
+
+$LOCK_SH = LOCK_SH;
+$LOCK_UN = LOCK_UN;
+$LOCK_EX = LOCK_EX;
+
+@EXT = ('.pag', '.dir', '');
+
+sub TIEHASH {
+    my($class, $file, @args) = @_;
+
+    $file =~ /^(.*)$/s;
+    $file = $1;
+    my $fh = $file.".lock";
+
+    my $self = bless {
+		      'file' => $file,
+		      'args' => [ $file, @args ],
+		      'lock_fh' => $fh,
+		      'lock_file' => $fh,
+		      'lock_num' => 0,
+		      'md5_keys' => 0,
+		      'pid' => $$,
+		      'keys' => [],
+		      'db_type' => $MLDBM::UseDB,
+		      'serializer' => $MLDBM::Serializer,
+		      'remove_taint' => $MLDBM::RemoveTaint,
+		     };
+
+    $self;
+}
+
+sub DESTROY { 
+    my $self = shift;
+    if($self->{lock_num}) {
+	$self->{lock_num} = 1;
+	$self->UnLock;
+    }
+}
+
+sub AUTOLOAD {
+    my($self, $key, $value) = @_;
+    $AUTOLOAD =~ /::([^:]+)$/;
+    my $func = $1;
+    grep($func eq $_, ('FETCH', 'STORE', 'EXISTS', 'DELETE'))
+      || die("$func not handled by object $self");
+
+    ## CHECKSUM KEYS
+    if(defined $key && $self->{md5_keys}) {
+	$key = $self->SyncChecksum($key);
+    }
+
+    # CACHE, short circuit if found in cache on FETCH/EXISTS
+    # after checksum, since that's what we store
+    my $cache = (defined $key) ? $self->{cache} : undef;
+    if($cache && ($func eq 'FETCH' or $func eq 'EXISTS')) {
+	my $rv = $cache->$func($key);
+	defined($rv) && return($rv);
+    }
+
+    my $rv;
+    if ($func eq 'FETCH' or $func eq 'EXISTS') {
+	$self->read_lock;
+    } else {
+	$self->lock;
+    }
+
+    {
+	local $MLDBM::RemoveTaint = $self->{remove_taint};
+	if (defined $value) {
+	    $rv = $self->{dbm}->$func($key, $value);
+	} else {
+	    $rv = $self->{dbm}->$func($key);
+	}
+    }
+
+    $self->unlock;
+
+    # do after lock critical section, no point taking 
+    # any extra time there
+    $cache && $cache->$func($key, $value);
+
+    $rv;
+}
+
+sub CLEAR { 
+    my $self = shift;
+    
+    $self->lock;
+    $self->{dbm}->CLEAR;
+    $self->{dbm} = undef;
+    # delete the files to free disk space
+    my $unlinked = 0;
+    for (@EXT) {
+	my $file = $self->{file}.$_;	
+	next if(! -e $file);
+	if(-d $file) {
+	    rmdir($file) || warn("can't unlink dir $file: $!");
+	} else {
+	    unlink($file) || die("can't unlink file $file: $!");
+	}
+
+	$unlinked++;
+    }
+    if($self->{lock_num} > 1) {
+	$self->SyncTie; # recreate, not done with it yet
+    }
+
+    $self->unlock;
+    if($self->{lock_num} == 0) {
+	# only unlink if we are clear of all the locks
+	unlink($self->{lock_file});
+    }
+    
+    $self->{cache} && $self->{cache}->CLEAR;
+
+    1;
+};
+
+# don't bother with cache for first/next key since it'll kill
+# the cache anyway likely
+sub FIRSTKEY {
+    my $self = shift;
+
+    if($self->{md5_keys}) {
+	confess("can't get keys() or each() on MLDBM::Sync database ".
+		"with SyncKeysChecksum(1) set");
+    }
+    
+    $self->read_lock;
+    my $key = $self->{dbm}->FIRSTKEY();
+    my @keys;
+    while(1) {
+	last if ! defined($key);
+	push(@keys, $key);
+	$key = $self->{dbm}->NEXTKEY($key);
+    }
+    $self->unlock;
+    $self->{'keys'} = \@keys;
+
+    $self->NEXTKEY;
+}
+
+sub NEXTKEY {
+    my $self = shift;
+
+    if($self->{md5_keys}) {
+	confess("can't get keys() or each() on MLDBM::Sync database ".
+		"with SyncKeysChecksum(1) set");
+    }
+    
+    my $rv = shift(@{$self->{'keys'}});
+}
+
+sub SyncChecksum {
+    my($self, $key) = @_;
+    if(ref $key) {
+	join('g', md5_hex($$key), sprintf("%07d",length($$key)));
+    } else {
+	join('g', md5_hex($key), sprintf("%07d", length($key)));
+    }
+}
+
+sub SyncCacheSize {
+    my($self, $size) = @_;
+    $CACHE_ERR && die("need Tie::Cache installed to use this feature: $@");
+
+    if ($size =~ /^(\d+)(M|K)$/) {
+	my($num, $type) = ($1, $2);
+	if (($type eq 'M')) {
+	    $size = $num * 1024 * 1024;
+	} elsif (($type eq 'K')) {
+	    $size = $num * 1024;
+	} else {
+	    die "$type symbol not understood for $size";
+	}
+    } else {
+	($size =~ /^\d+$/) or die("$size must be bytes size for cache");
+    }
+    
+    if ($self->{cache}) {
+	$self->{cache}->CLEAR(); # purge old cache, to free up RAM maybe for mem leaks
+    }
+    
+    my %cache;
+    my $cache = tie %cache, 'Tie::Cache', { MaxBytes => $size };
+    $self->{cache} = $cache; # use non tied interface, faster
+}
+
+sub SyncTie {
+    my $self = shift;
+    my %temp_hash;
+    my $args = $self->{args};
+    local $MLDBM::UseDB = $self->{db_type};
+    local $MLDBM::Serializer = $self->{serializer};
+    local $MLDBM::RemoveTaint = $self->{remove_taint};
+    $self->{dbm} = tie(%temp_hash, 'MLDBM', @$args) || die("can't tie to MLDBM with args: ".join(',', @$args)."; error: $!");
+
+    $self->{dbm};
+}
+
+#### DOCUMENTED API ################################################################
+
+sub SyncKeysChecksum {
+    my($self, $setting) = @_;
+    if(defined $setting) {
+	$self->{md5_keys} = $setting;
+    } else {
+	$self->{md5_keys};
+    }
+}
+
+*read_lock = *ReadLock;
+sub ReadLock { shift->Lock(1); }
+
+*lock = *SyncLock = *Lock;
+sub Lock {
+    my($self, $read_lock) = @_;
+
+    if($self->{lock_num}++ == 0) {
+	my $file = $self->{lock_file};
+	open($self->{lock_fh}, "+>$file") || die("can't open file $file: $!");
+	flock($self->{lock_fh}, ($read_lock ? $LOCK_SH : $LOCK_EX))
+	  || die("can't ". ($read_lock ? "read" : "write") ." lock $file: $!");
+	$self->{read_lock} = $read_lock;
+	$self->SyncTie;
+    } else {
+	if ($self->{read_lock} and ! $read_lock) {
+	    $self->{lock_num}--; # roll back lock count
+	    # confess here to help developer track this down
+	    confess("Can't upgrade lock type from LOCK_SH to LOCK_EX! ".
+		    "This could happen if you tried to write to the MLDBM ".
+		    "in a critical section locked by ReadLock(). ".
+		    "Also the read expression my \$v = \$db{'key1'}{'key2'} will trigger a write ".
+		    "if \$db{'key1'} does not already exist, so this will error in a ReadLock() section"
+		    );
+	}
+	1;
+    }
+}
+
+*unlock = *SyncUnLock = *UnLock;
+sub UnLock {
+    my $self = shift;
+
+    if($self->{lock_num} && $self->{lock_num}-- == 1) {
+	$self->{lock_num} = 0;
+	undef $self->{dbm};
+	flock($self->{'lock_fh'}, $LOCK_UN) || die("can't unlock $self->{'lock_file'}: $!");
+	close($self->{'lock_fh'}) || die("can't close $self->{'lock_file'}");
+	$self->{read_lock} = undef;
+	1;
+    } else {
+	1;
+    }
+}
+
+sub SyncSize {
+    my $self = shift;
+    my $size = 0;
+    for (@EXT) {
+	my $file = $self->{file}.$_;	
+	next unless -e $file;
+	$size += (stat($file))[7];
+
+	if(-d $file) {
+	    $size += (stat($file))[7];
+	    opendir(DIR, $file) || next;
+	    my @files = readdir(DIR);
+	    for my $dir_file (@files) {
+		next if $dir_file =~ /^\.\.?$/;
+		$size += (stat("$file/$dir_file"))[7];
+	    }
+	    closedir(DIR);
+	}
+    }
+
+    $size;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+  MLDBM::Sync - safe concurrent access to MLDBM databases
+
+=head1 SYNOPSIS
+
+  use MLDBM::Sync;                       # this gets the default, SDBM_File
+  use MLDBM qw(DB_File Storable);        # use Storable for serializing
+  use MLDBM qw(MLDBM::Sync::SDBM_File);  # use extended SDBM_File, handles values > 1024 bytes
+  use Fcntl qw(:DEFAULT);                # import symbols O_CREAT & O_RDWR for use with DBMs
+
+  # NORMAL PROTECTED read/write with implicit locks per i/o request
+  my $sync_dbm_obj = tie %cache, 'MLDBM::Sync' [..other DBM args..] or die $!;
+  $cache{"AAAA"} = "BBBB";
+  my $value = $cache{"AAAA"};
+
+  # SERIALIZED PROTECTED read/write with explicit lock for both i/o requests
+  my $sync_dbm_obj = tie %cache, 'MLDBM::Sync', '/tmp/syncdbm', O_CREAT|O_RDWR, 0640;
+  $sync_dbm_obj->Lock;
+  $cache{"AAAA"} = "BBBB";
+  my $value = $cache{"AAAA"};
+  $sync_dbm_obj->UnLock;
+
+  # SERIALIZED PROTECTED READ access with explicit read lock for both reads
+  $sync_dbm_obj->ReadLock;
+  my @keys = keys %cache;
+  my $value = $cache{'AAAA'};
+  $sync_dbm_obj->UnLock;
+
+  # MEMORY CACHE LAYER with Tie::Cache
+  $sync_dbm_obj->SyncCacheSize('100K');
+
+  # KEY CHECKSUMS, for lookups on MD5 checksums on large keys
+  my $sync_dbm_obj = tie %cache, 'MLDBM::Sync', '/tmp/syncdbm', O_CREAT|O_RDWR, 0640;
+  $sync_dbm_obj->SyncKeysChecksum(1);
+  my $large_key = "KEY" x 10000;
+  $sync{$large_key} = "LARGE";
+  my $value = $sync{$large_key};
+
+=head1 DESCRIPTION
+
+This module wraps around the MLDBM interface, by handling concurrent
+access to MLDBM databases with file locking, and flushes i/o explicity
+per lock/unlock.  The new [Read]Lock()/UnLock() API can be used to serialize
+requests logically and improve performance for bundled reads & writes.
+
+  my $sync_dbm_obj = tie %cache, 'MLDBM::Sync', '/tmp/syncdbm', O_CREAT|O_RDWR, 0640;
+
+  # Write locked critical section
+  $sync_dbm_obj->Lock;
+    ... all accesses to DBM LOCK_EX protected, and go to same tied file handles
+    $cache{'KEY'} = 'VALUE';
+  $sync_dbm_obj->UnLock;
+
+  # Read locked critical section
+  $sync_dbm_obj->ReadLock;
+    ... all read accesses to DBM LOCK_SH protected, and go to same tied files
+    ... WARNING, cannot write to DBM in ReadLock() section, will die()
+    ... WARNING, my $v = $cache{'KEY'}{'SUBKEY'} will trigger a write so not safe
+    ...   to use in ReadLock() section
+    my $value = $cache{'KEY'};
+  $sync_dbm_obj->UnLock;
+
+  # Normal access OK too, without explicity locking
+  $cache{'KEY'} = 'VALUE';
+  my $value = $cache{'KEY'};
+
+MLDBM continues to serve as the underlying OO layer that
+serializes complex data structures to be stored in the databases.
+See the MLDBM L<BUGS> section for important limitations.
+
+MLDBM::Sync also provides built in RAM caching with Tie::Cache
+md5 key checksum functionality.
+
+=head1 INSTALL
+
+Like any other CPAN module, either use CPAN.pm, or perl -MCPAN C<-e> shell,
+or get the file MLDBM-Sync-x.xx.tar.gz, unzip, untar and:
+
+  perl Makefile.PL
+  make
+  make test
+  make install
+
+=head1 LOCKING
+
+The MLDBM::Sync wrapper protects MLDBM databases by locking
+and unlocking around read and write requests to the databases.
+Also necessary is for each new lock to tie() to the database
+internally, untie()ing when unlocking.  This flushes any
+i/o for the dbm to the operating system, and allows for
+concurrent read/write access to the databases.
+
+Without any extra effort from the developer, an existing 
+MLDBM database will benefit from MLDBM::sync.
+
+  my $dbm_obj = tie %dbm, ...;
+  $dbm{"key"} = "value";
+
+As a write or STORE operation, the above will automatically
+cause the following:
+
+  $dbm_obj->Lock; # also ties
+  $dbm{"key"} = "value";
+  $dbm_obj->UnLock; # also unties
+
+Just so, a read or FETCH operation like:
+
+  my $value = $dbm{"key"};
+
+will really trigger:
+
+  $dbm_obj->ReadLock; # also ties
+  my $value = $dbm{"key"};
+  $dbm_obj->Lock; # also unties
+
+However, these lock operations are expensive because of the 
+underlying tie()/untie() that occurs for i/o flushing, so 
+when bundling reads & writes, a developer may explicitly
+use this API for greater performance:
+
+  # tie once to database, write 100 times
+  $dbm_obj->Lock;
+  for (1..100) {
+    $dbm{$_} = $_ * 100;
+    ...
+  }
+  $dbm_obj->UnLock;
+
+  # only tie once to database, and read 100 times
+  $dbm_obj->ReadLock;
+  for(1..100) {
+    my $value = $dbm{$_};  
+    ...
+  }
+  $dbm_obj->UnLock;
+
+=head1 CACHING
+
+I built MLDBM::Sync to serve as a fast and robust caching layer
+for use in multi-process environments like mod_perl.  In order
+to provide an additional speed boost when caching static data,
+I have added an RAM caching layer with Tie::Cache, which 
+regulates the size of the memory used with its MaxBytes setting.
+
+To activate this caching, just:
+
+  my $dbm = tie %cache, 'MLDBM::Sync', '/tmp/syncdbm', O_CREAT|O_RDWR, 0640;
+  $dbm->SyncCacheSize(100000);  # 100000 bytes max memory used
+  $dbm->SyncCacheSize('100K');  # 100 Kbytes max memory used
+  $dbm->SyncCacheSize('1M');    # 1 Megabyte max memory used
+
+The ./bench/bench_sync.pl, run like "bench_sync.pl C<-c>" will run 
+the tests with caching turned on creating a benchmark with 50%
+cache hits.
+
+One run without caching was:
+
+ === INSERT OF 50 BYTE RECORDS ===
+  Time for 100 writes + 100 reads for  SDBM_File                  0.16 seconds     12288 bytes
+  Time for 100 writes + 100 reads for  MLDBM::Sync::SDBM_File     0.17 seconds     12288 bytes
+  Time for 100 writes + 100 reads for  GDBM_File                  3.37 seconds     17980 bytes
+  Time for 100 writes + 100 reads for  DB_File                    4.45 seconds     20480 bytes
+
+And with caching, with 50% cache hits:
+
+ === INSERT OF 50 BYTE RECORDS ===
+  Time for 100 writes + 100 reads for  SDBM_File                  0.11 seconds     12288 bytes
+  Time for 100 writes + 100 reads for  MLDBM::Sync::SDBM_File     0.11 seconds     12288 bytes
+  Time for 100 writes + 100 reads for  GDBM_File                  2.49 seconds     17980 bytes
+  Time for 100 writes + 100 reads for  DB_File                    2.55 seconds     20480 bytes
+
+Even for SDBM_File, this speedup is near 33%.
+
+=head1 KEYS CHECKSUM
+
+A common operation on database lookups is checksumming
+the key, prior to the lookup, because the key could be
+very large, and all one really wants is the data it maps
+too.  To enable this functionality automatically with 
+MLDBM::Sync, just:
+
+  my $sync_dbm_obj = tie %cache, 'MLDBM::Sync', '/tmp/syncdbm', O_CREAT|O_RDWR, 0640;
+  $sync_dbm_obj->SyncKeysChecksum(1);
+
+ !! WARNING: keys() & each() do not work on these databases
+ !! as of v.03, so the developer will not be fooled into thinking
+ !! the stored key values are meaningful to the calling application 
+ !! and will die() if called.
+ !!
+ !! This behavior could be relaxed in the future.
+ 
+An example of this might be to cache a XSLT conversion,
+which are typically very expensive.  You have the 
+XML data and the XSLT data, so all you do is:
+
+  # $xml_data, $xsl_data are strings
+  my $xslt_output;
+  unless ($xslt_output = $cache{$xml_data.'&&&&'.$xsl_data}) {
+    ... do XSLT conversion here for $xslt_output ...
+    $cache{$xml_data.'&&&&'.xsl_data} = $xslt_output;
+  }
+
+What you save by doing this is having to create HUGE keys
+to lookup on, which no DBM is likely to do efficiently.
+This is the same method that File::Cache uses internally to 
+hash its file lookups in its directories.
+
+=head1 New MLDBM::Sync::SDBM_File
+
+SDBM_File, the default used for MLDBM and therefore MLDBM::Sync 
+has a limit of 1024 bytes for the size of a record.
+
+SDBM_File is also an order of magnitude faster for small records
+to use with MLDBM::Sync, than DB_File or GDBM_File, because the
+tie()/untie() to the dbm is much faster.  Therefore,
+bundled with MLDBM::Sync release is a MLDBM::Sync::SDBM_File
+layer which works around this 1024 byte limit.  To use, just:
+
+  use MLDBM qw(MLDBM::Sync::SDBM_File);
+
+It works by breaking up up the STORE() values into small 128 
+byte segments, and spreading those segments across many records,
+creating a virtual record layer.  It also uses Compress::Zlib
+to compress STORED data, reducing the number of these 128 byte 
+records. In benchmarks, 128 byte record segments seemed to be a
+sweet spot for space/time efficiency, as SDBM_File created
+very bloated *.pag files for 128+ byte records.
+
+=head1 BENCHMARKS
+
+In the distribution ./bench directory is a bench_sync.pl script
+that can benchmark using the various DBMs with MLDBM::Sync.
+
+The MLDBM::Sync::SDBM_File DBM is special because is uses 
+SDBM_File for fast small inserts, but slows down linearly
+with the size of the data being inserted and read.
+
+The results for a dual PIII-450 linux 2.4.7, with a ext3 file system 
+blocksize 4096 mounted async on a RAID-1 2xIDE 7200 RPM disk were as follows:
+
+ === INSERT OF 50 BYTE RECORDS ===
+  Time for 100 writes + 100 reads for  SDBM_File                  0.16 seconds     12288 bytes
+  Time for 100 writes + 100 reads for  MLDBM::Sync::SDBM_File     0.19 seconds     12288 bytes
+  Time for 100 writes + 100 reads for  GDBM_File                  1.09 seconds     18066 bytes
+  Time for 100 writes + 100 reads for  DB_File                    0.67 seconds     12288 bytes
+  Time for 100 writes + 100 reads for  Tie::TextDir .04           0.31 seconds     13192 bytes
+
+ === INSERT OF 500 BYTE RECORDS ===
+ (skipping test for SDBM_File 100 byte limit)
+  Time for 100 writes + 100 reads for  MLDBM::Sync::SDBM_File     0.52 seconds    110592 bytes
+  Time for 100 writes + 100 reads for  GDBM_File                  1.20 seconds     63472 bytes
+  Time for 100 writes + 100 reads for  DB_File                    0.66 seconds     86016 bytes
+  Time for 100 writes + 100 reads for  Tie::TextDir .04           0.32 seconds     58192 bytes
+
+ === INSERT OF 5000 BYTE RECORDS ===
+ (skipping test for SDBM_File 100 byte limit)
+  Time for 100 writes + 100 reads for  MLDBM::Sync::SDBM_File     1.41 seconds   1163264 bytes
+  Time for 100 writes + 100 reads for  GDBM_File                  1.38 seconds    832400 bytes
+  Time for 100 writes + 100 reads for  DB_File                    1.21 seconds    831488 bytes
+  Time for 100 writes + 100 reads for  Tie::TextDir .04           0.58 seconds    508192 bytes
+
+ === INSERT OF 20000 BYTE RECORDS ===
+ (skipping test for SDBM_File 100 byte limit)
+ (skipping test for MLDBM::Sync db size > 1M)
+  Time for 100 writes + 100 reads for  GDBM_File                  2.23 seconds   2063912 bytes
+  Time for 100 writes + 100 reads for  DB_File                    1.89 seconds   2060288 bytes
+  Time for 100 writes + 100 reads for  Tie::TextDir .04           1.26 seconds   2008192 bytes
+
+ === INSERT OF 50000 BYTE RECORDS ===
+ (skipping test for SDBM_File 100 byte limit)
+ (skipping test for MLDBM::Sync db size > 1M)
+  Time for 100 writes + 100 reads for  GDBM_File                  3.66 seconds   5337944 bytes
+  Time for 100 writes + 100 reads for  DB_File                    3.64 seconds   5337088 bytes
+  Time for 100 writes + 100 reads for  Tie::TextDir .04           2.80 seconds   5008192 bytes
+
+=head1 AUTHORS
+
+Copyright (c) 2001-2002 Joshua Chamas, Chamas Enterprises Inc.  All rights reserved.
+Sponsored by development on NodeWorks http://www.nodeworks.com and Apache::ASP
+http://www.apache-asp.org
+
+This program is free software; you can redistribute it
+and/or modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+ MLDBM(3), SDBM_File(3), DB_File(3), GDBM_File(3)