#!/usr/local/bin/perl -w
# BioMirror/GenBankNewDaily.pm

=head1 NAME

BioMirror::GenBankNewDaily -- genbank daily update methods.

=head1 DESCRIPTION

This perl script reads in GenBank nc-daily entries, rebuilds the cumulative
GenBank update.   

=head1 AUTHOR

 Tim Cutts, 12th January 1999
 adapted by d.gilbert for use with BioMirror packages

=cut

package BioMirror::GenBankNewDaily;

# use strict;
BEGIN {
my %id=();

# Genbank files use // as the separator
## local $/ = "\n//\n"; ## see process()

my $debug = 0;
my $kept = 0;
my $n = 0;
my $fileno = 1;
}

sub main( $$$;$) {
	local($fromdir,$todir,$refArchivefiles,$viewonly)= @_;
	
	$debug= $BioMirror::debug;
	%id=();
	$kept = 0;
	$n = 0;
	$fileno = 1;
	
	my @infiles= @$refArchivefiles;
	chdir($fromdir); ## need this for sort unless append to each infile
	@infiles= sort byNewestFile @infiles;
	print STDERR "BioMirror::GenBankNewDaily:: - expanding\n from $fromdir\n to $todir\n @infiles\n"; ## if ($viewonly||$debug);
	return 0 unless scalar(@infiles);
	
	print STDERR  "chdir($todir)\n" if ($debug||$viewonly);
	chdir($todir) || die "Can't chdir($todir): $! ";
	##^^ ? drop this requirement?
	
		##? this is about 5+ GB - should we erase before regenerating or not?
		## if yes, may as well create .new as .seq instead.
		## since this (now) is the live public data, minimize missing time.
	$error= BioMirror::callSystem("/bin/rm -f gbcu*.seq");
	if ($error) { die "error removing old gbcu*.seq: $! "; }

	# Open the new cumulative flatfile for output
	open (NEW, ">gbcu$fileno.new") ||
    die "Could not open gbcu$fileno.new to write: $! ";

	foreach my $inf (@infiles) { process( "$fromdir/$inf", $viewonly); }
	
	close NEW;

	# Now we need to delete the old version and move the new one in its
	# place.

	for my $n (1..$fileno) {
	
	  if (-e "gbcu$n.seq")
	  {
		print  STDERR  "unlink gbcu$n.seq \n" if ($debug||$viewonly);
	  unless($viewonly) {
	    unlink("gbcu$n.seq") ||
			&CleanDie("Could not delete gbcu$n.seq: $!\n");
			}
	  }

		print  STDERR "rename gbcu$n.new gbcu$n.seq \n" if ($debug||$viewonly);
		unless($viewonly) {
	 		rename("gbcu$n.new", "gbcu$n.seq") ||
	      &CleanDie("Could not rename gbcu$n.new: $!\n");
	    my $infile0= "$fromdir/".$infiles[0];
			system("$BioMirror::CopyFileDate $infile0 gbcu$n.seq"); 
			## ???? set proper date so we can test for new data - screws srs index test?
	  	}
	}

	print  STDERR  join("\n",
		   "Records read:     $n",
		   "Records replaced: ".($n-$kept),
		   "Records written:  $kept\n");
	return $error;
}

  ## -M is file age in days.hrs before now: result == $a is newer than $b
  ## or  nc1234.. can be sorted by file name - biggest ## is newest
sub byNewestFile { return (-M $a) <=> (-M $b); }

sub process($;$) { ## dgg
	local( $archivef, $viewonly)= @_;
	
	if ($archivef =~ /.Z$/) { $Zcat= $BioMirror::Zcat; }
	else { $Zcat= $BioMirror::Gzcat; }

	print  STDERR " $Zcat $archivef| process() \n" if ($viewonly||$debug);
	return if ($viewonly);

	# Genbank files use // as the separator
	local $/ = "\n//\n";

	local(*INZ);
	open(INZ, "$Zcat $archivef|") || 
		&CleanDie("Can't $Zcat $archivef: $!");
	
	while (<INZ>)
	{
	  $n++;
	  print STDERR  "$n...\n" if (($n % 100000) == 0);

	  my($id) = /^LOCUS\s+(\S+)/;

	  # Because the wrapping shell script feeds us the files newest first,
	  # we ignore any ID that we have seen before (because the version we
	  # already have will be newer)

	  unless (defined $id) {
	    print  STDERR  "\n\nAWWOOOGGGAH!\n";
	    print  STDERR  $_;
	    print  STDERR  "\n";
	    next;
	  }

	  if (exists $id{$id})
	  {
	    print  STDERR "Already got LOCUS $id\n" if ($debug>1);
	    next;
	  }

	  $kept++;

	  $id{$id} = 1;

	  print NEW "$_" || &CleanDie("Could not write to new file: $!\n");

	  if (($kept % 100000) == 0){
	    close NEW;
	    $fileno++;
	    print  STDERR  "Starting gbcu$fileno.new\n";
	    open (NEW, ">gbcu$fileno.new") ||
		&CleanDie("Could not open gbcu$fileno.new to write: $!\n");
	  }

	}

} ## sub process


sub CleanDie {

  close NEW;
  for my $n (1..$fileno)
  {
    unlink("gbcu$n.new");
  }
  die(@_);

}

1; ## perly 

__END__

example
BioMirror::GenBankNewDaily:: - expanding
 from /c3/iubio/biomir-pub/biomirror/genbank/daily-nc/
 to /c3/iubio/data/gbnew/
 nc1124.flat.Z nc1123.flat.Z nc1122.flat.Z nc1121.flat.Z nc1120.flat.Z nc1119.fl
at.Z nc1118.flat.Z nc1117.flat.Z nc1116.flat.Z nc1115.flat.Z nc1114.flat.Z nc111
3.flat.Z nc1112.flat.Z nc1111.flat.Z nc1110.flat.Z nc1109.flat.Z nc1104.flat.Z n
c1103.flat.Z nc1102.flat.Z nc1031.flat.Z nc1030.flat.Z nc1029.flat.Z nc1028.flat
.Z nc1027.flat.Z nc1026.flat.Z nc1025.flat.Z nc1024.flat.Z nc1023.flat.Z nc1022.
flat.Z nc1021.flat.Z nc1020.flat.Z nc1019.flat.Z nc1018.flat.Z nc1017.flat.Z nc1
016.flat.Z
chdir(/c3/iubio/data/gbnew/)
system( /bin/rm -f gbcu*.seq )
 zcat /c3/iubio/biomir-pub/biomirror/genbank/daily-nc//nc1124.flat.Z| process()
1000...
2000...
.....
 zcat /c3/iubio/biomir-pub/biomirror/genbank/daily-nc//nc1123.flat.Z| process()
Already got LOCUS PTY238812
Already got LOCUS PCA249563
...
rename gbcu1.new gbcu1.seq
rename gbcu2.new gbcu2.seq
rename gbcu3.new gbcu3.seq
rename gbcu4.new gbcu4.seq
rename gbcu5.new gbcu5.seq
Records read:     503643
Records replaced: 10544
Records written:  493099
////
isOldTarget: /bio/mb/srss/510/index/genbanknew_id.inx older than /c3/iubio/data/
gbnew//gbcu2.seq?  yes
system( /bio/mb/srss/510/bin/solaris/srscheck -l 'GENBANKNEW' -o /tmp/srsindex.g
enbanknew; \
/tmp/srsindex.genbanknew
 )
======> must build indices for library "GENBANKNEW"
+ time srsbuild GENBANKNEW -xdir SRSINX: -odir SRSINX: -s unix -nn
...processing GENBANKNEW
...processing /c3/iubio/data/gbnew/gbcu2.seq
...processing /c3/iubio/data/gbnew/gbcu2.seq