#!/usr/local/bin/perl -w # BioMirror/GenBankNewDaily.pm =head1 NAME BioMirror::GenBankNewDaily -- genbank daily update methods. =head1 DESCRIPTION This perl script reads in GenBank nc-daily entries, rebuilds the cumulative GenBank update. =head1 AUTHOR Tim Cutts, 12th January 1999 adapted by d.gilbert for use with BioMirror packages =cut package BioMirror::GenBankNewDaily; # use strict; BEGIN { my %id=(); # Genbank files use // as the separator ## local $/ = "\n//\n"; ## see process() my $debug = 0; my $kept = 0; my $n = 0; my $fileno = 1; } sub main( $$$;$) { local($fromdir,$todir,$refArchivefiles,$viewonly)= @_; $debug= $BioMirror::debug; %id=(); $kept = 0; $n = 0; $fileno = 1; my @infiles= @$refArchivefiles; chdir($fromdir); ## need this for sort unless append to each infile @infiles= sort byNewestFile @infiles; print STDERR "BioMirror::GenBankNewDaily:: - expanding\n from $fromdir\n to $todir\n @infiles\n"; ## if ($viewonly||$debug); return 0 unless scalar(@infiles); print STDERR "chdir($todir)\n" if ($debug||$viewonly); chdir($todir) || die "Can't chdir($todir): $! "; ##^^ ? drop this requirement? ##? this is about 5+ GB - should we erase before regenerating or not? ## if yes, may as well create .new as .seq instead. ## since this (now) is the live public data, minimize missing time. $error= BioMirror::callSystem("/bin/rm -f gbcu*.seq"); if ($error) { die "error removing old gbcu*.seq: $! "; } # Open the new cumulative flatfile for output open (NEW, ">gbcu$fileno.new") || die "Could not open gbcu$fileno.new to write: $! "; foreach my $inf (@infiles) { process( "$fromdir/$inf", $viewonly); } close NEW; # Now we need to delete the old version and move the new one in its # place. for my $n (1..$fileno) { if (-e "gbcu$n.seq") { print STDERR "unlink gbcu$n.seq \n" if ($debug||$viewonly); unless($viewonly) { unlink("gbcu$n.seq") || &CleanDie("Could not delete gbcu$n.seq: $!\n"); } } print STDERR "rename gbcu$n.new gbcu$n.seq \n" if ($debug||$viewonly); unless($viewonly) { rename("gbcu$n.new", "gbcu$n.seq") || &CleanDie("Could not rename gbcu$n.new: $!\n"); my $infile0= "$fromdir/".$infiles[0]; system("$BioMirror::CopyFileDate $infile0 gbcu$n.seq"); ## ???? set proper date so we can test for new data - screws srs index test? } } print STDERR join("\n", "Records read: $n", "Records replaced: ".($n-$kept), "Records written: $kept\n"); return $error; } ## -M is file age in days.hrs before now: result == $a is newer than $b ## or nc1234.. can be sorted by file name - biggest ## is newest sub byNewestFile { return (-M $a) <=> (-M $b); } sub process($;$) { ## dgg local( $archivef, $viewonly)= @_; if ($archivef =~ /.Z$/) { $Zcat= $BioMirror::Zcat; } else { $Zcat= $BioMirror::Gzcat; } print STDERR " $Zcat $archivef| process() \n" if ($viewonly||$debug); return if ($viewonly); # Genbank files use // as the separator local $/ = "\n//\n"; local(*INZ); open(INZ, "$Zcat $archivef|") || &CleanDie("Can't $Zcat $archivef: $!"); while () { $n++; print STDERR "$n...\n" if (($n % 100000) == 0); my($id) = /^LOCUS\s+(\S+)/; # Because the wrapping shell script feeds us the files newest first, # we ignore any ID that we have seen before (because the version we # already have will be newer) unless (defined $id) { print STDERR "\n\nAWWOOOGGGAH!\n"; print STDERR $_; print STDERR "\n"; next; } if (exists $id{$id}) { print STDERR "Already got LOCUS $id\n" if ($debug>1); next; } $kept++; $id{$id} = 1; print NEW "$_" || &CleanDie("Could not write to new file: $!\n"); if (($kept % 100000) == 0){ close NEW; $fileno++; print STDERR "Starting gbcu$fileno.new\n"; open (NEW, ">gbcu$fileno.new") || &CleanDie("Could not open gbcu$fileno.new to write: $!\n"); } } } ## sub process sub CleanDie { close NEW; for my $n (1..$fileno) { unlink("gbcu$n.new"); } die(@_); } 1; ## perly __END__ example BioMirror::GenBankNewDaily:: - expanding from /c3/iubio/biomir-pub/biomirror/genbank/daily-nc/ to /c3/iubio/data/gbnew/ nc1124.flat.Z nc1123.flat.Z nc1122.flat.Z nc1121.flat.Z nc1120.flat.Z nc1119.fl at.Z nc1118.flat.Z nc1117.flat.Z nc1116.flat.Z nc1115.flat.Z nc1114.flat.Z nc111 3.flat.Z nc1112.flat.Z nc1111.flat.Z nc1110.flat.Z nc1109.flat.Z nc1104.flat.Z n c1103.flat.Z nc1102.flat.Z nc1031.flat.Z nc1030.flat.Z nc1029.flat.Z nc1028.flat .Z nc1027.flat.Z nc1026.flat.Z nc1025.flat.Z nc1024.flat.Z nc1023.flat.Z nc1022. flat.Z nc1021.flat.Z nc1020.flat.Z nc1019.flat.Z nc1018.flat.Z nc1017.flat.Z nc1 016.flat.Z chdir(/c3/iubio/data/gbnew/) system( /bin/rm -f gbcu*.seq ) zcat /c3/iubio/biomir-pub/biomirror/genbank/daily-nc//nc1124.flat.Z| process() 1000... 2000... ..... zcat /c3/iubio/biomir-pub/biomirror/genbank/daily-nc//nc1123.flat.Z| process() Already got LOCUS PTY238812 Already got LOCUS PCA249563 ... rename gbcu1.new gbcu1.seq rename gbcu2.new gbcu2.seq rename gbcu3.new gbcu3.seq rename gbcu4.new gbcu4.seq rename gbcu5.new gbcu5.seq Records read: 503643 Records replaced: 10544 Records written: 493099 //// isOldTarget: /bio/mb/srss/510/index/genbanknew_id.inx older than /c3/iubio/data/ gbnew//gbcu2.seq? yes system( /bio/mb/srss/510/bin/solaris/srscheck -l 'GENBANKNEW' -o /tmp/srsindex.g enbanknew; \ /tmp/srsindex.genbanknew ) ======> must build indices for library "GENBANKNEW" + time srsbuild GENBANKNEW -xdir SRSINX: -odir SRSINX: -s unix -nn ...processing GENBANKNEW ...processing /c3/iubio/data/gbnew/gbcu2.seq ...processing /c3/iubio/data/gbnew/gbcu2.seq