# BioMirror/NCBI.pm =head1 NAME BioMirror::NCBI -- BioMirror::Data packages for NCBI databanks =cut # genbank => "$zpath/genbank $dpath/gbfull gb{b,g,h,i,m,p,r,s,u,v}*.seq gbrel.txt ", # gbest => "$zpath/genbank $dpath/gbest gbest*.seq ", package BioMirror::GenBank; @ISA = qw( BioMirror::Data ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'GENBANK', name => 'GenBank', makeflags => $BioMirror::Data::kFromArchive|$BioMirror::Data::kDoSrsIndex, mirror_dir => '$zpath/genbank/', # expanded_dir => '$dpath/gbfull/', expanded_dir => '$doffpath/gbfull/', online_dir => '$dpath/gbfull/', dosummary => 1, # add to summary table source0 => { comment => 'GenBank Sequence Database', home => 'NCBI', # url => 'ftp://ncbi.nlm.nih.gov/ncbi-genbank/', url => 'ftp://ftp.ncbi.nih.gov/ncbi-genbank/', web => 'http://www.ncbi.nlm.nih.gov/Genbank/GenbankOverview.html', get_patt => '(\.seq\.(gz|Z)$|^gbrel\.txt|README|release\.notes)', exclude_patt => '(^vms|^daily)',# ^genomes| local_ignore => '(^daily|local$)',#^genomes| recursive => 'false', }, source => { comment => 'GenBank Sequence Database', home => 'NCBI', url => 'ftp://ftp.ncbi.nih.gov/ncbi-genbank/', web => 'http://www.ncbi.nlm.nih.gov/Genbank/GenbankOverview.html', # get all now # get_patt => '(\.seq\.(gz|Z)$|^gbrel\.txt|README|release\.notes)', # exclude_patt => '(^vms|^daily)', # ^genomes| local_ignore => '(^daily|local$)', #^genomes| recursive => 'false', }, data => [ ## ? need data_pat => or such for data file regex patterns ? ## '(gb(?!est).*\.seq)', # nasty perl regex to skip gbest\d+.seq files '(\.seq)', ], docs => [ ## don't confuse w/ seq data - do we need these unexpanded? '(gbrel\.txt)', '(README)', ], @_ ); return $self; } sub getRelease { ## 15 April 1999 ## NCBI-GenBank Flat File Release 111.0 my $self= shift; my $rel= undef; ## my $df= BioMirror::replaceVars( $self->expanded_dir . 'gbrel.txt'); my @df= $self->getDataPathnames(1); my $df= $df[0] . $df[1]; my $buf= $self->readChunk($df); if ($buf =~ m/GenBank Flat File Release ([0-9\.]+)/) { $rel= "$1"; } if ($buf =~ m/Sequence Data Bank\s+([0-9]{2}\s[A-z]+\s[0-9]{4})/) { $rel .= ", $1"; } ##if ($buf =~ m/Genetic Sequence Data Bank\s*\n\s+([0-9]{2} [A-z]+ [0-9]{4})/) return $rel; } #------------- ## ## from NCBI: add gbgenome, unigene, taxonomy, others for biomir -? srs no/yes? ## ## switch to dailync - avoid repeated mirror of >2GB daily file package BioMirror::GenBankNew; @ISA = qw( BioMirror::GenBank ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'GENBANKNEW', name => 'GenBankNew', # makeflags => $BioMirror::Data::kFromArchive|$BioMirror::Data::kDoSrsIndex, mirror_dir => '$zpath/genbank/daily-nc/', # expanded_dir => '$dpath/gbnew/', expanded_dir => '$doffpath/gbnew/', online_dir => '$dpath/gbnew/', dosummary => 0, source => { comment => 'GenBank daily nc updates from NCBI', url => 'ftp://ncbi.nlm.nih.gov/ncbi-genbank/daily-nc/', web => 'http://www.ncbi.nlm.nih.gov/Database/', exclude_patt => '(^Old)', get_patt => '(^nc.+\.flat\.(gz|Z)$|README)', # name_mappings => 's/\.flat/\.seq/', recursive => 'false', max_delete_files => '100%', # so old ones don't accumulate! }, #! add method for this class to create gbcdu##.seq files from # nc##.flat files, using Tim Cutt's daily update perl (checks, removes dup LOCUSes) nodailycondense => 1, data => [ '(\.flat)=(.seq)', ], docs => [], ## => [ 'gbcu.flat=gbcdu.seq', ], ## data => [ 'gbcu.flat', ], @_ ); return $self; } ## override ::Data method to process NCBI Genbank daily-nc/nc*.flat files sub localArchiveToExpanded { my $self = shift; my $fromdir0= shift; ##? if ($self->nodailycondense) { return $self->SUPER::localArchiveToExpanded($fromdir0); } my $error = 0; my @data = @{$self->data}; # my @docs = @{$self->docs}; ##? do these also? # push(@data,@docs) if (@docs); my $fromdir= BioMirror::replaceVars( $self->mirror_dir ); my $todir = BioMirror::replaceVars( $self->expanded_dir ); if (!-d $fromdir) { warn("Missing archive folder $fromdir\n"); return 1; ## && -d $todir -- not if doMakeDirs in main } foreach my $f ( @data ) { my ($fz, $tof); my @fz; if ( $f =~ m/=/ ) { ($fz,$tof) = split(/=/,$f); } else { $fz= $f; $tof= $f; } if ( $fz =~ m/^\((.+)\)$/ ) { @fz= $self->matchfiles($fromdir,$1); $tof= '' if ($tof eq $fz); } else { @fz= ($fz); } ## dang - need file date test to see if archivef is newer than expanded ## before running this long job # print "GenBankNew::localArchiveToExpanded fromdir=$fromdir todir=$todir\n" if ($BioMirror::debug); # print "GenBankNew::localArchiveToExpanded files=@fz\n" if ($BioMirror::debug); ## subclass part is here -- eval to prevent keep from die'ing local(*TOD); my $isold= $BioMirror::forceindex; if ( !$isold && opendir(TOD,$todir) ) { my @testfile= grep( /\.seq$/, readdir(TOD)); closedir(TOD); my $expandedf= "$todir/$testfile[0]"; ##?? "$fromdir/$fz" pattern $isold= BioMirror::isOldTarget("$fromdir/*\.flat", $expandedf); } if ($isold) { eval { ##? @fz is empty require BioMirror::GenBankNewDaily; $error= BioMirror::GenBankNewDaily::main( $fromdir, $todir, \@fz, $BioMirror::view ); }; if ($@ || $error) { warn $@; return 1; } } } return $error; } ## ? must override this also to return proper file list? sub getDataPathnames(;$ $) { my $self = shift; my ($expandedlist, $adddocs) = @_; my @data = @{$self->data}; if ($adddocs) { my @docs = @{$self->docs}; ##? do these also? push(@data,@docs) if (@docs); } my $todir = BioMirror::replaceVars( $self->expanded_dir ); my @files= (); ## ? support for !$expandedlist ? local(*D); if (opendir(D,$todir)) { @files= grep( /\.seq/, readdir(D)); closedir(D); } return ($todir, @files); } #------------- ##? drop this as section - do with GenBank ? #package BioMirror::GenBankEst; # @ISA = qw( BioMirror::GenBank ); # #sub new { # my $class= shift; # my $self = $class->SUPER::new( # srsdb => 'GBEST', # name => 'GenBankEst', # # sourceflags => $BioMirror::Data::kSuperSource, # # makeflags => $BioMirror::Data::kFromArchive|$BioMirror::Data::kDoSrsIndex, # makeflags => 0, # # mirror_dir => '$zpath/genbank/', # expanded_dir => '$dpath/gbest/', ## keep separate from gbfull for SRS uses # # # data => [ '(gbest.*\.seq)', ], # data => [ ], # docs => [ ], # @_ ); # return $self; #} #------------- #package BioMirror::GenBankNewCum; # @ISA = qw( BioMirror::GenBank ); # #sub new { # my $class= shift; # my $self = $class->SUPER::new( # srsdb => 'GENBANKNEWCUM', # name => 'GenBankNewCum', # makeflags => 0, # comment => 'switch to daily-nc to avoid daily mirror of >2GB cum. file', # # mirror_dir => '$zpath/genbank/daily/', # expanded_dir => '', #'$dpath/gbnew/', # source => { # comment => 'GenBank daily updates from NCBI', # url => 'ftp://ncbi.nlm.nih.gov/ncbi-genbank/daily/', # get_patt => '(^gbcu\.flat\.Z$|README)', # exclude_patt => '(^Old)', # recursive => 'false', # }, # # data => [], # ## data => [ 'gbcu.flat=gbcdu.seq', ], # @_ ); # return $self; #} # #------------- # this still exists aside from newer ncbigenomes section # ? # package BioMirror::GBGenomesOld; # @ISA = qw( BioMirror::GenBank ); # # sub new { # my $class= shift; # my $self = $class->SUPER::new( # srsdb => 'GBGENOMES', # name => 'GenBank Genomes', # makeflags => 0, # # mirror_dir => '$zpath/genbank/genomes/', # expanded_dir => '', #'$dpath/genomes/', # # dosummary => 1, # add to summary table # source => { # home => 'NCBI', # # comment => 'Genome section, Genbank from NCBI', # comment => 'Whole genome sequence section of GenBank', # url => 'ftp://ftp.ncbi.nih.gov/ncbi-genbank/genomes/', # # web => 'http://www.ncbi.nlm.nih.gov/Database/', # web => 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=Genome', # get_patt => '(\.gbk|\.gbs|\.ptt|README$)', # exclude_patt => '(Old/|yst_\d)', # recursive => 'true', # }, # data => [], # # ## get only these file kinds: # # *.gbk = GenBank flat file format # # *.gbs = GenBank summary file format # # *.ptt = Protein Table # # @_ ); # return $self; # } package BioMirror::GBGenomes; @ISA = qw( BioMirror::GenBank ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'GBGENOMES', name => 'GenBank Genomes', makeflags => 0, mirror_dir => '$zpath/ncbigenomes/', expanded_dir => '', #'$dpath/genomes/', dosummary => 1, # add to summary table source => { home => 'NCBI', comment => 'Whole genome sequence section of GenBank', url => 'ftp://ftp.ncbi.nih.gov/genomes/', web => 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=Genome', # get all # get_patt => '(\.gbk|\.gbs|\.ptt|README$)', # sexclude_patt => '(Old/|yst_\d)', recursive => 'true', }, data => [], ## get only these file kinds ?? # *.gbk = GenBank flat file format # *.gbs = GenBank summary file format # *.ptt = Protein Table @_ ); return $self; } #------------- package BioMirror::RefSeq; @ISA = qw( BioMirror::GenBank ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'REFSEQ', name => 'RefSeq', makeflags => $BioMirror::Data::kFromArchive|$BioMirror::Data::kDoSrsIndex, mirror_dir => '$zpath/refseq/cumulative', #? or part of locuslink? ##expanded_dir => '$dpath/refseq/', expanded_dir => '$doffpath/refseq/', online_dir => '$dpath/refseq/', dosummary => 1, # add to summary table source => { home => 'NCBI', # comment => 'RefSeq from NCBI', comment => 'NCBI Reference Sequences', url => 'ftp://ftp.ncbi.nih.gov/refseq/', web => 'http://www.ncbi.nlm.nih.gov/LocusLink/refseq.html', get_patt => '(rscu.gbff.Z$|rscu.gnp.Z$|README)', exclude_patt => '(^release|^daily|^LocusLink)', local_ignore => '(^genomes|local$)', recursive => 'true', }, data => [ 'rscu.gbff.Z=rscu_gbff.seq', 'rscu.gnp.Z=rscu_gnp.seq', ], # $file:rscu_gbff -- GenBank flat file format, nucleotide sequence records # $file:rscu_gnp -- GenPept flat file format, protein sequence records # ! genbank/genomes will migrate to refseq as NC_ records # docs => [ ## don't confuse w/ seq data - do we need these unexpanded? # '(README)', # ], @_ ); return $self; } #------------- package BioMirror::BlastDB; ## no srs index - maybe blast service later ## this one should mirror ~ once/month @ISA = qw( BioMirror::Data ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'BLASTDB', name => 'BlastDB', makeflags => 0, ## not yet mirror_dir => '$zpath/blast/', expanded_dir => '', ## not yet dosummary => 1, # add to summary table source => { home => 'NCBI', comment => 'Biosequence databases for BLAST searches', url => 'ftp://ftp.ncbi.nih.gov/blast/db/', web => 'http://www.ncbi.nlm.nih.gov/BLAST/blast_databases.html', get_patt => '(\.Z$|README)', exclude_patt => '(^month|^old)', local_ignore => '(^month|local$)', recursive => 'false', }, data => [ ], docs => [ ], @_ ); return $self; } #------------- package BioMirror::BlastDBDaily; ## this one should mirror ~ nightly @ISA = qw( BioMirror::BlastDB ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'BLASTDBNEW', name => 'BlastDB Daily', # makeflags => 0, ## not yet # mirror_dir => '$zpath/blast/', # expanded_dir => '', ## not yet dosummary => 0, source => { comment => 'Blast DB daily updates from NCBI', url => 'ftp://ftp.ncbi.nih.gov/blast/db/', get_patt => '(^month.*\.Z$)', local_ignore => '(^(?!month).*)', recursive => 'false', }, # data => [ ], # docs => [ ], @_ ); return $self; } #------------- package BioMirror::UniGene; @ISA = qw( BioMirror::Data ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'UNIGENE', name => 'UniGene', makeflags => $BioMirror::Data::kFromArchive|$BioMirror::Data::kDoSrsIndex, mirror_dir => '$zpath/unigene/', expanded_dir => '$dpath/unigene/', dosummary => 1, # add to summary table source => { home => 'NCBI', comment => 'Unique Gene Sequence Collection for Human, Mouse, Rat, and Zebrafish', url => 'ftp://ftp.ncbi.nih.gov/repository/unigene/', web => 'http://www.ncbi.nlm.nih.gov/UniGene/', exclude_patt => '+^Old|\~$', recursive => 'false', }, data => [ '(\.data)' ], @_ ); return $self; } sub FIXME_getRelease { # Release 25.00, June 30, 1998 my $self= shift; my $rel= undef; my $df= BioMirror::replaceVars( $self->expanded_dir . ${$self->data}[0]); my $buf= $self->readChunk($df); if ($buf =~ m/Release ([0-9\.]+), ([0-9A-z\, ]+)/) { $rel= "$1, $2"; } return $rel; } package BioMirror::UniGeneMM; @ISA = qw( BioMirror::UniGene ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'MMUNIGENE', name => 'UniGeneMM', makeflags => $BioMirror::Data::kDoSrsIndex, sourceflags => $BioMirror::Data::kSuperSource, dosummary => 0, @_ ); return $self; } package BioMirror::UniGeneRN; @ISA = qw( BioMirror::UniGene ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'RNUNIGENE', name => 'UniGeneRN', makeflags => $BioMirror::Data::kDoSrsIndex, sourceflags => $BioMirror::Data::kSuperSource, dosummary => 0, @_ ); return $self; } package BioMirror::UniGeneDR; @ISA = qw( BioMirror::UniGene ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'DRUNIGENE', name => 'UniGeneDR', makeflags => $BioMirror::Data::kDoSrsIndex, sourceflags => $BioMirror::Data::kSuperSource, dosummary => 0, @_ ); return $self; } #------------- ## moved to BioMirror/MeowGenes.pm ## package BioMirror::LocusLink; #------------- package BioMirror::Taxonomy_NCBI; @ISA = qw( BioMirror::Data ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'TAXONOMY_NCBI', name => 'Taxonomy', makeflags => 0, # $BioMirror::Data::kFromArchive|$BioMirror::Data::kDoSrsIndex, dosummary => 0, # add to summary table source => { home => 'NCBI', comment => 'Species names', ## comment => 'Taxonomy data from NCBI', url => 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/', web => 'http://www.ncbi.nlm.nih.gov/Taxonomy/', get_patt => '(\.gz$|readme|README)', recursive => 'true', }, mirror_dir => '$zpath/taxonomy/ncbi/', expanded_dir => '', # '$dpath/taxonomy/', data => [ ], @_ ); return $self; } #------------- ## dummy for summary package BioMirror::Taxonomy; @ISA = qw( BioMirror::Data ); sub new { my $class= shift; my $self = $class->SUPER::new( name => 'Taxonomy', makeflags => 0, dosummary => 1, # add to summary table source => { home => 'NCBI, EBI', comment => 'Species names', url => 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/', web => 'http://www.ncbi.nlm.nih.gov/Taxonomy/', }, mirror_dir => '$zpath/taxonomy/', @_ ); return $self; } #------------- package BioMirror::Prints; @ISA = qw( BioMirror::Data ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'PRINTS', name => 'Prints', makeflags => $BioMirror::Data::kFromArchive|$BioMirror::Data::kDoSrsIndex, ## source should be constant information for each BioMirror::Data class ## but want to support alternate url types (http: ftp: rsync: others?) dosummary => 0, source => { ## ftp-mirror package info comment => 'PRINTS from NCBI', url => 'ftp://ftp.ncbi.nih.gov/repository/blocks/unix/', get_patt => '(^prints)', name_mappings => 's|^prints[^\/]*/||', ## cut the leading dir exclude_patt => '(^Old|\.tar$)', recursive => 'true', }, ## need to support remote mirrors ... but don't define in each data subclass ## mirror_dir should be constant for all BioMirrors (but for $zpath prefix) ## see blocks dir name change bug mirror_dir => '$zpath/blocks/data-prints/', ##? expanded_dir => '$dpath/prints/', data => [ 'prints.dat', ], @_ ); return $self; } sub getRelease { ## Prints Database 22.0 in Blocks Format, March 1999 my $self= shift; my $rel= undef; my $df= BioMirror::replaceVars( $self->expanded_dir . ${$self->data}[0]); my $buf= $self->readChunk($df); if ($buf =~ m/Prints Database ([0-9\.]+)\s+in Blocks Format,\s+([0-9A-z ]+)/) { $rel= "$1, $2"; } return $rel; } #------------- package BioMirror::Blocks; @ISA = qw( BioMirror::Data ); sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'BLOCKS', name => 'Blocks', makeflags => $BioMirror::Data::kFromArchive|$BioMirror::Data::kDoSrsIndex, dosummary => 1, # add to summary table source => { home => 'NCBI', comment => 'Highly conserved regions of proteins', url => 'ftp://ftp.ncbi.nih.gov/repository/blocks/unix/', web => 'http://blocks.fhcrc.org/', get_patt => '(^blocks-11|^README)', ## dang ^ want to get only most recent blocks- and blocksplus- versions ! ## and put to staticly named mirror_dir name_mappings => 's|^blocks[^\/]*/||', ## cut the leading dir exclude_patt => '(^Old|\.tar$)', recursive => 'true', }, ## bad - blocks data goes into variable dir w/ version# ## need to deal w/ changeable source folders, like sprot data name ## '(sprot[0-9]+.dat.Z)=seq.dat', mirror_dir => '$zpath/blocks/data-blocks/', ## expanded_dir => '$dpath/blocks/', data => [ 'blocks.dat', ], @_ ); return $self; } sub getRelease { ## Blocks Database Version 11.0, July 1998 my $self= shift; my $rel= undef; my $df= BioMirror::replaceVars( $self->expanded_dir . ${$self->data}[0]); my $buf= $self->readChunk($df); if ($buf =~ m/Blocks Database Version ([0-9\.]+)\s*,\s*([0-9A-z ]+)/) { $rel= "$1, $2"; } return $rel; } #------------- package BioMirror::BlocksPlus; ## @ISA = qw( BioMirror::Blocks ); ## for now just mirror BlocksPlus, don't srsindex ## gone ; jun03 sub new { my $class= shift; my $self = $class->SUPER::new( srsdb => 'BLOCKSPLUS', name => 'Blocks Plus', ##makeflags => $BioMirror::Data::kFromArchive, ## |$BioMirror::Data::kDoSrsIndex -- not here dosummary => 0, source => { comment => 'BLOCKS Plus from NCBI', url => 'ftp://ftp.ncbi.nih.gov/repository/blocks/unix/', get_patt => '(^blocksplus)', name_mappings => 's|^blocksplus[^\/]*/||', ## cut the leading dir exclude_patt => '(^Old|\.tar$)', recursive => 'true', }, mirror_dir => '$zpath/blocks/blocksplus', expanded_dir => '', ## '$dpath/blocks/blocksplus', data => [], ## [ 'blocks.dat', ], @_ ); return $self; } #------------- ## moved to MeowGenes.pm ## package BioMirror::CElegans; 1;