# BioMirror/SRS.pm # Bio-Mirror perl packages # extract from srsperls.pl # d.gilbert ## oct01 - edit srs5 to srs6 for iubio, elsewhere ## ! ## ! add method to update srs .i files to reflect new databank files - genbank/embl esp. ## ( srs doesn't deal with file match patterns ). ## ? maybe also add general srs .i/.is management methods here? ## ## add method to wgetz srs .i, .is, .it from other servers - collect archive of them? ## e.g. http://kalo/srsbin/cgi-bin/wgetz?-fun+PageIcarusFile+-l+PIR+-ifile+i ## ## offer to ftp fetch srs5.tar.Z if not installed? ## =head1 NAME BioMirror::SRS - SRS Sequence Retrieval System, using perl calls via system(). =cut package BioMirror::SRS; $VERSION = "1.0"; sub Version { $VERSION; } use Carp (); $sourceUrl = 'ftp://ftp.ebi.ac.uk/pub/software/unix/srs/srs5.1.0.tar.gz'; $homeServer = 'http://srs.ebi.ac.uk/'; #$wgetz= '/srs5bin/cgi-bin/wgetz'; $wgetz= '/srs6bin/cgi-bin/wgetz'; #$allservers= 'http://srs.ebi.ac.uk/srs5list.html'; ## still there, oct01 $allservers='http://www.lionbio.co.uk/publicsrs.html'; $SRSLibIds= 'srsdb-libids.i'; # special cut of srsdb.i for preserving libid # %SkipIfile= ('srsdb.i'=>1, 'srsgen.i' =>1, 'readseq.i' =>1, $SRSLibIds=>1); $view= 0; $debug= 0; $forceindex= 0; $SRSsymbols= '\[\]\:\&'; ## this probably includes more, need to test $DefaultBoolean= '&'; ## '|' is alternative $kServers = 1; $kDocument = 2; =head1 BioMirror::SRS::init($SRSRoot) Initialize SRS environment. $SRRoot is path to SRS installation (level of index/ etc/ bin/ ...) $ENV{SRSROOT} overrides $SRRoot value =cut sub init { local($srsroot) = @_; ##? return if ($didInit); if ($ENV{'SRSROOT'}) { $srsroot = $ENV{'SRSROOT'}; } ## parameters? $view= $BioMirror::view; $debug= $BioMirror::debug; $forceindex= $BioMirror::forceindex; $OS = `uname`; chop($OS); if ($OS eq 'SunOS' && (`uname -r` =~ /^[5-9]/)) { $OS = 'Solaris'; } $_= $OS; OSCASE: { if (/Solaris/){ $OS_SPECIFIC = "solaris"; last OSCASE; } if (/SunOS/) { $OS_SPECIFIC = "sunos"; last OSCASE; } if (/OSF1/) { $OS_SPECIFIC = "osf"; last OSCASE; } if (/ULTRIX/) { $OS_SPECIFIC = "ultrix"; last OSCASE; } if (/IRIX/) { $OS_SPECIFIC = "irix"; last OSCASE; } if (/IRIX64/) { $OS_SPECIFIC = "irix64"; last OSCASE; } if (/Linux/) { $OS_SPECIFIC = "linux"; last OSCASE; } if (/AIX/) { $OS_SPECIFIC = "aix"; last OSCASE; } if (/HP-UX/) { $OS_SPECIFIC = "hpux"; last OSCASE; } ## add /Rhapsody/ == macosx ## die "Unsupported operating system: $OS"; Carp::croak("Unsupported operating system: $OS"); } # SRS internal directories $ENV{"SRSROOT"} = $SRSROOT = $srsroot; $ENV{"SRSDAT"}= $SRSDAT= "$SRSROOT/etc"; $ENV{"SRSDB"}= $SRSDB= "$SRSROOT/icarus/db"; $ENV{"SRSICA"}= $SRSICA= "$SRSROOT/icarus/util"; $ENV{"SRSSOU"}= $SRSSOU= "$SRSROOT/src"; $ENV{"SRSETC"}= $SRSETC= "$SRSROOT/etc"; $ENV{"SRSWWW"}= $SRSWWW= "$SRSROOT/www"; $ENV{"SRSWWWTMP"}= $SRSWWWTMP= "$SRSROOT/tmp"; $ENV{"SRSDOC"}= $SRSDOC= "${SRSWWW}/doc"; $ENV{"SRSDEMO"}= $SRSDEMO= "$SRSROOT/demo"; $ENV{"SRSMAN"}= $SRSMAN= "${SRSWWW}/man"; $ENV{"SRSINX"}= $SRSINX= "$SRSROOT/index"; $ENV{"SRSSEC"}= $SRSSEC= "$SRSROOT/etc/${OS_SPECIFIC}"; $ENV{"SRSEXE"}= $SRSEXE= "$SRSROOT/bin/${OS_SPECIFIC}"; if (! -d $SRSEXE ) { my $msg= "\nCan't locate SRS executables at $SRSEXE\n" . "Fetch SRS software from\n $sourceUrl\n"; Carp::croak($msg); ## die "$msg "; } $SRSChecker= $ENV{'SRSEXE'} . "/srscheck"; $SRSIndexer= $ENV{'SRSEXE'} . "/srsbuild"; $SRSIndexDir= $ENV{'SRSINX'}; $didInit= 1; } =head1 BioMirror::SRS::install() install - information for installing SRS software =cut sub install { print <`; my @proc = split ("\n", $tmp); foreach (@proc) { if (/srsbuild/ || /srscheck/ ) { return 1; # some indexing job is running already } } my($srsupf)= "/tmp/srsupd$$.csh"; my($cmd) = < /dev/null 2>&1"); return $error; } =head1 BioMirror::SRS::setRelease($dbname, $release) set data release number for SRS dataset $dbname - SRS dataset, $release - value to set =cut sub setRelease($$) { local($dbname, $release)= @_; return 0 unless($dbname && $release); print STDERR "setRelease for $dbname to $rel\n" if ($debug||$view); return &BioMirror::callSystem("$SRSIndexer -rel \'$release\' $dbname"); } =head1 BioMirror::SRS::info($dbname) get info from SRS dataset =cut sub info($) { local($dbname)= @_; return 0 unless($dbname); ## return &BioMirror::callSystem("$SRSEXE/getz -info $dbname"); ##? instead return string of output for further editing/formatting? return `$SRSEXE/getz -info $dbname`; } =head1 BioMirror::SRS::libraries() get SRS data library list =cut sub libraries() { return &BioMirror::callSystem("$SRSEXE/getz -libs"); } sub libInfoUrl { my($server,$srsdb) = @_; ##s5# return "$server$wgetz?-fun+pagelibinfo+-info+$srsdb"; return "$server$wgetz?-page+LibInfo+-lib+$srsdb"; } =head1 BioMirror::SRS::allInfo($srsserver,$flags,$refdbarray) Get info for all known SRS databanks, including databank name, documentation, and available SRS servers with their data release dates. $srsserver - optional remote SRS server with DATABANKS databank, default will try getz query of local DATABANKS databank $flags - | $SRS::kServers - get only server info - | $SRS::kDocument - get only documentation $refdbarray - optional list of SRS DB names to fetch info, as array ref returns hash of hashes, keyed on DB name. See also SRS::getSrsServers(), SRS::fetchIcarusDoc() =cut ## return other info here? or rely on other calls? sub allInfo { local($srsserver,$flags,$refdbarray)= @_; ##? add flag to get doc, server info separately ? my $doserv = (!$flags || ($flags & $kServers) != 0); my $dodocs = (!$flags || ($flags & $kDocument) != 0); my %getdb= undef; my $ngetdbs; if ($refdbarray) { foreach (@$refdbarray) { $ngetdbs++; $getdb{ uc($_) }= 1; } } local(*D); my %dbs; my ($fdoc, $frel); if ($srsserver) { ## needs testing ! $fdoc= "/tmp/srsalldb$$.doc"; $frel= "/tmp/srsalldb$$.rel"; my $striphtml= 1; my $surl= $srsserver . "$wgetz?-f+'nam%20enn'+[databanks-nam:*]"; my $text= BioMirror::getDocFromUrl( $surl, $striphtml); open(D,">$frel"); print D $text; close(D); $surl= $srsserver . "$wgetz?-f+'nam%20des'+[databanks-nam:*]"; $text= BioMirror::getDocFromUrl( $surl, $striphtml); open(D,">$fdoc"); print D $text; close(D); } my ($db, $svn); if (-r $frel) { open(D,$frel); } else { open(D,"$SRSEXE/getz '[databanks-nam:*]' -f 'nam enn'|"); } while () { if (/does not exist/) { $error=1; last; } ## error: SRSICA:srsquery.i:61: error: unknown object, data bank "bob" does not exist ## SWISSPROT at Adlib, CAB International, Wallingford, UK if ( /\s*([A-Z0-9]+) at (.+)/ ) { $db= $1; $svn= $2; if ($ngetdbs && !$getdb{$db}) { $db= undef; next; } my %rec= %{$dbs{$db}}; %rec= () unless(defined %rec); $rec{name}= $db; if ($doserv) { my %svr= %{$rec{servers}}; %svr= () unless(defined %svr); # $svr{$svn}= 1; ## don't need dup key=value $svr{$svn}->{name}= $svn; $rec{servers}= \%svr; } $dbs{$db}= \%rec; } ## The current release 57 has 3046471 entries and was indexed 01-Jan-1999. elsif ( $db && $svn && /release (\S+) has (\S+) entries and was indexed (\S+)/ ) { my $rel= $1; my $nent= $2; my $date= $3; if ($doserv) { my %rec= %{$dbs{$db}}; # %rec= () unless(defined %rec); my %svr= %{$rec{servers}}; # %svr= () unless(defined %svr); $svr{$svn}->{rel}= $rel; $svr{$svn}->{count}= $nent; $svr{$svn}->{date}= $date; $rec{servers}= \%svr; $dbs{$db}= \%rec; } $db= $svn= undef; } } close(D); if (!$error && $dodocs) { $db= undef; my $doc; if (-r $fdoc) { open(D,$fdoc); } else { open(D,"$SRSEXE/getz '[databanks-nam:*]' -f 'nam des'|"); } ## ? do one (w)getz call for each nam? so WORD at text of des isn't confused w/ nam ? while () { if ( /\s*([A-Z0-9]+) at (.+)/ ) { my $ndb= $1; $svn= $2; if ($doc && $db) { my %rec= %{$dbs{$db}}; $rec{doc} .= $doc; $dbs{$db}= \%rec; } $db= $ndb; $doc= undef; if ($ngetdbs && !$getdb{$db}) { $db= undef; next; } } elsif ( /\S/ ) { ##? all other is desc? $doc .= $_; } } close(D); } unlink $fdoc if ($fdoc); unlink $frel if ($frel); return %dbs; } =head1 BioMirror::SRS::forceindex($turnon) force indexing (true/false) for SRS index, when index dates don't require it. =cut #' sub forceindex($) { local($turnon) = @_; $forceindex= $turnon; } =head1 BioMirror::SRS::getDataPathnames($dbname) return directory and data file(s) for SRS databank. result is ($directory,@files) =cut sub getDataPathnames($) { local($dbname)= @_; return 0 unless($dbname); my $dpath; my @files; my %files; my $info= info($dbname); my @info= split(/\n/,$info); my ($nextfiles,$nextdir); foreach (@info) { chomp; if ('') { $nextdir= 0; $nextfiles= 0; } elsif ($nextdir && /\s+\"(\S+)\"/) { #" $dpath= $1; $nextdir= 0; } elsif ($nextfiles && /\s+(\S+)/) { my @ff= split(' '); # / / ?? /\s/ ?? my $f; foreach $f (@ff) { # screen out buggy dups: pir1.dat pir1. my $fm= $f; $fm =~ s/\..*//; if ($f =~ /\..+/ || !$files{$fm}) { $files{$fm}= $f; } } ## push(@files, @ff); } elsif (/Directory:/) { $nextdir= 1; } elsif (/Filename\(s\):/) { $nextfiles= 1; } } foreach (sort keys %files) { push(@files,$files{$_}); } $dpath =~ s=SRSROOT:=$SRSROOT/=; ##? do we want this subst. return ($dpath,@files); } ## need srscheck call w/ no libs to get commands to build links !!! # build all "index" links # srsbuild -i GENBANK PIR # build all "read" links # srsbuild SWISSNEW =head1 BioMirror::SRS::check($dbname, $dpath, @files) Call srscheck and srsbuild if indicated by srscheck. If data is newer than data index, new index will be built. Returns 1 if did update, 0 otherwise. $dbname - srs databank name $dpath - path to data files @files - data files =cut sub check( $ $ @) { local($dbname, $dpath, @files) = @_; ## ^^ change to \@files and add \@dbnames local($db,$db2); if ($dbname =~ / /) { ($db,$db2) = split(' ',$dbname,2); } # ^^ do all transfac's at once, others? else { $db= $dbname; } $db = lc($db); local $index= "$SRSIndexDir/$db" . '_id.inx'; ##? or '_acc.inx' push( @files, "$SRSDB/$db.i", "$SRSDB/$db.is"); local $needindex= $forceindex; if (!$needindex) { local $df; foreach $df ( @files ) { $df = "$dpath/$df" if ($df !~ m+/+ ); $needindex= &BioMirror::isOldTarget($df,$index); last if $needindex; } } if ( $needindex ) { ## replace srsbuild w/ srscheck ## check opts: -l $dbname -o build.script -xdir 'SRSINX:' -odir $tempIndexOutputDir local($srsupf)= "/tmp/srsindex.$db"; local($cmd) = <$outfile") || die "Can't create $SRSDB/srsdb.i"; print DF "## $SRSDB/srsdb.i\n"; print DF "## autogenerated by $BioMirror::AgentID\n"; print DF "## $date\n\n"; ## print DF "\$dataRoot = '$dataRoot'\n\n"; ##? don't need? my $ic; foreach $ic (sort @ic) { next if ($SkipIfile{$ic}); ## next if ($ic eq 'srsdb.i' || $ic eq 'srsgen.i' || $ic eq $SRSLibIds); print DF "file:\"SRSDB:$ic\"\n"; } print DF "\n\$site:[name:unix\n\tlibs:{\n"; my $obj; foreach $obj (@dataclasses) { my $srsdb= uc($obj->getSrsdb()); my $dosrs= (($obj->makeflags & $BioMirror::Data::kDoSrsIndex) != 0); if ($srsdb && $dosrs) { my $todir = BioMirror::replaceVars( $obj->expanded_dir ); print DF "\t\t\$libloc:[\@".$srsdb."_DB dir:\"$todir\"]\n"; } } print DF " \t}\n]\n"; readLibIds("$SRSDB/$SRSLibIds"); print DF "\n\$srsdb:[\n\tlibIds:{\n"; foreach $obj (@dataclasses) { my $srsdb= uc($obj->getSrsdb()); my $dosrs= (($obj->makeflags & $BioMirror::Data::kDoSrsIndex) != 0); if ($srsdb && $dosrs) { my $lid= getLibId($srsdb); print DF "\t\t\$libid:[$lid lib:\@?".$srsdb."_DB]\n"; } } print DF "\t}\n]\n"; close(DF); } return 0; } sub getLibId { local($srsdb)= @_; my $lid= $libIds{$srsdb}; return $lid if ($lid); do { $newlibid++; } while ( $idLib{ $newlibid } ); $libIds{$srsdb}= $newlibid; $idLib{$newlibid}= $srsdb; return $newlibid; } sub readLibIds { ## format of line: $libid:[8 lib:@?SWISSNEW_DB] local($idf)= @_; local(*F); if (open(F,$idf)) { while () { if (/\$libid\W+(\d+)\s*lib\W+(\w+)/) { my $lid= $1; my $srsdb= uc($2); $srsdb =~ s/_DB$//; $libIds{$srsdb}= $lid; ## global hash of ids $idLib{$lid}= $srsdb; # and converse } } close(F); } } =head1 BioMirror::SRS::updateDbFileList( $srsdb, $reffilelist, $outfile ) Edit databank.i file to reflect available databank.i files $srsdb -- name of databank to edit $reffilelist -- ref of @list of $srsdb data files to include $outfile -- optional output file (default is SRSDB:$srsdb.i) =cut sub updateDbFileList { local( $srsdb, $reffilelist, $outfile ) = @_; my $date= `date`; my $infile= "$SRSDB/".lc($srsdb).".i"; $outfile= "$infile.new" unless($outfile); my @dfiles= @$reffilelist; if (!scalar(@dfiles)) { @dfiles= ('*'); } local(*D,*DF); if (open(D,$infile)) { BioMirror::saveOldFile($outfile); open(DF,">$outfile") || die "Can't create $outfile"; print DF "##bm $SRSDB/".lc($srsdb).".i\n"; print DF "##bm auto-updated by $BioMirror::AgentID\n"; print DF "##bm $date\n\n"; my $inlib= 0; my $infiles= 0; while () { next if /^\#\#bm /; my $didprint= 0; if (/^\s*${srsdb}_DB\s*:\s*\$library\s*:\s*\[/i) { $inlib= 1; } elsif ($inlib && /]/) { $inlib= 0; } elsif ($inlib) { if (/\s*files\s*:\s*\{/i) { $infiles= 1; } elsif ($infiles && /\}/) { my $df; foreach $df (sort @dfiles) { $df =~ s/\..+//; ## drop file suffix? print DF "\t\$file:$df\n"; } $infiles= 0; } elsif ($infiles) { $didprint= 1; } } print DF $_ unless $didprint; } close(D); close(DF); } return 0; } =head1 BioMirror::SRS::fetchIcarusDoc($srsurl, $srsdb, $ifile, $outfile) Fetch and print Icarus document from SRS server. Requires LWP packages. $srsurl -- srs server, such as http://srs.ebi.ac.uk/ $srsdb -- srs db name, such as EMBL $ifile -- icarus file = ( i, is, it, db, gen) $outfile -- optional output (default to stdout) =cut ## add method to wgetz srs .i, .is, .it from other servers - collect archive of them? ## e.g. http://kalo/srsbin/cgi-bin/wgetz?-fun+PageIcarusFile+-l+PIR+-ifile+i ## ? revise to get all ifiles for [all] dbs from a given server? sub fetchIcarusDoc { local($srsurl,$srsdb,$ifile,$outfile) = @_; Carp::carp("Bad url '$srsurl'") unless ($srsurl =~ m=^http://=i); ## if (!$srsdb) - get all dbs at server? Carp::carp("Bad srsdb '$srsdb'") unless ($srsdb); ## if (!$ifile) get all for db ? $ifile= 'i' if ($ifile !~ /^(i|is|it|db|gen)$/); my $striphtml= 1; #my $surl= $srsurl . "$wgetz?-fun+PageIcarusFile+-l+$srsdb+-ifile+$ifile"; my $surl= $srsurl . "$wgetz?-page+icarusFile+-l+$srsdb+-ifile+$ifile"; my $text= BioMirror::getDocFromUrl( $surl, $striphtml); if (length($text) < 10) { } #? error, at least don't make file elsif ($outfile) { open(OUT,">$outfile"); print OUT $text; close(OUT); } else { print $text; } return 0; } ## this cute trick gets all db info !? -- what srsdatabanks uses # http://magpie//srs5bin/cgi-bin/wgetz?-fun+pageflatinfo sub getDblist { local($srsurl) = @_; Carp::carp("Bad url '$srsurl'") unless ($srsurl =~ m=^http://=i); my $striphtml= 0; #my $surl= $srsurl . "$wgetz?-fun+pageliblist"; my $surl= $srsurl . "$wgetz?-page+databanks"; ## http://iubio.bio.indiana.edu/srsx/bin/wgetz?-fun+pageliblist+-color+plain ## http://srs.ebi.ac.uk/srs5bin/cgi-bin/wgetz?-fun+pageliblist print STDERR "getDblist $surl\n" if $debug; my $text= BioMirror::getDocFromUrl( $surl, $striphtml); my @lines= split(/\n/, $text); my @dblist; foreach (@lines) { ## ##if (/\-info\+(\w+)/) { if (/\-lib\+(\w+)/) { push(@dblist,$1); print STDERR "db=$1\n" if $debug; } } return @dblist; } sub getSrsServers { my $striphtml= 0; my $text= BioMirror::getDocFromUrl( $allservers, $striphtml); my @lines= split(/\n/, $text); my %serverhash; foreach (@lines) { ## http://expasy.proteome.org.au:80/srs5/ if ( /HREF=\"([^\"]+)\">([^<]+) name $serverhash{$2}= $1; ## name => url } } return %serverhash; } 1; # perly __END__ ## get all srsdb info getz '[databanks-nam:*]' -f 'nam enn' |more getz '[databanks-nam:*]' -f 'nam des' |more kalo% getz '[databanks-nam:*]' -f 'nam enn' |more SWISSPROT at Adlib, CAB International, Wallingford, UK The current release 36 has 74019 entries and was indexed 19-Sep-1998. EMBL at Adlib, CAB International, Wallingford, UK The current release 57 has 3046471 entries and was indexed 01-Jan-1999. EMBLNEW at Adlib, CAB International, Wallingford, UK The current release has 92702 entries and was indexed 02-Jan-1999. kalo% getz '[databanks-nam:*]' -f 'nam des' |more SWISSPROT at Adlib, CAB International, Wallingford, UK SWISS-PROT Protein Sequence Database

SWISS-PROT is a curated protein sequence database which strives to ... EMBLNEW at Adlib, CAB International, Wallingford, UK SWISSNEW at Adlib, CAB International, Wallingford, UK PRINTS at Adlib, CAB International, Wallingford, UK This is release 18.0 of the PRINTS database. PRINTS is a compendium of protein fingerprints. A ########////////// sub readDatabanksDoc { local($dbdocfile,$srsdb)= @_; ##? use srs getz '[databanks-fld:...] query ? local(*D); if (open(D,$dbdocfile)) { } } GENBANK_DB:$library:[GENBANK group:@SEQUENCE_LIBS partSize:200000 subentries:@GenbankFeatures_DB format:@GENBANK_FORMAT cachesize:1024 maxNameLen:10 ifiles:{'genbank.i' 'genbank.is'} files:{ # 'gb*' ## !? something in gbpln1 is dying w/ out-of-mem error, regardless of partSize $file:gbpln1 $file:gbpln2 } ] makeSrsdb: $site:[name:unix libs:{ $libloc:[@SWISSPROT_DB dir:"$dataRoot/swissprot/"] $libloc:[@SWISSNEW_DB dir:"$dataRoot/swissnew/"] } ] ## libid sets order of display ... get from somewhere? $srsdb:[ libIds:{ $libid:[1 lib:@?SWISSPROT_DB] $libid:[8 lib:@?SWISSNEW_DB] } ] wgetz.c -fun options } functions[] = { "PageStart", PageStart, "PageReinspectQuery", PageReinspectQuery, "PageDownloadSetOpt", PageDownloadSetOpt, "PageDownloadSet", PageDownloadSet, "PageQueryForm", PageQueryForm, "PageQueryForm4", PageQueryForm4, "PageQueryForm5", PageQueryForm5, "PageField1Info", PageField1Info, "PageField2Info", PageField2Info, "PageField3Info", PageField3Info, "PageField4Info", PageField4Info, "PageQueryManager", PageQueryManager, "PageConstructQuery5", PageConstructQuery5, "PageConstructQuery4", PageConstructQuery4, "PageSelectLibrary", PageSelectLibrary, "PageViewCreate", PageViewCreate, "PageViewSelectFields", PageViewSelectFields, "PageViewManager", PageViewManager, "PageViewEdit", PageViewEdit, "PageEntry", PageEntry, "PageViewSelectedEntries", PageViewSelectedEntries, "PageLaunchAppl", PageLaunchAppl, "PageDoAppl", PageDoAppl, "PageQueryExpression", PageQueryExpression, "PageCombineQueries", PageCombineQueries, "PageIndexBrowseOpt", PageIndexBrowseOpt, "PageIndexBrowse", PageIndexBrowse, "PageValues2Query", PageValues2Query, "PageLinkOpt", PageLinkOpt, "PageLinkEntryOpt", PageLinkEntryOpt, "PageLinkToLibs", PageLinkToLibs, "PageMoreEntries", PageMoreEntries, "PageLibList", PageLibList, ##! "PageLibNetwork", PageLibNetwork, "PageLibInfo", PageLibInfo, "PageApplInfo", PageApplInfo, "PageFlatInfo", PageFlatInfo, ##! "PageLinkInfo", PageLinkInfo, "PageIcarusFile", PageIcarusFile, ##! "PageDbStatus", PageDbStatus, "PageEnv", PageEnv,