#!usr/bin/perl

use strict;

# program to read a list of NCBI accession numbers
# and extract the fasta sequences from genbank
# Rebecca Hamer February 2008

print "enter input file name containing list of protein accessions:\n";
my $file = <STDIN>;
chomp $file;

open GIS, $file or die "cannot open file of Gi numbers\n$!";
my @gi = <GIS>;
close GIS;

print "enter output file name e.g. protein_seqs.fasta:\n";
my $outfile = <STDIN>;
chomp $outfile;

open SEQS, ">$outfile" or die "cannot open $outfile\n$!";

foreach my $gi(@gi){

    my $fasta = genbank_seq($gi);
    
    print "fasta is:\n$fasta\n\n";

    print SEQS "$fasta\n";

}

close SEQS;



sub genbank_seq  {
    
    use LWP;
    use HTTP::Request::Common;
    
    my $ua = new LWP::UserAgent;
    
    my ($gi) = @_;
    my $fasta = "seq not found";
    my $sleep=0;
    
    do{
	sleep($sleep);
	
	my $res = $ua->request(GET "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=$gi&rettype=fasta");
	
	if($res->is_success){
	    $fasta = $res->content;
	}
	else{
	    $fasta = $res->error_as_HTML;
	}
	$sleep=$sleep+1;
    }
    until (($fasta=~/>gi/) or ($fasta =~ /ERROR: Cannot find/i) or ($sleep>30));
    
    if ($sleep>30){
	$fasta = "timeout: seq from $gi not found";
    }
    elsif ($fasta =~ /ERROR: cannot find/i){
	$fasta = "error: Gi $gi number not found";
    }
    elsif ($fasta eq ""){
	$fasta = "seq from $gi not found";
    }

    if($fasta =~ /(>gi.*)/s){
	$fasta = $1;
	#print "fasta is:\n$fasta\n";
    }

    return $fasta;
}


