############################## Perl XI - BioPerl Problem Set ############################# Preparation: 1. Download uniprot_sprot: wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz 2. Unzip the file. gunzip uniprot_sprot.fasta.gz PROBLEM 1: Bio::DB::Fasta 1. write a script to retrieve all IDs from uniprot_sprot.fasta using the get_all_ids method from Bio::DB:Fasta. 2. Search through the list of IDs for all IDs that contain the term "HDAC" 3. Print the sequences for these proteins, in FASTA format. #!/usr/bin/perl #PerlXI_Bioperl_Prob_1.pl use strict; use warnings; use Bio::DB::Fasta; my $BioDB_file = shift; my $BioDB_obj = Bio::DB::Fasta->new ($BioDB_file); my @FASTA_ids = $BioDB_obj->get_all_ids; #Getting all ids into array #print $FASTA_ids[1],"\n"; #to see each id individually #print @FASTA_ids; #all entries in one file foreach my $FASTA_ids (@FASTA_ids) { if ($FASTA_ids =~ /HDAC/){ #print $FASTA_ids,"\n"; #Checking to see if retrieved HDAC ids my $ID_Seq_obj = $BioDB_obj->get_Seq_by_id($FASTA_ids); #need to refer to original object my $Sequence_string = $ID_Seq_obj -> seq; #pull out sequence my $Description_string = $ID_Seq_obj -> desc; #pull out description print ">$FASTA_ids $Description_string\n$Sequence_string\n"; } } PROBLEM 2: Bio::SeqIO 1. Write a script using Bio::SeqIO to retrieve the CDS translation from a genbank file. #usr/bin/perl #PerlXI_Bioperl_Prob_2.pl use strict; use warnings; use Bio::SeqIO; my $Genbank_file = shift; my $GenBank_obj = Bio::SeqIO->new(-file => $Genbank_file, -format => "genbank"); my $Seqobj = $GenBank_obj->next_seq; foreach my $feature ($Seqobj->get_SeqFeatures){ print $feature->primary_tag,"\n"; #my $tag = $GenBank_seq->get_all_tags; if ($feature->primary_tag eq "CDS"){ my ($translation) = $feature->get_tag_values("translation"); #printing seqs. for CDS with all other tags print $translation,"\n"; } } PROBLEM 3: Bio::SearchIO 1. On the command line, run the following command to format the file for using as a blast database: makeblastdb -in uniprot_sprot.fasta -dbtype prot 2. Here is how you can blast your favorite seq against swissprot: blastp -query query.fasta -db uniprot_sprot.fasta -evalue 1e-10 -out query_v_sprot.blastout **You can find additional information on BLAST+ at: http://www.ncbi.nlm.nih.gov/books/NBK1763/ 3. Run Blast with 3 of the protein sequences from Problem 1. * use the uniprot_sprot.fasta as your database * run blast with an e-value cut-off of 1e-10 4. Parse your Blast output. For Hits with "significance" less than or equal to 1e-50 retrieve every HSP and print in a tab delimited format: * QUERY Name * HIT Name * HSP Evalue #! /usr/bin/perl #PerlXI_Bioperl_Prob3.pl use strict; use warnings; use Bio::SearchIO; # use the bioperl object #my ($infile, $outfile) = @ARGV; #my ($informat, $outformat) = ('fasta', 'fasta'); # Bioperl-style access my $infile = shift; my $obj = Bio::SearchIO->new( -file => $infile, -format => 'blast' ); my $cutoff = 1e-50; while (my $result = $obj->next_result){ my $id = $result->query_name(); #print "$id\n"; while ( my $hit = $result->next_hit) { my $hit_name = $hit->name(); my $significance = $hit->significance(); #print "Significance: $significance\n"; if ($significance < $cutoff){ while ( my $hsp = $hit->next_hsp){ my $evalue = $hsp->evalue(); print "QUERY Name: ", "$id\n"; print "HIT Name: ", "$hit_name\n"; print "HSP Evalue: ", "$evalue\n"; } } } }