#! /usr/bin/perl -w ## converts blast output of consensus sequences to one containing actual (raw) sequences ($blastIn,$dbRaw)=@ARGV; $prog=$0; $prog=~s/.*\///; if($#ARGV<1){ print "ERROR: arguments not specified!\n"; print "$prog converts an alignment file generated by BLAST (standard format) on the database of consensus sequences to an alignmet file containing actual (raw) sequences\n"; print "$prog takes two arguments:\n"; print "\t1: blast output file (standard format generated with -m 0 parameter)\n"; print "\t2: a file (fasta format) containing actual (raw) sequences of consensus sequences used for BLAST search\n"; print "\n"; print "$prog blast_file actual_sequences_file\n"; exit; } if(! defined $blastIn){ die "ERROR: blast file not specified (first argument)\n"; } if(! defined $dbRaw){ die "ERROR: sequence database file not specified (second argument)\n"; } $h_blastIns{$blastIn}=1; ## parse to get the sequence ids needed undef %h_id2len; foreach $fileIn (sort keys %h_blastIns){ open(FHIN,$fileIn) || die "ERROR: failed to read file=$fileIn\n"; $firstLine=; if($firstLine !~ /^BLAST/){ die "ERROR: $fileIn is not in the right format\n"; } while(){ if(/^>(\S+)/){ $id=$1; $h_id2len{$id}=1; } elsif(/^\s+Length =\s*(\d+)/){ $len=$1; $h_id2len{$id}=$len; } } close FHIN; } ## read sequences undef %h_id2seq; open(FHIN,$dbRaw) || die "ERROR: failed to read file=$dbRaw\n"; $read=0; while(){ if(/^>(\S+)/){ $id=$1; if(defined $h_id2len{$id}){ $read=1; if(defined $h_id2seq{$id}){ print STDERR "INFO: found multiple entries for $id in $dbRaw. Using only the first one\n"; $read = 0; } } else{ $read=0; } }elsif($read==1){ s/\s//g; $h_id2seq{$id}.=$_; } } close FHIN; foreach $id (sort keys %h_id2len){ if(! defined $h_id2seq{$id}){ die "ERROR: did not find an entry for $id in $dbRaw\n"; } } foreach $id (sort keys %h_id2seq){ $lenCheck=length($h_id2seq{$id}); if($lenCheck != $h_id2len{$id}){ die "ERROR: discrepancy in sequence length for $id . The length found in $dbRaw is $lenCheck while the length indicated in $blastIn is $h_id2len{$id}\n"; } } foreach $fileIn (sort keys %h_blastIns){ open(FHIN,$fileIn) || die "ERROR: failed to open file=$fileIn\n"; while(){ if(/^>(\S+)/){ $id=$1;} elsif(/^Sbjct:/){ ($edge1,$beg,$specer1,$gfrag,$specer2,$end,$edge2)= ($_=~/^(Sbjct:\s*)(\d+)(\s*)(\S+)(\s*)(\d+)(\s*)/); $newfrag=""; $spos=$beg; for $i (0 .. length($gfrag)-1){ if(substr($gfrag,$i,1) ne "-"){ $newfrag.=substr($h_id2seq{$id},$spos-1,1); $spos++; }else{ $newfrag.="-"; } } $_=$edge1.$beg.$specer1.$newfrag.$specer2.$end.$edge2; } print $_; } close FHIN; }