# Author: Robert Waterhouse #!/usr/local/bin/perl use strict; # list the run_* directories with tarzipped single copy sequences # if you did not name your first exercise folder 'rmw1' then you will have to edit this here my @all_runs=split(/\s+/,`ls ~/rmw1/run_*/single_copy_busco_sequences.tar.gz`); # hash (dictionary) for saving the sequences undef my %spe2eog2seq; # hash (dictionary) for saving the species per orthologous group (BUSCO) undef my %eog2spe; # cycle through runs to extract the sequences foreach my $run (@all_runs) { # get the assembly name my $assembly=''; if($run=~/run_(\S{6})\/single_copy_busco_sequences.tar.gz/) { $assembly=$1; } print "Reading\t$assembly\t$run\n"; # make a temporary directory into which to extract the sequences if(-d "temp_seqs") { `rm -r temp_seqs`; } `mkdir temp_seqs`; # extract to temp_seqs `tar -xzf $run -C temp_seqs`; # read in these (.faa for the proteins, ignore .fna the nucleotides) sequences and save # count number of SC BUSCOs found my $counter=0; foreach my $fastafile (split(/\s+/,`ls temp_seqs/single_copy_busco_sequences/*.faa`)) { my $fasta=`cat $fastafile`; my ($header,$protein)=split(/\n/,$fasta); my $eog=''; if($header=~/>(EOG\S{8}):/) { $eog=$1; } # capture the orthologous group (BUSCO) identifer $spe2eog2seq{$assembly}{$eog}=$protein; push(@{$eog2spe{$eog}},$assembly); $counter++; } print "Found $counter SC BUSCOs for $assembly\n"; `rm -r temp_seqs`; } # make directory to keep all your sequences if(-d "busco_seqs") { `rm -r busco_seqs`; } `mkdir busco_seqs`; # cycle through each orthologous group (BUSCO) and keep only those with all species print "Now looking for universal SC BUSCOs ... "; my $unicount=0; foreach my $eog (sort keys %eog2spe) { if(scalar(@{$eog2spe{$eog}}) == scalar(@all_runs)) { open(OUT,">busco_seqs/$eog\.fas") || die $!; foreach my $assembly (sort @{$eog2spe{$eog}}) { print OUT ">$assembly\n" . $spe2eog2seq{$assembly}{$eog} . "\n"; } close(OUT); $unicount++; } } print "DONE - found $unicount universal SC BUSCOs\n";