# Author: Robert Waterhouse
#!/usr/local/bin/perl

use strict;

# list the run_* directories with tarzipped single copy sequences
# if you did not name your first exercise folder 'rmw1' then you will have to edit this here
my @all_runs=split(/\s+/,`ls ~/rmw1/run_*/single_copy_busco_sequences.tar.gz`);

# hash (dictionary) for saving the sequences
undef my %spe2eog2seq;
# hash (dictionary) for saving the species per orthologous group (BUSCO)
undef my %eog2spe;

# cycle through runs to extract the sequences
foreach my $run (@all_runs) {
    
    # get the assembly name
    my $assembly='';
    if($run=~/run_(\S{6})\/single_copy_busco_sequences.tar.gz/) { $assembly=$1; }
    
    print "Reading\t$assembly\t$run\n";
    
    # make a temporary directory into which to extract the sequences
    if(-d "temp_seqs") { `rm -r temp_seqs`; }
    `mkdir temp_seqs`;
    
    # extract to temp_seqs
    `tar -xzf $run -C temp_seqs`;
    
    # read in these (.faa for the proteins, ignore .fna the nucleotides) sequences and save
    # count number of SC BUSCOs found
    my $counter=0;
    foreach my $fastafile (split(/\s+/,`ls temp_seqs/single_copy_busco_sequences/*.faa`)) {
	my $fasta=`cat $fastafile`;
	my ($header,$protein)=split(/\n/,$fasta);
	my $eog='';
	if($header=~/>(EOG\S{8}):/) { $eog=$1; } # capture the orthologous group (BUSCO) identifer
	$spe2eog2seq{$assembly}{$eog}=$protein;
	push(@{$eog2spe{$eog}},$assembly);
	$counter++;	
    }
    print "Found $counter SC BUSCOs for $assembly\n";
    `rm -r temp_seqs`;
}

# make directory to keep all your sequences
if(-d "busco_seqs") { `rm -r busco_seqs`; }
`mkdir busco_seqs`;

# cycle through each orthologous group (BUSCO) and keep only those with all species
print "Now looking for universal SC BUSCOs ... ";
my $unicount=0;
foreach my $eog (sort keys %eog2spe) {
    if(scalar(@{$eog2spe{$eog}}) == scalar(@all_runs)) {
	open(OUT,">busco_seqs/$eog\.fas") || die $!;
	foreach my $assembly (sort @{$eog2spe{$eog}}) {
	    print OUT ">$assembly\n" . $spe2eog2seq{$assembly}{$eog} . "\n";
	}
	close(OUT);
	$unicount++;
    }
}
print "DONE - found $unicount universal SC BUSCOs\n";