#!/usr/bin/perl -w

# Description: compute p-value of BLS
# Input format (bls):
# 1:COG0001H COG0007H:NC_000917 NC_003030 NC_003364 NC_004557:1.113456
# 2:COG0001H COG0007H COG1587H COG1648H:NC_003030 NC_003364 NC_004557:0.877630

use strict;

die "usage: cmptBLSpval.pl <mcsFile> <rdmMcsDir> > output\n" unless @ARGV == 2;

open IN, $ARGV[0] or die "Can't open $ARGV[0] for input: $!";
my @Line = <IN>;
close IN;
my @id = ();
my @cogs = ();
my @genomes = ();
my @bls = ();
for (my $i=0; $i < @Line; $i++)
{
	chomp($Line[$i]);
	if($Line[$i] eq "") { next; }
	my @fields = split /:/, $Line[$i];
	push(@id, $fields[0]);
	my @c = split / /, $fields[1];
	push(@cogs, \@c);
	my @g = split / /, $fields[2];
	push(@genomes, \@g);
	push(@bls, $fields[3]);
}

my %genome_hist_hash = ();
for (my $i=0; $i < scalar(@id); $i++)
{
	my @orgs = @{$genomes[$i]};
	# read in the random sampled bls
	foreach my $org (@orgs)
	{
		if (not exists $genome_hist_hash{$org})
		{
			my %size_hist = ();
			open IN, "<$ARGV[1]/$org.rdm.bls" or die "Can't open $ARGV[1]/$org.rdm.bls for input: $!";
			while(<IN>)
			{
				chomp;
				my @fields = split /:/;
				my @c = split / /, $fields[1];
				if (not exists $size_hist{scalar(@c)})
				{
					my @hist = ();
					$size_hist{scalar(@c)} = \@hist;
				}
				push(@{$size_hist{scalar(@c)}}, $fields[3]);
			}
			close IN;
			foreach my $hist (values %size_hist)
			{
				@{$hist} = sort {$b <=> $a} (@{$hist});
			}
			$genome_hist_hash{$org} = \%size_hist;
		}
	}

	# compute p-value
	my $size = scalar(@{$cogs[$i]});
	my $score = $bls[$i];
	my $num_ge = 0;
	my $num_all = 0;
	#my $org = "NC_000913";
	foreach my $org (@orgs)
	{
		my @hist = @{${$genome_hist_hash{$org}}{$size}};
		foreach my $s (@hist)
		{
			if ($s >= $score) { $num_ge++; }
			else { last; }
		}
		$num_all += scalar(@hist);
	}
	my $max_pval;
	if( $num_all == 0) { $max_pval = 1.0; }
	else { $max_pval = $num_ge/$num_all; }
	#print $id[$i], ":", scalar(@{$cogs[$i]}), ":", scalar(@{$genomes[$i]}), ":", $bls[$i], ":", $max_pval, "\n";
	print $id[$i], ":", join(" ", @{$cogs[$i]}), ":", join(" ", @{$genomes[$i]}), ":", $bls[$i], ":", $max_pval, "\n";
}
