#!/usr/bin/perl -w

# Description: extract size of gene clusters for each genome
# Input format (mcs):
# 1:COG0001H COG0007H: NC_000917 NC_003030 NC_003364 NC_004557
# 2:COG0001H COG0007H COG1587H COG1648H: NC_003030 NC_003364 NC_004557

use strict;

die "usage: getClusterSizes.pl <mcsFile> > output\n" unless @ARGV == 1;

open IN, $ARGV[0] or die "Can't open $ARGV[0] for input: $!";
my @Line = <IN>;
close IN;
my %genome_CSHash = ();
for (my $i=0; $i < @Line; $i++)
{
	chomp($Line[$i]);
	if($Line[$i] eq "") { next; }
	my @fields = split /:/, $Line[$i];
	if (scalar(@fields) < 3) { next; }
	my @cogs = split / /, $fields[1];
	my @genomes = split / /, $fields[2];
	foreach my $org (@genomes)
	{
		if (not exists $genome_CSHash{$org})
		{
			my @a = ();
			$genome_CSHash{$org} = \@a;
		}
		push(@{$genome_CSHash{$org}}, scalar(@cogs));
	}
}

foreach my $org (keys %genome_CSHash)
{
	my %hash = map { $_, 1 } @{$genome_CSHash{$org}};
	my @unique = keys %hash;
	my @CS = sort {$a <=> $b} (@unique);
	print $org, "\t", join(",", @CS), "\n";
}
