PathwayLinker-functional-similarity-Z-scores.pl

We used this Perl script to compute the Z scores quantifying how significantly two interacting proteins are members of the same signaling pathway(s). Each possible combination of interaction data sources is analyzed and independently from this each signaling pathway database is allowed (one signaling database at a time). A detailed description is here and the shell command running this program (and containing command-line parameters) is here.


#!/usr/bin/perl
use strict; use warnings;

# =============== parameters =================

if( 6 != @ARGV )
{
    die "
\tUsage: $0 \\
\t       <in:   directory containing protein data> \\
\t       <in:   input file pattern: UniProtKB data files> \\
\t       <in:   start the random seed variable from this value> \\
\t       <in:   number of random seeds for the randomized analysis with one data set> \\
\t       <in:   unreviewed (TrEBML) proteins allowed? 1: yes, 0:no> \\
\t       <out:  the name of the output file (for the results of the analysis) starts with this string>

";
}

my %PAR = ( "inDir"    => $ARGV[0],
            "UniProtKBinFilePattern" => $ARGV[1],
            "rndSeed"  => $ARGV[2],
            "nRndSeed" => $ARGV[3],
            "trembl",  => $ARGV[4],
            "outFileNameStart"       => $ARGV[5],
            #
            # list of interaction types for each organism
            "intTypeList cel" => "stringdb biogrid ccsbwi8 stringexp ccsbgenetic",
            "intTypeList dme" => "droidotherphysical stringdb biogrid droidcuragen droidfinley droidhybrigenics ".
                                 "stringexp droidgenetic",
            "intTypeList hsa" => "hprd stringdb biogrid stringexp", 
            #
#           # URL template for downloading the UniProtKB accession (AC) -> taxonomy ID mapping
#           "urlTemplate_ac2taxId" => "http://www.uniprot.org/uniprot/?query=QUERY&format=tab&columns=id,taxon",
#           # number of ACs for which the mapping should be downloaded in one set
#           "download_groupSize" => "20",
#           # wget
#           "wget" => "/usr/bin/wget",
#           #
#           # taxonomy ID --> organism code
#           "taxid2orgCode 6239" => "cel",
#           "taxid2orgCode 7227" => "dme",
#           "taxid2orgCode 9606" => "hsa",
            #
            # UniProtKB code of an organism --> 3-letter code of the organism
            "orgUniprotCode_2_orgThreeLetterCode CAEEL" => "cel",
            "orgUniprotCode_2_orgThreeLetterCode DROME" => "dme",
            "orgUniprotCode_2_orgThreeLetterCode HUMAN" => "hsa",
    );

# =============== function definitions ==============

sub currentTime
{
    my @months = ('January','February','March','April','May','June',
                  'July','August','September','October',
                  'November','December');
    my @days = ('Sun','Mon','Tue',
             'Wed','Thu','Fri','Sat');
    my ($sec,$min,$hour,$mday,$mon,$year,$wday) = 
        (localtime(time))[0,1,2,3,4,5,6];
    #
    # patch: for years beyond 2000, the year is sometimes wrongly 100,101,etc instead of 2000,2001 -- correct this by adding 100 to the year
    if($year<2000){ $year+=1900; }
    # date
    my $date = "$days[$wday] $months[$mon] $mday $year $hour:$min:$sec";

    # done
    return $date;
}

# ----------------------------------------------------------------

sub init
{
    my ($par) = @_;

    # loop through the list of organisms for which interaction types are available
    for my $orgCode (map{ /^intTypeList\s+(\S+)/; $1 } grep{ /^intTypeList/ } keys %$par){
        # loop through the list of interaction types in this organism
        my @intTypeList = split m/\s+/, $$par{"intTypeList ".$orgCode};
        # for each interaction type save all possible interaction type sets in which it is a member
        for my $group (@{&powerset(\@intTypeList)}){
            for my $intType (@$group){
                ++${ ${ ${$$par{"int2org2groupList"}}{$intType} }{$orgCode} }{ join(",",sort @$group) };
            }
        }
    }

#    # test
#    for my $int (sort keys %int2org2groupList){
#	print $int."\n";
#	for my $orgCode (sort keys %{$int2org2groupList{$int}}){
#	    print $int."\t".$orgCode."\n";
#	    for my $groupList (sort keys %{${$int2org2groupList{$int}}{$orgCode}}){
#		print $int."\t".$orgCode."\t".$groupList."\n";
#	    }
#	    print "\n";
#	}
#	print "\n";
#   }

}

# -----------------------------------------------

sub powerset
{
    my ($list) = @_;

    return
	[
	    map {
		my $n = $_;
		[@$list[grep {$n & 1 << $_} 0..$#$list]]
	    } (1..2**($#$list + 1))
	];
}

# -----------------------------------------------

# knowing the directory of a protein return its primary UniProtKB accession (pAC)
sub protDir2pac
{
    my ($protDir) = @_;

    if( -d $protDir."/primary" ){
	#my $retval = ( map{ s/\/\s*$//; (split m/\//)[-1] } glob($protDir."/primary/*") )[ 0 ];
	#if( "1" eq $retval ){ local $|=1; print $protDir."\n".join("\n",glob($protDir."/primary/*"))."\n"; }
	#return $retval;
	return ( map{ s/\/\s*$//; (split m/\//)[-1] } glob($protDir."/primary/*") )[ 0 ];
    }
    else{
	return "";
    }
}

# -----------------------------------------------

sub read_ac2orgCode
{
    my ($par,$ac2orgCode) = @_;

    # loop through the list of input files
    for my $inFile (glob($$par{"UniProtKBinFilePattern"})){
	# open file (unzip before opening, if necessary)
	my $in = ($inFile=~/\.(gz|Z|zip)$/) ? "gzip -dc $inFile|" : $inFile;
	open IN, "$in" or die "Error, cannot open \'$in\'\n";
	# read input records ( input record separator: //\n )
	local $/ = "//\n";
	while(my$record=<IN>){
	    # save the 3-letter organism code and proceed only if the organism is one of these: cel, dme, hsa
	    if( $record =~ / (?: ^ | \n ) ID \s+ \S+? _( CAEEL | DROME | HUMAN ) /x ){
		my $orgCode = $$par{"orgUniprotCode_2_orgThreeLetterCode ".$1};		
		# save the UniProtKB accessions (ACs) from each AC line
		my @acList; for my $acLine (grep{/^AC/} split m/\n/,$record){ push @acList, ($acLine =~ /(\S{6});/g); }
		#{local$|=1; print $record.":\n".join(", ",@acList)."\n-----------------------------\n";} # test
		# for each UniProtKB accession (AC) save the 3-letter code of the organism
		for my $ac (@acList){ $$ac2orgCode{$ac} = $orgCode }
	    }
	}
	close IN;
    }

#    # test
#    for my $ac (sort keys %$ac2orgCode){ print $ac."\t".$$ac2orgCode{$ac}."\n"; } exit(1);
}

# -----------------------------------------------

#sub download_pac2taxId
#{
#    my ($par,$pacList,$pac2taxId) = @_;
#
#    # download the mapping in groups of size <download_groupSize>
#    my @list;
#    while(@$pacList){
#	# list the ACs for which the mapping should be downloaded now
#	while( scalar @$pacList && $$par{"download_groupSize"} > scalar @list ){
#	    my $item = shift @$pacList; push @list, $item;
#	}
#	# the URL for downloading the mapping
#	my $url = $$par{"urlTemplate_ac2taxId"};
#	my $q = "(".join("+or+",map{"accession:".$_}@list).")";
#	$url =~ s/QUERY/$q/;
#	# download and save the mapping
#	open IN, $$par{wget}." \"".$url."\" -O - -o /dev/null | "; my @lines = <IN>; close IN;
#	shift @lines; # remove first line (header)
#	for(@lines){ if(/^\s*(\S+)\s+(\S+)\s*$/){ # save AC -> taxonomy ID pairs
#	    $$pac2taxId{$1} = $2;
#	}}
#    }
#}

# -----------------------------------------------

sub listJaccard
{
    my ($listA,$listB) = @_;

    # if either of the two lists is empty, then return 0
    if( 0 == scalar @$listA || 0 == scalar @$listB ){
	return 0.0;
    }
    else{
	# the union and intersection of the two lists
	my %union, my %isect;
	for (@$listA, @$listB){ $union{$_}++ || $isect{$_}++ }

	return ( 0.0 + scalar keys %isect ) / ( 0.0 + scalar keys %union );
    }
}

# -----------------------------------------------

# with the given parameters list the signaling pathway memberships of the protein with the primary UniProtKB accession <pac>
sub pwList
{
    my ($pac2sigDb2pwList,$pac,$sigDb,$pwList) = @_;

    # default: empty list
    @$pwList = ();
    # if there are values, then save them
    if( defined ${$$pac2sigDb2pwList{$pac}}{$sigDb} && scalar keys %{${$$pac2sigDb2pwList{$pac}}{$sigDb}} ){
	@$pwList = sort keys %{${$$pac2sigDb2pwList{$pac}}{$sigDb}};
    }
}

# -----------------------------------------------

sub analyzeInteractions
{
    my ($par,$type) = @_;

    # ----------- read data ----------------

    # read the list of primary UniProtKB accesions
    my %pacList;
    # read the signaling pathway memberships of all proteins by primary UniProtKB accession (pAC)
    my %pac2sigDb2pwList;
    # read the interactions of proteins by interaction type
#    my %pac2pacNei2intType;
    my %pac2pacNei;
    my %pacSortedPair2intType;
    # the list of signaling pathway databases
    my %sigDbList;

    # loop through the list of all interacting protein pairs
    my $baseDir = $$par{inDir}."/by_protein_ac/".$type;
    # loop through the list of proteins in these directories
    my $d = $baseDir;
    for my $dir1 (glob($d."/*")){ $d = $dir1;
	for my $dir2 (glob($d."/*")){ $d = $dir2;
	    for my $dir3 (glob($d."/*")){ $d = $dir3;
	        for my $dir4 (glob($d."/*")){ $d = $dir4;
		    for my $dir5 (glob($d."/*")){ $d = $dir5;
		        for my $dir6 (glob($d."/*")){	
			    $d = $dir6;
			    # save the primary AC of this protein
			    # proceed only if the primary AC is defined, i.e., the returned value is not an empty string
			    if( ( my $pac = &protDir2pac( $d ) ) !~ /^\s*$/ ){
#if("1"eq$pac){local$|=1;print "d: ".$d."\npac: ".$pac."\n------------\n";}#test
				# save this pAC into the list of primary accessions
				++$pacList{$pac};
				# list the signaling pathway memberships of this protein by source
				if( glob($d."/signaling_pathway_membership_by_source/*") ){
				for my $sigDbDir (map{s/\/\s*$//;$_} glob($d."/signaling_pathway_membership_by_source/*")){
				    my $sigDb = (split m/\//, $sigDbDir)[-1];
				    for my $pw (map{ s/\/\s*$//; (split m/\//)[-1] } glob($sigDbDir."/*")){
					++${ ${ $pac2sigDb2pwList{$pac} }{$sigDb} }{$pw};
					++$sigDbList{$sigDb};
				    }
				}}
				# loop through the list of interactors of the current protein
				if( glob($d."/interactions_by_interactor/*")){
				for my $acNeiDir (map{s/\/\s*$//;$_} glob($d."/interactions_by_interactor/*")){
				    my $acNei = (split m/\//, $acNeiDir)[-1];
				    # primary AC of the neighbor protein
				    # proceed only if the primary AC is defined
				    if( ( my $pacNei = &protDir2pac( $baseDir."/".join("/",split m//,$acNei) ) ) !~ /^\s*$/ ){
#if("1"eq$pacNei){local$|=1;print "d: ".$d."\nacNeiDir: ".$acNeiDir."\npacNei: ".$pacNei."\n------------\n";}else{local$|=1;print"OK: ".$pacNei."\n";}#test
					# save this pAC into the list of primary accessions
					++$pacList{$pacNei};
					# loop through the list of interaction types
					# through which these two proteins interact
					for my $intTypeDir (glob($acNeiDir."/*")){
					    my $intType = (split m/\//, $intTypeDir)[-1];
#					    ++${ ${ $pac2pacNei2intType{$pac   } }{$pacNei} }{$intType};
#					    ++${ ${ $pac2pacNei2intType{$pacNei} }{$pac   } }{$intType};
					    ++${ $pac2pacNei{$pac   } }{$pacNei};
					    ++${ $pac2pacNei{$pacNei} }{$pac   };
					    ++${ $pacSortedPair2intType{join(" ",sort($pac,$pacNei))} }{$intType};
					}
				    }
				}}
			    }
			}
		    }
		}
	    }
	}
    }

    # ----------- download data ---------------
    # for each primary AC get its organism from the UniProt data files
    my %ac2orgCode;
    &read_ac2orgCode(\%$par,\%ac2orgCode);
    # test
    {local$|=1; if( grep{ !defined $ac2orgCode{$_} } sort keys %pacList){
	print "No orgCode for the following pACs: \n".join("\n",grep{ !defined $ac2orgCode{$_} } sort keys %pacList)."\n";}}

# test
{local$|=1;print"dat start\t".(scalar keys %pacSortedPair2intType)." pairs\t".&currentTime()."\n";}
    # ----------- analyze -----------
    # the list of taxonomy IDs
    my %orgCodeList = map{$_=>1} values %ac2orgCode;
    # loop through the list of organisms
    #for my $orgCode (sort keys %orgCodeList){
    for my $orgCode (qw<dme cel hsa>){ # do 'dme' first
	#
	# ------------ analysis of real data -------------
	# the list of Jaccard correlation values for a given
	# group of allowed interaction types, signaling database, organism code
	my %intGroup2sigDb2jacList;
	# loop through the list of proteins from this organism
	for my $pac (grep{$orgCode eq $ac2orgCode{$_}} sort keys %pac2pacNei){
	    # loop through the list of those neighbor proteins from this organism
	    # that are ascii-betically larger than <pac>
	    for my $pacNei (grep{$pac lt $_ && $orgCode eq $ac2orgCode{$_}} sort keys %{$pac2pacNei{$pac}}){
		# loop through the list of the available signaling pathway membership databases (sources)
		for my $sigDb (sort keys %sigDbList){
		    # list the signaling pathways in this database to which the two proteins belong
		    my @pwListA; &pwList(\%pac2sigDb2pwList,$pac,   $sigDb,\@pwListA);
		    my @pwListB; &pwList(\%pac2sigDb2pwList,$pacNei,$sigDb,\@pwListB);
		    #
		    # proceed only, if at least one of the two proteins has at least one pathway membership
		    # in other words: use only those pairs of interacting neighbor proteins from which at least one is a member of at least one signaling pathway
		    if( (scalar @pwListA) + (scalar @pwListB) ){
			# get the jaccard correlation value
			my $jac = &listJaccard(\@pwListA,\@pwListB);
			# loop through the interaction types connecting these two proteins
			# and list the interaction type groups to which these interaction types belong
			my %groupList;
			for my $intType (keys %{$pacSortedPair2intType{ join(" ",sort($pac,$pacNei)) }}){
			    for my $group (keys %{ ${ ${$$par{"int2org2groupList"}}{$intType} }{$orgCode} }){
				++$groupList{$group};
			    }
			}
			# for each interaction type group save the Jaccard correlation
			for my $group (keys %groupList){
			    push @{ ${ $intGroup2sigDb2jacList{$group} }{$sigDb} }, $jac;
			}
		    }
		}
	    }
	}
	# loop through the lists of interaction type groups and signaling databases
	for my $group (keys %intGroup2sigDb2jacList){
	    for my $sigDb (keys %{$intGroup2sigDb2jacList{$group}}){
		# get average and standard deviation of the Jaccard correlation values saved for these parameters
		# NOTE: we assume here that the list is non-empty
		my $avg, my $stdDev; &avgStdDev( \@{ ${ $intGroup2sigDb2jacList{$group} }{$sigDb} }, \$avg, \$stdDev );
		# save the average and standard deviation of the Jaccard correlations
		$$par{"jacAvgStdDevN acType:".$type." orgCode:".$orgCode." intGroup:".$group." sigDb:".$sigDb} =
		    { "avg" => $avg, "stdDev" => $stdDev, "n" => (scalar @{ ${ $intGroup2sigDb2jacList{$group} }{$sigDb} }) };
	    }
	}

	# ------------ analysis of randomized data -------------
	# the list of primary accessions to pick from
	my @pacList = sort keys %pacList;
	# loop through the requested number of random seeds
	for (1..$$par{"nRndSeed"}){
{local$|=1;print"orgCode: ".$orgCode."\trndSeed: ".$$par{"rndSeed"}."\t".&currentTime()."\n";} # test
	    # increment random seed and then seed the random number generator with this seed
	    srand( ++$$par{"rndSeed"} );
	    # RND_intGroup2sigDb2jacList is used only for the current randomized (RND) case, i.e., with the current random seed
	    # it is the list of Jaccard correlation values for the current parameters (parameters include the current random seed)
	    my %RND_intGroup2sigDb2jacList;
	    # loop through the lists of interaction type groups and signaling databases that we have seen in the real data set
	    for my $group (keys %intGroup2sigDb2jacList){
		for my $sigDb (keys %{$intGroup2sigDb2jacList{$group}}){
		    #
		    # -------- select protein pairs ----------
		    # select the requested number of protein pairs randomly (protein = pac = primary UniProtKB accession)
		    # such that in each pair at least one protein is a member of at least one signaling pathway
		    my %RND_pacPairList;
		    # the number of protein pairs should be equal to the number of proteins in the real data set with the same parameters
		    while( scalar keys %RND_pacPairList < ${$$par{"jacAvgStdDevN acType:".$type." orgCode:".$orgCode." intGroup:".$group." sigDb:".$sigDb}}{"n"} ){
			# the two randomly selected proteins
			my $pacA = $pacList[int(rand(scalar@pacList))];
			my $pacB = $pacList[int(rand(scalar@pacList))];
			# use this protein pair for the randomized test case only if
			if(    # the two proteins are different
			       $pacA ne $pacB 
			       # and at least one of the two proteins is a member of at least one signaling pathway
			    && (    (    defined       ${$pac2sigDb2pwList{$pacA}}{$sigDb}
				      && scalar keys %{${$pac2sigDb2pwList{$pacA}}{$sigDb}} )
				 || (    defined       ${$pac2sigDb2pwList{$pacB}}{$sigDb}
				      && scalar keys %{${$pac2sigDb2pwList{$pacB}}{$sigDb}} )
			       )
			  )
			{
			    ++$RND_pacPairList{join(" ",sort($pacA,$pacB))};
			}
		    }
		    #
		    # -------- analyze the selected protein pairs ----------
		    for my $pacSortedPair (sort keys %RND_pacPairList){
			my ($pacA,$pacB) = split m/\s+/, $pacSortedPair;

			# list the signaling pathways in this database to which the two proteins belong
			my @pwListA; &pwList(\%pac2sigDb2pwList,$pacA,$sigDb,\@pwListA);
			my @pwListB; &pwList(\%pac2sigDb2pwList,$pacB,$sigDb,\@pwListB);
			#
			# get the jaccard correlation value
			my $jac = &listJaccard(\@pwListA,\@pwListB);
			# loop through the interaction types connecting these two proteins
			# and list the interaction type groups to which these interaction types belong
			my %groupList;
			for my $intType (keys %{$pacSortedPair2intType{ join(" ",sort($pacA,$pacB)) }}){
			    for my $group (keys %{ ${ ${$$par{"int2org2groupList"}}{$intType} }{$orgCode} }){
				++$groupList{$group};
			    }
			}
			# for each interaction type group save the Jaccard correlation
			for my $group (keys %groupList){
			    push @{ ${ $RND_intGroup2sigDb2jacList{$group} }{$sigDb} }, $jac;
			}
		    }
		}
	    }

	    # loop through the lists of interaction type groups and signaling databases
	    for my $group (keys %RND_intGroup2sigDb2jacList){
		for my $sigDb (keys %{$RND_intGroup2sigDb2jacList{$group}}){
		    # get average and standard deviation of the Jaccard correlation values saved for these parameters
		    # NOTE: we assume here that the list is non-empty
		    my $avg, my $stdDev; &avgStdDev( \@{ ${ $RND_intGroup2sigDb2jacList{$group} }{$sigDb} }, \$avg, \$stdDev );
		    # save the average and standard deviation of the Jaccard correlations
		    # 'rnd' in the key means that these are the results from a randomized test
		    $$par{"jacAvgStdDevN_rnd rndSeed:".$$par{rndSeed}." acType:".$type." orgCode:".$orgCode." intGroup:".$group." sigDb:".$sigDb} =
		        { "avg" => $avg, "stdDev" => $stdDev, "n" => (scalar @{ ${ $RND_intGroup2sigDb2jacList{$group} }{$sigDb} }) };
		}
	    }
	}

	# -------- compute Z scores ------------
	# loop through the lists of interaction type groups and signaling databases
	for my $group (keys %intGroup2sigDb2jacList){
	    for my $sigDb (keys %{$intGroup2sigDb2jacList{$group}}){
		my $a1 = ${ $$par{"jacAvgStdDevN acType:".$type." orgCode:".$orgCode." intGroup:".$group." sigDb:".$sigDb} }{"avg"};
		my @a2list = map{ ${ $$par{ $_ } }{ "avg" } }
                             grep{ m/jacAvgStdDevN_rnd/ && m/acType\:$type/ && m/orgCode\:$orgCode/ && m/intGroup\:$group/ && m/sigDb\:$sigDb/ } 
		             keys %$par;
		# average and std. deviation of the average Jaccard correlations
		my $a2avg, my $a2stdDev; &avgStdDev( \@a2list, \$a2avg, \$a2stdDev );
		$$par{"jacZ acType:".$type." orgCode:".$orgCode." intGroup:".$group." sigDb:".$sigDb} =
		    (   # IF   the std.dev. (of the average Jaccard correlations computed from the randomized cases) is zero
			# THEN the Z score is undefined 
			# ELSE it is defined
                        0.0 == $a2stdDev 
		      ? undef
		      : ( ${ $$par{"jacAvgStdDevN acType:".$type." orgCode:".$orgCode." intGroup:".$group." sigDb:".$sigDb} }{"avg"} - $a2avg ) / $a2stdDev
		    );
	    }
	}
    }
}

# -----------------------------------------------

sub avgStdDev
{
    my ($numList,$avg_ref,$stdDev_ref) = @_;

    # --- get the avg./std.dev. of the items in the numerical list ---
    my $sum, my $sumSqr, my $n = scalar @$numList;
    for (@$numList){ $sum += $_; $sumSqr += $_ * $_; }

    # --- average and std.deviation ---
    if( 0 == $n ){
	$$avg_ref = undef;
	$$stdDev_ref = undef;
    }
    else{
	$$avg_ref    =       $sum / ( 1.0 * $n );
	$$stdDev_ref =  ( $sumSqr / ( 1.0 * $n ) - $$avg_ref ** 2.0 ) ** 0.5;
    }
}

# -----------------------------------------------

sub printResults
{
    my ($par,$type) = @_;

    # open outfile, print file header
    my $outFile = $$par{"outFileNameStart"}."z-scores--".$type.".txt";
    open OUT, ">".$outFile or die "Error, cannot write to \'$outFile\'\n";
    print OUT << "STOP";
# Z score
#\tAC type: TrEMBL (i.e., unreviewed) allowed ? 1: yes, 0: no
#\t\tOrganism code
#\t\t\tSource (database) of signaling pathways
#\t\t\t\tList of allowed interaction types

STOP
    # print data
    for my $key (grep{/^jacZ/} sort keys %$par){
	my ($acType,$orgCode,$intGroup,$sigDb) = ( $key =~ / acType\:(\S+) \s+ orgCode\:(\S+) \s+ intGroup\:(\S+) \s+ sigDb\:(\S+) /x );
	# print data only for the requested type
	if( $acType eq $type ){
	    my $acTypeNum = ("reviewed-only" eq $acType ? "0" : "1");
	    print OUT join( "\t", ( $$par{$key}, $acTypeNum, $orgCode, $sigDb, $intGroup ) )."\n";
	}
    }
    # close outfile
    close OUT;
}

# =================== main ====================

&init(\%PAR);
my $type = ($PAR{trembl} ? "reviewed-and-unreviewed" : "reviewed-only");
&analyzeInteractions(\%PAR,$type); 
&printResults(\%PAR,$type);