PathwayLinkerDb.pl

We compiled PathwayLinker's data with this Perl script.

This is the documentation of the resulting data set. This is the documentation of all downloads.

Move your mouse over the code area and look at its
top right corner where a set of icons for downloading, copy-pasting, etc. will appear.


#!/usr/bin/perl
use strict; use warnings; use List::Util qw<min max>; use List::MoreUtils qw<uniq>; use Switch;

# ================== parameters ==================

if( 26 != @ARGV )
{
    die "
\tUsage: $0 \\

\t       <infile:    BioGrid all interactions (cel, dme, hsa)> \\
\t       <in:        list of selected taxonomy IDs, use this: \"6239 7227 9606\"> \\
\t       <infile:    BioGrid identifier mappings> \\
\t       <infile:    CSSB genetic interactions (cel)> \\
\t       <infile:    CSSB WI8 interactions (cel)> \\

\t       <infile:    DroID \"Curagen\" interactions (dme)> \\
\t       <infile:    DroID \"Finley\" interactions (dme)> \\
\t       <infile:    DroID genetic interactions (dme)> \\
\t       <infile:    DroID \"Hybrigenics\" interactions (dme)> \\
\t       <infile:    DroID other physical interactions (dme)> \\

\t       <infile:    HPRD interactions (hsa)> \\
\t       <infile:    STRING database dump file (cel, dme, hsa)> \\
\t       <in:        STRING score threshold (e.g., 400)> \\
\t       <infiles:   (file name pattern) KEGG gene map files: pathway - gene pairs (cel, dme, hsa)> \\
\t       <infile:    info on KEGG signaling pathways: pathway ID --> name, short name> \\

\t       <infile:    Reactome pathways 'stid' file (hsa)> \\
\t       <infile:    info on Reactome signaling pathways: pathway ID --> name, short name> \\
\t       <infiles:   (file name pattern) SignaLink pathways (cel, dme, hsa)> \\
\t       <infile:    info on SignaLink pathways (all are signaling pathways): pathway ID --> name, short name> \\
\t       <infiles:   (file name pattern) list of abbreviated UniProt data files containing lines starting with AC or ID> \\

\t       <infile:    (file name pattern) UniProt data file of invertebrates
\t                                       containing AC, ID line starts, ORF and CG\\d+ patterns> \\
\t       <infile:    (file name pattern) UniProt data file containing AC, ID, DE, GN line starts> \\
\t       <in/outdir: FlyBase reference pages> \\
\t       <outdir:    main directory for output data> \\
\t       <outfile:   unconverted protein names> \\

\t       <outfile:   list of the UniProt ACs of all proteins that interact or are pathway members>

";
}

# save command-line variables
my (#
    # interactions
    $INFILE_BIOGRID_ALL_INTERACTIONS,
    $SELECTED_TAXONOMY_ID_LIST,
    $INFILE_BIOGRID_ID_MAPPINGS,
    $INFILE_CCSB_GENETIC_INTERACTIONS,
    $INFILE_CCSB_WI8_INTERACTIONS,
    #
    # interactions
    $INFILE_DROID_CURAGEN_INTERACTIONS,
    $INFILE_DROID_FINLEY_INTERACTIONS,
    $INFILE_DROID_GENETIC_INTERACTIONS,
    $INFILE_DROID_HYBRIGENICS_INTERACTIONS,$INFILE_DROID_OTHER_PHYSICAL_INTERACTIONS,
    $INFILE_HPRD_INTERACTIONS,$INFILE_STRING,$IN_STRING_SCORE_THRESHOLD,
    #
    # pathway memberships and pathway names
    $INFILE_PATTERN_KEGG_GENE_MAP,$INFILE_KEGG_SIGNALING_PATHWAY_INFO,
    $INFILE_REACTOME_PATHWAYS_STID,$INFILE_REACTOME_SIGNALING_PATHWAY_INFO,
    $INFILE_PATTERN_SIGNALINK_PATHWAY_MEMBERSHIPS,$INFILE_SIGNALINK_PATHWAY_INFO,
    #
    # UniProt data files 
    $INFILE_PATTERN_UNIPROT_DATA_AC_ID,
    $INFILE_PATTERN_UNIPROT_INVERTEBRATES_AC_ID_CG_ORF,
    $INFILE_PATTERN_UNIPROT_AC_ID_DE_GN,
    #
    # FlyBase reference pages
    $DIR_FLYBASE_REFERENCE_PAGES,
    #
    # output data
    $OUT_MAIN_DATA_DIR,$OUTFILE_UNCONVERTED_PROTEIN_NAMES,$OUTFILE_AC_LIST) = @ARGV;

# parameters
my %PAR = ( "queryMaxSize" => "50",
	    "uniprotApiUrl" => "http://www.uniprot.org/mapping/?from=FROM&to=TO&format=tab&query=QUERY",
	    "wget_sleep"   => "1", 
	    #
	    # number of shell commands to be executed together
	    "shCmdGroupSize" => "2000",
	    #
	    # URL directory of FlyBase reference reports
	    "urlDir_FlyBaseReports" => "http://flybase.org/reports",
	    # 
	    # list of taxonomy IDs
	    "taxIdList" => [ "6239", "7227", "9606" ],
	    # 
	    # taxonomy ID -> organism 3-letter code
	    "taxId2orgCode 6239" => "cel",
	    "taxId2orgCode 7227" => "dme",
	    "taxId2orgCode 9606" => "hsa",
	    #
	    # organism 3-letter code -> taxonomy ID
	    "orgCode2taxId cel" => "6239",
	    "orgCode2taxId dme" => "7227",
	    "orgCode2taxId hsa" => "9606",
	    #
	    # get the code of the organism from the name of the file
	    "orgCommonName2orgCode worm"  => "cel", 
	    "orgCommonName2orgCode fly"   => "dme",
	    "orgCommonName2orgCode human" => "hsa", 
	    #
	    # KEGG protein name prefixes
	    "keggPrefix cel" => "cel:",
	    "keggPrefix dme" => "dme:Dmel_",
	    "keggPrefix hsa" => "hsa:",
	    # 
	    # ------ AC -> name mapping ----------
	    # URL template
	    "urlTemplate_ac2nameMapping" => "http://www.uniprot.org/uniprot/?query=AC&format=tab&columns=id,genes",
	    # number of ACs (group size) for which the names are downloaded together
	    "acGroupSize_ac2nameMapping" => "50",
	    #
	    # 5-letter name of the organism from UniProtKB --> 3-letter code
	    "orgUpCode2orgCode CAEEL" => "cel",
	    "orgUpCode2orgCode DROME" => "dme",
	    "orgUpCode2orgCode HUMAN" => "hsa",
 );

# =================== function definitions ======================

sub init
{
    # test
    {local$|=1;print &currentTime."\n";}
}

# -------------------------------------------------------

sub readFile
{
    my ($inFile) = @_;

    # check, if the input file exists
    -f $inFile or -l $inFile or die "Error, input file does not exist: \'$inFile\'\n";

    # open input file (uncompress before reading, if necessary), close input
    my $in = ($inFile=~/\.(gz|Z|zip)$/) ? "gzip -dc $inFile|" : $inFile;
    open IN, "$in" or die "Error, cannot read from \'$in\'\n";

    # read data, close input file, return data
    my @data = <IN>; close IN;
    return \@data;
}

# ----------------------------------------

sub currentTime
{
    my @months = ('January','February','March','April','May','June',
                  'July','August','September','October',
                  'November','December');
    my @days = ('Sun','Mon','Tue',
             'Wed','Thu','Fri','Sat');
    my ($sec,$min,$hour,$mday,$mon,$year,$wday) = 
        (localtime(time))[0,1,2,3,4,5,6];
    #
    # patch: for years beyond 2000, the year is sometimes wrongly 100,101,etc instead of 2000,2001 -- correct this by adding 100 to the year
    if($year<2000){ $year+=1900; }
    # date
    my $date = "$days[$wday] $months[$mon] $mday $year $hour:$min:$sec";

    # done
    return $date;
}

# ----------------------------------------------------------------

sub read_BioGrid_interactions_pmids
{
    my ($inFile,$selTaxIdList,$ppiList) = @_;
    my %selTaxIdList_hash = map{$_=>1} split m/\s+/, $selTaxIdList;

    # do NOT clear output data: ppiList

    # open infile (unzip before opening, if necessary)
    my $in = ($inFile=~/\.(gz|zip|Z)$/) ? "gzip -dc $inFile|" : $inFile;
    open IN, "$in" or die "Error, cannot open \'$in\'";
    # discard header
    <IN>;

    # read data lines
    while(my$line=<IN>){
	chomp $line;
	# get the BioGrid ID and the taxonomy ID of both proteins and the PubMed ID of the publication (or: list of PubMed IDs) for this PPI
	my ($protA_biogridId,$protB_biogridId,$pmidList,$protA_taxId,$protB_taxId) =
	    map{ / ( GRID | pubmed | taxid ) : ( \d+ ) /x; $2 } (split m/\t/,$line)[0,1,8,9,10];
	# IF   both proteins are in the same requested species,
	# THEN save the interaction
	if( defined $selTaxIdList_hash{$protA_taxId} && $protA_taxId eq $protB_taxId ){
	    push @$ppiList, { "proteinA_id" => $protA_biogridId, "proteinB_id" => $protB_biogridId, "source" => "biogrid",
			      "taxonomy_id" => $protA_taxId,     "pmidList" => [ split m/\,/, $pmidList ], }
	}
    }
    close IN;

    # test
    {local$|=1;print &currentTime."\t"."biogrid interactions: ".(scalar grep{$$_{source} eq "biogrid"} @$ppiList)."\n";}
}

# ---------------------------------------------------------

sub read_BioGridId_2_AC_and_EntrezGeneId
{
    my ($inFile,$ppiList) = @_;

    # do NOT clear output data: ppiList

    # --- list the BioGrid IDs from ppiList ---
    my %biogridIdList;
    # loop through the list of PPIs (protein-protein interactions) from BioGrid
    for my $ppi ( grep { "biogrid" eq $$_{source} } @$ppiList ){
	# save both protein IDs into the list of BioGrid protein IDs
	for ( @$ppi{qw<proteinA_id proteinB_id>} ){ ++$biogridIdList{$_} }
    }

    # --- map these BioGrid IDs to UniProt accessions (ACs) and Entrez Gene IDs ---
    # BioGrid ID -> list of UniProt accessions, -> list of Entrez Gene IDs
    my %biogridId2acList, my %biogridId2entrezGeneIdList;
    # open infile (unzip before opening, if necessary)
    my $in = ($inFile=~/\.(gz|zip|Z)$/) ? "gzip -dc $inFile|" : $inFile;
    open IN, "$in" or die "Error, cannot open \'$in\'";
    # discard header (the line above the first data line starts with BIOGRID_ID)
    my $line=""; while($line!~/^BIOGRID_ID/){ $line=<IN> }
    #
    # read data lines
    while(<IN>){
	chomp; my ($biogridId,$otherId,$typeOf_otherId) = (split/\t/)[0,1,2];
	# IF   the BioGrid ID is one of the selected IDs
	# AND  the ID is from UniProt (Swiss-Prot or TrEMBL) or Entrez Gene
	# AND  the ID is non-empty
	# THEN save the ID
	if( defined $biogridIdList{$biogridId} && $otherId !~ /^\s*$/ ){
	    switch ($typeOf_otherId) {
		case /^SWISSPROT|TREMBL$/ { ++${ $biogridId2acList{          $biogridId} }{ $otherId } }
		case /^ENTREZ_GENE$/      { ++${ $biogridId2entrezGeneIdList{$biogridId} }{ $otherId } }
	    }
	}
    }
    close IN;

    # --- save the UniProt ACs for the interactions from BioGrid ---
    # loop through the list of PPIs (protein-protein interactions) from BioGrid
    for my $ppi ( grep { "biogrid" eq $$_{"source"} } @$ppiList ){
	# take both interactors: A and B
	for my $AorB (qw<A B>){
	    # the current BioGrid protein ID
	    my $id = $$ppi{"protein".$AorB."_id"};
	    # save the UniProt AC(s) for the current protein
	    if( defined $biogridId2acList{$id}           && scalar keys %{$biogridId2acList{$id}}           ){
		@{$$ppi{"protein".$AorB."_acList"}}           = sort keys %{$biogridId2acList{$id}};           }
	    # save the Entrez Gene ID(s) for the current protein
	    if( defined $biogridId2entrezGeneIdList{$id} && scalar keys %{$biogridId2entrezGeneIdList{$id}} ){
		@{$$ppi{"protein".$AorB."_entrezGeneIdList"}} = sort keys %{$biogridId2entrezGeneIdList{$id}}; }
	}
    }

    # test
    {local$|=1;print &currentTime."\t"."biogrid 2 ac: ".            (scalar keys %biogridId2acList).          "\n";}
    {local$|=1;print &currentTime."\t"."biogrid 2 entrez gene id: ".(scalar keys %biogridId2entrezGeneIdList)."\n";}
}

# ---------------------------------------------------------

sub read_CCSB_interactions
{
    my ($inFile,$ccsbSubtype,$ppiList,$par) = @_;

    # --- local variables ---
    # organism code for the CCSB data sets: cel (C. elegans)
    my $orgCode = "cel";

    # open input file (unzip before opening, if necessary)
    my $in = ($inFile=~/\.(gz|Z|zip)$/ ? "gzip -dc $inFile|" : $inFile);
    open IN, "$in" || die "Error, cannot open \'$in\'\n";
    # discard header line
    <IN>;
    # read data lines: read the first two items from each line
    while(<IN>){ if( /^\s*(\S+)\s+(\S+)/ ){

#tmp
if( 10 > scalar @$ppiList ) {
	my ($id1,$id2) = ($1,$2);
	push @$ppiList, { "proteinA_id" => $id1, "proteinB_id" => $id2, "source" => "ccsb".$ccsbSubtype, 
	                  "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode}, };
}
    }}
    close IN;

    # test
    {local$|=1;print &currentTime."\t"."ccsb".$ccsbSubtype.": ".(scalar grep{"ccsb".$ccsbSubtype eq $$_{"source"}} @$ppiList)."\n";}
}

# --------------------------------------------------------

sub read_DroId_interactions
{
    my ($inFile,$droidSubtype,$ppiList,$par) = @_;

    # --- local variables ---
    # organism code for the DroId data set: dme (D. melanogaster)
    my $orgCode = "dme";

    # open input file (unzip before opening, if necessary)
    my $in = ($inFile=~/\.(gz|Z|zip)$/ ? "gzip -dc $inFile|" : $inFile);
    open IN, "$in" || die "Error, cannot open \'$in\'\n";
    # discard header line
    <IN>;
    # read data lines
    while(my $line = <IN>){ 
	# chomp and split this data line
	chomp $line; my @items = split m/\t/, $line;
	#
	# separately for each input file
	switch ($droidSubtype) {
	    # DroID Curagen, Finley, Hybrigenics -- the PubMed ID is in the column with the index 7 (note that the first column has the index 0)
	    case [qw<curagen finley hybrigenics>] {
		# ID of protein 1, ID of protein 2, list of PubMed IDs for this interaction
		my ($id1,$id2,$pmidField) = @items[0,1,7];
		# remove html tags (e.g., <up> and </up>) from each protein's ID
		for ( $id1, $id2 ){ s/\<.*?\>//g; }
		# IF there is a valid PubMed ID, THEN save it too
		if( $pmidField =~ /PMID/ ){
		    push @$ppiList, { "proteinA_id" => $id1, "proteinB_id" => $id2, "source" => "droid".$droidSubtype, 
				      "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode},
				      "pmidList" => [$pmidField=~/PMID:(\d+)/g], };
		}
		else{
		    push @$ppiList, { "proteinA_id" => $id1, "proteinB_id" => $id2, "source" => "droid".$droidSubtype, 
				      "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode}, };
		}
	    }
	    # DroID genetic -- each interaction has one or more reference reports (refIdList: list of reference report IDs)
	    case "genetic" {
		my ($id1,$id2,$refIdList) = @items[0,1,2];
		# remove html tags (e.g., <up> and </up>) from each protein's ID
		for ( $id1, $id2 ){ s/\<.*?\>//g; }
		# save interaction
		push @$ppiList, { "proteinA_id" => $id1, "proteinB_id" => $id2, "source" => "droid".$droidSubtype,
				  "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode},
				  "refIdList" => [ split m/\,/, $refIdList ], };
	    }
	    # DroID other physical -- the URL of the PubMed IDs is in the column with the index 4 (recall that the first column has the index 0)
	    case "otherphysical" {
		my ($id1,$id2,$pmidUrl) = @items[0,1,3];
		# remove html tags (e.g., <up> and </up>) from each protein's ID
		for ( $id1, $id2 ){ s/\<.*?\>//g; }
		# extract the list of PubMed IDs from the URL
		if( $pmidUrl =~ m/list\_uids=(.+?)\&/ ){
		    my $pmidList = $1;
		    push @$ppiList, { "proteinA_id" => $id1, "proteinB_id" => $id2, "source" => "droid".$droidSubtype,
				      "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode},
				      "pmidList" => [ split m/\,/, $pmidList ], };
		}
	    }
	}
    }
    close IN;

    # test
    {local$|=1;print &currentTime."\t"."droid".$droidSubtype.": ".(scalar grep{"droid".$droidSubtype eq $$_{"source"}} @$ppiList)."\n";}
}

# -------------------------------------------------------------

sub read_HPRD_interactions
{
    my ($inFile,$ppiList,$par) = @_;

    # --- local variables ---
    # organism code for the HPRD data set: hsa (H. sapiens)
    my $orgCode = "hsa";

    # open input file (unzip before opening, if necessary)
    my $in = ($inFile=~/\.(gz|Z|zip)$/ ? "gzip -dc $inFile|" : $inFile);
    open IN, "$in" || die "Error, cannot open \'$in\'\n";

    # read data lines
    while(<IN>){ 
	chomp;
	my ($id1,$id2,$pmid) = (split m/\t/)[2,5,7];
	my @pmidList = ($pmid);
	push @$ppiList, { "proteinA_id" => $id1, "proteinB_id" => $id2, "source" => "hprd", "pmidList" => [ @pmidList ],
			  "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode}, };
    }    
    close IN;
}

# ------------------------------------------------------------

sub read_STRING_interactions_db_exp
{
    my ($inFile,$scoreThreshold,$selTaxIdList,$ppiList) = @_;
    my %selTaxIdList_hash = map{$_=>1} split m/\s+/, $selTaxIdList;

    # open input file (unzip before opening, if necessary)
    my $in = ($inFile=~/\.(gz|Z|zip)$/ ? "gzip -dc $inFile|" : $inFile);
    open IN, "$in" || die "Error, cannot open \'$in\'\n";
    # read data lines
    while(<IN>){
	chomp;
	my ($proteinA_taxId_and_protId,$proteinB_taxId_and_protId,$expScore,$dbScore) = map{ defined $_ && !/^\s*(0|\-)\s*$/ ? $_ : ""} (split m/\s+/)[0,1,6,7];
	my ($proteinA_id,$proteinB_id,$proteinA_taxId,$proteinB_taxId);
	if($proteinA_taxId_and_protId =~ /^\s*(\d+)\.(\S+)\s*$/){ ($proteinA_taxId,$proteinA_id) = ($1,$2); }
	if($proteinB_taxId_and_protId =~ /^\s*(\d+)\.(\S+)\s*$/){ ($proteinB_taxId,$proteinB_id) = ($1,$2); }
	#
	# IF   the two proteins have the same requested known taxonomy ID and known protein names
	# THEN consider the interaction for saving
	if(    defined $proteinA_taxId && defined $proteinA_id
            && defined $proteinB_taxId && defined $proteinB_id
	    && $proteinA_taxId eq $proteinB_taxId
            && defined $selTaxIdList_hash{$proteinA_taxId} )
	{
	    # - for C.elegans: (i) remove the digit after the 2nd period and
	    #   (ii) remove the closing letter in the part after the 1st period   
	    #
	    #   in other words: keep only the part before the 1st period and
	    #   the number(s) immediately following the first period
	    if("6239" eq $proteinA_taxId){ $proteinA_id =~ s/^(.+?\.\d+).*$/$1/; }
	    if("6239" eq $proteinB_taxId){ $proteinB_id =~ s/^(.+?\.\d+).*$/$1/; }
	    #
	    # for D.melanogaster: remove the variant suffix (-PA, -PB, ...) from protein names
	    if("7227" eq $proteinA_taxId){ $proteinA_id =~ s/\-P[A-Z]$//; }
	    if("7227" eq $proteinB_taxId){ $proteinB_id =~ s/\-P[A-Z]$//; }
	    #
	    # IF   either the "exp" of the "db" score is a non-zero number higher than the threshold value,
	    # THEN save the interaction and that score
	    for my $type (qw<db exp>){
		my $score = $type eq "db" ? $dbScore : $expScore;
		if( $score =~ /[1-9]/ && $score >= $scoreThreshold ){
		    push @$ppiList, { "proteinA_id" => $proteinA_id,   "proteinB_id" => $proteinB_id, "taxonomy_id" => $proteinA_taxId,
				      "source"      => "string".$type, "score"       => $score, };
		}
	    }
	}
    }
    close IN;
}

# ------------------------------------------------------------

sub read_KEGG_signalingPathwayInfo
{
    my ($inFile,$spwInfo) = @_;
    my $source = "kegg";

    # read the input file line by line
    for my $line (@{&readFile($inFile)}){
	#
	# --- reading the variables ---
        my ($mark,$id,$name,$shortName) = ("","","","");
	chomp $line; my @items = split/\s*\;\s*/, $line;
        # IF   the current data line contains 3 items,
        # THEN save only
        #      the ID -> name pair and
        #      whether this KEGG pathway is selected
        #      and put an empty string into 'shortName'
        if( 3 == @items ){ ($mark,$id,$name) = @items; $shortName = ""; }
        # IF   the current data line contains 4 items,
        # THEN save the short name of the pathway too, which is the last item in the data line
        if( 4 == @items ){ ($mark,$id,$name,$shortName) = @items; }

	# --- adjusting the variables ---
	# zero-padding pathway ID from the front to make it five characters long
	while( 5 > length $id ){ $id = "0".$id; }
        # changing the names: replace apostrophe with &apos; , replace double quote mark with &quot;
        for($name,$shortName){ s/\'/\&apos\;/g; s/\"/\&quot\;/g; }

	# --- save info about this signaling pathway ---
	push @$spwInfo, { "pathway_id" => $id, "pathway_name" => $name, "pathway_shortName" => $shortName, "pathway_isSignaling" => ($mark=~/ ^\s* x \s* $ /x ? "1" : "0"), "source" => $source };
    }

    #test
    {local$|=1;print"Read KEGG signaling pathway info: ".(scalar grep{$source eq $$_{"source"}} @$spwInfo)."\n";}
}

# ------------------------------------------------------------

sub read_KEGG_signalingPathwayMemberships
{
    my ($inFilePattern,$spwInfo,$pwmList,$par) = @_;
    my $source = "kegg"; # source of the data

    # the list of the IDs of signaling pathways in KEGG
    my %sPwIdList = map{$$_{"pathway_id"}=>1} grep{defined $$_{"pathway_isSignaling"} && $$_{"pathway_isSignaling"} && $source eq $$_{"source"}} @$spwInfo;

    # loop through the list of input files
    for my $inFile (sort glob($inFilePattern)){
	# 3-letter code of the organism
	my $orgCode; if( $inFile =~ /(cel|dme|hsa)_gene_map/ ){ $orgCode = $1; } else{ die "Wrong file name format: ".$inFile."\n"; }
	# open input file (unzip before reading, if necessary)
	my $in = $inFile=~/\.(gz|zip|Z)$/ ? "gzip -dc $inFile|" : $inFile; open IN, $in or die "Error, cannot read from \'$in\'\n";
	# read for each protein: the name of the protein and the list of its KEGG pathways
	while(<IN>){
	    chomp; my @items = split m/\s+/;
	    # proceed only, if there are at least two items in this line, i.e., a protein ID and at least one pathway ID
	    if( 2 > scalar @items ){ die "Wrong data line format in \'$inFile\':\n".$_."\n"; }
	    my $protein_id = shift @items;
	    # save only the signaling pathways of KEGG
	    for my $pathway_id (grep{defined $sPwIdList{$_}} @items){
		push @$pwmList, { "source" => $source, "orgCode" => $orgCode, "protein_id" => $protein_id, "pathway_id" => $pathway_id, 
				  "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode}, };
	    }
	}
	close IN;
    }

    # test
    {local$|=1;print"Read KEGG signaling pathway memberships: ".(scalar grep{$$_{"source"} eq $source} @$pwmList)."\n";}
}

# ------------------------------------------------------------

sub read_Reactome_signalingPathwayInfo
{
    my ($inFile,$spwInfo) = @_;
    my $source = "reactome";

    # --- read data ---
    # open the input file (unzip before opening, if necessary)
    my $in = $inFile=~/\.(gz|zip|Z)$/ ? "gzip -dc $inFile|" : $inFile;
    open IN, "$in" or die "Error, cannot open \'$in\'\n";
    while(<IN>){ if( / ^ \s* (.+?) \t (.+?) \t (.+?) $ /x ){
	my ($pathway_id,$pathway_name,$pathway_shortName) = ($1,$2,$3);
	push @$spwInfo, { "pathway_id" => $pathway_id, "pathway_name" => $pathway_name, "pathway_shortName" => $pathway_shortName, "source" => $source, "pathway_isSignaling" => "1" };
    }}
    close IN;

    #test
    {local$|=1;print"Read Reactome signaling pathway info: ".(scalar grep{$source eq $$_{"source"}} @$spwInfo)."\n";}
}

# ------------------------------------------------------------

sub read_Reactome_signalingPathwayMemberships
{
    my ($inFile,$spwInfo,$pwmList,$par) = @_;

    # --- local variables ---
    my $source = "reactome"; # source
    my $orgCode = "hsa"; # organism in Reactome: H. sapiens

    # the list of the IDs of signaling pathways in Reactome
    my %sPwIdList = map{$$_{"pathway_id"}=>1} grep{defined $$_{"pathway_isSignaling"} && $$_{"pathway_isSignaling"} && $source eq $$_{"source"}} @$spwInfo;

    # --- read data ---
    # open the input file (unzip before opening, if necessary)
    my $in = $inFile=~/\.(gz|zip|Z)$/ ? "gzip -dc $inFile|" : $inFile;
    open IN, "$in" or die "Error, cannot open \'$in\'\n";
    while(<IN>){ if( / ^ \s* (\S+) \s+ (\S+) /x ){
	my ($ac,$pathway_id) = ($1,$2);
	#print $ac."\t".$pathway_id."\n"; #test
	# save only signaling pathways
	if( defined $sPwIdList{$pathway_id} ){
	    push @$pwmList, {"pathway_id" => $pathway_id, "protein_id" => $ac, "source" => $source, "orgCode" => $orgCode,
			     "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode}, };
	}
    }}    
    close IN;

    # test
    {local$|=1;print"Read Reactome signaling pathway memberships: ".(scalar grep{$$_{"source"} eq $source} @$pwmList)."\n";}
}

# ------------------------------------------------------------

sub read_SignaLink_signalingPathwayInfo
{
    my ($inFile,$spwInfo) = @_;
    my $source = "signalink";

    # --- read data ---
    # open the input file (unzip before opening, if necessary), skip blank and comment lines
    my $in = $inFile=~/\.(gz|zip|Z)$/ ? "gzip -dc $inFile|" : $inFile;
    open IN, "$in" or die "Error, cannot open \'$in\'\n";
    while(<IN>){ if( ! / ^ \s* ( $ | \# ) /x ){
	chomp;
	my ($pathway_id,$pathway_name) = split m/\s+/;
	my $pathway_shortName = $pathway_name;
	push @$spwInfo, { "pathway_id" => $pathway_id, "pathway_name" => $pathway_name, "pathway_shortName" => $pathway_shortName, "source" => $source, "pathway_isSignaling" => "1" };
    }}
    close IN;

    #test
    {local$|=1;print"Read SignaLink signaling pathway info: ".(scalar grep{$source eq $$_{"source"}} @$spwInfo)."\n";}
}

# ------------------------------------------------------------

sub read_SignaLink_signalingPathwayMemberships
{
    my ($inFilePattern,$pwmList,$par) = @_;

    # --- local variables ---
    my $source = "signalink"; # data source: SignaLink

    # --- read data ---
    # loop through the list of input files
    #print join("\n",(glob($inFilePattern)))."\n";exit(1); #test
    for my $inFile (glob($inFilePattern)){
	# get the organism
	$inFile =~ /(worm|fly|human)/;
	my $orgCode = $$par{"orgCommonName2orgCode ".$1};
	# open input file (unzip before reading, if necessary)
	my $in = $inFile=~/\.(gz|zip|Z)$/ ? "gzip -dc $inFile|" : $inFile;
	open IN, $in or die "Error, cannot read from \'$in\'\n";
	<IN>; # discard first line (header)
	# read for each protein: the name of the protein and the list of its SignaLink pathways
	while(<IN>){
	    chomp;
	    my ($protein_id,     $egf_ncore,     $egf_core, $wnt_ncore, $wnt_core, 
		                 $tgf_ncore,     $tgf_core, $igf_ncore, $igf_core,
		               $notch_ncore,   $notch_core,  $hh_ncore,  $hh_core, 
		             $jakstat_ncore, $jakstat_core,       $nhr, ) = 
	       (split m/\;/)[ 2, 4,5,6,7, 8,9,10,11, 12,13,14,15, 16,17,18, ];
	    # the list of pathways of which this protein is a member
	    my %pwList; 
	    if( "yes" eq     $egf_ncore || "yes" eq     $egf_core ){ ++$pwList{"EGF"}     }
	    if( "yes" eq     $wnt_ncore || "yes" eq     $wnt_core ){ ++$pwList{"WNT"}     }
	    if( "yes" eq     $tgf_ncore || "yes" eq     $tgf_core ){ ++$pwList{"TGF"}     }
	    if( "yes" eq     $igf_ncore || "yes" eq     $igf_core ){ ++$pwList{"IGF"}     }
	    if( "yes" eq   $notch_ncore || "yes" eq   $notch_core ){ ++$pwList{"NOTCH"}   }
	    if( "yes" eq      $hh_ncore || "yes" eq      $hh_core ){ ++$pwList{"HH"}      }
	    if( "yes" eq $jakstat_ncore || "yes" eq $jakstat_core ){ ++$pwList{"JAKSTAT"} }
	    if( "yes" eq           $nhr                           ){ ++$pwList{"NHR"}     }
	    #
	    # save the pathways of the current protein
	    for my $pathway_id (keys %pwList){
		push @$pwmList, { "source" => $source, "orgCode" => $orgCode, "protein_id" => $protein_id, "pathway_id" => $pathway_id,
				  "taxonomy_id" => $$par{"orgCode2taxId ".$orgCode}, };
	    }
	}
	close IN;
    }

    # test
    {local$|=1;
     print"Read SignaLink signaling pathway memberships: ".(scalar grep{$$_{"source"} eq $source} @$pwmList)."\n";
     my %orgCodeList = map{$$_{"orgCode"}=>1}@$pwmList;
     for my $orgCode (sort keys %orgCodeList){
	 print "\t".$orgCode.": ".(scalar grep{$orgCode eq $$_{"orgCode"} && $$_{"source"} eq $source} @$pwmList)."\n";
     }
    }
}

# ---------------------------------------------------------

sub map_unmapped_BioGridIds_via_EntrezGeneId_2_allAC
{
    my ($ppiList,$par) = @_;

    # ---- get the list Entrez Gene IDs for those BioGrid interactors that have no UniProt accession (AC) yet -----
    # save each Entrez gene ID and its taxonomy ID
    my %entrezGeneId2taxId;
    for my $ppi (grep{"biogrid" eq $$_{"source"}} @$ppiList){
	for my $AorB (grep{!defined $$ppi{"protein".$_."_acList"} || 0 == scalar @{$$ppi{"protein".$_."_acList"}}} qw<A B>){
	    for (@{$$ppi{"protein".$AorB."_entrezGeneIdList"}}){
		$entrezGeneId2taxId{$_} = $$ppi{"taxonomy_id"};
	    }
	}
    }
    # test
    print "Entrez Gene IDs: ".(scalar keys %entrezGeneId2taxId)."\n";#.join("\n",sort keys %entrezGeneId2taxId)."\n";#test

    # --- map these BioGrid IDs to UniProt accessions (ACs) ---
    my %entrezGeneId2acList; &otherId2acList( \%entrezGeneId2taxId, "P_ENTREZGENEID", \%entrezGeneId2acList, \%$par );

    # --- save the UniProt ACs for those interactors in BioGrid that have no UniProt accession (AC) yet ---
    # loop through the list of PPIs (protein-protein interactions) from BioGrid
    for my $ppi (grep{"biogrid" eq $$_{"source"}} @$ppiList){
	for my $AorB (grep{!defined $$ppi{"protein".$_."_acList"} || 0 == scalar @{$$ppi{"protein".$_."_acList"}}} qw<A B>){
	    my $biogridId = $$ppi{"protein".$AorB."_id"};	    
	    my %acList;
	    # loop through the list of Entrez Gene IDs saved for this BioGrid ID
	    if( defined $$ppi{"protein".$AorB."_entrezGeneIdList"} && scalar @{$$ppi{"protein".$AorB."_entrezGeneIdList"}} ){
		for my $entrezGeneId (@{$$ppi{"protein".$AorB."_entrezGeneIdList"}}){
		    # loop through the list of UniProt ACs saved for this Entrez Gene ID
		    if( defined $entrezGeneId2acList{$entrezGeneId} && scalar keys %{$entrezGeneId2acList{$entrezGeneId}} ){
			for my $ac (keys %{$entrezGeneId2acList{$entrezGeneId}}){
			    ++$acList{$ac};
			}
		    }
		}
	    }
	    # if we do have at least one UniProt accession (AC) for the current BioGrid ID, then save it
	    if( scalar keys %acList ){ @{$$ppi{"protein".$AorB."_acList"}} = sort keys %acList; }
	}
    }

    # test: how many BioGrid IDs do not have a UniProt AC yet
    my %list;
    for my $ppi (grep{"biogrid" eq $$_{"source"}} @$ppiList){
	for my $AorB (grep{!defined $$ppi{"protein".$_."_acList"} || 0 == scalar @{$$ppi{"protein".$_."_acList"}}} qw<A B>){
	    my $id = $$ppi{"protein".$AorB."_id"};
	    ++$list{$id};
	}
    }
    # test
    {local$|=1;print "Unmapped BioGrid IDs: ".(scalar keys %list)."\n";}#exit(1);#.join("\n",sort keys %list)."\n";exit(1);
}

# ---------------------------------------------------------

sub map_CCSB_proteinIds2allAC
{
    my ($ppiList,$par) = @_;

    # get the list of protein IDs from CCSB interactions
    # key: protein ID, value: taxonomy ID for that protein ID
    my %ccsbId2taxId, my %wormPepId2taxId, my %keggId2taxId;
    for my $ppi (grep{$$_{"source"} =~ /^ccsb/} @$ppiList){ for my $id (@$ppi{qw<proteinA_id proteinB_id>}){
	$ccsbId2taxId{$id}                           = $$ppi{"taxonomy_id"};
	$wormPepId2taxId{$id}                        = $$ppi{"taxonomy_id"};
	$keggId2taxId{ $$par{"keggPrefix cel"}.$id } = $$ppi{"taxonomy_id"};
    }}

    # mapping the protein names to UniProt accessions (ACs) by assuming that they are WormBase IDs
    # mapping the protein names by assuming that they are KEGG IDs
    my %wormPepId2acList; &otherId2acList( \%wormPepId2taxId, "WORMBASE_ID", \%wormPepId2acList, \%$par );
    my %keggId2acList;    &otherId2acList( \%keggId2taxId,    "KEGG_ID",     \%keggId2acList,    \%$par );

    # --- save the UniProt ACs for the interactions obtained from the CCSB sources ---
    # loop through the list of PPIs (protein-protein interactions) from CCSB
    for my $ppi ( grep { $$_{source} =~ /^ccsb/ } @$ppiList ){
	# take both interactors: A and B
	for my $AorB (qw/A B/){
	    my $id = $$ppi{"protein".$AorB."_id"};
	    my $keggId = $$par{"keggPrefix cel"}.$id;
	    my %acListNow; # the list of ACs for the current protein ID
	    # list the UniProt ACs that have been saved for the current protein ID
	    if( defined $wormPepId2acList{$id}  && scalar keys %{$wormPepId2acList{$id}}  ){ for (keys %{$wormPepId2acList{$id}} ){ ++$acListNow{$_} } }
	    if( defined $keggId2acList{$keggId} && scalar keys %{$keggId2acList{$keggId}} ){ for (keys %{$keggId2acList{$keggId}}){ ++$acListNow{$_} } }
	    # save the UniProt AC(s) for the current protein
	    if( scalar keys %acListNow ){ @{$$ppi{"protein".$AorB."_acList"}} = sort keys %acListNow; }
	}
    }
}

# ---------------------------------------------------------

sub map_DroId_proteinIds2allAC
{
    my ($ppiList,$par) = @_;

    # get the list of protein IDs from DroId interactions
    # key: DroId identifier or protein, value: taxonomy ID of the protein
    my %droidId2taxId, my %flyBaseId2taxId, my %ensgId2taxId, my %ensgpId2taxId;
    for my $ppi (grep{$$_{"source"} =~ /^droid/} @$ppiList){ for my $id (@$ppi{qw<proteinA_id proteinB_id>}){
	for my $ref (\%droidId2taxId, \%flyBaseId2taxId, \%ensgId2taxId, \%ensgpId2taxId){
	    $$ref{$id} = $$ppi{"taxonomy_id"};
	}
    }}

    # mapping the protein names to UniProt accessions (ACs) by assuming that they are FlyBase IDs
    # mapping the protein names by assuming that they are Ensembl gene or gene/protein IDs
    my %flyBaseId2acList; &otherId2acList(\%flyBaseId2taxId, "FLYBASE_ID",           \%flyBaseId2acList, \%$par);
    my %ensgId2acList;    &otherId2acList(\%ensgId2taxId,    "ENSEMBLGENOME_ID",     \%ensgId2acList,    \%$par);
    my %ensgpId2acList;   &otherId2acList(\%ensgpId2taxId,   "ENSEMBLGENOME_PRO_ID", \%ensgpId2acList,   \%$par);

    # --- save the UniProt ACs for the interactions obtained from the DroId sources ---
    # loop through the list of PPIs (protein-protein interactions) from DroId
    for my $ppi ( grep { $$_{source} =~ /^droid/ } @$ppiList ){
	# take both interactors: A and B
	for my $AorB (qw/A B/){
	    my $id = $$ppi{"protein".$AorB."_id"};
	    my %acListNow; # the list of ACs for the current protein ID
	    # list the UniProt ACs that have been saved for the current protein ID
	    if( defined $flyBaseId2acList{$id} && scalar keys %{$flyBaseId2acList{$id}} ){ for (keys %{$flyBaseId2acList{$id}} ){ ++$acListNow{$_} } }
	    if( defined $ensgId2acList{$id}    && scalar keys %{$ensgId2acList{$id}}    ){ for (keys %{$ensgId2acList{$id}}    ){ ++$acListNow{$_} } }
	    if( defined $ensgpId2acList{$id}   && scalar keys %{$ensgpId2acList{$id}}   ){ for (keys %{$ensgpId2acList{$id}}   ){ ++$acListNow{$_} } }
	    # save the UniProt AC(s) for the current protein
	    if( scalar keys %acListNow ){ @{$$ppi{"protein".$AorB."_acList"}} = sort keys %acListNow; }
	}
    }
}

# ---------------------------------------------------------

sub read_DroId_genetic_PubMedIds_from_FlyBaseReferencePages
{
    my ($mainDir,$ppiList,$par) = @_;

    # loop through the list of DroId genetic interactions
    for my $ppi (grep { "droidgenetic" eq $$_{"source"} } @$ppiList){
	# the list of PubMed IDs supporting this interaction
	my %pmidList;
	# loop through the list of reference report IDs
	for my $refId (@{$$ppi{"refIdList"}}){
	    # the input file containing the reference report
	    my $refFile = $mainDir."/".$refId;
	    # IF this reference report page is not yet available in the local directory, THEN download it
	    if( !(-f $refFile) ){
		my $shCmd = "wget \"".$$par{urlDir_FlyBaseReports}."/".$refId.".html"."\" -O \"".$refFile."\" -o /dev/null; "."sleep ".$$par{"wget_sleep"}.";";
		`$shCmd`;
	    }
	    # read the PubMed IDs from the reference file
            open IN, "$refFile" || die "Error, cannot open \'$refFile\'\n"; my $refFileLines = join("",<IN>); close IN;
	    if($refFileLines=~/list_uids\=(.+?)\"/){
		my $pmidListFromThisFile = $1;
		# each reference page may contain more than one PubMed ID, save all of them
		for my $pmid ($pmidListFromThisFile=~/(\d+)/g){
		    ++$pmidList{$pmid};
		}
	    }
	}
	# save the full list of PubMed IDs
	@{$$ppi{"pmidList"}} = sort{$a<=>$b} keys %pmidList;
    }
}

# ---------------------------------------------------------

sub map_HPRD_proteinIds2allAC
{
    my ($ppiList,$par) = @_;

    # get the list of protein IDs from HPRD interactions
    # key: protein ID, value: taxonomy ID of the protein
    my %hprdId2taxId, my %flyBaseId2taxId, my %ensgId2taxId, my %ensgpId2taxId;
    for my $ppi (grep{$$_{"source"} =~ /hprd/} @$ppiList){ for my $id (@$ppi{qw<proteinA_id proteinB_id>}){
	for my $ref (\%hprdId2taxId, \%flyBaseId2taxId, \%ensgId2taxId, \%ensgpId2taxId){
	    $$ref{$id} = $$ppi{"taxonomy_id"};
	}
    }}

    # mapping the protein names to UniProt accessions (ACs) by assuming that they are RefSeq accessions
    my %flyBaseId2acList; &otherId2acList(\%flyBaseId2taxId, "FLYBASE_ID",           \%flyBaseId2acList, \%$par);
    my %ensgId2acList;    &otherId2acList(\%ensgId2taxId,    "ENSEMBLGENOME_ID",     \%ensgId2acList,    \%$par);
    my %ensgpId2acList;   &otherId2acList(\%ensgpId2taxId,   "ENSEMBLGENOME_PRO_ID", \%ensgpId2acList,   \%$par);

    # map RefSeq accessions (from HPRD) to uniprot ACs, save also non-mapped items
    my %hprdId2acList; &otherId2acList( \%hprdId2taxId, "P_REFSEQ_AC", \%hprdId2acList, \%$par );

    # --- save the UniProt ACs for the interactions obtained from HPRD ---
    # loop through the list of PPIs (protein-protein interactions) from HPRD
    for my $ppi ( grep { $$_{source} =~ /hprd/ } @$ppiList ){
	# take both interactors: A and B
	for my $AorB (qw/A B/){
	    my $protId = $$ppi{"protein".$AorB."_id"};
	    # save the UniProt AC(s) for the current protein
	    if( scalar keys %{$hprdId2acList{$protId}} ){ @{$$ppi{"protein".$AorB."_acList"}} = sort keys %{$hprdId2acList{$protId}}; }
	}
    }
}

# ---------------------------------------------------------

sub map_STRING_proteinIds2allAC
{
    my ($ppiList,$selTaxIdList,$par) = @_;
    my %selTaxIdList_hash = map{$_=>1} split m/\s+/, $selTaxIdList;

    # --- list protein names separately for each organism ---
    # first key: organism (taxonomy ID), second key: protein ID, value: taxonomy ID of the protein
    # the first key and the value are identical, but this structure is need for compatibility with 'sub otherId2acList'
    my %org2protId2taxId;
    for my $ppi (grep {$$_{"source"} =~ /^string/} @$ppiList){
	# save both interactors into the list of proteins in the current species
	for my $protId (@$ppi{qw<proteinA_id proteinB_id>}){
	    ${ $org2protId2taxId{$$ppi{"taxonomy_id"}} }{ $protId } = $$ppi{"taxonomy_id"};
	    # C. elegans:
	    # for each protein (e.g. FH23H11.3) save also its KEGG variant (cel:F23H11.3);
	    if( "6239" eq $$ppi{"taxonomy_id"} ){
		${$org2protId2taxId{ $$ppi{"taxonomy_id"}} }{ $$par{"keggPrefix cel"}.$protId } = $$ppi{"taxonomy_id"};
	    }
	    # D. melanogaster:
	    # for each protein (e.g., CG1112) save also its KEGG variant (dme:Dmel_CG1112)
	    if( "7227" eq $$ppi{"taxonomy_id"} ){
		${$org2protId2taxId{ $$ppi{"taxonomy_id"} }}{ $$par{"keggPrefix dme"}.$protId } = $$ppi{"taxonomy_id"};
	    }
	}
    }

    # --- conversion of protein names to UniProt primary accessions (ACs) ---
    # loop through the list of the selected (requested) organisms and save data for them only
    for my $taxId (sort keys %selTaxIdList_hash){
	#
	# conversion
	my %otherId2acList;
	my @otherIdTypes;
	switch ($taxId) {
	    case "6239" { @otherIdTypes = ("ENSEMBL_TRS_ID","WORMPEP_ID","KEGG_ID"); }
	    case "7227" { @otherIdTypes = ("GERMONLINE_ID","KEGG_ID"); }
	    case "9606" { @otherIdTypes = ("ENSEMBL_PRO_ID"); }
	}
	for my $otherIdType (@otherIdTypes){
	    &otherId2acList( \%{$org2protId2taxId{$taxId}}, $otherIdType, \%otherId2acList, \%$par );
	}
	#
	# save names for the STRING interactions in the current organism
	for my $ppi (grep {$$_{"source"} =~ /^string/ && $taxId eq $$_{"taxonomy_id"}} @$ppiList){
	    # take both interactors
	    for my $AorB (qw<A B>){
		my $protId = $$ppi{"protein".$AorB."_id"};
		#
		# for C.elegans and D.melanogaster:
		# check also whether UniProt accessions (ACs) have been found for the modified protein names
		# if there are such ACs, then save them for the original protein
		if( "6239" eq $$ppi{"taxonomy_id"} && defined $otherId2acList{$$par{"keggPrefix cel"}.$protId} && scalar keys %{$otherId2acList{ $$par{"keggPrefix cel"}.$protId }} ){
		    for my $ac (keys %{$otherId2acList{ $$par{"keggPrefix cel"}.$protId }}){
			++${$otherId2acList{$protId}}{$ac};
		    }
		}
		if( "7227" eq $$ppi{"taxonomy_id"} && defined $otherId2acList{$$par{"keggPrefix dme"}.$protId} && scalar keys %{$otherId2acList{$$par{"keggPrefix dme"}.$protId}} ){
		    for my $ac (keys %{$otherId2acList{$$par{"keggPrefix dme"}.$protId}}){
			++${$otherId2acList{$protId}}{$ac};
		    }
		}
		# save the UniProt AC(s) for the current protein
		if( scalar keys %{$otherId2acList{$protId}} ){ @{$$ppi{"protein".$AorB."_acList"}} = sort keys %{$otherId2acList{$protId}}; }
	    }
	}
    }
}

# ---------------------------------------------------------

sub map_KEGG_proteinIds2allAC
{
    my ($pwmList,$selTaxIdList,$par) = @_;
    my $source = "kegg";

    # test
    {my %idList = map{$$_{"protein_id"}=>1} grep{$source eq $$_{"source"}} @$pwmList;
     local$|=1;print "KEGG ID mapping: ".(scalar keys%idList)."\n";}

    # --- convert protein names to UniProt accessions ---
    # loop through the list of organism codes, i.e., the organism codes belonging to the selected taxonomy IDs
    for my $orgCode (map{ $$par{"taxId2orgCode ".$_} } split m/\s+/, $selTaxIdList){
	# do it differently for each organism
	switch ($orgCode) {
	    # C. elegans
	    case "cel" {
		#
		# mapping the protein names by assuming that they are WormBase IDs
		# mapping the protein names by assuming that they are WormBase Protein IDs
		# mapping the protein names by assuming that they are KEGG IDs
		#
		# key: protein ID, value: taxonomy ID
		my %wormBaseId2taxId   = map{$$_{"protein_id"}=>$$_{"taxonomy_id"}} grep{$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList;
		my %wormProtId2taxId   = %wormBaseId2taxId;
		my %keggId2taxId       = map{$orgCode.":".$_  => $wormBaseId2taxId{$_}} keys %wormBaseId2taxId;
		#
		# ID conversion
		my %wormBaseId2acList; &otherId2acList( \%wormBaseId2taxId, "WORMBASE_ID",     \%wormBaseId2acList, \%$par);
		my %wormProtId2acList; &otherId2acList( \%wormProtId2taxId, "WORMBASE_PRO_ID", \%wormProtId2acList, \%$par);
		my %keggId2acList;     &otherId2acList( \%keggId2taxId,     "KEGG_ID",         \%keggId2acList,     \%$par);

		# --- save converted names for each protein of the current organism in the pathway membership list ---
		for my $pwm (grep {$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList){
		    # the list of UniProt accessions (ACs) for the current protein
		    my $id = defined $$pwm{"protein_id"} ? $$pwm{"protein_id"} : "";
		    my $id_kegg = $orgCode.":".$id;
		    my %acList;
		    # save the list of UniProt accessions from all three sources
		    if( defined $wormBaseId2acList{$id}  && scalar keys %{$wormBaseId2acList{$id}}  ){ for (keys %{$wormBaseId2acList{$id}} ){ ++$acList{$_} } }
		    if( defined $wormProtId2acList{$id}  && scalar keys %{$wormProtId2acList{$id}}  ){ for (keys %{$wormProtId2acList{$id}} ){ ++$acList{$_} } }
		    if( defined $keggId2acList{$id_kegg} && scalar keys %{$keggId2acList{$id_kegg}} ){ for (keys %{$keggId2acList{$id_kegg}}){ ++$acList{$_} } }
		    # save all UniProt accessions (ACs)
		    if( scalar keys %acList ){ @{$$pwm{"protein_acList"}} = sort keys %acList; }
		}
	    }
	    #
	    # D. melanogaster
	    case "dme" {
		#
		# mapping the protein names by assuming that they are KEGG IDs
		#
		# key: protein ID, value: taxonomy ID of the protein
		my %keggId2taxId      = map{$orgCode.":".$$_{"protein_id"}=>$$_{"taxonomy_id"}} grep{$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList;
		my %keggId2acList;    &otherId2acList( \%keggId2taxId, "KEGG_ID", \%keggId2acList, \%$par );

		# --- save converted names for each protein of the current organism in the pathway membership list ---
		for my $pwm (grep {$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList){
		    my $id_kegg = defined $$pwm{"protein_id"} ? $orgCode.":".$$pwm{"protein_id"} : "";
		    # the list of UniProt accessions (ACs) for the current protein
		    if( defined $keggId2acList{$id_kegg} && scalar keys %{$keggId2acList{$id_kegg}} ){
			@{$$pwm{"protein_acList"}} = sort keys %{$keggId2acList{$id_kegg}}
		    }
		}
	    }
	    #
	    # H. sapiens
	    case "hsa" {
		#
		# mapping the protein names by assuming that they are Entrez Gene IDs
		#
		# key: protein ID, value: taxonomy ID of the protein
		my %entGenId2taxId   = map{$$_{"protein_id"}=>$$_{"taxonomy_id"}} grep{$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList;
		my %entGenId2acList; &otherId2acList( \%entGenId2taxId, "P_ENTREZGENEID", \%entGenId2acList, \%$par );

		# --- save converted names for each protein of the current organism in the pathway membership list ---
		for my $pwm (grep {$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList){
		    my $id = defined $$pwm{"protein_id"} ? $$pwm{"protein_id"} : "";
		    # the list of UniProt accessions (ACs) for the current protein
		    if( defined $entGenId2acList{$id} && scalar keys %{$entGenId2acList{$id}} ){
			@{$$pwm{"protein_acList"}} = sort keys %{$entGenId2acList{$id}}
		    }
		}
	    }
	}
    }
}

# ---------------------------------------------------------

sub map_KEGG_flyProteinsCG
{
    my ($pwmList,$inFilePattern,$par) = @_;

    # test: get the list of KEGG protein names from fly that have no UniProt accession (AC) yet
    my %keggIdList = map{$_=>1} grep{    "kegg" eq $$_{"source"} && "dme" eq $$_{"orgCode"}
				      && (!defined $$_{"protein_acList"} || 0==scalar@{$$_{"protein_acList"}})} @$pwmList;
    {local$|=1;print "Unmapped KEGG IDs before: ".(scalar keys %keggIdList)."\n";}

    # for each CG\d+ type name list its UniProt accessions (ACs)
    my %cgId2acList;

    # read input files
    for my $inFile (glob($inFilePattern)){
	# open input file (unzip before reading, if necessary)
	my $in = ($inFile=~/\.(gz|zip|Z)$/) ? "gzip -dc $inFile|" : $inFile; 
	open IN, $in or die "Error, cannot open \'$in\'\n";
	# read records separated by //\n from the input file
	local $/ = "//\n";
	while(my$record=<IN>){
	    # read only records for D. melanogaster, read the UniProt accession (AC)
	    if ( $record =~ / ^ID \s+ \S+?_DROME .+? \n \s* AC \s+ (\S{6}); /xs ){
		my $ac = $1;
		for my $cgId ($record =~ / ( CG \d+ ) /xg){
		    ++${ $cgId2acList{$cgId} }{$ac};
		}
	    }
	}
	close IN;
    }

    # convert protein IDs to UniProt accesions (ACs)
    for my $pwm (@$pwmList){
	if( $$pwm{"protein_id"} =~ / ( CG \d+ ) /x ){
	    my $cgId = $1;
	    # save already known ACs
	    my %acList;
	    if( defined $$pwm{"protein_acList"} && scalar @{$$pwm{"protein_acList"}} ){
		%acList = map{$_=>1} @{$$pwm{"protein_acList"}};
	    }
	    # save the ACs listed above
	    if( defined $cgId2acList{$cgId} && scalar keys %{$cgId2acList{$cgId}} ){
		for my $ac (keys %{$cgId2acList{$cgId}}){
		    ++$acList{$ac};
		}
	    }
	    if( scalar keys %acList ){ @{$$pwm{"protein_acList"}} = sort keys %acList; }
	}
    }

    # test again
    %keggIdList = map{$_=>1} grep{    "kegg" eq $$_{"source"} && "dme" eq $$_{"orgCode"}
				   && (!defined $$_{"protein_acList"} || 0 == scalar @{$$_{"protein_acList"}}) } @$pwmList;
    {local$|=1;print "Unmapped KEGG IDs after: ".(scalar keys %keggIdList)."\n";}
    #exit(1);
}

# ---------------------------------------------------------

sub map_Reactome_proteinIds2allAC
{
    my ($pwmList,$selTaxIdList,$par) = @_;
    my $source = "reactome";

    # key: protein ID, value: taxonomy ID
    my %protId2taxId = map{$$_{"protein_id"}=>$$_{"taxonomy_id"}} grep{$source eq $$_{"source"} && defined $$_{"protein_id"}} @$pwmList;

    # test
    {local$|=1;print "Reactome ID mapping: ".(scalar keys%protId2taxId)." all\n";}

    # assuming that these are UniProt accessions (ACs) or IDs, convert them to UniProt ACs
    my %protId2acList; &otherId2acList( \%protId2taxId, "ACC+ID", \%protId2acList, \%$par );

    # save converted names for each protein
    for my $pwm (grep {$source eq $$_{"source"} && defined $$_{"protein_id"}} @$pwmList){
	my $protId = $$pwm{"protein_id"};
	if( defined $protId2acList{$protId} && scalar keys %{$protId2acList{$protId}} ){
	    @{$$pwm{"protein_acList"}} = sort keys %{$protId2acList{$protId}};
	}
    }
}

# ---------------------------------------------------------

sub map_SignaLink_proteinIds2allAC
{
    my ($pwmList,$selTaxIdList,$par) = @_;
    my $source = "signalink";

    # key: protein ID, value: taxonomy ID
    my %protId2taxId = map{$$_{"protein_id"}=>$$_{"taxonomy_id"}} grep{$source eq $$_{"source"}} @$pwmList;

    # test
    {local$|=1;print "SignaLink ID mapping: ".(scalar keys %protId2taxId)."\n";}

    # --- convert protein names to UniProt accessions ---
    # loop through the list of organism codes, i.e., the organism codes belonging to the selected taxonomy IDs
    for my $orgCode (map{ $$par{"taxId2orgCode ".$_} } split m/\s+/, $selTaxIdList){
	# do it differently for each organism
	switch ($orgCode) {
	    # C. elegans
	    case "cel" {
		#
		# mapping the protein names by assuming that they are WormPep IDs
		# mapping the protein names by assuming that they are KEGG IDs
		#
		# key: protein ID, value: taxonomy ID
		my %wormpepId2taxId   = map{$$_{"protein_id"}=>$$_{"taxonomy_id"}} grep{$source eq $$_{"source"} && $orgCode eq $$_{"orgCode"}} @$pwmList;
		my %keggId2taxId      = map{ $orgCode.":".$_ => $wormpepId2taxId{$_} } keys %wormpepId2taxId;
		#
		# conversion
		my %wormpepId2acList; &otherId2acList( \%wormpepId2taxId, "WORMPEP_ID", \%wormpepId2acList, \%$par );
		my %keggId2acList;    &otherId2acList( \%keggId2taxId,    "KEGG_ID",    \%keggId2acList,    \%$par );

		# --- save converted names for each protein of the current organism in the pathway membership list ---
		for my $pwm (grep {$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList){
		    # the list of UniProt accessions (ACs) for the current protein
		    my $id = defined $$pwm{"protein_id"} ? $$pwm{"protein_id"} : "";
		    my $id_kegg = $orgCode.":".$id;
		    my %acList;
		    # save the list of UniProt accessions from both sources
		    if( defined $wormpepId2acList{$id}   && scalar keys %{$wormpepId2acList{$id}}   ){ for (keys %{$wormpepId2acList{$id}}  ){ ++$acList{$_} } }
		    if( defined $keggId2acList{$id_kegg} && scalar keys %{$keggId2acList{$id_kegg}} ){ for (keys %{$keggId2acList{$id_kegg}}){ ++$acList{$_} } }
		    if( scalar keys %acList ){ @{$$pwm{"protein_acList"}} = sort keys %acList; }
		}
	    }
	    #
	    # D. melanogaster
	    case "dme" {
		#
		# mapping the protein names by assuming that they are GermOnline IDs
		# mapping the protein names by assuming that they are KEGG IDs
		#
		# key: protein ID, value: taxonomy ID
		my %germOnlineId2taxId = map{$$_{"protein_id"}=>$$_{"taxonomy_id"}} grep{$source eq $$_{"source"} && $orgCode eq $$_{"orgCode"}} @$pwmList;
		my %keggId2taxId       = map{$$par{"keggPrefix dme"}.$_=>$germOnlineId2taxId{$_}} keys %germOnlineId2taxId;
		#
		# conversion
		my %germOnlineId2acList; &otherId2acList( \%germOnlineId2taxId, "GERMONLINE_ID", \%germOnlineId2acList, \%$par );
		my %keggId2acList;       &otherId2acList( \%keggId2taxId,       "KEGG_ID",       \%keggId2acList,       \%$par );

		# --- save converted names for each protein of the current organism in the pathway membership list ---
		for my $pwm (grep {$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList){
		    # the list of UniProt accessions (ACs) for the current protein
		    my $id = defined $$pwm{"protein_id"} ? $$pwm{"protein_id"} : "";
		    my $id_kegg = $$par{"keggPrefix dme"}.$id;
		    my %acList;
		    # save the list of UniProt accessions from both sources
		    if( defined $germOnlineId2acList{$id} && scalar keys %{$germOnlineId2acList{$id}} ){ for (keys %{$germOnlineId2acList{$id}} ){ ++$acList{$_} } }
		    if( defined $keggId2acList{$id_kegg}  && scalar keys %{$keggId2acList{$id_kegg}}  ){ for (keys %{$keggId2acList{$id_kegg}}  ){ ++$acList{$_} } }
		    if( scalar keys %acList ){ @{$$pwm{"protein_acList"}} = sort keys %acList; }
		}
	    }
	    #
	    # H. sapiens
	    case "hsa" {
		# mapping the protein names by assuming that they are Ensembl protein IDs
		#
		# key: protein ID, value: taxonomy ID
		my %enspId2taxId = map{$$_{"protein_id"}=>$$_{"taxonomy_id"}} grep{$source eq $$_{"source"} && $orgCode eq $$_{"orgCode"}} @$pwmList;
		#
		# conversion
		my %enspId2acList; &otherId2acList( \%enspId2taxId, "ENSEMBL_PRO_ID", \%enspId2acList, \%$par );

		# --- save converted names for each protein of the current organism in the pathway membership list ---
		for my $pwm (grep {$source eq $$_{"source"} && $$_{"orgCode"} eq $orgCode} @$pwmList){
		    # the list of UniProt accessions (ACs) for the current protein
		    my $id = defined $$pwm{"protein_id"} ? $$pwm{"protein_id"} : "";
		    if( defined $enspId2acList{$id} && scalar keys %{$enspId2acList{$id}} ){ @{$$pwm{"protein_acList"}} = sort keys %{$enspId2acList{$id}}; }
		}
	    }
	}
    }

    # test
    {local$|=1;print"Mapped SignaLink signaling pathway member proteins: ".(scalar grep{defined$$_{"protein_acList"} && scalar @{$$_{"protein_acList"}} && $source eq $$_{"source"}} @$pwmList)." SignaLink of ".(scalar @$pwmList)." total\n";}
}

# ------------------------------------------------------------

sub write_unconverted_protein_names
{
    my ($outFile,$ppiList,$pwmList) = @_;

    # for each data source list unmapped protein IDs
    my %src2protIdList;
    # loop through the list of protein-protein interactions
    for my $ppi (@$ppiList){
	# take both interactors
	for my $AorB (qw<A B>){
	    if( !defined $$ppi{"protein".$AorB."_acList"} || 0 == scalar @{$$ppi{"protein".$AorB."_acList"}} ){
		++${ $src2protIdList{$$ppi{"source"}} }{ $$ppi{"protein".$AorB."_id"} };
	    }
	}
    }
    # loop through the list of pathway memberships
    for my $pwm (@$pwmList){
	if( !defined $$pwm{"protein_acList"} || 0 == scalar @{$$pwm{"protein_acList"}} ){
	    # test
	    if( !defined $$pwm{"source"} || !defined $$pwm{"protein_id"} ){ print "ppi source undef: ".join(", ",map{$_.":".$$pwm{$_}} sort keys %$pwm)."\n"; }
	    ++${$src2protIdList{ $$pwm{"source"} }}{ $$pwm{"protein_id"} };
	}
    }

    # open outfile, print file header
    open OUT, ">$outFile" or die "Error, cannot write to \'$outFile\'\n";
    print OUT "# List of protein identifiers not mapped to UniProt accessions (ACs)\n".
	"#\n".
	"# Source\tProtein identifier\n\n";
    # print unmapped IDs    
    for my $src (sort keys %src2protIdList){ for my $protId (sort keys %{$src2protIdList{$src}}){ print OUT $src."\t".$protId."\n"; }}
    close OUT;
}

# ------------------------------------------------------------

sub mkdir
{
    my ($dir,$shCmdList) = @_;

    # the shell command should contain the IF too,
    # because in the list of not yet executed shell commands there may be already another command making this directory
    if( !(-d $dir) ){ push @$shCmdList, "if [ ! -d \"$dir\" ]; then mkdir \"$dir\"; fi"; }

    #print $dir."\n";#test
}

# ------------------------------------------------------------

sub write_interactions
{
    my ($outMainDir,$ppiList,$unreviewedAllowed,$acList,$par) = @_;

    # local variables: AC --> primary AC, AC --> whether it is reviewed
    my %ac2pac, my %ac2isR;
    for my $acRef (@$acList){ 
	$ac2pac{ $$acRef{ "AC" } } = $$acRef{ "primaryAC"  }; 
	$ac2isR{ $$acRef{ "AC" } } = $$acRef{ "isReviewed" };
    }

    # depending on whether unreviewed proteins are allowed, make the necessary subdirectories
    my $outDir = $outMainDir;
    my $revUnrev = $unreviewedAllowed ? "reviewed-and-unreviewed" : "reviewed-only";
    for my $dirStep ("by_protein_ac", $revUnrev){ ++${ $$par{dirList} }{ ($outDir.="/".$dirStep) } }

    # loop through the list of those protein-protein interactions where both proteins have at least one non-empty UniProt AC
    for my $ppi ( grep {    defined $$_{"proteinA_acList"} && scalar @{$$_{"proteinA_acList"}}
			 && defined $$_{"proteinB_acList"} && scalar @{$$_{"proteinB_acList"}} } @$ppiList )
    {
	# save this interaction both ways: (i) B is an interactor of A, (ii) A is an interactor of B
	for my $AorB (qw<A B>){
	    my $AorB_other = ($AorB eq "A" ? "B" : "A");
	    # set the main directory of the current protein
	    my $protMainDir = $outDir;
	    # IF both proteins do have UniProt accessions, THEN loop through the list of UniProt accessions (ACs) of both
	    if(    defined $$ppi{"protein".$AorB.      "_acList"} && scalar @{$$ppi{"protein".$AorB.      "_acList"}}
		&& defined $$ppi{"protein".$AorB_other."_acList"} && scalar @{$$ppi{"protein".$AorB_other."_acList"}} )
	    {
		for my $ac1 (@{$$ppi{"protein".$AorB."_acList"}}){ for my $ac2 (@{$$ppi{"protein".$AorB_other."_acList"}}){
		    #
		    # IF only reviewed proteins are allowed, THEN check also that both proteins are reviewed
		    if(    # either unreviewed should be allowed
			   $unreviewedAllowed
			|| # or both UniProt accessions (ACs) should be reviewed
			   ( defined $ac2isR{$ac1} && $ac2isR{$ac1} && defined $ac2isR{$ac2} && $ac2isR{$ac2} ) 
		      )
		    {
			# save the interaction: save the 2nd protein as an interactor of the 1st protein
			# 1, go to the subdirectory of the 1st protein: protDir
			# 2, save by PubMed ID of the interactions by
			#    a) interaction source and the name of the 2nd protein
			#       (directory of the interaction source: by_source / <interactor name> )
			#    b)  name of the 2nd protein and the interaction source
			#       (directory of interactor protein: by_interactor / <interaction source name> ) 
			# 
			# if a directory does not yet exist, then make it (this includes parent directories)
			# save the shell commands into the list of shell commands to be executed

			# --- 1 ---
			my $dir = $outDir; for my $char (split m//,$ac1){ ++${ $$par{dirList} }{ ($dir.="/".$char) } }
			my $protDir = $dir;
			my $subDir;

			# --- 2a ---
			$subDir = $protDir;
			for my $dirStep ("interactions_by_source", $$ppi{"source"}, $ac2){ ++${ $$par{dirList} }{ ($subDir.="/".$dirStep) } }
			# if   we do have PubMed IDs for the current interaction,
			# THEN save them into a separate subdirectory
			if( defined $$ppi{"pmidList"} && scalar @{$$ppi{"pmidList"}} ){
			    ++${ $$par{dirList} }{ ($subDir.="/pmid") };
			    # save each PubMed ID
			    for my $pmid (@{ $$ppi{pmidList} }){ ++${ $$par{dirList} }{ ($subDir."/".$pmid) } }
			}

			# --- 2b ---
			$subDir = $protDir;
			for my $dirStep ("interactions_by_interactor", $ac2, $$ppi{"source"}){ ++${ $$par{dirList} }{ ($subDir.="/".$dirStep) } }
			# if   we do have PubMed IDs for the current interaction,
			# THEN save them into a separate subdirectory
			if( defined $$ppi{"pmidList"} && scalar @{$$ppi{"pmidList"}} ){
			    ++${ $$par{dirList} }{ ($subDir.="/pmid") };
			    # save each PubMed ID
			    for my $pmid (@{ $$ppi{pmidList} }){ ++${ $$par{dirList} }{ ($subDir."/".$pmid) } }
			}
		    }
		}}
	    }
	}
    }
}

# ------------------------------------------------------------

sub write_signalingPathwayMemberships
{
    my ($outMainDir,$pwmList,$unreviewedAllowed,$acList,$par) = @_;

    # local variables: AC --> primary AC, AC --> whether it is reviewed
    my %ac2pac, my %ac2isR;
    for my $acRef (@$acList){ 
	$ac2pac{ $$acRef{ "AC" } } = $$acRef{ "primaryAC"  }; 
	$ac2isR{ $$acRef{ "AC" } } = $$acRef{ "isReviewed" };
    }

    # list of shell commands to be executed
    my @shCmdList;

    # depending on whether unreviewed proteins are allowed, make the necessary subdirectories
    my $outDir = $outMainDir;
    for my $dirStep ("by_protein_ac", ($unreviewedAllowed ? "reviewed-and-unreviewed" : "reviewed-only") ){ ++${ $$par{dirList} }{ ($outDir.="/".$dirStep) } }

    # loop through the list of such signaling pathway membership records where the protein has at least one UniProt AC
    for my $pwm (grep {defined $$_{"protein_acList"} && scalar @{$$_{"protein_acList"}}} @$pwmList){
	# loop through the list of accessions (ACs) for the current protein
	for my $ac (@{$$pwm{"protein_acList"}}){
	    #
	    # IF only reviewed proteins are allowed, THEN check also that the protein is reviewed
	    # (either unreviewed should be allowed or the UniProt accession (AC) should be reviewed)
	    if( $unreviewedAllowed || (defined $ac2isR{$ac} && $ac2isR{$ac}) ){
		#
		# 1, go to the subdirectory of the protein: protDir	
		# 2, by source and then by protein
		#    2a, go to the subdirectory of the source (database) from which the signaling pathway was obtained
		#    2b, and then go to the subdirectory named with the ID of the signaling pathway

		# --- 1 ---
		my $dir = $outDir; for my $char (split m//,$ac){ ++${ $$par{dirList} }{ ($dir.="/".$char) } }
		my $protDir = $dir;

		# --- 2a,b ---
 		my $subDir = $protDir;
		for my $dirStep ("signaling_pathway_membership_by_source", @$pwm{qw<source pathway_id>}){ ++${ $$par{dirList} }{ ($subDir.="/".$dirStep) } }
	    }
	}
    }
}

# ------------------------------------------------------------

# for all UniProtKB accessions (ACs): write AC --> primary AC mapping
sub write_AC_2_primaryAC
{
    my ($outMainDir,$ppiList,$pwmList,$unreviewedAllowed,$acInfoList,$par) = @_;

    # depending on whether unreviewed proteins are allowed, make the necessary subdirectories
    my $outDir = $outMainDir;
    for my $dirStep ("by_protein_ac", ($unreviewedAllowed ? "reviewed-and-unreviewed" : "reviewed-only") ){ ++${ $$par{dirList} }{ ($outDir.="/".$dirStep) } }

    # for each UniProt accession (AC) save its primary UniProt accession (AC)
    # --> NOTE: consider only those ACs that do have a primary AC
    for my $acRef (@$acInfoList){
	#
	# IF only reviewed proteins are allowed, THEN check also that the protein is reviewed
	# (either unreviewed should be allowed or the UniProt accession (AC) should be reviewed)
	if( $unreviewedAllowed || ( defined $$acRef{"isReviewed"} && $$acRef{"isReviewed"} ) ){
	    #
	    # 1, go to the subdirectory of the protein: protDir	
	    # 2, make subdirectory called "primary"
	    # 3, make subdirectory named as the primary AC itself

	    # --- 1 ---
	    my $dir = $outDir; for my $char (split m//,$$acRef{"AC"}){ ++${ $$par{dirList} }{ ($dir.="/".$char) } }
	    my $protDir = $dir;

	    # --- 2,3 ---
	    my $subDir = $protDir;
	    for my $dirStep ("primary", $$acRef{"primaryAC"}){ ++${ $$par{dirList} }{ ($subDir.="/".$dirStep) } }
	}
    }
}

# ------------------------------------------------------------

sub write_originalProteinNames
{
    my ($outMainDir,$ppiList,$pwmList,$unreviewedAllowed,$acList,$par) = @_;

    # local variables: AC --> whether it is reviewed
    my %ac2isR; for my $acRef (@$acList){ $ac2isR{ $$acRef{ "AC" } } = $$acRef{ "isReviewed" } }

    # --- list all UniProt accessions (ACs) ---
    # ac2src2name{<AC> <source> <original name>}: listing the accession, source database and original name (character coded) for each protein
    my %ac2src2name;
    #
    # list all UniProt accessions (ACs) from the interactions
    for my $ppi (@$ppiList){
	# take both interactors
	for my $AorB (qw<A B>){
	    # if this interactor does have at least one AC, then list all of its ACs (UniProt accessions)
	    if( defined $$ppi{"protein".$AorB."_acList"} && scalar @{$$ppi{"protein".$AorB."_acList"}} ){
		for my $ac (@{$$ppi{"protein".$AorB."_acList"}}){
		    # original name of the protein (characters converted to numbers)
		    my $originalName = join( "_", map{ord} split m//,$$ppi{"protein".$AorB."_id"});
		    ++$ac2src2name{ join(" ", ( $ac, $$ppi{"source"}, $originalName ) ) };
		}
	    }
	}
    }
    # list all UniProt accessions (ACs) from the pathway membership lists
    for my $pwm (@$pwmList){
	# if this interactor does have at least one AC, then list all of its ACs (UniProt accessions)
	if( defined $$pwm{"protein_acList"} && scalar @{$$pwm{"protein_acList"}} ){
	    for my $ac (@{$$pwm{"protein_acList"}}){
		# original name of the protein (characters converted to numbers)
		my $originalName = join( "_", map{ord} split m//,$$pwm{"protein_id"});
		++$ac2src2name{ join(" ", ( $ac, $$pwm{"source"}, $originalName ) ) };
	    }
	}
    }

    # --- save all original names ---- two successive selection levels, 1st: by protein AC, 2nd: by data source ---
    #
    # list of shell commands to be executed
    my @shCmdList;
    #
    # depending on whether unreviewed proteins are allowed, make the necessary subdirectories
    my $outDir = $outMainDir;
    for my $dirStep ("by_protein_ac", ($unreviewedAllowed ? "reviewed-and-unreviewed" : "reviewed-only")){ ++${ $$par{dirList} }{ ($outDir.="/".$dirStep) } }
    for my $ac2src2name (sort keys %ac2src2name){
	my ($ac,$src,$name) = split m/\s+/, $ac2src2name;
        # proceed, if
        # EITHER  unreviewed are allowed
        # OR      unreviewed are not allowed and the AC is reviewed
        if( $unreviewedAllowed || (defined $ac2isR{$ac} && $ac2isR{$ac}) ){
	    # . change to the subdirectory of the current AC (UniProt accession -- the name of a protein)
	    # . if this subdirectory and/or its parents do not exist yet, then make them
	    my $dir = $outDir; for my $char (split m//,$ac){ ++${ $$par{dirList} }{ ($dir.="/".$char) } }
	    # . move into subdirectories two steps deeper and make them, if they do not yet exist
	    for my $dirStep ( "original_name_by_database_source", $src, $name ){
		++${ $$par{dirList} }{ ($dir.="/".$dirStep) };
	    }
	}
    }
}

# ------------------------------------------------------------

sub write_signalingPathwayInfo
{
    my ($outMainDir,$spwInfo,$par) = @_;

    # source + pathway ID -> pathway name, pathway short name
    my %srcPwId2pwName;
    my %srcPwId2pwShortName;
    for my $spw (@$spwInfo){
	$srcPwId2pwName{      $$spw{"source"}." ".$$spw{"pathway_id"} } = $$spw{"pathway_name"     };
	$srcPwId2pwShortName{ $$spw{"source"}." ".$$spw{"pathway_id"} } = $$spw{"pathway_shortName"};
    }

    # the list of the source databases of signaling pathways
    my %sourceList = map{$$_{"source"}=>1} grep{defined $$_{"pathway_isSignaling"} && $$_{"pathway_isSignaling"}} @$spwInfo;

    # list of shell commands to be executed
    my @shCmdList;

    # save information separately for each source
    for my $src (keys %sourceList){
	#
	# the list of pathway IDs for this source
	my %pathwayIdList = map{$$_{"pathway_id"}=>1} grep{$src eq $$_{"source"} && defined $$_{"pathway_isSignaling"} && $$_{"pathway_isSignaling"}} @$spwInfo;

	# loop through the list of pathway IDs for this source
	for my $pwId (sort keys %pathwayIdList){
	    #
	    # save information on each pathway by its pathway ID: make the appropriate subdirectory
	    my $dir = $outMainDir;
	    for my $dirStep ("by_pathway", $src, $pwId){ ++${ $$par{dirList} }{ ($dir.="/".$dirStep) } }
	    my $pathwayDir = $dir;
	    my $subDir;

	    # save the name of the pathway (with numerically encoded characters)
	    $subDir = $pathwayDir;
	    for my $dirStep ("pathway_name", join("_",map{ord} split m//,$srcPwId2pwName{$src." ".$pwId})){ ++${ $$par{dirList} }{ ($subDir.="/".$dirStep) } }

	    # save the short name of the pathway (with numerically encoded characters)
	    $subDir = $pathwayDir;
	    for my $dirStep ("pathway_shortName", join("_",map{ord} split m//,$srcPwId2pwShortName{$src." ".$pwId})){ ++${ $$par{dirList} }{ ($subDir.="/".$dirStep) } }
	}
    }
}

# ------------------------------------------------------------

# par{ac2orgCode}: 3-letter organism code for each AC
sub readAClist_saveAC2pAC_saveReviewedAC_saveAC2orgCode
{
    my ($inFilePattern,$acList_all,$par) = @_;

    # --- clear output data ---
    @$acList_all = ();
    my @keyList = sort grep{/^ac2orgCode/} keys %$par; for (@keyList){ delete $$par{$_} }

    # --- read UniProt accessions (ACs) and reviewed/unreviewed information ---
    # loop through the list of input files
    for my $inFile (glob($inFilePattern)){
	# open file (unzip before opening, if necessary)
	my $in = ($inFile=~/\.(gz|Z|zip)$/) ? "gzip -dc $inFile|" : $inFile;
	open IN, "$in" or die "Error, cannot open \'$in\'\n";
	# read input records ( input record separator: //\n )
	local $/ = "//\n";
	while(my$record=<IN>){

	    my $isReviewed = ($record=~m/Reviewed/) ? "1" : "0";
	    # save the UniProt accessions (ACs) from each AC line
	    my @acList; for my $acLine (grep{/^AC/} split m/\n/,$record){ for my $ac ($acLine =~ /(\S{6});/g){ 
		push @acList, $ac;
	    }}
	    # save the 3-letter organism code
	    my $orgIsRequested = 0, my $orgCode, my $taxonomy_id;
	    for my $line (grep {/^ID/} split m/\n/, $record){
		if( $line =~ /^ID\s*\S+?_(CAEEL|DROME|HUMAN)/){
		    ++$orgIsRequested;
		    $orgCode = $$par{"orgUpCode2orgCode ".$1};
		    $taxonomy_id = $$par{"orgCode2taxId ".$orgCode};
		}
	    }

	    # IF        we are at one of the requested organisms,
	    # i.e., IF  the organism code is defined,
	    # THEN      save each accession (AC)
	    #           such that the first AC is the primary AC
	    if( $orgIsRequested ){
		for my $ac (@acList){
		    my $isPrimary = $ac eq $acList[0] ? "1" : "0";
		    #
		    # save the entire record
		    push @$acList_all,
		    { "AC" => $ac, "isReviewed" => $isReviewed, "isPrimary" => $isPrimary, "primaryAC" => $acList[0],
		      "orgCode" => $orgCode, "taxonomy_id" => $taxonomy_id, };
		    #
		    # save the organism code for the currrent AC
		    $$par{"ac2orgCode ".$ac} = $orgCode;
		}
	    }
	}
	# close infile
	close IN;
    }
    #@ac_allList = @acList_all[0..4];#test
    # test
    {local$|=1;print &currentTime."\t"."pac,rev: ".(scalar @$acList_all)."\n";}
    #{local$|=1;print "number of ACs with pAC: ".(scalar grep{defined $$_{"primaryAC"}}@$acList_all)."\n";exit(1);}

}

# --------------------------------------------------------------

sub otherId2acList
{
    my ($otherId2taxId,$otherIdType,$otherId2acList,$par) = @_;

    # --- local variables ---
    # for each taxonomy ID get the list of gene/protein IDs ("other IDs") to be converted
    my %taxId2otherIdList; while(my($protId,$taxId)=each%$otherId2taxId){ ++${$taxId2otherIdList{$taxId}}{$protId} }
    # the list of other IDs to be mapped
    my @otherIdList = sort keys %$otherId2taxId;
    #@otherIdList = @otherIdList[0..4];  # test
#test
#print join(", ",@otherIdList)."\n";

    # --- do the conversion with the UniProt ID mapping tool in groups ---
    # the maximal size of a group should be at most queryMaxSize

    # go on while we have at least one other ID to be mapped to UniProt ACs
    while(@otherIdList){
	#
	my @query; while( ($$par{"queryMaxSize"} > scalar @query) && scalar @otherIdList ){ my $id = shift @otherIdList; push @query, $id; }
	#
	# the URL for requesting the ID mapping
	my $url = $$par{"uniprotApiUrl"};
	$url =~ s/FROM/$otherIdType/;
	$url =~ s/TO/ACC/;
	my $query = join("+",@query);
	$url =~ s/QUERY/$query/;
	#
	# downloading and saving the mapping
	open IN, "sleep ".$$par{"wget_sleep"}."; wget \"$url\" -o /dev/null -O - |";
#test
#print $url."\n";
	# test
	#print $url."\n";
	# discard header line
	<IN>; 
	# loop through data lines and save UniProt accessions (ACs)
	while( my $line = <IN> ){
	    chomp $line;
	    # if the line format is correct, then save data
	    my ($otherId,$ac) = split m/\t/, $line;
	    # set the 3-letter code of the organism to which the queried ID (otherId) belongs
	    my $orgCode = $$par{"taxId2orgCode ".$$otherId2taxId{$otherId}};
	    # IF    the AC returned by the name mapping service belongs to the same organism,
	    # THEN  save this AC as an equivalent (mapping) of the otherId
	    if( defined $$par{"ac2orgCode ".$ac} && $$par{"ac2orgCode ".$ac} eq $orgCode ){ ++${ $$otherId2acList{$otherId} }{ $ac }; }
	    # else: write warning message
	    else{ print "Warning, mapping query result line not saved for orgCode=$orgCode, otherId=$otherId, line=$line\n"; }
	}
	close IN;
    }
}

# -------------------------------------------------------------------------

sub execShellCmdList
{
    my ($shCmdList,$par) = @_;

    #{local$|=1;print "".(scalar @$shCmdList)."\n";} #test
    # execute shell commands and then empty the list of shell commands
    while(@$shCmdList){
	# How many of them should be executed now? Not more than shCmdGroupSize of the remaining commands
	my $n = min( scalar @$shCmdList, $$par{"shCmdGroupSize"} );
	# execute shell commands
	my $shCmd = join("; ",@$shCmdList[0..($n-1)]);
	`$shCmd`;
	#print $shCmd."\n\n"; #test
	# remove them from the list
	for(1..$n){ shift @$shCmdList }
    }
}

# -------------------------------------------------------------------------

sub write_all_data
{
    my ($par) = @_;
    my @dirList = sort keys %{$$par{dirList}};
    #print "".join("\n",@dirList)."\n";#test

    # test
    #print join("\n",@dirList)."\n";
    my $numDone; {local$|=1;print "".&currentTime."\t"."-- starting directories: ".(scalar @dirList)."\n";}#test

    # make the requested directories in groups of <shCmdGroupSize>
    while(@dirList){
	# How many of the directories should be made now? Not more than shCmdGroupSize of the remaining directories
	my $n = min( scalar @dirList, $$par{"shCmdGroupSize"} );
	# the shell commands to make the directories
	my $shCmd = "mkdir -p ".join(" ",@dirList[0..($n-1)]);
	`$shCmd`;
	#print $shCmd."\n\n"; #test
	# remove them from the list
	for(1..$n){ shift @dirList }
	# test
	$numDone += $$par{"shCmdGroupSize"};
	if( ! ($numDone % 10000) ){ local $|=1; print &currentTime."\tdone: ".$numDone."\n"; }
    }
    # test
    {local$|=1;print "".&currentTime."\t"."-- done\n";}
}

# ------------------------------------------------

sub read_write_ac_list
{
    my ($outMainDir,$outFile,$par) = @_;

    # local variables
    my $currentDir; # the current data directory
    my %acList; # the list of UniProt accessions (ACs)

    # list all ACs
    $currentDir = $outMainDir."/by_protein_ac/reviewed-and-unreviewed";
    my $shCmd = "ls -R ".$currentDir."/*/*/*/*/*/*|grep original_name_by_database_source\:";
    open IN, "$shCmd |" or die "Error, cannot list directories\n";
    while(my$line=<IN>){
	if( $line =~ / \/ (\S) \/ (\S) \/ (\S) \/ (\S) \/ (\S) \/ (\S) \/ /x ){
	    my $ac = $1.$2.$3.$4.$5.$6;
	    ++$acList{ $ac };
	}
    }
    close IN;

    # write the list of ACs
    &write_itemList_fromHashKeys_withStats($outFile,\%acList);
}

# -------------------------------------------------

sub read_ac2name_saveMapping
{
    my ($acRefList,$inFilePattern,$par) = @_;

    # the mapping
    my %ac2name;

    # read the input files and for each UniProt accession (AC) save a name
    for my $inFile (glob($inFilePattern)){
	# open input file (unzip before opening, if necessary)
	my $in = ($inFile=~/\.(gz|Z|zip)$/ ? "gzip -dc $inFile |" : $inFile);
	open IN, $in or die "Error: cannot open \'$in\'\n";
	# read // - separated records
	local $/ = "//\n";
	while(my$record=<IN>){
	    # save accessions from the AC lines
	    my @acList; for (grep{/^AC/} split m/\n/, $record){ push @acList, (/(\S+?);/g); }
	    # save gene name
	    # 1st attempt: from the GN line
	    # 2nd: from the ID line
	    if(    $record =~ /\nGN\s+Name=(\S+?);/
		|| $record =~ /^ID\s+(\S+?)_/ )
	    {
		for (@acList){ $ac2name{$_} = $1; }
	    }
	    else{ print "Warning: no name for ".$acList[0]."\n"; }
	}
	close IN;
    }
    # test
    #print join("\n",map{$_." ".$ac2name{$_}} sort keys %ac2name)."\n";

    # loop through the references of all used UniProt accessions (ACs):
    # - IF   there is at least one name known for the current AC
    #   THEN save it
    for my $acRef (@$acRefList){
	my $ac = $$acRef{"AC"};
	if( defined $ac2name{$ac} ){ @{$$acRef{"nameList"}} = ($ac2name{$ac}); }
	else{                        @{$$acRef{"nameList"}} = $ac;             
				     print "Warning: no name for this AC: ".$ac."\n"; }
    }
}

# -------------------------------------------------

sub write_AC_2_name
{
    my ($outMainDir,$acList,$unreviewedAllowed,$par) = @_;

    # local variables: AC --> whether it is reviewed
    my %ac2isR; for my $acRef (@$acList){ $ac2isR{ $$acRef{ "AC" } } = $$acRef{ "isReviewed" } }

    # depending on whether unreviewed proteins are allowed, make the necessary subdirectories
    my $outDir = $outMainDir;
    for my $dirStep ("by_protein_ac", ($unreviewedAllowed ? "reviewed-and-unreviewed" : "reviewed-only") ){ ++${ $$par{dirList} }{ ($outDir.="/".$dirStep) } }

    # loop through the list of UniProt ACs
    for my $acRef (@$acList){
	# proceed, if
	# EITHER  unreviewed are allowed
	# OR      unreviewed are not allowed and the AC is reviewed
	my $ac = $$acRef{"AC"};
	if( $unreviewedAllowed || (defined $ac2isR{$ac} && $ac2isR{$ac}) ){
	    #
	    # 1, go to the subdirectory of the protein: protDir	
	    # 2, make the subdirectory containing the names of the protein
	    # 3, make one subdirectory for each gene name
	    #    the name of the subdirectory is the gene name (numerically encoded)
	    #
	    # --- 1 ---
	    my $dir = $outDir; for my $char (split m//,$ac){ ++${ $$par{dirList} }{ ($dir.="/".$char) } }
	    my $protDir = $dir;
	    #
	    # --- 2 ---
	    my $nameDir = $protDir."/gene_names_from_uniprot";
	    ++${ $$par{dirList} }{ $nameDir };
	    #
	    # --- 3 ---
	    for my $geneName (@{$$acRef{"nameList"}}){
		++${ $$par{dirList} }{ $nameDir."/".join("_",map{ord} split m//,$geneName) }
	    }
	}
    }
}

# -------------------------------------------------

sub write_itemList_fromHashKeys_withStats
{
    my ($outFile,$itemList) = @_;
    my $nItem = scalar keys %$itemList;

    # --- open outfile, write header ---
    open OUT, ">$outFile" or die "Error, cannot write to \'$outFile\'\n";
    print OUT <<"STOP";
# List of items, one item per line
# Number of items: $nItem

STOP
    # --- print the list of items, sorted ---
    print OUT join("\n",sort keys %$itemList)."\n";
    close OUT;
}

# ----------------------------------------------

sub write_stats_beforeAfter_idMapping
{
    my ($acList,$ppiList,$pwmList,$before_or_after,$par) = @_;

    # --- local variables ---
    my %ac2pac, my %ac2isReviewed, my %ac2isPrimary;
    for my $ac (@$acList){
	$ac2pac{        $$ac{"AC"} } = $$ac{"primaryAC"    };
	$ac2isPrimary{  $$ac{"AC"} } = $$ac{"ac2isPrimary" };
	$ac2isReviewed{ $$ac{"AC"} } = $$ac{"ac2isReviewed"};
    }

    # --- list interaction types ---
    # all detailed interaction types
    my %ppiTypeList; for my $ppi (@$ppiList){ ++$ppiTypeList{ $$ppi{"source"}." ".$$ppi{"taxonomy_id"} } }
    # grouped interaction types
    #   CCSB
    ++$ppiTypeList{ "ccsb 6239" };
    #   DroId
    ++$ppiTypeList{ "droid 7227" };
    #   STRING
    for my $taxId (@{$$par{"taxIdList"}}){ ++$ppiTypeList{ "string". " " . $taxId } }

    # --- interaction statistics --- 
    # for each interaction type list the nodes and links and count both
    for my $ppiType (sort keys %ppiTypeList){
	# the list of interactions of this type
	my %ppiList_thisType;
	# the list of nodes in these interactions
	my %nodeList;
	# interaction source and organism
	my ($source,$taxId) = split m/\s+/, $ppiType;
	# loop through the list of interaction types
	# a match 'source =~ /^source/' is used (instead of an equality) to allow for grouped interaction types
	for my $ppi (grep {$$_{"source"} =~ /^$source/ && $taxId eq $$_{"taxonomy_id"}} @$ppiList){
	    # before mapping the IDs the interactors are named proteinA_id, proteinB_id
	    if( "before" eq $before_or_after ){
		for my $AorB (qw<A B>){
		    ++$nodeList{ $$ppi{"protein".$AorB."_id"} };
		}
		++$ppiList_thisType{ join( " ", sort map{$$ppi{"protein".$_."_id"}} qw<A B> ) };
	    }
	    # after mapping the IDs the interactors have AC lists
	    elsif( "after" eq $before_or_after ){
		for my $AorB (qw<A B>){
		    # loop through the list of ACs to which the name/id of the current protein has been mapped
		    for my $ac (@{$$ppi{"protein".$AorB."_acList"}}){
			# use only primary ACs
			if( defined $ac2isPrimary{ $ac} && 1 == $ac2isPrimary{ $ac} ){
			    push @{$$ppi{"protein".$AorB."_primary_acList"}}, $ac;
			}
		    }
		    # IF    the protein has more than one primary AC,
		    # THEN  send a warning message
		    if( defined $$ppi{"protein".$AorB."_primary_acList"} && 1 < scalar @{$$ppi{"protein".$AorB."_primary_acList"}} ){
			local $| = 1; print "Warning, more than one primary AC for a protein: ".join(", ",@{$$ppi{"protein".$AorB."_primary_acList"}})."\n";
		    }
		}

		# --- save the list of proteins and the list of interactions ---
		# save all primary ACs of both proteins
		for my $psac ( @{$$ppi{"proteinA_primary_acList"}}, @{$$ppi{"proteinB_primary_acList"}} ){ ++$nodeList{$psac} }
		# for each link loop through the list of primary ACs of both proteins
		for my $psacA (@{$$ppi{"proteinA_primary_acList"}}){
		    for my $psacB (@{$$ppi{"proteinB_primary_acList"}}){
			++$ppiList_thisType{ join( " ", sort map{$$ppi{"protein".$_."_id"}} qw<A B> ) };
		    }
		}
	    }
	    else{
		die "Error, no such option for 'before_or_after': ".$before_or_after."\n";
	    }
	}
	local $| = 1;
	print $before_or_after."\tInteractions, source/organism: ".$ppiType."\tNodes: ".(scalar keys %nodeList)."\tLinks: ".(scalar keys %ppiList_thisType)."\n";
    }

    # --- signaling pathway membership statistics ---
    my %pwmTypeList; for my $pwm (@$pwmList){ ++$pwmTypeList{ $$pwm{"source"}." ".$$pwm{"taxonomy_id"} } }
    for my $pwmType (sort keys %pwmTypeList){
	# the list of proteins from this source of signaling pathways
	my %nodeList;
	# pathway source and organism
	my ($source,$taxId) = split m/\s+/, $pwmType;
	# loop through the list of signaling pathway types
	# an equality is used in the next line (source eq source), because there are no grouped source types here (eg, 'ccsb' for 'both' ccsbwi8 and 'ccsbgenetic')
	for my $pwm (grep {$$_{"source"} eq $source && $taxId eq $$_{"taxonomy_id"}} @$pwmList){
	    # before mapping the IDs the protein is named protein_id
	    if( "before" eq $before_or_after ){
		++$nodeList{ $$pwm{"protein_id"} };
	    }
	    # after mapping the IDs each protein has an AC list
	    elsif( "after" eq $before_or_after ){
		# loop through the list of ACs to which the name/id of this protein has been mapped
		for my $ac (@{$$pwm{"protein_acList"}}){
		    # use only Swiss-Prot (reviewed) primary ACs
		    if(    defined $ac2isPrimary{ $ac} && 1 == $ac2isPrimary{ $ac} 
			&& defined $ac2isReviewed{$ac} && 1 == $ac2isReviewed{$ac} )
		    {
			push @{$$pwm{"protein_primary_swp_acList"}}, $ac;
		    }
		}
		# IF    the protein has more than one primary Swiss-Prot AC,
		# THEN  send a warning message
		if( defined $$pwm{"protein_primary_swp_acList"} && 1 < scalar @{$$pwm{"protein_primary_swp_acList"}} ){
		    local $| = 1;
		    print "Warning, more than one Swiss-Prot primary AC for a protein: ".join(", ",@{$$pwm{"protein_primary_swp_acList"}})."\n";
		}
		# --- save the list of proteins ---
		# save all primary Swiss-Prot ACs of both proteins
		for my $psac (@{$$pwm{"protein_primary_swp_acList"}}){ ++$nodeList{$psac} }
	    }
	    else{
		die "Error, no such option for 'before_or_after': ".$before_or_after."\n";
	    }
	}
	local $| = 1;
	print $before_or_after."\tSignaling pathway source/organism: ".$pwmType."\tNodes: ".(scalar keys %nodeList)."\n";
    }
}

# ================== main ================

&init();

# ---------------------------- variables ------------------------------------

my @ppiList; # the list of PPIs (Protein-Protein Interactions)
my @spwInfo; # information about signaling pathways: name and short name of pathway
my @pwmList; # pathway membership of proteins (pwm: pathway membership)
my @acList;  # information about UniProt accessions (ACs)


# ---------- protein name conversions and protein lists for all data sources ------------------------------------------
# for each UniProt accession (AC): save its primary AC, save whether that AC is reviewed
&readAClist_saveAC2pAC_saveReviewedAC_saveAC2orgCode($INFILE_PATTERN_UNIPROT_DATA_AC_ID,\@acList,\%PAR);

# --------------- map all UniProt accessions (ACs) to gene/protein names -----------------
&read_ac2name_saveMapping(\@acList,$INFILE_PATTERN_UNIPROT_AC_ID_DE_GN,\%PAR);


# --------------- read interactions with PubMed IDs (when available) --------------------

# --- BioGrid ---
# read BioGrid interactions
&read_BioGrid_interactions_pmids($INFILE_BIOGRID_ALL_INTERACTIONS,$SELECTED_TAXONOMY_ID_LIST,\@ppiList);

# --- CCSB ---
# read CCSB interactions
&read_CCSB_interactions($INFILE_CCSB_GENETIC_INTERACTIONS,"genetic",\@ppiList,\%PAR);
&read_CCSB_interactions($INFILE_CCSB_WI8_INTERACTIONS,"wi8",\@ppiList,\%PAR);

# --- DroId ---
# read DroId interactions
&read_DroId_interactions($INFILE_DROID_CURAGEN_INTERACTIONS,       "curagen",      \@ppiList,\%PAR);
&read_DroId_interactions($INFILE_DROID_FINLEY_INTERACTIONS,        "finley",       \@ppiList,\%PAR);
&read_DroId_interactions($INFILE_DROID_GENETIC_INTERACTIONS,       "genetic",      \@ppiList,\%PAR);
&read_DroId_interactions($INFILE_DROID_HYBRIGENICS_INTERACTIONS,   "hybrigenics",  \@ppiList,\%PAR);
&read_DroId_interactions($INFILE_DROID_OTHER_PHYSICAL_INTERACTIONS,"otherphysical",\@ppiList,\%PAR);
# for the DroId genetic interactions read the PubMed IDs from the reference pages of FlyBase
&read_DroId_genetic_PubMedIds_from_FlyBaseReferencePages($DIR_FLYBASE_REFERENCE_PAGES,\@ppiList,\%PAR);

# --- HPRD ---
# read HPRD interactions
&read_HPRD_interactions($INFILE_HPRD_INTERACTIONS,\@ppiList,\%PAR);

# --- STRING ---
# read STRING interactions: "db" and "exp"
&read_STRING_interactions_db_exp($INFILE_STRING,$IN_STRING_SCORE_THRESHOLD,$SELECTED_TAXONOMY_ID_LIST,\@ppiList);


# ------------------------- read info for signaling pathways and memberships in signaling pathways -------------------------

# --- KEGG ---
# read info for a selected group of KEGG's pathways (signaling pathways):
# pathway ID -> (i) pathway name, (ii) pathway short name
&read_KEGG_signalingPathwayInfo($INFILE_KEGG_SIGNALING_PATHWAY_INFO,\@spwInfo);
# read KEGG pathway memberships only for signaling pathways
&read_KEGG_signalingPathwayMemberships($INFILE_PATTERN_KEGG_GENE_MAP,\@spwInfo,\@pwmList,\%PAR);

# --- Reactome ---
# read info for a selected group of pathways (signaling pathways) of Reactome:
# pathway ID -> (i) pathway name, (ii) pathway short name
&read_Reactome_signalingPathwayInfo($INFILE_REACTOME_SIGNALING_PATHWAY_INFO,\@spwInfo);
# read Reactome pathway memberships only for signaling pathways
&read_Reactome_signalingPathwayMemberships($INFILE_REACTOME_PATHWAYS_STID,\@spwInfo,\@pwmList,\%PAR);

# --- SignaLink ---
# read info for the pathways of SignaLink (they are all signaling pathways):
# pathway ID -> (i) pathway name, (ii) pathway short name
&read_SignaLink_signalingPathwayInfo($INFILE_SIGNALINK_PATHWAY_INFO,\@spwInfo);
# read SignaLink pathway memberships from all SignaLink pathways
# (all SignaLink pathways are signaling pathways, so there's no need to select from them)
&read_SignaLink_signalingPathwayMemberships($INFILE_PATTERN_SIGNALINK_PATHWAY_MEMBERSHIPS,\@pwmList,\%PAR);

# --- test: write interaction numbers by interaction type, write protein numbers for the interactions and pathways ---
&write_stats_beforeAfter_idMapping(\@acList,\@ppiList,\@pwmList,"before",\%PAR);


# -------------------- convert protein names to UniProt accessions (ACs) ----------------------------

# --- BioGrid ---
# read BioGrid ID -> UniProt AC mappings
&read_BioGridId_2_AC_and_EntrezGeneId($INFILE_BIOGRID_ID_MAPPINGS,\@ppiList);
# list unmapped BioGrid IDs and map them to UniProt accessions (ACs) via Entrez Gene IDs
&map_unmapped_BioGridIds_via_EntrezGeneId_2_allAC(\@ppiList,\%PAR);

# --- CCSB ---
# map protein identifiers to UniProt ACs
&map_CCSB_proteinIds2allAC(\@ppiList,\%PAR);

# --- DroId ---
# map protein identifiers to UniProt ACs
&map_DroId_proteinIds2allAC(\@ppiList,\%PAR);

# --- HPRD ---
# map protein identifiers to UniProt ACs
&map_HPRD_proteinIds2allAC(\@ppiList,\%PAR);

# --- STRING ---
# map protein identifiers to UniProt ACs
&map_STRING_proteinIds2allAC(\@ppiList,$SELECTED_TAXONOMY_ID_LIST,\%PAR);

# --- KEGG ---
# map protein identifiers to UniProt ACs
&map_KEGG_proteinIds2allAC(\@pwmList,$SELECTED_TAXONOMY_ID_LIST,\%PAR);
# map KEGG names of fly proteins
&map_KEGG_flyProteinsCG(\@pwmList,$INFILE_PATTERN_UNIPROT_INVERTEBRATES_AC_ID_CG_ORF,\%PAR);

# --- Reactome ---
# map protein identifiers to UniProt ACs
&map_Reactome_proteinIds2allAC(\@pwmList,$SELECTED_TAXONOMY_ID_LIST,\%PAR);

# --- SignaLink ---
# map protein identifiers to UniProt ACs
&map_SignaLink_proteinIds2allAC(\@pwmList,$SELECTED_TAXONOMY_ID_LIST,\%PAR);

# --- test: write interaction numbers by interaction type, write protein numbers for the interactions and pathways ---
&write_stats_beforeAfter_idMapping(\@acList,\@ppiList,\@pwmList,"after",\%PAR);


# ------------------------------------------- write output data ---------------------------------------------------

# --- ppi, pwm ---
# . list unconverted protein names
&write_unconverted_protein_names($OUTFILE_UNCONVERTED_PROTEIN_NAMES,\@ppiList,\@pwmList);

# --- by_protein: write output data by proteins -- do all without and with unreviewed proteins ---
for my $unreviewedAllowed (qw<0 1>){

    # . save each signaling pathway source and below that each pathway ID (in which this protein is a member)
    &write_signalingPathwayMemberships($OUT_MAIN_DATA_DIR,\@pwmList,$unreviewedAllowed,\@acList,\%PAR);
    # . A) by protein and then by interaction database: interacting other ACs, and the PubMed ID for each (where available)
    # . B) by interaction database and then by protein
    &write_interactions($OUT_MAIN_DATA_DIR,\@ppiList,$unreviewedAllowed,\@acList,\%PAR);
    # . for each UniProt accession (AC) save its primary AC
    &write_AC_2_primaryAC($OUT_MAIN_DATA_DIR,\@ppiList,\@pwmList,$unreviewedAllowed,\@acList,\%PAR);
    # . write the original names of proteins
    &write_originalProteinNames($OUT_MAIN_DATA_DIR,\@ppiList,\@pwmList,$unreviewedAllowed,\@acList,\%PAR);
    # . write the gene/protein name for each Uniprot accession (AC)
    &write_AC_2_name($OUT_MAIN_DATA_DIR,\@acList,$unreviewedAllowed,\%PAR);
}

# --- by_pathway ---
# . pathway names, short names
&write_signalingPathwayInfo($OUT_MAIN_DATA_DIR,\@spwInfo,\%PAR);


# ====================== write all data (make subdirectories) ========================

&write_all_data(\%PAR);