03_download_uniprot_suggestions.pl

This Perl script is used to query the order of completions via a UniProt search. During running, it verboses the search. If it was successful (there were results), the time the search took is echoed. Unsuccessful searches are filled out later in the next step.

This is the documentation of all downloads.

Move your mouse over the code area and look at its top right corner where a set of icons for downloading, copy-pasting, etc. will appear.


#!/usr/bin/perl
use strict;
use warnings;

use Time::HiRes qw(time);
use Tree::Prefix;

# =============== parameters ===============

our $limit=10;		# number of suggestions to show
my $min_char=2;
our $i=0;

my @species=qw/drome caeel human/;

# ======================= initialize =======================

# these are excluded from the search, because the query interface reserves them
my @stopwords=qw/and or not/;

our %stopwords;
map { $stopwords{$_}="" } @stopwords;

# UniProt IDs
my %species2org=(
	'human' => 9606,
	'caeel' => 6239,
	'drome' => 7227
);

my $tree=new Tree::Prefix;
my $pwd=$tree->pwd;

# ======================= read from file =======================

for my $species (@species) {
	#if ($species eq "drome") { next; }
	$tree->set_basedir($pwd."/".$species);

	print STDERR "loading species: $species\n";

	# download all possible queries from UniProt and save a limited list of suggestions
	download_tree_from_uniprot($tree, "", $species2org{$species});
}

# =============== recursively processes a tree, loading Uniprot word order into it ===============
sub download_tree_from_uniprot {
	my $tree=shift;
	my $prefix=shift;
	my $species=shift;

	# go through all the first-level letters in the tree
	#for my $letter (keys %$tree) {
	for my $letter (@{$tree->list_dir($prefix)}) {
		#if ($letter eq 'ls') { next; }

		my $entered=$prefix.$letter;
		if (!$tree->exists_file($entered) && length $entered >= $min_char && !exists $stopwords{$entered}) {
			# don't query again existing data

			print STDERR $entered;
			my $start_time=time();

			my $search="$entered*";
			# it is faster for short prefixes to omit the *-wildcard
			#if (length $entered <= 3) {
			if (length $entered == 2) {
				$search=$entered;
			}

			my $tab=my_get( "http://www.uniprot.org/uniprot/".
                                 "?query=organism:$species+AND+$search+reviewed%3Ayes&format=tab".
                                 "&columns=domains,id,entry%20name,genes,protein%20names&sort=score");

			if ($tab ne "") {
				my @tab=split/\n/, $tab;
				shift @tab;		# drop header line

				my @all_the_words=();		# in order to remove duplicates
				for my $line (@tab) {
					# for the words, we omit all kinds of parentheses, commas
					$line =~ s/[\(\),_\[\]\{\}]/ /g;

					my @words=split/[ \t]/, $line;

					# look for prefix in the selected line
					my $quoteentered=quotemeta $entered;
					@words=grep { /^$quoteentered/i } @words;

					# change "Case" to "case"
					@words=map { if (/^([A-Z])([-a-z]+)$/) { lc $_ } else { $_ } } @words;

					# omit words ending in non-letter-or-number
					@words=grep { /[a-zA-Z0-9]$/ } @words;

					@all_the_words=(@all_the_words, @words);
				}
				my $all_the_words=drop_duplicates(\@all_the_words);
				my @limited_words=@$all_the_words[0..min($limit-1, $#$all_the_words)];

				#$tree->insert_files($entered, $all_the_words);
				$tree->insert_files($entered, \@limited_words);

				my $end_time=time();
				my $duration=int(($end_time-$start_time)*100)/100;

				print STDERR " ($duration"."s)";
			} else {
				# for empty search results, save an empty results file
				# (in order not to repeat processing this prefix next time)

				my @a=();
				$tree->insert_files($entered, \@a);
			}

			print STDERR "\n";
		} else {
			if ($i++ > 1000) {
				print STDERR "$entered\n";
				$i=0;
			}
		}

		download_tree_from_uniprot($tree, $entered, $species);
	}
}

# =============== drop duplicates from a list, preserving the original order ===============
sub drop_duplicates {
	my $list=shift;
	my %elements;

	my @new_list;

	for my $element (@$list) {
		if (!exists $elements{lc $element}) {
			$elements{lc $element}="";
			push @new_list, $element;
		}
	}

	return \@new_list;
}

sub my_get {
    my $url=shift;
    my @args=("-qO-", $url);

    open WGET, "-|", "wget", @args;
    my @all=<WGET>;
    close WGET;

    return join "", @all;
}

# ========== smallest element of given parameters ==========
sub min {
	my $min=100000000000;
	for (@_) {
		if ($_<$min) { $min=$_ }
	}

	return $min;
}