#!/usr/bin/perl

# extract_community_binding_data.pl - extract user-specified data, in a user-specified format, from the community binding data
# Written by Brett Trost
# University of Saskatchewan
# Last updated Aug. 8, 2006
# This script may be used and modified by anyone.

use strict;
use warnings;

# Get the data from the file.
open DATA, "<complete_binding_data.txt";
my @recs = <DATA>;
close DATA;

shift @recs; # First line is not a record.

# Set defaults for command-line options.
my $format           = "%o\t%a\t%l\t%c\t%s\t%i\t%I";
my $binders_only     = 0;
my $nonbinders_only  = 0;
my $threshold        = 500;
my $binder_name      = "binder";
my $nonbinder_name   = "nonbinder";
my $desired_length   = 0;
my $desired_allele   = "";
my $desired_organism = "";
my $desired_cv       = "";
my $output_file      = "";


my $help_msg = <<HELP;

In June 2006, B. Peters et al. published a set of over 48,000 peptides, each of which has had its
MHC class I binding affinity determined experimentally by one of two labs:

The lab of Alessandro Sette at the La Jolla Institute for Allergy and Immunology

and

The lab of Soren Buus at the University of Copenhagen

The citation for the paper is "Peters, B., Bui, H. H., Frankild, S., Nielsen, M., Lundegaard, C., et al. (2006)
"A community resource benchmarking predictions of peptide binding to MHC-I molecules", PLoS Comput Biol 2(6):e65.

The dataset can be downloaded from http://mhcbindingpredictions.immuneepitope.org/.  To use this program, you must download 
a copy of the dataset and place it in the same folder as this script.  The filename of the dataset must be "complete_binding_data.txt".

This set represents the largest set of MHC class I binding data produced thus far, and has the desirable property 
that the measurements are readily comparable because those tested in the same lab were determined using identical protocols.
Although each lab did use a different procedure, the authors state that the level of agreement between
the measurements produced in the two labs is very good.  

Gathering MHC-binding peptides from the literature or from online databases is problematic,
since different peptides may have been tested using different assays or different experimental conditions,
making it very difficult to compare IC50 values.  The authors suggest that these peptides be the data by which all
future MHC-binding prediction tools can be both trained and cross-validated.

This program is designed to allow the user to easily:
1) Extract data from the dataset that match specific criteria; for example, the user might want records
that are for specific allele(s), or for specific organism(s).

2) Arrange the data into a format conducive to the application for which the data are being used.  For example, 
a new MHC-binding prediction method might accept input formatted in a certain way, and this script makes it easy to
change the format from that used in the "complete_binding_data.txt" file to that accepted by a specific training algorithm.

The program accepts the following command-line options:

-b		
			Display binders only.  Binders are peptides that have IC50 less than or equal to the threshold.  The default threshold is 500 nM,
			but can be changed using the -t option.
		
-n		
			Display nonbinders only.  Nonbinders are peptides that have IC50 greater than the threshold.  The default threshold is 500 nM,
			but can be changed using the -t option.
		
-f	<format string>
		
			Specify the format to use in outputting the data.  The original datafile contains 7 fields, separated by tabs.  
			In the string you specify for formatting, each of these fields is identified by a % sign, followed by a letter.
			For each record, the appropriate values will be substituted in place of each %<letter>.
			
			The %<letter> corresponding to each attribute in a record are as follows:
		
			organism			%o
			allele				%a
			peptide length			%l
			cross-validation number		%c
			peptide sequence		%s
			inequality			%i
			IC50				%I
			
			In addition, you can also include the following in your format string.
			
			
			binder/nonbinder		%b
			
			%b will be replaced by "binder" if the peptide is a binder according to the specified threshold.  If 
			the -y (--binder-name) options is set, the specified string will be printed in lieu of "binder".
			Likewise, %b will be replaced by "nonbinder" if the peptide is a nonbinder according to the specified threshold.
			If the -z (--nonbinder-name) option is set, the specified string will be printed in lieu of "nonbinder".
			
			You do not have to use all of the fields in your format string.
			
			Use "\t" to represent tabs, and "\n" to represent new lines.
			
			Example:
			Consider this line in the "complete_binding_data.txt" file:
			
					human	HLA A*1101	9	3	AITTPQMTL	>	20000.0
			
			Then the format string "%s has IC50 %i %I" will produce the output
			
					AITTPQMTL has IC50 > 20000.0
			
			Likewise, the format string "%s is a %b to %a" will produce the output
			
					AITTPQMTL is a nonbinder to HLA A*1101
			
			This is assuming that the value of -t (--threshold) is smaller than 20000.0,
			and that -z (--nonbinder-name) is set to "nonbinder".
			
			The default format is "%o\t%a\t%l\t%c\t%s\t%i\t%I", which is exactly the same format
			as is present in the database file.  This is to facilitate the usage of the program
			to extract specific data (all binders, all human data, etc.) without changing the format of the data.
			
-t	<threshold>
		
			Set the threshold used to distinguish binders from nonbinders.  
			The default value is 500 nM.
		
-y	<binder name>
		
			Specify the string to be printed in place of %b when the peptide
			is a binder (IC50 is less than the threshold).  Default is "binder".
			
-z	<nonbinder name>
		
			Specify the string to be printed in place of %b when the peptide
			is a nonbinder (IC50 is greater than the threshold).  Default is "nonbinder".
			
-l	<lengths>
		
			Only print records whose peptides are of the specified length.  
			You may specify multiple lengths, separated by commas.
			
-a	<alleles>
		
			Only print records for the specified allele(s).  You may specify multiple alleles, separated by commas.
			Note that the argument to -a should be enclosed in quotes, both due to the spaces in the names
			of alleles, and due to the asterisk (*) in the allele names, which the shell will attempt to
			interpret as a wildcard.  Alternatively, you may escape these characters with a backslash.
			The following two lines are equivalent:
			
				extract_community_binding_data.pl -a "HLA A*0201"
				
			and
				
				extract_community_binding_data.pl -a HLA\ A\*0201
				
			But the following will NOT work:
			
				extract_community_binding_data.pl -a HLA A*0201
			
-o	<organisms>
		
			Only print data for organism <organism>.  You may specify multiple organisms, separated by commas.
			
-c	<cv numbers>
		
			Only print data for the specified cv numbers.  You may specify multiple cv numbers, separated by commas.
			Each cv number must be between 0 and 4, inclusive. 
			
-O	<output file>

			Write the output to the file <output file> instead of to the screen.
			
-h	
			Print this help message and exit.
	
	
--binders-only		Same as -b
			
--nonbinders-only	Same as -n

--format		Same as -f
			
--binder-name		Same as -y

--nonbinder-name	Same as -z

--lengths		Same as -l

--alleles		Same as -a

--organisms		Same as -o

--cvs			Same as -c

--output-file		Same as -O

--help			Same as -h


Examples:

The command

		extract_community_binding_data.pl -f "%s" -b -a "HLA A*0201" -l 9 -t 50

will print the sequences (only) of all 9-mers that bind to HLA A*0201 with an affinity
better than 50 nM.

Similarly, the command

		extract_community_binding_data.pl -a "HLA A*0201, H-2 Kk, Mamu B*01" -l "9, 10" -n -O binding_data.txt

will print all records for which the allele is HLA A*0201, H-2 Kk, or Mamu B*01, and where the length of the sequence is 9 or 10,
and where the peptide is a nonbinder (IC50 > 500 nM, because the default threshold of 500 nM has not been changed using the -t option). 
The output will be written to the file binding_data.txt, as specified by the -O option.

-----------------

Please send bugs, suggestions or comments to:

Brett Trost
brt381\@mail.usask.ca

Also, feel free to modify and/or distribute this script as you see fit.

HELP

	
# Get the command-line options
for (my $i = 0; $i <= $#ARGV; $i++) {

	if ($ARGV[$i] eq "-h" or $ARGV[$i] eq "--help") {
		print $help_msg;
		exit;
	}
	elsif ($ARGV[$i] eq "-b" or $ARGV[$i] eq "--binders-only") {
		$binders_only = 1;
	}
	
	elsif ($ARGV[$i] eq "-n" or $ARGV[$i] eq "--nonbinders-only") {
		$nonbinders_only = 1;
	}
	
	elsif ($ARGV[$i] eq "-f" or $ARGV[$i] eq "--format") {
		$format = $ARGV[++$i];
	}
	
	elsif ($ARGV[$i] eq "-t" or $ARGV[$i] eq "--threshold") {
		$threshold = $ARGV[++$i];
	}
	
	elsif ($ARGV[$i] eq "-y" or $ARGV[$i] eq "--binder-name") {
		$binder_name = $ARGV[++$i];
	}
	
	elsif ($ARGV[$i] eq "-z" or $ARGV[$i] eq "--nonbinder-name") {
		$nonbinder_name = $ARGV[++$i];
	}
	
	elsif ($ARGV[$i] eq "-l" or $ARGV[$i] eq "--lengths") {
		$desired_length = $ARGV[++$i];
	}
	
	elsif ($ARGV[$i] eq "-a" or $ARGV[$i] eq "--alleles") {
		$desired_allele = $ARGV[++$i];
	}
	
	elsif ($ARGV[$i] eq "-o" or $ARGV[$i] eq "--organisms") {
		$desired_organism = $ARGV[++$i];
	}
	
	elsif ($ARGV[$i] eq "-c" or $ARGV[$i] eq "--cvs") {
		$desired_cv = $ARGV[++$i];
	}
	
	elsif ($ARGV[$i] eq "-O" or $ARGV[$i] eq "--output-file") {
		$output_file = $ARGV[++$i];
	}
	
	else {
		print "Invalid command-line option \"$ARGV[$i]\" specified.  Please try again or use the -h option for help\n";
		exit;
	}
}


$format =~ s/\\t/\t/g; # Convert the user's tabs (the actual string "\t") to actual tabs
$format =~ s/\\n/\n/g; # Convert the user's new lines (the actual string "\n") to actual newlines

# Get rid of non-number characters in the threshold (for example, the user might input "50 nM", but the program only wants to see the "50".
$threshold =~ s/[^\d.]//g; 


######### Process the -l (--lengths) argument to put each length requested into an array #########

my @desired_lengths;

if ($desired_length =~ /,/) {
	@desired_lengths = split /\s*,\s*/, $desired_length;
} else {
	$desired_lengths[0] = $desired_length;
}

######### Process the -a (--alleles) argument to put each allele requested into an array #########

my @desired_alleles;

if ($desired_allele =~ /,/) {
	@desired_alleles = split /\s*,\s*/, $desired_allele;
} else {
	$desired_alleles[0] = $desired_allele;
}

######### Process the -o (--organisms) argument to put each organism requested into an array #########

my @desired_organisms;

if ($desired_organism =~ /,/) {
	@desired_organisms = split /\s*,\s*/, $desired_organism;
} else {
	$desired_organisms[0] = $desired_organism;
}

######### Process the -c (--cvs) argument to put each organism requested into an array #########

my @desired_cvs;

if ($desired_cv =~ /,/) {
	@desired_cvs = split /\s*,\s*/, $desired_cv;
} else {
	$desired_cvs[0] = $desired_cv;
}


### Set it up to write to the appropriate place (STDOUT, or to the specified file)
my $filehandle;

if ($output_file) { 				# If the user has specified an output file, then write to that file
	open OUTPUT_FILE, ">$output_file";
	$filehandle = \*OUTPUT_FILE;
} else { 					# Otherwise, just write to the terminal.
	$filehandle = \*STDOUT;
}


##################### Process the records #####################

foreach my $rec (@recs) {
	my ($organism, $allele, $length, $cv, $sequence, $inequality, $IC50) = split "\t", $rec;
	
	$IC50 =~ s/[^\d.]//g; # Get rid of strange characters (and newline)
	
	########### Determine if the current record matches the user's criteria ###########
	
	### Check if this peptide's length matches one of the user's requested lengths ###
	
	my $length_ok = 0;
	if ($desired_lengths[0] == 0) { # The user did not want to restrict the search by length (that is, they did not specify the -l option on the command line)
		$length_ok = 1;
	} else {
	
		for (my $i = 0; $i <= $#desired_lengths; $i++) {
			
			if ($desired_lengths[$i] == $length) {
				$length_ok = 1;
			}
		}
	}
	
	### Check if this record's allele matches one of the user's requested alleles ###
	
	my $allele_ok = 0;
	if ($desired_alleles[0] eq "") {  # The user did not want to restrict the search by allele (that is, they did not specify the -a option on the command line)
		$allele_ok = 1;
	} else {
	
		for (my $i = 0; $i <= $#desired_alleles; $i++) {
			
			if ($desired_alleles[$i] eq $allele) {
				$allele_ok = 1;
			}
		}
	}
	
	### Check if the organism for this record matches one of the user's requested organisms ###
	
	my $organism_ok = 0;
	if ($desired_organisms[0] eq "") { # The user did not want to restrict the search by organism (that is, they did not specify the -o option on the command line)
		$organism_ok = 1;
	} else {
	
		for (my $i = 0; $i <= $#desired_organisms; $i++) {
			
			if ($desired_organisms[$i] eq $organism) {
				$organism_ok = 1;
			}
		}
	}
	
	### Check if the cross-validation number for this record matches one of the user's requested cross-validation numbers ###
	
	my $cv_ok = 0;
	if ($desired_cvs[0] eq "") { # The user did not want to restrict the search by cv number (that is, they did not specify the -c option on the command line)
		$cv_ok = 1;
	} else {
	
		for (my $i = 0; $i <= $#desired_cvs; $i++) {
			
			if ($desired_cvs[$i] eq $cv) {
				$cv_ok = 1;
			}
		}
	}
	
	
	 # If any of the fields in this record didn't meet the specified criteria, go on to the next record.
	next if !$length_ok or !$allele_ok or !$organism_ok or !$cv_ok; 
	
	# Replace the data in the user's format string with the actual values
	my $output_string = $format;
	
	$output_string =~ s/%o/$organism/;
	$output_string =~ s/%a/$allele/;
	$output_string =~ s/%l/$length/;
	$output_string =~ s/%c/$cv/;
	$output_string =~ s/%s/$sequence/;
	$output_string =~ s/%i/$inequality/;
	$output_string =~ s/%I/$IC50/;
	
	
	if ($IC50 <= $threshold) {
		$output_string =~ s/%b/$binder_name/;
		next if $nonbinders_only;
	} else {
		$output_string =~ s/%b/$nonbinder_name/;
		next if $binders_only;
	}
	
	
	print $filehandle "$output_string\n";
}

close $filehandle;
	

