#!/usr/bin/perl
###################################################################################################
# Recover the genes associated with traits of interest using the flank markers information available on Cattle QTL in the
# Animal QTLdb.
# You may distribute this script under the same terms as perl itself
# Francislon Silva <francislon at cpqrr dot fiocruz dot br>
# GGBC Fiocruz-MG - 22_10_2014
###################################################################################################
use strict;
use Getopt::Long;
###################################################################################################
# Declaring variables
###################################################################################################
use constant{
NAME => 'name',
POS_START => 'start',
POS_END => 'END',
QTL=>'qtl',
CATEGORY_QTL=>'cat_qtl',
INFO =>'info',
DBSNP_ID =>'dbsnpid',
FILE => 'file'
};
my $file_genes_list; # export genes that exists in QTL regions
my $file_dbsnp; # dbsnp database file
my $file_gff; # all genes annotated
my $file_qtl; # qtl database file
my $file_output; # output file with SNPs found in any gene described in a GFF file
my $file_output_no_rs; # output file with 'FlankersMarkers' (from QTL database file) not found in dbsnp file
my $file_output_gene_qtl; # output file with genes from any output exported in this script
my $file_output_leftovers; # output file with SNPs around 100 kb from any gene described in a GFF file
my $count_lines_gene_list = 0; # quantity of lines in $file_genes_list
my $count_lines_dbsnp = 0; # quantity of lines in $file_dbsnp
my $count_lines_gff = 0; # quantity of lines in $file_gff
my $count_lines_qtl = 0; # quantity of lines in $file_qtl
my %hash_count = (); # hash to manage the file reading progress
my %hash_qtl=(); # hash to store all QTL information from QTL file
my %hash_dbsnp=(); # hash to store all dbSNP information from dbSNP file
my %hash_snp_position_gene=(); # hash to store snp position and gene related
my %hash_leftovers=(); # hash to store position of SNPs around 100 kb from any gene described in a GFF file
my %hash_leftovers_genes=(); # hash to store information about genes with SNPs 100 kb around the gene
my %hash_leftovers_inserted=();# hash to control what genes can be exported in the output
my %hash_genes_list=(); # hash to store genes from $file_genes_list [only to facilitate the search]
my @array_genes_list=(); # array to store genes from $file_genes_list [to keep the order of the file]
my %hash_genes_in_files=(); # hash to store genes that will be exported if the $file_genes_list is provided
my $help; # flag to know if help is requested
###################################################################################################
# Receiving parameters
###################################################################################################
GetOptions('d=s' => \$file_dbsnp,
'g=s' => \$file_gff,
'q=s' => \$file_qtl,
'h=s' => \$file_genes_list,
'o=s' => \$file_output,
'help|?' => \$help
);
$| = 1; # forces a flush right away and after every write or print on the currently selected output channel
###################################################################################################
# Calling the main function
###################################################################################################
main();
###################################################################################################
# The main function
###################################################################################################
sub main{
validate_parameters(); # This function validates if the parameters are correct
pre_processing(); # This function counts the quantity of lines from each file
# if $file_genes_list is provided then we will read the file
if(defined $file_genes_list){
print "Reading $file_genes_list...\n";
$file_output_gene_qtl = $file_output.".gene_qtl";
read_genes_list();
}
print "Reading $file_qtl...\n";
read_qtl();
print "Reading $file_dbsnp...\n";
read_dbsnp();
print "Reading $file_gff...\n";
read_gff();
if(defined $file_genes_list){
export_gene_qtl();
}
print "\n\nFiles [$file_output], [$file_output".".flank_no_rs]";
if(defined $file_genes_list){
print ", [$file_output_gene_qtl] and [$file_output".".remaining] generated...\n\n"
}
print "Process finished!\n";
}
###################################################################################################
# This function counts the quantity of lines from each file
###################################################################################################
sub pre_processing{
if(defined $file_genes_list){
$count_lines_gene_list = `wc -l $file_genes_list`;
}
$count_lines_dbsnp = `wc -l $file_dbsnp`;
$count_lines_gff = `wc -l $file_gff`;
$count_lines_qtl = `wc -l $file_qtl`;
}
###################################################################################################
# This function reads a file with one gene per line
# If the gene list file is provided we will export all genes in that file who was exported
# in another exported file
###################################################################################################
sub read_genes_list{
my $count = 0;
my $wrap_line = 0;
%hash_count = ();
open(IN, $file_genes_list);
while(<IN>){
chomp;
my $line = $_;
unless(/^(\s|\t)*$/){
my @columns = split /\s+/;
$columns[0] =~ s/^>//g;
$columns[0] =~ s/\s//g;
$hash_genes_list{$columns[0]} = 1;
push(@array_genes_list, $columns[0]);
}
# the following lines are just to calculate and printing the percentage of progress
my $percent_lines = int(($count++ * 100)/$count_lines_gene_list);
unless(exists $hash_count{$percent_lines}){
print " | $percent_lines%";
$wrap_line++;
$hash_count{$percent_lines} = $percent_lines;
if( ($percent_lines > 0 and $wrap_line % 10 == 0) or $percent_lines==100 ){
print "\n";
}
}
}
unless(exists $hash_count{100}){
print " | 100%\n";
}
close(IN);
}
###################################################################################################
# This function reads a QTL file.
# We are looking for genes in QTL regions that we can verify if there are SNPs in those genes
###################################################################################################
sub read_qtl{
my $count = 0;
my $wrap_line = 0;
%hash_count = ();
open(IN, $file_qtl);
$file_output_no_rs = $file_output.".flank_no_rs";
# the file $file_output_no_rs will export all QTL regions which have at least one gene starting with no 'rs'
open(OUT, ">".$file_output_no_rs);
while(<IN>){
chomp;
my $line = $_;
unless(/^(\s|\t)*$/){
unless(/^#/){
my @columns = split /\t/;
my $chr = $columns[0];
$chr =~ s/^Chr\.(\w+)$/$1/g;
my $qtl = $columns[1];
my $category_qtl = $columns[2];
my $start = $columns[3];
my $end = $columns[4];
my $info_column = $columns[8];
my %hash = (POS_START=>$start, POS_END=>$end,
QTL=>$qtl,INFO=>$info_column);
$chr = lc($chr);
if($chr == 'x'){ # chromosome X will be called 30 to facilitate the comparison between files
$chr = 30;
}
my $flankers_markers;
if($info_column =~ /^.+;FlankMarkers\=([^;]+);.+$/){ # the gene information in each QTL region is located at the property 'FlankMarkers'
$flankers_markers = $1;
my @genes_flankers = split(/,/, $flankers_markers); # each 'FlankMarkers' can have more than one gene separated by comma
my $has_no_rs = 0;
for(@genes_flankers){
my $gene = $_;
if(/^rs.+$/){ # we are looking for genes with rs*, which means that the gene has SNPs in dbSNP
if(exists $hash_qtl{$chr}{$gene}{$category_qtl}){
my $array = $hash_qtl{$chr}{$gene}{$category_qtl};
push(@$array, \%hash);
}else{
my @array=(\%hash);
$hash_qtl{$chr}{$gene}{$category_qtl} = \@array;
}
}else{
if(exists $hash_genes_list{$gene}){
my %hash = (NAME => $gene, CATEGORY_QTL=>$category_qtl, DBSNP_ID=>'not found', FILE=>$file_output_no_rs);
if(exists $hash_genes_in_files{$gene}){
my $array = $hash_genes_in_files{$gene};
push(@$array, \%hash);
}else{
my @array=(\%hash);
$hash_genes_in_files{$gene} = \@array;
}
}
$has_no_rs = 1;
}# end if ^rs
}
if($has_no_rs){ # if this QTL has at least one gene name starting with no 'rs'
print OUT "$line\n";
}
}
}
}
# the next lines are just to calculate and printing the percentage of progress
my $percent_lines = int(($count++ * 100)/$count_lines_qtl);
unless(exists $hash_count{$percent_lines}){
print " | $percent_lines%";
$wrap_line++;
$hash_count{$percent_lines} = $percent_lines;
if( ($percent_lines > 0 and $wrap_line % 10 == 0) or $percent_lines==100 ){
print "\n";
}
}
}
unless(exists $hash_count{100}){
print " | 100%\n";
}
close(OUT);
close(IN);
}
###################################################################################################
# This function reads a GFF file.
# We will save all genes from this GFF in a hash to search if this genes have SNPs from dbSNP.
###################################################################################################
sub read_gff{
open(IN, $file_gff);
open(OUT, ">".$file_output);
print OUT sprintf("#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
"Chr", "'Gene Name'","'Gene Start'", "'Gene End'", "'dbSNP ID'", "'dbSNP pos'", "'QTL db'", "'QTL Category'", "'QTL Start'", "'QTL End'", "'QTL Info'");
my $count = 0;
my $wrap_line = 0;
%hash_count = ();
my %hash_order=();
while(my ($chr, $hash) = each(%hash_snp_position_gene)){
my @array_pos = sort{$a<=>$b} keys %$hash;
$hash_order{$chr}=\@array_pos;
}
my $last_end=-1;
my $last_start=-1;
my %hash_backup_pos=();
my %hash_backup_genes=();
my $c = 0;
while(<IN>){
chomp;
unless(/^(\s|\t)*$/){
unless(/^(#|GS)/){
my @columns = split /\t/;
$columns[2] = lc($columns[2]);
if($columns[2] eq "gene"){
my $last_col = $columns[-1];
my @array_last_col = split(/;/, $last_col);
if($array_last_col[0] =~ /^gene_id "GK(\d{6})\.\d:(\w+)".*$/){
my $chr = int($1);
my $gene = $2;
if(exists $hash_order{$chr}){
my $array_pos = $hash_order{$chr};
my $pos = 0;
if($columns[3] > $columns[4]){
my $aux = $columns[3];
$columns[3] = $columns[4];
$columns[4] = $aux;
}
my $start = $columns[3];
my $end = $columns[4];
if(!exists $hash_backup_pos{$chr}){
my @array = ();
$hash_backup_pos{$chr} = \@array;
}
my $array_backup_pos = $hash_backup_pos{$chr};
do{
if(scalar(@$array_pos)>0){
$pos = shift(@$array_pos);
if($pos >= $start and $pos <= $end){
print OUT sprintf("%s\t%s\t%d\t%d\t%s\t%d",
$chr, $gene,$start, $end, $hash_snp_position_gene{$chr}{$pos}, $pos);
my $hash_qtl_innter = $hash_qtl{$chr}{$hash_snp_position_gene{$chr}{$pos}};
while(my ($key, $value) = each(%$hash_qtl_innter)){
for(@$value){
print OUT sprintf("\t%s\t%s\t%d\t%d\t%s",
$_->{QTL}, $key, $_->{POS_START},
$_->{POS_END}, $_->{INFO});
}
}
print OUT "\n";
if(exists $hash_genes_list{$gene}){
my $category_qtl = "";
my $hash_qtl_innter =
$hash_qtl{$chr}{$hash_snp_position_gene{$chr}{$pos}};
while(my ($key, $value) = each(%$hash_qtl_innter)){
if($category_qtl eq ""){
$category_qtl = $key;
}else{
$category_qtl .= ",".$key;
}
}
my %hash = (NAME => $gene, CATEGORY_QTL=>$category_qtl,
DBSNP_ID=>$hash_snp_position_gene{$chr}{$pos}, FILE=>$file_output);
if(exists $hash_genes_in_files{$gene}){
my $array = $hash_genes_in_files{$gene};
push(@$array, \%hash);
}else{
my @array=(\%hash);
$hash_genes_in_files{$gene} = \@array;
}
}
}elsif($pos <= $end){
if(($pos >= $start-100000 and $pos <= $start) or
($pos <= $end+100000 and $pos >= $end) ){
if(not exists $hash_leftovers_inserted{$chr}{$pos}{$gene}){
if(not exists $hash_leftovers{$chr}{$pos}){
$hash_leftovers{$chr}{$pos} = 1;
}
my %hash_inner = (NAME=>$gene,
POS_START=>$start, POS_END=>$end);
if(exists $hash_leftovers_genes{$chr}{$pos}){
my $array_left =
$hash_leftovers_genes{$chr}{$pos};
push(@$array_left, \%hash_inner);
}else{
my @array_left = (\%hash_inner);
$hash_leftovers_genes{$chr}{$pos} =
\@array_left;
}
push(@$array_backup_pos, $pos);
$hash_leftovers_inserted{$chr}{$pos}{$gene} = 1;
}
}else{
push(@$array_backup_pos, $pos);
}
}else{
unshift(@$array_pos, $pos);
}
}
}while($pos <= $end and scalar(@$array_pos)>0);
$last_end = $end;
$last_start = $start;
if(not exists $hash_backup_genes{$chr}){
my @array = ();
$hash_backup_genes{$chr} = \@array;
}
my $array_genes_by_chr = $hash_backup_genes{$chr};
my %hash_gene = (NAME=>$gene, POS_START=>$start, POS_END=>$end);
push(@$array_genes_by_chr, \%hash_gene);
}
}
}
}
}
# the next lines are just to calculate and printing the percentage of progress
my $percent_lines = int(($count++ * 100)/$count_lines_gff);
unless(exists $hash_count{$percent_lines}){
print " | $percent_lines%";
$wrap_line++;
$hash_count{$percent_lines} = $percent_lines;
if( ($percent_lines > 0 and $wrap_line % 10 == 0) or $percent_lines==100 ){
print "\n";
}
}
}
unless(exists $hash_count{100}){
print " | 100%\n";
}
close(IN);
close(OUT);
while(my ($key, $value) = each(%hash_order)){
if(exists $hash_backup_pos{$key}){
my $array = $hash_backup_pos{$key};
push(@$array, @$value);
}else{
$hash_backup_pos{$key} = $value;
}
}
while(my ($chr, $value) = each(%hash_backup_pos)){
my @array_pos = sort {$a<=>$b} @$value;
my $array_genes = $hash_backup_genes{$chr};
for(@array_pos){
my $pos = $_;
my $quant_remove = 0;
my $pos_remove=0;
for(my $i = 0; $i < scalar @$array_genes; $i++){
my $hash_inner = $array_genes->[$i];
my $start = $hash_inner->{POS_START};
my $end = $hash_inner->{POS_END};
my $gene = $hash_inner->{NAME};
if($pos + 100000 < $start){
$quant_remove++;
$pos_remove = $i;
last;
}elsif( ($pos >= $start-100000 and $pos <= $start) or
($pos <= $end+100000 and $pos >= $end)){
if(not exists $hash_leftovers_inserted{$chr}{$pos}{$gene}){
if(not exists $hash_leftovers{$chr}{$pos}){
$hash_leftovers{$chr}{$pos} = 1;
}
if(exists $hash_leftovers_genes{$chr}{$pos}){
my $array = $hash_leftovers_genes{$chr}{$pos};
push(@$array, $hash_inner);
}else{
my @array = ($hash_inner);
$hash_leftovers_genes{$chr}{$pos} = \@array;
}
$hash_leftovers_inserted{$chr}{$pos}{$gene} = 1;
}
}
}
splice(@$array_genes,0,$quant_remove);
}
}
$file_output_leftovers = $file_output.".remaining";
open(OUT, ">".$file_output_leftovers);
print OUT sprintf("#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
"'dbSNP ID'", "Chr", "'Gene Name'","'Gene Start'", "'Gene End'", "'dbSNP pos'", "'QTL db'", "'QTL Category'", "'QTL Start'", "'QTL End'", "'QTL Info'");
my @array_chr = sort {$a<=>$b} keys %hash_leftovers;
for(@array_chr){
my $chr = $_;
my @array_pos = sort {$a<=>$b} keys %{$hash_leftovers{$chr}};
for(@array_pos){
my $pos = $_;
my @array_gene_inner = sort{$a->{POS_START}<=>$b->{POS_START}} @{$hash_leftovers_genes{$chr}{$pos}};
my @closest_markers=();
my $minimum = 100001;
for(@array_gene_inner){
my $hash_gene_inner = $_;
my $start = $hash_gene_inner->{POS_START};
my $end = $hash_gene_inner->{POS_END};
my $distance = 0;
if($pos <= $start ){
$distance = $start - $pos;
}else{
$distance = $pos - $end;
}
if($distance < $minimum){
$minimum = $distance;
@closest_markers=();
push(@closest_markers, $hash_gene_inner);
}elsif($distance == $minimum){
push(@closest_markers, $hash_gene_inner);
}
}
for(@closest_markers){
my $hash_gene_inner = $_;
my $gene = $hash_gene_inner->{NAME};
my $start = $hash_gene_inner->{POS_START};
my $end = $hash_gene_inner->{POS_END};
print OUT sprintf("%s\t%s\t%s\t%d\t%d\t%d",
$hash_snp_position_gene{$chr}{$pos}, $chr, $gene,$start, $end, $pos);
my $hash_qtl_innter = $hash_qtl{$chr}{$hash_snp_position_gene{$chr}{$pos}};
while(my ($key, $value) = each(%$hash_qtl_innter)){
for(@$value){
print OUT sprintf("\t%s\t%s\t%d\t%d\t%s", $_->{QTL}, $key, $_->{POS_START},
$_->{POS_END}, $_->{INFO});
}
}
print OUT "\n";
if(exists $hash_genes_list{$gene}){
my $category_qtl = "";
my $hash_qtl_innter = $hash_qtl{$chr}{$hash_snp_position_gene{$chr}{$pos}};
while(my ($key, $value) = each(%$hash_qtl_innter)){
if($category_qtl eq ""){
$category_qtl = $key;
}else{
$category_qtl .= ",".$key;
}
}
my %hash = (NAME => $gene, CATEGORY_QTL=>$category_qtl, DBSNP_ID=>$hash_snp_position_gene{$chr}{$pos}, FILE=>$file_output_leftovers);
if(exists $hash_genes_in_files{$gene}){
my $array = $hash_genes_in_files{$gene};
push(@$array, \%hash);
}else{
my @array=(\%hash);
$hash_genes_in_files{$gene} = \@array;
}
}
}
}
}
close(OUT);
}
#########################################################################################################
# This function export all genes provided by $file_genes_list if those genes were exported in another file.
#########################################################################################################
sub export_gene_qtl{
$file_output_gene_qtl = $file_output.".gene_list";
open(OUT, ">".$file_output_gene_qtl);
print OUT sprintf("#%s\t%s\t%s\t%s\n", "'Gene Name'", "'QTL Category'", "'DBSNP ID'", "'File associated'");
for(@array_genes_list){
my $gene = $_;
if(exists $hash_genes_in_files{$gene}){
my $array = $hash_genes_in_files{$gene};
for(@$array){
print OUT sprintf("%s\t%s\t%s\t%s\n", $_->{NAME}, $_->{CATEGORY_QTL}, $_->{DBSNP_ID}, $_->{FILE});
}
}
}
close(OUT);
}
#########################################################################################################
# This function reads a dbSNP file and store the information in a hash.
#########################################################################################################
sub read_dbsnp{
open(IN, $file_dbsnp);
my $count = 0;
my $wrap_line = 0;
%hash_count = ();
while(<IN>){
chomp;
unless(/^(\s|\t|#)*$/){
my @columns = split /\t/;
my $chr = $columns[0];
if(lc($chr) == 'x'){
$chr = 30;
}
my $pos = $columns[1];
my $gene = $columns[2];
if(exists $hash_qtl{$chr}{$gene}){
$hash_dbsnp{$chr}{$gene}=$pos;
$hash_snp_position_gene{$chr}{$pos} = $gene;
}
}
else{
$count_lines_dbsnp--;
}
# the next lines are just to calculate and printing the percentage of progress
my $percent_lines = int(($count++ * 100)/$count_lines_dbsnp);
unless(exists $hash_count{$percent_lines}){
print " | $percent_lines%";
$wrap_line++;
$hash_count{$percent_lines} = $percent_lines;
if( ($percent_lines > 0 and $wrap_line % 10 == 0) or $percent_lines==100 ){
print "\n";
}
}
}
unless(exists $hash_count{100}){
print " | 100%\n";
}
close(IN);
}
#########################################################################################################
# This function validates the parameters provided from user to the script
#########################################################################################################
sub validate_parameters{
my $allExists = 1;
my $fileExists = 1;
if ( defined $help ) {
print usage();
exit 0;
}
unless ( defined $file_dbsnp ) {
$allExists = 0;
}
unless ( defined $file_gff ) {
$allExists = 0;
}
unless ( defined $file_qtl ) {
$allExists = 0;
}
unless ( defined $file_output ) {
$allExists = 0;
}
if ( defined $file_genes_list ) {
unless ( -f $file_genes_list ) {
print STDERR "$file_genes_list doesn't exists.\n";
$fileExists = 0;
}
}
if ($allExists) {
unless ( -f $file_dbsnp ) {
print STDERR "$file_dbsnp doesn't exists.\n";
$fileExists = 0;
}
unless ( -f $file_gff ) {
print STDERR "$file_gff doesn't exists.\n";
$fileExists = 0;
}
unless ( -f $file_qtl ) {
print STDERR "$file_qtl doesn't exists.\n";
$fileExists = 0;
}
}
unless ($allExists) {
print usage();
exit 0;
}
unless ($fileExists) {
print STDERR "Program execution aborted.\n";
exit 0;
}
}
#########################################################################################################
# This function return the text explaining how to use the script
#########################################################################################################
sub usage{
my $usage = <FOO;
Usage:
perl $0 -g gff_file -q qtl_file -d dbsnp_file -o output_file [-h genes_list]
FOO
return $usage;
}