#!/usr/bin/perl

## Date and Version
my $version= '2.0';
my $version_date = '08_aug_2012';

##################################################
## Documentation POD
##################################################

##################################################
## Modules
##################################################
## Perl modules
use strict;
use warnings;
use diagnostics;

use Getopt::Long;
use File::Basename;
use Data::Dumper;
use File::Copy;

# BioPerl modules
use Bio::SeqIO;
use Bio::DB::Fasta;

##################################################
## Globals variables
##################################################

# Initializations
my $Start_time_hr = humanReadableDate();
my $help = undef();
my $DEBUG = 0;
my $Control_lvl = 1;
my %Parameters;

# Default values
my $Default_ORF_type = 'auto';
my $Default_Sequence_type = 'auto';

##################################################
## Manage options
##################################################

# Get options from command line
GetOptions (	'help|h'				=> \$help,
			'orf_file|orf=s'			=> \$Parameters{'ORF_file'},
			'orf_type|otype=s'			=> \$Parameters{'ORF_type'},
			'sequence_file|seq=s'		=> \$Parameters{'Sequence_file'},
			'sequence_type|stype=s'		=> \$Parameters{'Sequence_type'},
			'output|out=s'				=> \$Parameters{'Output_prefix'},
			'exec:s'					=> \$Parameters{'Execution_mode'}
		);

# Display help message if needed
if (defined($help)) {
	displayHelpMessage();
}

# Check parameters
checkParameters(\%Parameters);

# Generate additional output file name
generateOutputfileNames(\%Parameters);

# Backup old data
saveOldResults(\%Parameters);

##################################################
## Main code
##################################################

# Welcome message
print "###########################################################\n";
print "#         Welcome in ORF extractor (Version $version)          #\n"; 
print "###########################################################\n";
print "Start time: " . $Start_time_hr . "\n\n";

# Initializations
my $Ref_to_ORF_positions = undef();

# Verbose
print 'Selected ORF input file: ' . $Parameters{'ORF_file'} . "\n";
print 'Selected Sequence input file: ' . $Parameters{'Sequence_file'} . "\n\n";

print 'Indicated ORF input file format: ' . $Parameters{'ORF_type'} . "\n";
print 'Indicated sequence input file format: ' . $Parameters{'Sequence_type'} . "\n\n";

# Check the type/format of the ORF input file
if ($Parameters{'ORF_type'} eq 'auto') {
	$Parameters{'ORF_type'} = determineORFType($Parameters{'ORF_file'});
	
	if ($Parameters{'ORF_type'} eq 'Unknown') {
		print STDERR "Error: ORF extractor can only extract ORF positions from getORF/Prot4EST standard output file !\n";
		exit(1);
	}
}

# Check the type/format of the sequence input file
if ($Parameters{'Sequence_type'} eq 'auto') {
	$Parameters{'Sequence_type'} = determineSeqType($Parameters{'Sequence_file'});
	
	if ($Parameters{'Sequence_type'} eq 'Unknown') {
		print STDERR "Error: ORF extractor can only extract CDS from Fasta/PseudoFasta/ALR sequence file !\n";
		exit(1);
	}
}

# Recover all ORF positions
if ($Parameters{'ORF_type'} =~ /getORF/i) {
	$Ref_to_ORF_positions = getOrfPositionsFromGetORF($Parameters{'ORF_file'});
} else { # prot4EST
	$Ref_to_ORF_positions = getOrfPositionsFromProt4EST($Parameters{'ORF_file'});
}

# Extract the ORF part of each sequences of the Fasta/PseudoFasta/ALR input file and write into a new file
if ($Parameters{'Sequence_type'} =~ /Fasta|PseudoFasta/i) {
	get_CDS_and_UTR_from_Fasta($Ref_to_ORF_positions, \%Parameters);
} else { # ALR
	get_CDS_and_UTR_from_ALR($Ref_to_ORF_positions, \%Parameters);
}

print 'Output files:' . "\n\n";
print '  - 5\'UTR: ' . $Parameters{'Five_prime_UTR_file'} . "\n";
print '  - CDS: ' . $Parameters{'CDS_file'} . "\n";
print '  - 3\'UTR: ' . $Parameters{'Three_prime_UTR_file'} . "\n\n";

print "Stop time: " . humanReadableDate() . "\n";
print "###########################################################\n";
print "#                     End of execution                    #\n"; 
print "###########################################################\n";


##################################################
## Filename management related functions
##################################################

sub generateOutputfileNames {
	
	# Recovers parameters
	my $Ref_to_parameters = shift;
	
	# Some explanations: 
	# In local mode the names of the additionnal output files are derived from the name of the main input file
	# However, the current version of Galaxy Workflow Manager is not able to retrieve output files with non HARD CODED names (The value of the from_work_dir attribute is not interpreted !!)
	# Therefore it's impossible to use derived name in Galaxy mode
	
	if ($Ref_to_parameters->{'Execution_mode'} eq 'local') {
		# Names derived from the main input file name
		$Ref_to_parameters->{'Five_prime_UTR_file'} = $Ref_to_parameters->{'Output_prefix'} . '.utr5';
		$Ref_to_parameters->{'CDS_file'} = $Ref_to_parameters->{'Output_prefix'} . '.cds';
		$Ref_to_parameters->{'Three_prime_UTR_file'} = $Ref_to_parameters->{'Output_prefix'} . '.utr3';
	} else {
		# Hard coded names (Galaxy mode)
		$Ref_to_parameters->{'Five_prime_UTR_file'} = 'ORF_extractor_5_UTR';
		$Ref_to_parameters->{'CDS_file'} = 'ORF_extractor_CDS';
		$Ref_to_parameters->{'Three_prime_UTR_file'} = 'ORF_extractor_3_UTR';
	}
	
	return 0;
}


##################################################
## Backup related functions
##################################################

sub saveOldResults {
	
	# Recovers parameters
	my $Ref_to_parameters = shift;

	# Initializations
	my @list_of_global_file = ('Five_prime_UTR_file', 'CDS_file', 'Three_prime_UTR_file');
	
	# Rename an already existing global file / Special Contig file if needed
	foreach my $Global_file_name (@list_of_global_file) {
		rename($Ref_to_parameters->{$Global_file_name}, $Ref_to_parameters->{$Global_file_name} . '.old.' . $Start_time_hr) if (-e $Ref_to_parameters->{$Global_file_name});
	}
	
	return 0;
}


##################################################
## Divination related functions
##################################################

sub determineORFType {
	
	# Recovers parameters
	my $ORF_input_file = shift;
	
	# Initializations
	my $type = 'Unknown';
	
	print ' Determination of the real ORF input file format' . "\n";
	
	# Analyse the first line of the ORF input file to determine if it has been generated by getORF or Prot4EST
	# Prot4est example line: Contig1	145	348
	# getORF example line: >Contig1_1 [145 - 348]
	open (ORF_FILE_TEST, '<' . $ORF_input_file) or die ('Error: Cannot open/read file: ' . $ORF_input_file . "\n");

	while (my $Current_line = <ORF_FILE_TEST>){
		if ($Current_line =~ /^(\w+)\t(\d+)\t(\d+)$/) {
			$type = 'prot4EST';
			last;
		} elsif ($Current_line =~ /^>(.*)?\[/) {
			$type = 'getORF';
			last;
		}
	}

	close (ORF_FILE_TEST);
	
	print '  --> Detected file format: ' . $type . "\n\n";
	
	return $type;
}

## Some explanation about sequence file format detection ##

# Fasta/PseudoFasta example lines:
# >Contig1	Description1
# ATGCTATGCTATGCTATGCTATGCTATGCTATGCTATGCTATGCTATGCTATGCTATGCT

# ALR example lines:
# >Contig1
# maj	M/P	Ciona|GA02R Ciona|GA12A Ciona|GA12B Ciona|GA12C
# T	M	3	4	3	7 # Monomorphic position
# A	P	8[8/0/0/0]	7[2/5/0/0]	7[0/7/0/0]	10[8/2/0/0] # Polymorphic position

sub determineSeqType {
	
	# Recovers parameters
	my $Sequence_input_file = shift;
	
	# Initializations
	my $type = 'Unknown';
	my $Line_counter = 0;
	
	print ' Determination of the real sequence input file format' . "\n";
	
	# Analyse the first lines of the sequence input file to determine if it's a Fasta/PseudoFasta or ALR file
	open (SEQ_FILE_TEST, '<' . $Sequence_input_file) or die ('Error: Cannot open/read file: ' . $Sequence_input_file . "\n");

	while (my $Current_line = <SEQ_FILE_TEST>) {
		
		if ($Line_counter == 0 && $Current_line !~ /^>([\w\.]+)/) {
			# If the first line is not a standard comment line which begins by a ">" character then the analyzed sequence file is not valid
			last;
			
		} elsif ($Line_counter == 1 && $Current_line =~ /^[ACGTN]+$/) {
			$type = 'Fasta';
			last;
			
		} elsif ($Line_counter == 2 && $Current_line =~ /^[ACGTN]{1}\t[MP]{1}\t[\d\[\/\]]+/) {
			$type = 'ALR';
			last;
			
		} elsif ($Line_counter > 2) {
			last;
		}
			
		$Line_counter++;
	}

	close (SEQ_FILE_TEST);
	
	print '  --> Detected file format: ' . $type . "\n\n";
	
	return $type;
}

##################################################
## ORF related functions
##################################################

sub getOrfPositionsFromGetORF {
	
	# Recovers parameters
	my $ORF_input_file = shift;
	
	# Initializations
	my %Best_ORF_positions;
	
	print ' Search for the best/longest ORF of each contig (getORF mode)' . "\n";
	
	# Browse the ORF file and collect the start and stop position of the longest ORF for each contig
	my $SeqIO_input_object  = Bio::SeqIO->new ( '-file' => $ORF_input_file, '-format' => 'FASTA' );
	
	while (my $Current_ORF = $SeqIO_input_object->next_seq)  {

		# Get usefull informations about the current ORF
		my ($ORF_id, $ORF_length) =  ($Current_ORF->display_id(), $Current_ORF->length());
		my ($Contig_name) = ($ORF_id =~ /(.*?)(_\d+)*$/); # Keep the real contig name only (Example: Contig555_1 -> Contig555) - Works for getORF in All or Best mode
		my ($ORF_start, $ORF_stop) = ($Current_ORF->description =~ /\[(\d+)\s*-\s*(\d+)\]/);
		
		# If we are on the first ORF of a contig we store its information
		if (! defined($Best_ORF_positions{$Contig_name})) {
			$Best_ORF_positions{$Contig_name} = { 'ORF_ID' => $ORF_id, 'ORF_start' => $ORF_start, 'ORF_stop' => $ORF_stop, 'ORF_length' => $ORF_length };
		
		# Otherwise we check if the current ORF is the longer than the previous one
		# If that's the case we considered it as the new best ORF
		} else {
			if ($Best_ORF_positions{$Contig_name}->{'ORF_length'} < $ORF_length) {
				$Best_ORF_positions{$Contig_name} = { 'ORF_ID' => $ORF_id, 'ORF_start' => $ORF_start, 'ORF_stop' => $ORF_stop, 'ORF_length' => $ORF_length };
			}
		}
	}

	# Verbose and DEBUG
	if (scalar(keys %Best_ORF_positions) > 0) {
		print '  --> Research complete: ' . scalar(keys %Best_ORF_positions) . " ORF collected\n\n";
	}
	
	print Dumper(%Best_ORF_positions) if $DEBUG;
	return \%Best_ORF_positions;
}

sub getOrfPositionsFromProt4EST {
	
	# Recovers parameters
	my $ORF_input_file = shift;
	
	# Initializations
	my %Prot4EST_positions;
	
	print ' Search for the best/longest ORF of each contig (prot4EST mode)' . "\n";
	
	# Load all ORF positions into a hash table
	open (PROT4EST, '<' . $ORF_input_file) or die ('Error: Cannot open/read file: ' . $ORF_input_file . "\n");

	while (my $Current_line = <PROT4EST>){
		if ($Current_line !~ /^[#@>]/) {
			# Split the current line and save informations into the hash table
			my ($Name, $Start, $Stop) = split(/\t/, $Current_line);
			$Prot4EST_positions{$Name} = { 'ORF_start' => $Start, 'ORF_stop' => $Stop };
		}
	}
	
	close (PROT4EST);
	
	# Verbose and DEBUG
	if (scalar(keys %Prot4EST_positions) > 0) {
		print '  --> Research complete: ' . scalar(keys %Prot4EST_positions) . " ORF collected\n\n";
	}
	
	print Dumper(%Prot4EST_positions) if $DEBUG;
	
	return \%Prot4EST_positions;
}


##################################################
## Fasta/Pseudofasta related functions
##################################################

sub get_CDS_and_UTR_from_Fasta {
	
	# Recovers parameters
	my ($Ref_to_ORF_data, $Ref_to_parameters) = @_;
	
	# Initializations
	my $Sequence_file_basename = basename($Ref_to_parameters->{'Sequence_file'});
	my $Symlink = $Sequence_file_basename . '.sym';
	
	my $sort_rule = 'InAlphaNumericOrder';
	my ($Extracted_CDS, $Rejected_sequences, $Discarded_sequences) = (0, 0, 0);
	
	# Creation of a symlink to the Fasta file to force the creation of the index file in the current directory (instead of the Galaxy's dataset directory)
	symlink($Ref_to_parameters->{'Sequence_file'}, $Symlink);
	
	# Index the fasta input file for faster processing
	my ($fasta_db, $Ref_to_ID_list) = indexFastaFile($Symlink, $Sequence_file_basename);

	# Browse the Fasta/PseudoFasta input file, extract CDS and UTR regions and write them into the main output file
	print ' Extract the coding part and UTR regions of each sequence of the input Fasta/PseudoFasta file' . "\n";
	
	# Examinate one element of the ID_list to determine which sort rules is needed
	my $Tests_keys = $Ref_to_ID_list->[0];

	if ($Tests_keys =~ /Contig.*/) { $sort_rule = 'ByContigNumberAsc'; }
	elsif ($Tests_keys =~ /gi.*/) { $sort_rule = 'ByGenbankIdentifierAsc'; }

	# Extract and write CDS
	foreach my $Pfas_ID (sort $sort_rule @{$Ref_to_ID_list}) {
		# Split the sequence identifier to extract the contig_name
		my ($Contig_name, $Species_name, $Individual, $Allele) = split(/\|/, $Pfas_ID, 4);
		
		# Check if there is an ORF which correspond to the current contig name
		if (defined($Ref_to_ORF_data->{$Contig_name})) {
			# Access to the fasta sequence corresponding to the current ORF
			my $Seq_object = $fasta_db->get_Seq_by_id($Pfas_ID);
			
			# Recover ORF boundaries
			my ($begin, $end, $seq_length) = ($Ref_to_ORF_data->{$Contig_name}->{'ORF_start'}, $Ref_to_ORF_data->{$Contig_name}->{'ORF_stop'}, $Seq_object->length());
			
			if ($begin < $end && $seq_length < $end) {
				print '  -> Warning: ORF stop is outside of the ALR sequence (Forward) ! Trinity_ORF problem ! Contig "' . $Contig_name . '" will be discarded..' . "\n";
				$Discarded_sequences++;
				next;
				
			} elsif ($end < $begin && $seq_length < $begin) {
				print '  -> Warning: ORF start is outside of the ALR sequence (Reverse) ! Trinity_ORF problem ! Contig "' . $Contig_name . '" will be discarded..' . "\n";
				$Discarded_sequences++;
				next;
			}
			
			$Extracted_CDS++;
			
			# Extract and write CDS and UTR regions in different output files (ORF boundaries are collected in the ORF_data hash table)
			extractAndWriteWithBioPerl($Ref_to_parameters, $Seq_object, $begin, $end, $seq_length);
		
		} else { $Rejected_sequences++; }
	}

	print "\n" . '  --> Number of extracted CDS: ' .  $Extracted_CDS . "\n";
	print '  --> Number of rejected sequences: ' .  $Rejected_sequences . "\n";
	print '  --> Number of discarded sequences (ORF limits problem): ' .  $Discarded_sequences . "\n\n";
	
	# Remove index file and symlink if needed
	if (-e $Symlink . '.index') { unlink($Symlink . '.index') }
	if (-e $Symlink) { unlink($Symlink) }

	return 0;
}

sub indexFastaFile {
	
	# Recovers parameters
	my ($Fasta_file_to_index, $Alias) = @_;
	
	# Verbose
	print ' Fasta/PseudoFasta file indexation (' . $Alias .  ')' . "\n";
	
	# Indexation via BioPerl
	my $Fasta_database = Bio::DB::Fasta->new($Fasta_file_to_index);
	
	# Get and count sequence identifiers
	my @Elements = sort($Fasta_database->get_all_ids());
	print '  --> Number of indexed sequences: ' . scalar(@Elements) . "\n\n";
	
	return ($Fasta_database, \@Elements);
}

sub extractAndWriteWithBioPerl {
	
	# Recovers parameters
	my ($Ref_to_parameters, $Seq_in_obj, $ORF_begin, $ORF_end, $Sequence_length) = @_;
	
	# Initializations
	my $Sequence_ID = $Seq_in_obj->id();
	
	# Creation of an output stream for each type of extracted sequences
	my $Sequence_out_object_UTR5 = Bio::SeqIO->new(-format => 'fasta', -file => '>>' . $Ref_to_parameters->{'Five_prime_UTR_file'});
	my $Sequence_out_object_CDS = Bio::SeqIO->new(-format => 'fasta', -file => '>>' . $Ref_to_parameters->{'CDS_file'});
	my $Sequence_out_object_UTR3 = Bio::SeqIO->new(-format => 'fasta', -file => '>>' . $Ref_to_parameters->{'Three_prime_UTR_file'});
	
	# Initialize new sequence object for each type of desired subsequences
	my $UTR5 = Bio::Seq->new(-display_id => $Sequence_ID);
	my $CDS = Bio::Seq->new(-display_id => $Sequence_ID);
	my $UTR3 = Bio::Seq->new(-display_id => $Sequence_ID);
	
	# Get CDS and UTR regions (depending on the ORF direction) - Note: subseq start at 1 (not 0 like an array)
	if ($ORF_begin < $ORF_end){ # Forward ORFS
		$UTR5->seq($Seq_in_obj->subseq(1, $ORF_begin-1)) if ($ORF_begin > 1);
		$CDS->seq($Seq_in_obj->subseq($ORF_begin, $ORF_end));
		$UTR3->seq($Seq_in_obj->subseq($ORF_end+1, $Sequence_length)) if ($ORF_end < $Sequence_length);
		
	}else{ # Reverse ORFS
		if ($ORF_begin < $Sequence_length) {
			my $RevSeq_object_UTR5 = Bio::Seq->new(-id => $Sequence_ID, -seq => $Seq_in_obj->subseq($ORF_begin+1, $Sequence_length));
			$UTR5->seq($RevSeq_object_UTR5->revcom->seq());
		}
		
		my $RevSeq_object_CDS = Bio::Seq->new(-id => $Sequence_ID, -seq => $Seq_in_obj->subseq($ORF_end, $ORF_begin));
		$CDS->seq($RevSeq_object_CDS->revcom->seq());
		
		if ($ORF_end > 1) {
			my $RevSeq_object_UTR3 = Bio::Seq->new(-id => $Sequence_ID, -seq => $Seq_in_obj->subseq(1, $ORF_end-1));
			$UTR3->seq($RevSeq_object_UTR3->revcom->seq());
		}
	}

	# Write the new CDS sequence in fasta format in the defined output file
	$Sequence_out_object_CDS->write_seq($CDS);
	
	# Write UTR regions in the preselected output files
	$Sequence_out_object_UTR5->write_seq($UTR5) if ($UTR5->length() > 0);
	$Sequence_out_object_UTR3->write_seq($UTR3) if ($UTR3->length() > 0);
	
	return 0;
}


##################################################
## ALR related functions
##################################################

sub get_CDS_and_UTR_from_ALR {
	
	# Recovers parameters
	my ($Ref_to_ORF_data, $Ref_to_parameters) = @_;
	
	# Initializations
	my ($Extracted_CDS, $Rejected_sequences, $Discarded_sequences) = (0, 0, 0);
	my $sort_rule = 'InAlphaNumericOrder';
	
	# "Index" ALR file
	my $Ref_to_Contig_data = indexALRFile($Ref_to_parameters->{'Sequence_file'}, basename($Ref_to_parameters->{'Sequence_file'}));
	
	# Examinate one element of the Contig_data hash table to determine which sort rules is needed
	my $Tests_keys = (keys(%{$Ref_to_Contig_data}))[0];

	if ($Tests_keys =~ /Contig.*/) { $sort_rule = 'ByContigNumberAsc'; }
	elsif ($Tests_keys =~ /gi.*/) { $sort_rule = 'ByGenbankIdentifierAsc'; }
	
	# Browse the ALR input file, extract CDS and UTR regions and write them into the main output file
	print ' Extract the coding part and UTR regions of each sequence of the input ALR file' . "\n";
	
	foreach my $Contig_name (sort $sort_rule (keys %{$Ref_to_Contig_data})) { 
		
		if (defined($Ref_to_ORF_data->{$Contig_name})) {
			
			
			# Get the CDS sequence and UTR regions depending on the ORF strand (ORF boundaries are collected in the ORF_data hash table)
			my ($Ref_to_5_prime, $Ref_to_CDS, $Ref_to_3_prime) = getSubSequencesFromAlr($Ref_to_Contig_data, $Contig_name, $Ref_to_ORF_data->{$Contig_name}->{'ORF_start'}, $Ref_to_ORF_data->{$Contig_name}->{'ORF_stop'});
			
			# Get the number of element for each array ref
			my ($Five_length, $CDS_length, $Three_length) = (scalar(@{$Ref_to_5_prime}), scalar(@{$Ref_to_CDS}),scalar(@{$Ref_to_3_prime}));
			
			if ($Five_length == 0 && $CDS_length == 0 && $Three_length == 0) {
				$Discarded_sequences++;
			} else {
				$Extracted_CDS++;
				
				# Add collected subsequences to the appropriate output file
				writeSubSequence($Ref_to_5_prime, $Ref_to_parameters->{'Five_prime_UTR_file'}) if ($Five_length > 0);
				writeSubSequence($Ref_to_CDS, $Ref_to_parameters->{'CDS_file'}) if ($CDS_length > 0);
				writeSubSequence($Ref_to_3_prime, $Ref_to_parameters->{'Three_prime_UTR_file'}) if ($Three_length > 0);
			}
		} else { $Rejected_sequences++; }
	}
	
	print "\n" . '  --> Number of extracted CDS: ' .  $Extracted_CDS . "\n";
	print '  --> Number of rejected sequences: ' .  $Rejected_sequences . "\n";
	print '  --> Number of discarded sequences (ORF limits problem): ' .  $Discarded_sequences . "\n\n";

	return 0;
}

sub indexALRFile {
	
	# Recovers parameters
	my ($ALR_file_to_index, $Alias) = @_;
	
	# Initializations
	my $Contig_counter = 0;
	my ($Current_file_content, $Current_contig_name) = ('', '');
	my %Contig_data;
	
	# Verbose
	print ' ALR file indexation (' . $Alias .  ')' . "\n";
	
	# Read the ALR input file and store all contig's data into a hash table
	open (ALR, '<' . $ALR_file_to_index) or die ('Error: Cannot open/read file: ' . $ALR_file_to_index . "\n");

	while (my $Current_line = <ALR>){

		if ($Current_line =~ /^[^#|@]/) { # If the current line ins't a comment line we analyse it
			
			if ($Current_line =~ /^>([\w\.]+)/) {
				
				if ($Contig_counter > 0) {
					# Update hash
					$Contig_data{$Current_contig_name} = $Current_file_content;
					
					# Reinitializations
					$Current_file_content = '';
				}
				
				# Store the new contig name for the next processContig call
				$Current_contig_name = $1;
				
				# Increase the number of treated contigs
				$Contig_counter++;
			}
			
			$Current_file_content .= $Current_line;
		}
	}
	
	# Don't forget to store the last contig's data
	$Contig_data{$Current_contig_name} = $Current_file_content;
	
	close (ALR);
	
	print '  --> Number of indexed sequences: ' . scalar(keys %Contig_data) . "\n\n";
	
	return \%Contig_data;

	return 0;
}

sub getSubSequencesFromAlr {
	
	# Recovers parameters
	my ($Ref_to_Contig_data, $Contig_name, $ORF_start, $ORF_stop) = @_;
	
	# Initializations
	my (@Five_prime_UTR, @CDS_lines, @Three_prime_UTR);
	my ($Ref_to_reverse_5_prime, $Ref_to_reverse_CDS, $Ref_to_reverse_3_prime) = ([], [], []);
	
	# Get all data lines
	my @Alr_lines = split(/\n/, $Ref_to_Contig_data->{$Contig_name});
	
	# Extract header lines
	my @Header_lines = splice(@Alr_lines, 0, 2);
	
	# Get the number of position of the ALR sequence (i.e. The number of element of the ALR_lines array without the two header lines)
	my $ALR_length = scalar(@Alr_lines);

	# Get ORFs and UTR regions
	if ($ORF_start < $ORF_stop){ # Forward ORFS
		print $Header_lines[0] . ' (Forward) --> Start: ' . $ORF_start . ' - Stop: ' . $ORF_stop . ' - Nb ALR positions: ' . $ALR_length . "\n" if $DEBUG;
		
		# Jump over the current contig if there is a problem with Trinity ORF positions
		if ($ALR_length < $ORF_stop) {
			print '  -> Warning: ORF limit is outside of the ALR sequence (Forward) ! Trinity_ORF problem ! Contig "' . $Contig_name . '" will be discarded..' . "\n";
		} else {
			# Five prime UTR extraction - The 5'UTR region for the forward contig is at the BEGINNING of the sequence
			if ($ORF_start > 1) { # If ORF_start = 1 then the CDS begin at the first position of the contig
				print '   -> Has a 5\'UTR region ! (0..' . ($ORF_start-2) . ")\n" if $DEBUG;
				push(@Five_prime_UTR, @Header_lines);
				push(@Five_prime_UTR, @Alr_lines[0..($ORF_start-2)]);
			}
			
			# CDS extraction
			push(@CDS_lines, @Header_lines);
			push(@CDS_lines, @Alr_lines[($ORF_start-1)..($ORF_stop-1)]);
			
			# Three prime UTR extraction - The 3'UTR region for the forward contig is at the END of the sequence
			if ($ORF_stop <= $#Alr_lines) { # Because the array start at position 0, if a sequence has a size of 20, then the last element of the tab is tab[19]
				print '   -> Has a 3\'UTR region ! (' . $ORF_stop . '..' . $#Alr_lines . ")\n" if $DEBUG;
				push(@Three_prime_UTR, @Header_lines);
				push(@Three_prime_UTR, @Alr_lines[$ORF_stop..$#Alr_lines]);
			}
		}
		
		return (\@Five_prime_UTR, \@CDS_lines, \@Three_prime_UTR);
		
	} else { # Reverse ORFs   -   Important note: remember that in this case ORF_start > ORF_stop ! ORF_stop is actually the start of the ORF part on the original sequence
		print $Header_lines[0] . ' (Reverse) --> Start: ' . $ORF_start . ' - Stop: ' . $ORF_stop . ' - Nb ALR positions: ' . $ALR_length . "\n" if $DEBUG;
		
		if ($ALR_length < $ORF_start) {
			print '  -> Warning: ORF limit is outside of the ALR sequence (Reverse) ! Trinity_ORF problem ! Contig "' . $Contig_name . '" will be discarded..' . "\n";
		} else {
			# Five prime UTR extraction - The 5'UTR region for the reverse contig is at the END of the original sequence
			if ($ORF_start <= $#Alr_lines) {
				print '   -> Has a 5\'UTR region ! (' . $ORF_start . '..' . $#Alr_lines . ")\n" if $DEBUG;
				@Five_prime_UTR = @Alr_lines[$ORF_start..$#Alr_lines];
				$Ref_to_reverse_5_prime = reverseComplementAlr(\@Five_prime_UTR, \@Header_lines, $Contig_name);
			}
			
			# CDS extraction
			@CDS_lines = @Alr_lines[($ORF_stop-1)..($ORF_start-1)];
			$Ref_to_reverse_CDS = reverseComplementAlr(\@CDS_lines, \@Header_lines, $Contig_name);
			
			# Three prime UTR extraction - The real 3'UTR region for the reverse contig is at the BEGINNING of the original sequence
			if ($ORF_stop > 1) {
				print '   -> Has a 3\'UTR region ! (0..' . ($ORF_stop-2) . ")\n" if $DEBUG;
				@Three_prime_UTR = @Alr_lines[0..($ORF_stop-2)];
				$Ref_to_reverse_3_prime = reverseComplementAlr(\@Three_prime_UTR, \@Header_lines, $Contig_name);
			}
		}
		
		return ($Ref_to_reverse_5_prime, $Ref_to_reverse_CDS, $Ref_to_reverse_3_prime);
	}
}

sub reverseComplementAlr {
	
	# Recovers parameters
	my ($Ref_to_subsequence_array, $Ref_to_header_lines, $Contig_name) = @_;
	
	# Initializations
	my %Complement = (  'A' => 'T', 'T' => 'A','C' => 'G', 'G' => 'C',
						'N' => 'N', '-' => '-', 'X' => 'X', '.' => '.',
						'R' => 'Y', 'Y' => 'R', 'M' => 'K', 'K' => 'M', 'B' => 'V', 'V' => 'B', 'H' => 'D', 'D' => 'H',
						'S' => 'S', 'W' => 'W');
	
	my @Reverse_Alr = ();
	
	# Reverse complement the ALR sequence and counters
	foreach my $Line_to_complement (@{$Ref_to_subsequence_array}) {
		# Split the line to separate counters from the base and the type
		my ($Base, $Type, $Individual_data) = split(/\t/, $Line_to_complement, 3);
		print 'Base: ' . $Base . ' - Type: ' . $Type . ' - Idata: ' . $Individual_data . "\n" if($DEBUG);
		
		if ($Individual_data eq "") {
			print 'Warning: There is a problem with the format of a position in contig ' . $Contig_name . ' !!' . "\n";
			print '  -> Corresponding line: ' . $Line_to_complement . "\n\n";
		}
		
		# Complement line depending on its type
		if ($Type eq 'M') {
			# For Monomorphic position we just need to change the base
			$Line_to_complement = $Complement{$Base} . "\t" . $Type . "\t" . $Individual_data;
		} else {
			# For Polymorphic position we have to change the bases and change the internal order of every X[a/c/g/t] block
			# Example: A	P	18[10/0/8/0]	15[15/0/0/0]	27[20/0/3/4] --> becomes --> T	P	18[0/8/0/10]	15[0/0/0/15]	27[4/3/0/20]

			my @Individuals = split(/\t/, $Individual_data);

			foreach my $Counters (@Individuals) {
				if ($Counters =~ /(\d+)\[(\d+)\/(\d+)\/(\d+)\/(\d+)\]/) {
					$Counters = $1 . '[' . $5 . '/' . $4 . '/' .$3 . '/' .$2 . ']';
				}
			}
			$Line_to_complement = $Complement{$Base} . "\t" . $Type . "\t" . join("\t", @Individuals);
		}
	}
	
	# Reverse sequence
	@Reverse_Alr = reverse(@{$Ref_to_subsequence_array});
	
	# Add header lines at the beginning of the final array
	unshift(@Reverse_Alr, @{$Ref_to_header_lines});
	
	return \@Reverse_Alr;
}

sub writeSubSequence {
	
	# Recovers parameters
	my ($Ref_to_subsequence, $Selected_output_file) = @_;
	
	if (scalar(@{$Ref_to_subsequence}) <= 0) {
		print "I'm here even if there is no fucking entry damnit !!\n";
	}
	
	# Write subsequence
	open (OUT, '>>' . $Selected_output_file) or die ('Error: Cannot create/open file: ' . $Selected_output_file . "\n");
	
	if ($Control_lvl == 1) {
		foreach my $Lines_to_write (@{$Ref_to_subsequence}) {
			print OUT $Lines_to_write . "\n" if (defined($Lines_to_write));
		}
	} else {
		foreach my $Lines_to_write (@{$Ref_to_subsequence}) {
			print OUT $Lines_to_write . "\n";
		}
	}
	
	close(OUT);
	
	return 0;
}

##################################################
## Custom sort rules
##################################################

sub InAlphaNumericOrder {
	$a cmp $b;
}

# sort using explicit subroutine name
sub ByContigNumberAsc {
	($a =~ /Contig(\d+)/)[0] <=> ($b =~ /Contig(\d+)/)[0];
}

sub ByGenbankIdentifierAsc {
	($a =~ /gi.(\d+)/)[0] <=> ($b =~ /gi.(\d+)/)[0];
}


##################################################
## Basic functions
##################################################

sub displayHelpMessage {

	print('################################################' . "\n");
	print('#        ORF Extractor  -  Help section        #' . "\n"); 
	print('################################################' . "\n\n");

	print "Usage example (short names): ORF_extractor.pl -orf sample.orf -otype auto -seq sample.pfas -stype auto -out sample\n\n";
	
	print "Usage example (long names): ORF_extractor.pl -orf_file sample.orf -orf_type auto -sequence_file sample.pfas -sequence_type auto -output sample\n\n\n";
	
	print "Here is the list of authorized parameters :\n\n";
	
	print " -help => Display this help message.\n\n";
	
	print " -orf_file/-orf file => Path and name of the ORF input file in getORF/Prot4EST format (mandatory).\n\n";
	
	print " -orf_type/-otype string => Origin/Format of the ORF input file (optional). Possible values are [getORF|prot4EST|auto].\n";
	print "    If set to <auto> then ORF_extractor will try to find it out by himself.\n";
	print "\tIf not specified, the following default value will be used: " . $Default_ORF_type . "\n\n";
	
	print " -sequence_file/-seq file => Path and name of the sequence input file in Fasta/PseudoFasta/ALR format (mandatory).\n\n";
	
	print " -sequence_type/-stype string => Format of the sequence input file (optional). Possible values are [Fasta|PseudoFasta|ALR|auto].\n";
	print "    If set to <auto> then ORF_extractor will try to find it out by himself.\n";
	print "\tIf not specified, the following default value will be used: " . $Default_Sequence_type . "\n\n";
	
	print " -output/-out string => Prefix name for the three output files (5'UTR, CDS, 3'UTR) in Fasta/PseudoFasta or ALR format (optional).\n";
	print "\tIf not specified, a default prefix name (derived from the name of the sequence input file) will be used.\n\n";

	exit (1);
}

sub checkParameters {
	
	# Recovers parameters
	my $Ref_to_parameters = shift;
	
	# Execution mode
	if (!defined($Ref_to_parameters->{'Execution_mode'}) || $Ref_to_parameters->{'Execution_mode'} eq "") {
		$Ref_to_parameters->{'Execution_mode'} = 'local';
	}
	
	# Input files - Path and name
	if (!defined($Ref_to_parameters->{'ORF_file'})) {
		print 'Error: No ORF input file defined !' . "\n\n";
		displayHelpMessage();
	} else {
		checkInputFile($Ref_to_parameters->{'ORF_file'}, 'ORF');
	}
	
	if (!defined($Ref_to_parameters->{'Sequence_file'})) {
		print 'Error: No Fasta/PseudoFasta/ALR input file defined !' . "\n\n";
		displayHelpMessage();
	} else {
		checkInputFile($Ref_to_parameters->{'Sequence_file'}, 'Fasta/PseudoFasta/ALR');
	}
	
	# Input files - Type
	if (!defined($Ref_to_parameters->{'ORF_type'}))  {
		print 'Warning: The orf_type/otype parameter has been omited ! Default value ("' . $Default_ORF_type . '") will be used...' . "\n\n";
		$Ref_to_parameters->{'ORF_type'} = $Default_ORF_type;
		
	} elsif ($Ref_to_parameters->{'ORF_type'} !~ /^(getORF|prot4EST|auto)$/i) {
		print 'Warning: The value (' . $Ref_to_parameters->{'ORF_type'} . ') of the orf_type/otype parameter given to this module is not valid ! Default value ("' . $Default_ORF_type . '") will be used...' . "\n\n";
		$Ref_to_parameters->{'ORF_type'} = $Default_ORF_type;
	}
	
	if (!defined($Ref_to_parameters->{'Sequence_type'}))  {
		print 'Warning: The sequence_type/stype parameter has been omited ! Default value ("' . $Default_Sequence_type . '") will be used...' . "\n\n";
		$Ref_to_parameters->{'Sequence_type'} = $Default_Sequence_type;
		
	} elsif ($Ref_to_parameters->{'Sequence_type'} !~ /^(Fasta|PseudoFasta|ALR|auto)$/i) {
		print 'Warning: The value (' . $Ref_to_parameters->{'Sequence_type'} . ') of the sequence_type/stype parameter given to this module is not valid ! Default value ("' . $Default_Sequence_type . '") will be used...' . "\n\n";
		$Ref_to_parameters->{'Sequence_type'} = $Default_Sequence_type;
	}
	
	# Output prefix
	if (!defined($Ref_to_parameters->{'Output_prefix'}))  {
		print 'Warning: The prefix name for output files has been omited ! A default name (derived from the name of the Fasta/PseudoFasta/ALR input file) will be used !' . "\n\n";
		#~ $Ref_to_parameters->{'Output_prefix'} = basename($Ref_to_parameters->{'Sequence_file'});
		$Ref_to_parameters->{'Output_prefix'} = fileparse($Ref_to_parameters->{'Sequence_file'}, qr/\.[^.]*$/);
	}
}

sub checkInputFile {
	
	# Recovers parameters
	my ($file, $type) = @_;
	
	if (! -e $file || -z $file) {
		print 'Fatal error: The ' . $type . ' file specified in the command line does not exist or is empty ! Please check file name, path and content !!' . "\n";
		print 'Selected file: ' . basename($file) . "\n";
		print 'Corresponding directory: ' . dirname($file) . "\n\n";
		exit(1);
	}
	
	return 0;
}

sub humanReadableDate {
	
	# Recovers parameters
	my $time = shift || time;
	
	# Split time string
	my ($seconde, $minute, $heure, $jour, $mois, $annee, $jour_semaine, $jour_annee, $heure_hiver_ou_ete) = localtime($time);
	$mois  += 1;
	$annee += 1900;

	# On rajoute 0 si le chiffre est compris entre 1 et 9
	foreach ( $seconde, $minute, $heure, $jour, $mois, $annee ) { s/^(\d)$/0$1/; }
	
	return "$jour-$mois-$annee" . '_' . "$heure:$minute:$seconde";
}
