#! /usr/bin/perl

###############################################################################
#                                                                             #
#    Copyright © 2012-2013 -- IRB/INSERM                                      #
#                            (Institut de Recherche en Biothérapie /          #
#                             Institut National de la Santé et de la          #
#                             Recherche Médicale)                             #
#                                                                             #
#  Auteurs/Authors:  Jerôme AUDOUX <jerome.audoux@univ-montp2.fr>             #
#                    Alban MANCHERON  <alban.mancheron@lirmm.fr>              #
#                    Nicolas PHILIPPE <nicolas.philippe@inserm.fr>            #
#                                                                             #
#  -------------------------------------------------------------------------  #
#                                                                             #
#  Ce fichier fait partie de la suite CracTools qui contient plusieurs pipeline# 
#  intégrés permettant de traiter les évênements biologiques présents dans du #
#  RNA-Seq. Les CracTools travaillent à partir d'un fichier SAM de CRAC et d'un# 
#  fichier d'annotation au format GFF3.                                       #
#                                                                             #
#  Ce logiciel est régi  par la licence CeCILL  soumise au droit français et  #
#  respectant les principes  de diffusion des logiciels libres.  Vous pouvez  #
#  utiliser, modifier et/ou redistribuer ce programme sous les conditions de  #
#  la licence CeCILL  telle que diffusée par le CEA,  le CNRS et l'INRIA sur  #
#  le site "http://www.cecill.info".                                          #
#                                                                             #
#  En contrepartie de l'accessibilité au code source et des droits de copie,  #
#  de modification et de redistribution accordés par cette licence, il n'est  #
#  offert aux utilisateurs qu'une garantie limitée.  Pour les mêmes raisons,  #
#  seule une responsabilité  restreinte pèse  sur l'auteur du programme,  le  #
#  titulaire des droits patrimoniaux et les concédants successifs.            #
#                                                                             #
#  À  cet égard  l'attention de  l'utilisateur est  attirée sur  les risques  #
#  associés  au chargement,  à  l'utilisation,  à  la modification  et/ou au  #
#  développement  et à la reproduction du  logiciel par  l'utilisateur étant  #
#  donné  sa spécificité  de logiciel libre,  qui peut le rendre  complexe à  #
#  manipuler et qui le réserve donc à des développeurs et des professionnels  #
#  avertis  possédant  des  connaissances  informatiques  approfondies.  Les  #
#  utilisateurs  sont donc  invités  à  charger  et  tester  l'adéquation du  #
#  logiciel  à leurs besoins  dans des conditions  permettant  d'assurer  la  #
#  sécurité de leurs systêmes et ou de leurs données et,  plus généralement,  #
#  à l'utiliser et l'exploiter dans les mêmes conditions de sécurité.         #
#                                                                             #
#  Le fait  que vous puissiez accéder  à cet en-tête signifie  que vous avez  #
#  pris connaissance  de la licence CeCILL,  et que vous en avez accepté les  #
#  termes.                                                                    #
#                                                                             #
#  -------------------------------------------------------------------------  #
#                                                                             #
#  This file is part of the CracTools which provide several integrated        #
#  pipeline to analyze biological events present in RNA-Seq data. CracTools   #
#  work on a SAM file generated by CRAC and an annotation file in GFF3 format.#
#                                                                             #
#  This software is governed by the CeCILL license under French law and       #
#  abiding by the rules of distribution of free software. You can use,        #
#  modify and/ or redistribute the software under the terms of the CeCILL     #
#  license as circulated by CEA, CNRS and INRIA at the following URL          #
#  "http://www.cecill.info".                                                  #
#                                                                             #
#  As a counterpart to the access to the source code and rights to copy,      #
#  modify and redistribute granted by the license, users are provided only    #
#  with a limited warranty and the software's author, the holder of the       #
#  economic rights, and the successive licensors have only limited            #
#  liability.                                                                 #
#                                                                             #
#  In this respect, the user's attention is drawn to the risks associated     #
#  with loading, using, modifying and/or developing or reproducing the        #
#  software by the user in light of its specific status of free software,     #
#  that may mean that it is complicated to manipulate, and that also          #
#  therefore means that it is reserved for developers and experienced         #
#  professionals having in-depth computer knowledge. Users are therefore      #
#  encouraged to load and test the software's suitability as regards their    #
#  requirements in conditions enabling the security of their systems and/or   #
#  data to be ensured and, more generally, to use and operate it in the same  #
#  conditions as regards security.                                            #
#                                                                             #
#  The fact that you are presently reading this means that you have had       #
#  knowledge of the CeCILL license and that you accept its terms.             #
#                                                                             #
###############################################################################

=head1 NAME

digitagCT - Post-processing tool for DGE tags mapped using CRAC software.

=head1 SYNOPSIS

  digitagCT --gff file.gff dge.sam [--sage sage.csv] [--rna-seq rna-seq.sam] [--tar tar.bed(.gz)]

  dge.sam: a file with mapping information for each tag in SAM standard format. 

=head1 DESCRIPTION

=head1 AUTHOR

Nicolas PHILIPPE <nicolas.philippe@inserm.fr>, Alban MANCHERON <alban.mancheron@lirmm.fr>
AND Jerome AUDOUX <jerome.audoux@univ-montp2.fr>

=cut

use strict;
use warnings;

# Generic perl modules
use Getopt::Long;                       # Get args from command line
use Pod::Usage;
use File::Basename;

# CracTools-core modules
use CracTools::SAMReader;
use CracTools::Output;
use CracTools::Config;

# CracTools-digitag modules
use CracTools::DigitagCT::Structure;
use CracTools::DigitagCT::Analyzer::Annotation;
use CracTools::DigitagCT::Analyzer::RNASeq;
use CracTools::DigitagCT::Analyzer::SAGE;
use CracTools::DigitagCT::Analyzer::Tiling;

# Script version number (printed in the output)
my $version = '1.1';

# Copy Args to print them in the output
my @ARGV_copy = @ARGV;

# files
my ($gff_file,
    $est_file,
    $dge_sam,
    $sage_genie_file,
    $sage_library,
    $tag_occ_file,
    @rna_seq_sams,
    $tar_file,
    $summary_file,
    $config_file,
  );

my ($stranded,$man,$help);


###############################################################################
#                              LOADING ARGUMENTS                              #
###############################################################################

=head1 OPTIONS

  --gff=file.gff                  Specify GFF3 file to perform annotation with. (or declare ANNOTATION_GFF in conf file) 
  --est=file.gff                  Specify GFF3 file with EST annotations. These annotations will be used if we don't find any matching annotation in the --gff annotation file.
  --sage=sage.csv                 Specify a csv file exported by TranscriRef database with all sage/DGE tag features.
  --rna-seq= [1.sam, ..., N.sam]  Specify a list of sam file generated by CRAC software from RNA-Seq reads.
  --tar=tar.bed                   Specify a bed file exported by UCSC Genome Browser with all tiling arrays features. This file can be gzipped.
  --tag-occ                       Specify a tsv file with occurences for each tag (Tag_sequence\tNb_occ)
  --stranded                      This option is used to specify if rna-seq data are stranded.
  --summary=summary.txt           Specify a file to print the annotation summary.
  --conf=CracTools.cfg            Specify a specific configuration file.
  --help                          Print this message.
  --man                           Open man page.

=cut


GetOptions( "gff|g=s"     =>  \$gff_file,
            "est=s"       =>  \$est_file,
            "sage=s"      =>  \$sage_genie_file,
            "sage-lib=s"  =>  \$sage_library,
            "rna-seq=s{1,}"   =>  \@rna_seq_sams,
            "stranded"    =>  \$stranded,
            "tar=s"       =>  \$tar_file,
            "tag-occ=s"   =>  \$tag_occ_file,
            "summary=s"   =>  \$summary_file,
            "conf=s"      =>  \$config_file,
            "man"         =>  \$man,
            "help"        =>  \$help,
        ) or pod2usage(-verbose => 1);

pod2usage(-verbose => 1)  if ($help);
pod2usage(-verbose => 2)  if ($man);

$dge_sam = $ARGV[0];

# Check mandatory args
if(!defined $dge_sam) {
  print STDERR "Missing input : DGE mapping file in SAM format.\n";
  pod2usage(-verbose => 0);
}

# Loding arg from config file
CracTools::Config::LoadConfig($config_file);

$gff_file = CracTools::Config::getConfVar('ANNOTATION_GFF') unless defined $gff_file;


###############################################################################
#                               LOADING DGE FILE                              #
###############################################################################
print STDERR "> Loading DGE tags into memory.\n";

# Creat DGEStruct to store tags info and occurences
my $DGEStruct = CracTools::DigitagCT::Structure->new();

my $sam_reader = CracTools::SAMReader->new($dge_sam);
my $sam_dge_it = $sam_reader->iterator();
while (my $line = $sam_dge_it->()) {
  if($line->isClassified('unique')) {
    $DGEStruct->addTag($line);
  }
}

if(defined $tag_occ_file) {
  open(my $fh,$tag_occ_file) or die("Cannot open $tag_occ_file");
  while(<$fh>) {
    next if $_ =~ /^#/;
    chomp;
    my($tag,$occ) = split("\t",$_);
    $DGEStruct->addGenericElement($tag,'occ',$occ);
  }
}

###############################################################################
#                             CREATING ANALYZERS                              #
###############################################################################
my @analyzers = ();

my $annot_analyzer = CracTools::DigitagCT::Analyzer::Annotation->new(
  digitag_struct => $DGEStruct,
  gff_file => $gff_file,
  est_file => $est_file,
);
push @analyzers, $annot_analyzer;

if(@rna_seq_sams) {
  foreach my $rna_seq_sam (@rna_seq_sams) {
    my $filename = basename($rna_seq_sam);
    my $rna_seq_analyzer = CracTools::DigitagCT::Analyzer::RNASeq->new(
      digitag_struct => $DGEStruct,
      rna_seq_sam => $rna_seq_sam,
      name => $filename,
    );
    push @analyzers, $rna_seq_analyzer;
  }
}

my $sage_analyzer;
if(defined $sage_genie_file) {
  $sage_analyzer = CracTools::DigitagCT::Analyzer::SAGE->new(
    digitag_struct => $DGEStruct,
    sage_genie_file => $sage_genie_file,
    sage_library => $sage_library,
  );
  push @analyzers, $sage_analyzer;
}

my $tiling_analyzer;
if(defined $tar_file) {
  $tiling_analyzer = CracTools::DigitagCT::Analyzer::Tiling->new(
    digitag_struct => $DGEStruct,
    tar_file => $tar_file,
  );
  push @analyzers, $tiling_analyzer;
}

################################################################################
#                                FINAL OUTPUT                                  #
################################################################################
#print "> Output results in $output_file.\n" unless ($quiet || !defined $output_file);

my @headers = ("tag","occ_dge_lib");
foreach my $analyzer (@analyzers) {
  push @headers,$analyzer->getHeaders();
}

my $output = CracTools::Output->new();
$output->printHeaders(ARGS => \@ARGV_copy, VERSION => $version);
$output->printHeaderLine;
$output->printHeaderLine(@headers);

my $tag_it = $DGEStruct->iterator();
while (my $tag = $tag_it->()) {
  my $samline = $DGEStruct->getSAMLine($tag);  
  my %annot = %{$DGEStruct->getGenericElement($tag,'annotation')};
  my @output = ($tag,$DGEStruct->nbOccurences($tag));
  foreach my $analyzer (@analyzers) {
    push @output,$analyzer->getOutput($tag);
  }
  $output->printLine(@output);
}

# Print summary if --summary option
if(defined $summary_file) {
  open SUM, ">$summary_file" or die("Cannot open $summary_file");
  print SUM $annot_analyzer->getAnnotationSummary;
}
