/********************************************************************************************************
 * QRNA - Comparative analysis of biological sequences 
 *         with pair hidden Markov models, pair stochastic context-free
 *        grammars, and probabilistic evolutionary  models.
 *       
 * Version 2.0.3 (MAY 2004)
 *
 * Copyright (C) 2000-2004 Howard Hughes Medical Institute/Washington University School of Medicine
 * All Rights Reserved
 * 
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ***********************************************************************************************************/

/* eqrna_sample.c
 * 
 * Creates alignments according to the different models
 *
 * ER, Sat May 22 18:38:35 CDT 2004 [ ST. Louis at home with Coro ]
 * 
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <time.h>

#include "funcs.h"
#include "evolfuncs.h"
#include "globals.h"
#include "squid.h"
#include "structs.h"
#include "version.h"


#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif

static struct opt_s OPTIONS[] = {
  { "-a",        TRUE,  sqdARG_NONE},
  { "-C",        TRUE,  sqdARG_NONE},
  { "-c",        TRUE,  sqdARG_STRING},
  { "-D",        TRUE,  sqdARG_STRING},
  { "-H",        TRUE,  sqdARG_STRING},
  { "-h",        TRUE,  sqdARG_NONE},
  { "-i",        TRUE,  sqdARG_STRING},
  { "-j",        TRUE,  sqdARG_STRING},
  { "-l",        TRUE,  sqdARG_STRING},
  { "-m",        TRUE,  sqdARG_STRING},
  { "-n",        TRUE,  sqdARG_NONE},
  { "-o",        TRUE,  sqdARG_STRING},
  { "-O",        TRUE,  sqdARG_STRING},
  { "-p",        TRUE,  sqdARG_STRING},
  { "-q",        TRUE,  sqdARG_STRING},
  { "-R",        TRUE,  sqdARG_NONE},
  { "-t",        TRUE,  sqdARG_NONE},
  { "-v",        TRUE,  sqdARG_NONE},
  { "--approx",  FALSE, sqdARG_NONE},
  { "--oldrna",  FALSE, sqdARG_NONE},
};
                
#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))

static char usage[]  = "\
Usage: qrna_sample [-options] \n\
where options are:\n\
   -a               : print alignment \n\
   -C               : create a COD-like alignment (default is OTH)\n\
   -c <cfgfile>     : <cfgfile> to use to train the rna model (default = tRNA+rRNA)\n\
   -H <Hexfile>     : include a file of Hexamer frequencies for the coding model\n\
   -h               : print short help and usage info\n\
   -i <tinit>       : minimum evolutionary time factor (default tinit=0)\n\
   -j <tinc>        : increments taken in the divergence time (default tinc=0.01)\n\
   -l <expected_l>  : expected length of aligments generated with the models. default is 150 nts\n\
   -m <tmax>        : maximum evolutionary time factor (default tmax=12)\n\
   -n               : create a null-like alignment (default is OTH)\n\
   -o <outfile>     : direct structure-annotated sequence to <outfile>\n\
   -O <targetfreq>  : change the overall base composition of the 3 models \n\
   -p <pamfile>     : <pamfile> to use (default = BLOSUM62)\n\
   -q <num>         : number of alingments to be created (default = 1)\n\
   -R               : create a RNA-like alignment (default is OTH)\n\
   -t               : print traceback\n\
   -v               : verbose debugging output\n\
";

static char experts[] = "\
\n";                                                                                                                        

static char banner[] = "qrna_sample";

int
main(int argc, char **argv)
{
  SQINFO   sqinfoX;               /* info structures for seqX                               */
  SQINFO   sqinfoY;               /* info structures for seqY                               */
  int     *isegX;        	  /* sequence X integer form (with gaps)                    */
  int     *isegY;                 /* sequence Y integer form (wwith gaps)                   */
  int      leg;                   /* length of a given alignment                            */
  int      Lexp;                  /* expected length of aligments generated                 */
  int      Lmax;                  /* length of the max alignment                            */
  int      Lmaxrealloc;           /* length of the max alignment                            */

  char    *codonfile;             /* codon-codon joint frequencies                          */
  char    *hexapfile;             /* Hexamer joint frequencies                              */
  char    *pamfile;               /* PAM substitution matrix                                */
  char    *cfgfile;               /* RNA grammar file                                       */
  char    *ribofile;              /* RNA pair-to-pair probs                                 */
  char    *targetfreqfile;        /* background probabilities                               */

  struct   emodel_s    *emodel;      /* transition + emission probs 3 models + null model      */
  struct   rnascfg_s   *mx;         /* for RNA dp                                             */
  struct   emitcodon_s *emitcodon;

  double                  ***cfg_node;             /* CFG node frequencies extracted form cfgfile            */
  double                   **hexa;                 /* Hexamer  frquencies extracted form hexapfile           */
  double                    *codon_joint;          /* codon-codon joint probabilbities                       */
  struct psubs_s            *pam_star;
  struct psubs_s            *codprob_star;
  struct psubs_s            *mutpxy_star;
  struct psubs_s            *mut5pxy_star;
  struct psubs_s            *riboprob_star;
  struct psubs_s            *riboprobapprox_star;
  struct pnonsubs_s         *pair5prob_star;
  struct psubs_s            *mutpxy_rna_unpaired_star;
  struct psubs_s            *mut5pxy_rna_unpaired_star;
  struct psubs_s            *mut5pxy_rna_loop_star;
  struct psubs_s            *mutpxy_rna_paired_star;
  struct psubs_s            *mut5pxy_rna_paired_star;

  int      alignment;	          /* TRUE prints alignment                                  */
  int      add_codon;	          /* TRUE adds codon-codon frequencies for coding model     */
  int      add_hexamer;	          /* TRUE adds hexamer frequencies for coding model         */
  int      approx;                /* TRUE use approximation to calculate the RNA riboprobs (default RIBOSUM)   */
  int      changefreqoverall;     /* TRUE change the baseline base composition of 3 models  */
  double  *targetfreq;            /* the new frequencies, not in use yet                    */
  int      cod;	                  /* TRUE create a COD-like  alignment                      */
  int      oldrna;                /* TRUE use 2.0.2 method to calculate the rna pair probabilities             */
  int      null;	          /* TRUE create a null-like  alignment                     */
  int      rna;	                  /* TRUE create a RNA-like  alignment                      */
  double   tfactor;               /* evolutionary time factor                               */
  double   tinc, tinit, tmax;     /* time increment, min amd max times considered           */
  int      logodds;               /* TRUE to wok in logodds mode                            */
  int      modelsareon;           /* TRUE for eqrna.c FALSE for eqrna_sample.c              */
  int      traceback;             /* TRUE to traceback alignment                            */
  int      pedantic;              /* TRUE do some checks for evolutionary models to debug   */
  int      verbose;               /* TRUE to be extremely verbose to debug                  */

  char   *outfile;                /* where to send the output                               */
  FILE   *targetfp;               /* open target frequencies file                           */
  FILE   *ofp;	                  /* open output file                                       */
  char   *optname;
  char   *optarg; 
  char   *string_name;
  int     num = 0, num_align;
  int     optind;	
  int     seed;

  double  first_pos;
  double  second_pos;
  double  third_pos;

  int     pairs;
  int     comp_pairs;
  int     noncomp_pairs;
  int     x;

  struct three_times_s times;

  struct psubs_s *riboprob;
  int             use_ribo_approx = TRUE;

  /* re-seed the random number generator.
   */
  seed = (int) time ((time_t *) NULL);
  sre_srandom(seed); /* reinit sre_random each time you shuffle a sequence */

  /* Parse command line
   */
  tinc     =  0.01;    /* time increments (if tinit < tmax)  */
  tinit    =  0.00;    /* default minimum evolutionary time  */
  tmax     = 12.00;    /* default maximum evolutionary time  */
  add_codon          = FALSE;      /* TRUE  ==  adds codon-codon frequencies for coding */
  add_hexamer        = FALSE;      /* TRUE  ==  adds hexamer frequencies for coding     */
  alignment          = FALSE;      /* TRUE  ==  prints alignment                        */
  approx             = FALSE;      /* TRUE use approximation to calculate the RNA riboprobs (default RIBOSUM) */
  changefreqoverall  = FALSE;      /* TRUE  ==  change the base comp of 3 models        */
  cod                = FALSE;      /* TRUE  ==  create a COD-like  alignment            */
  modelsareon        = FALSE;      /* TRUE for eqrna.c FALSE for eqrna_sample.c         */
  null               = FALSE;      /* TRUE  ==  create a null-like  alignment           */
  oldrna             = FALSE;      /* TRUE use 2.0.2 version to calculate the RNA riboprobs                   */
  rna                = FALSE;      /* TRUE  ==  create a RNA-like  alignment            */
  logodds            = FALSE;      /* FALSE ==  no logoods                              */
  traceback          = FALSE;      /* TRUE  ==  traceback alignment                     */
  pedantic           = FALSE;      /* TRUE  ==  check your evolutionary models          */
  verbose            = FALSE;      /* TRUE  ==  for debuging                            */
  Lexp               = 150;        /* expected length of aligments generated            */
  
  cfgfile        = NULL;
  hexapfile      = NULL;
  ribofile       = NULL;
  codonfile      = NULL;
  pamfile        = "BLOSUM62";
  outfile        = NULL;
  targetfreqfile = NULL;
  num_align = 1;

  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
		&optind, &optname, &optarg))
    {
      if      (strcmp(optname, "-a") == 0)  alignment         = TRUE;
      else if (strcmp(optname, "-C") == 0)  cod               = TRUE;
      else if (strcmp(optname, "-c") == 0)  cfgfile           = optarg;
      else if (strcmp(optname, "-D") == 0)  codonfile         = optarg;
      else if (strcmp(optname, "-H") == 0)  hexapfile         = optarg;
      else if (strcmp(optname, "-i") == 0)  tinit             = atof(optarg);
      else if (strcmp(optname, "-j") == 0)  tinc              = atof(optarg);
      else if (strcmp(optname, "-l") == 0)  Lexp              = atoi(optarg);
      else if (strcmp(optname, "-m") == 0)  tmax              = atof(optarg);
      else if (strcmp(optname, "-n") == 0)  null              = TRUE;
      else if (strcmp(optname, "-o") == 0)  outfile           = optarg;
      else if (strcmp(optname, "-O") == 0) {changefreqoverall = TRUE; targetfreqfile = optarg; }
      else if (strcmp(optname, "-p") == 0)  pamfile           = optarg;
      else if (strcmp(optname, "-q") == 0)  num_align         = atoi(optarg);
      else if (strcmp(optname, "-R") == 0)  rna               = TRUE;
      else if (strcmp(optname, "-t") == 0)  traceback         = TRUE;
      else if (strcmp(optname, "-v") == 0)  verbose           = TRUE;
      else if (strcmp(optname, "--approx") == 0) approx       = TRUE;
      else if (strcmp(optname, "--oldrna") == 0) oldrna       = TRUE;
      else if (strcmp(optname, "-h") == 0) 
	{
	  puts(banner);
	  printf("          %s (%s)", RELEASE, RELEASEDATE);
	  printf(" using squid %s (%s)\n", squid_version, squid_date);
	  puts(usage);
	  puts(experts);                                                                                                      
	  exit(0);
	}
    }

  Lmax        = 1;       /* this does not have to be very large, it will get reallocated as needed */
  Lmaxrealloc = Lmax;     /* reallocate the length if necesary */

  tfactor  = tinit; /* set the time to start with */

  if (argc - optind != 0)
    Die("Incorrect number of command line arguments.\n%s\n", usage);

  if (cfgfile == NULL) /* SCFG is not provided */
    cfgfile = FileConcat("", "mix_tied_linux.cfg");

  if (ribofile == NULL) /* RIBORPROB is not provided */
    ribofile = FileConcat("", "RIBOPROB85-60.SEP04.mat");

  /* Load the target frequencies
   */
  if (changefreqoverall) {
    if ((targetfp = fopen(targetfreqfile, "r")) == NULL &&
	(targetfp = EnvFileOpen(targetfreqfile, "QRNADB")) == NULL)
      Die("Failed to open targetfreq file %s", targetfreqfile);
    
    if (! ParseTargetFreqFile(targetfp, &targetfreq))
      Die("Failed to parse targetfreq file");
    
    fclose(targetfp);
  }
  
  else {
    targetfreq  = (double *) MallocOrDie (sizeof(double) * 4);
    for (x = 0; x < 4; x ++) targetfreq[x] = 0.0;
  }
  
  if (verbose) {
    printf("Target frequencies\n");
    PrintVectorProbs(stdout, targetfreq, 4);
  }
  
  if (hexapfile) add_hexamer = TRUE;
  else           add_hexamer = FALSE;
  
 /* Open output file 
   */
  ofp = stdout;
  if (outfile != NULL && (ofp = fopen(outfile, "w")) == NULL)
    Die("Failed to open output file %s", outfile);

  if (cod) emitcodon = AllocEmitCodon ();
  if (FALSE)  mx = AllocScfgRNA(leg, TRUE, FALSE);
  
  string_name = (char *) MallocOrDie(sizeof(char)*80);

  /*
   * eqrna_sample.c
   *
   *
   *                            |   function: EvolConstructModels_phase1()
   *                            |                                      changefreq = FALSE
   *           IF (changefreq)  |
   *                            |
   *                            |   function: EvolConstructModels_phase2()
   *                            |                                      changefreq = TRUE
   *
   *
   *                            |   function: EvolConstructModels_phase1()
   *                            |                                      changefreq = FALSE
   *           ELSE             |
   *      set:  targetfreq=0.0  |
   *                            |   function: EvolConstructModels_phase2()
   *                            |                                      changefreq = TRUE [change background freqs to the 
   *                            |                                                         single nt marginalizations of P(c1,c2|t).
   *                            |                                                         This is set in analyze_2_sequences().]
   *
   *
   *
   */
  EvolConstructModels_phase1(ofp, codonfile, hexapfile, pamfile, cfgfile, ribofile, 
			     targetfreq, &emodel, 
			     &cfg_node, &hexa, &codon_joint, 
			     &pam_star, &codprob_star,
			     &mutpxy_star, &mut5pxy_star, &riboprob_star, &riboprobapprox_star, 
			     &pair5prob_star, 
			     &mutpxy_rna_unpaired_star, &mut5pxy_rna_unpaired_star,
			     &mut5pxy_rna_loop_star, 
			     &mutpxy_rna_paired_star, &mut5pxy_rna_paired_star,
			     add_codon, add_hexamer, approx, modelsareon, FALSE, changefreqoverall, logodds, pedantic, verbose);  

  /* select the model we are generating from
   */
  if      (null) emodel->which_model->null = TRUE;
  else if (cod)  emodel->which_model->cod  = TRUE;
  else if (rna)  emodel->which_model->rna  = TRUE;
  else           emodel->which_model->oth  = TRUE;
  
  while (tfactor <= tmax) {
    /* Contruct the models 
     *    at a given evolutionary time "tfactor"
     */
    times.oth = tfactor;
    times.cod = tfactor;
    times.rna = tfactor;

    /* Use the base-composition, lenght and time to construct the models
     */
    if (approx) riboprob = riboprobapprox_star;
    else        riboprob = riboprob_star;
    
    EvolConstructModels_phase2(ofp, Lexp, cfg_node, hexa, codon_joint, targetfreq, targetfreq, 
			       emodel, times,
			       pam_star, codprob_star,
			       mutpxy_star, mut5pxy_star, riboprob, 
			       pair5prob_star, 
			       mutpxy_rna_unpaired_star, mut5pxy_rna_unpaired_star, 
			       mut5pxy_rna_loop_star, 
			       mutpxy_rna_paired_star, mut5pxy_rna_paired_star,
			       add_codon, add_hexamer, TRUE, changefreqoverall, logodds, oldrna, pedantic, verbose);
    
    if (cod) FillEmitCodon(emodel->cod, emodel->null, emitcodon);
    if (FALSE) InsideRNASS(ofp, leg, emodel->rna->pi2, mx);

    while (num++ < num_align) {
     
      Lmaxrealloc = Lmax;
      
      /* Allocate space for the sequences
       */
      AllocIntSeqs(Lmax, &isegX, &isegY);
      
      if (null) {
	snprintf (string_name,  80, "%s", "null");
	SimulateNullAlign(isegX, isegY, Lexp, Lexp, emodel->null);
      }
      else if (cod) {
	SimulateCODAlign(ofp, &sqinfoX, &isegX, &sqinfoY, &isegY, Lmax, 0, &Lmaxrealloc, emodel->cod, emitcodon, emodel->null,
			 traceback, alignment, string_name, &first_pos, &second_pos, &third_pos);    
	snprintf (string_name,  80, "%s_[%.2f/%.2f/%.2f]", "cod", first_pos, second_pos, third_pos);
      }
      else if (rna) {
	
	SimulateRNAAlign(ofp, &sqinfoX, &isegX, &sqinfoY, &isegY, Lmax, 0, &Lmaxrealloc, emodel->rna, 
			 traceback, alignment, string_name, 
			 &pairs, &comp_pairs, &noncomp_pairs);    
	snprintf (string_name,  80, "%s_[%d/%d/%d]", "rna", pairs, comp_pairs, noncomp_pairs);
      }
      else {
	snprintf (string_name,  80, "%s", "oth");
	SimulateOTHAlign(ofp, &sqinfoX, &isegX, &sqinfoY, &isegY, Lmax, 0, &Lmaxrealloc, emodel->oth, traceback, alignment, string_name);
      }
      
      CleanUpSeqs(isegX, isegY, Lmaxrealloc-1, Lmaxrealloc-1, &leg);
      PrintAlignIntSeqs(ofp, &sqinfoX, &sqinfoY, string_name, tfactor, 0, leg, isegX, isegY);

      free(isegX);  
      free(isegY); 
    }

    num = 0;
    tfactor += tinc; /* increment the evolutionary time */

  }
  
  /* Cleanup
   */
  EvolFreeModels(emodel); 
  
  if (outfile != NULL) fclose(ofp); 
  free (string_name);
  if (FALSE) FreeScfgRNA(mx, FALSE); 
  if (cod) FreeEmitCodon(emitcodon);

  free(targetfreq);
  free(codon_joint);
  FreeSubsProbs(pam_star);
  FreeSubsProbs(codprob_star);
  FreeSubsProbs(mutpxy_star);
  FreeSubsProbs(mut5pxy_star);
  FreeSubsProbs(riboprob_star);
  if (approx) FreeSubsProbs(riboprobapprox_star);
  FreeNonSubsProbs(pair5prob_star);
  FreeSubsProbs(mutpxy_rna_unpaired_star);
  FreeSubsProbs(mut5pxy_rna_unpaired_star);
  FreeSubsProbs(mut5pxy_rna_loop_star);
  FreeSubsProbs(mutpxy_rna_paired_star);
  FreeSubsProbs(mut5pxy_rna_paired_star);
  FreeSCFGNode(cfg_node);
  if (add_hexamer) free(hexa);

  return EXIT_SUCCESS;
}





