#!/usr/bin/perl
##-----------------------------------------------------------------------------
## PROGRAM: art_profiler_454
## VERSION: 0.9.1
## AUTHOR: Weichun Huang (whduke@gmail.com)
## DATE: last updated on  08/09/2012
## DESCRIPTION:
##	the program generates empirical 454 read profiles for ART 454 simulator
##	from 454 read data files in the fastq format 
##-----------------------------------------------------------------------------

use strict;
my $version="0.9.1";
if(@ARGV!=2 and @ARGV!=3){
       	print "DESCRIPTION:\n\tart_profiler_454 is a program for generating empirical 454 read profiles\n";
	print "\tfor ART 454 simulator from 454 read data files\n";
       	print "\tVersion: $version, Copyright (c) Weichun Huang 2012 (whduke\@gamil.com)\n"; 
	print "USAGE:\n\t$0 input_fastq_files_dir output_profile_dir [fastq_filename_extension]\n";
       	print "Examples:\n";
       	print "\t$0 454_dat_dir 454_profile_dir\n";
       	print "\t$0 454_dat_dir 454_profile_dir fastq\n";
       	print "NOTES:\n";
       	print "\t1)the default filename extension for fastq files is fq\n";
       	print "\t2)the program can read gzipped fastq data files with filename extension fq.gz\n";
       	exit;
}
my ($inDIR, $outDIR, $ext)=@ARGV;
if(@ARGV==2) { $ext="fq"; }

my %read_length;
my $with_fidx=0;
my (%qdist, %qdiff,%qall);

#foreach my $inDIR(@dirlst){
    opendir(DIR, $inDIR) or die "cannot open dir $inDIR\n";
    my @allf=readdir(DIR);
    my @fs=grep { /\.$ext$/ } @allf;
    if(@fs==0){
      @fs=grep { /\.$ext\.gz$/ } @allf;
    }
    foreach my $ff(@fs){
       $ff=~/(\S+)\.$ext/;
       my $name=$1; 

       open(FQ, "zcat  $inDIR/$ff|") if ($ff=~/\.gz$/);# or die "can't open $inDIR/$ff\n";
       open(FQ, "$inDIR/$ff") if ($ff=~/\.$ext$/);# or die "can't open $inDIR/$ff\n";
       my $fidx=$ff; $fidx=~s/\.gz$//;$fidx=~s/\.$ext$/\.idx/;
       open(FIDX, "$inDIR/$fidx") or die "can't open $inDIR/$fidx\n" if($with_fidx);

       while (<FQ>) {
           chomp;
	   my ($id, $ss, $qq, $len);
	   if (/^\@(\S+)/) {
	       $id = $1;
	       $_= <FQ>;
	       $_=~s/\s+$//;
	       $len = length;
	       $read_length{$len}+=1;
	       $ss = $_;

	       $_ = <FQ>;
	       $_ = <FQ>;
	       $_=~s/\s+$//;
	       my $qlen = length;
	       die("The number of bases != the number of quals.") if($len!=$qlen);
	       my @base=split //,$ss;
               my @qv;
	       my $i=0;
	       while (/(\S)/g) {
	          push @qv,ord($1)-33;
	       }
	       my @idx;
	       if($with_fidx){
		       my $fid=<FIDX>; $fid=~/^>(\S+)/; $fid=$1;
		       die "$fid!=$id not matched " if($fid ne  $id); 
		       $fid=<FIDX>; $fid=~s/\s+$//g; @idx=split /\t/,$fid;
	       }
	       else{
		       my $N=$base[0];
		       my $c=1;
		       foreach my $n (@base){
			       if($N eq $n){
				       push @idx, $c; 
			       }
			       else{
				       $N = $n;
				       $c++;
				       push @idx, $c; 
			       }
		       }
	       }
	       my $i=0;
	       while($i<@idx){
		       my $k=$idx[$i];
		       my @q;
		       my $cc=0;
		       while (($i<@idx) and ($k==$idx[$i])){
			       $cc+=1;
			       $i++;
		       }
		       $qdist{$cc}{$qv[$i-$cc]}+=1;
		       $qall{$cc}{0}{$qv[$i-$cc]}+=1;
		       foreach my $x(1..$cc-1){
			       my $d=$i-$cc+$x;
			       my @aa=@qv[$d-1,$d];
			       $qdiff{$cc}{$x}{$aa[0]}{$aa[1]}+=1;
			       $qall{$cc}{$x}{$aa[1]}+=1;
		       }
	       }
           }
       }
    }
#}

#the qual distribution of first postion for different len 
open (QPROFILE, ">$outDIR/qual_1st_profile") or die "Can't open $outDIR/qual_1st_profile\n";
print QPROFILE <<QPROFFORMAT;
##454 read profile for ART 454 simulator
##homopolymer-length dependent quality score distribution of the 1st-base of homopolymer runs 
##FORMAT
##lenth_of_homopolymer	quality_score_1	quality_score_2 ... 
##lenth_of_homopolymer	freq_of_score_1	freq_of_score_2 ...
QPROFFORMAT
my $p=0;
foreach my $len (sort {$a<=>$b} keys %qdist){
	#ensure no break in length
       	last if($len!=$p+1); $p=$len;
       	print QPROFILE $len;
       	my $count="";
       	foreach my $q (sort {$a<=>$b} keys %{$qdist{$len}}){
	       	print QPROFILE "\t$q";
	       	$count.=sprintf "\t%d", $qdist{$len}{$q};
       	}
       	printf QPROFILE "\n%d%s\n",$len,$count;
}
print QPROFILE "*this is the end of the profile";
close(QPROFILE);

#1st order Markov model of qual distribution for different len 
open (QMC, ">$outDIR/qual_mc_profile") or die "Can't open $outDIR/qual_mc_profile\n";
print QMC <<QMCFORMAT;
##454 read profile for ART 454 simulator
##the empirical transition distribution of the homopolymer-length dependent 1st order Markov model of quality scores
##FORMAT
##homopolymer_length	previous_base_position	previous_base_quality_score	next_base_quality_score_1	next_base_quality_score_2	...
##homopolymer_length	previous_base_position	previous_base_quality_score	freq_next_base_qual_score_1	freq_next_base_qual_score_2	...
QMCFORMAT
foreach my $len (sort {$a<=>$b} keys %qdiff){
	#ensure no break in length
       	last if($len>$p); 
  foreach my $x (sort {$a<=>$b} keys %{$qdiff{$len}}){
    foreach my $q (sort {$a<=>$b} keys %{$qdiff{$len}{$x}}){
      print QMC "$len\t$x\t$q";
      my $count="";
        foreach my $q2 (sort {$a<=>$b} keys %{$qdiff{$len}{$x}{$q}}){
          print QMC "\t$q2";
          $count.=sprintf "\t%d", $qdiff{$len}{$x}{$q}{$q2};
        }
      printf QMC "\n%d\t%d\t%d%s\n",$len,$x,$q,$count;
    }
  }
}
print QMC "*this is the end of the profile";
close(QMC);

#empirical distribution of read length  
open (FLEN, ">$outDIR/length_dist") or die "Can't open file $outDIR/length_dist\n";
print FLEN <<FLENFORMAT;
##454 read profile for ART 454 simulator
##the empirical distribution of 454 read length 
##FORMAT
##leng_1	leng_2	leng_3	...
##freq_1	freq_2	freq_3	...
FLENFORMAT
my $cc;
my $i=0;
foreach my $L (sort {$a<=>$b} keys %read_length){
       	if($i==0){
	       	print FLEN $L;
	       	$cc=$read_length{$L};
	       	$i=1;
	       	next;
       	}
       	print FLEN "\t$L";
       	$cc.="\t".$read_length{$L};
}
print FLEN "\n$cc\n";
print FLEN "*this is the end of the profile";
close(FLEN);

