/*
   License:
     This code is based upon the program "blosum.c" extracted from blimps-3.5.
     It is therefore covered by the blimps license,(see BLIMPS_LICENSE) and
     large parts of the code are (C) Copyright 1993-2001,Fred Hutchinson Cancer
     Research Center. (See code comments for more details). 

     The license appears to be yet another home grown open 
     source license, somewhat similar in spirit to the LGPL.
     
     <ftp://ncbi.nlm.nih.gov/repository/blocks/unix/blimps/>


   2002 Gavin E. Crooks <gec@compbio.berkeley.edu>
*/

/*=======================================================================
   doublet_blosum 
    
     Compute singlet and doublet substitution matrixes from blocks database.
     usage info: doublet_blosum -h 


==========================================================================*/

#include <stdlib.h>
#include <math.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <time.h>

#include <unistd.h>

#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>

#include "dirichlet/dirichlet_ml.h"


#define VERSION           "$Id: doublet_blosum.c,v 1.11 2005/07/28 22:48:37 gec Exp $"

#define AAS (20) 
#define AAS_EXT (AAS+3)             /* AAS plus 'B', 'Z', 'X'  */
#define MAX_L  (16+1)       
//#define MAX_L  (2+1)               /* Temp. Change. CHANGE BACK! */       
#define YES                1
#define NO                 0

#define MAXSEQS	  	  (60000)   /* Max number of sequences to be analyzed */
#define MAX_MERGE_WIDTH   (55)      /* Max. width of merged blocks */
#define MINSTR            (0)	    /* Min. strength to count */
#define MAXSTR            (9999)    /* Max. strength to count */
#define DEFAULT_CLUSTER   (45)      /* Default clustering percentage */
#define DEFAULT_MAT_SCALE (0) 

#define EPSILON            (0.000000001)
#define NEG_INF            (-100)
#define MAXLINE            (240)    /* Max line length for ASCII file */

#define round(x) ((x >= 0.0) ? (int) (x+0.5) : (int) (x-0.5))


const double nats_to_bits = 1.442695041;

const char aa_codes[AAS_EXT] = "ARNDCQEGHILKMFPSTWYVBZX"; 
const int N=2;
const int D=3;
const int Q=5;
const int E=6;
const int B=20;
const int Z=21;
const int X=22;


struct block {				/* Block structure */
  char ac[10];
  int nseq, width, strength, nclus;
  int aa[MAXSEQS][MAX_MERGE_WIDTH];	/* aas for seq */
  double weight[MAXSEQS];		/* seq weights found in block */
  int cluster[MAXSEQS];		        /* cluster # for seq */
  int ncluster[MAXSEQS];		/* #seqs in same cluster */
  double totdiag, totoffd, wtot;
  double resampling_weight;
} Block;


/*********************************************************
   FUNCTION DECLARATIONS
*********************************************************/

void die(char *comment, ...);
void help(void);

int read_dat(FILE *fin, const int resampling);
void fill_block(FILE *fin);
void count_block(void);
void count_weight(void);
void count_cluster(void);
void count_position(void);
void cluster_seqs(void);

void count_init(void);
void count_singlet_cluster(void);
void count_doublet_cluster(void);
void calc_singlet(FILE *fout);
void calc_doublet(FILE *fout);
void save_singlet(FILE *fout, int mat_scale, int clustering);
void save_doublet(FILE *fout, int mat_scale, int clustering);
void save_countdata(FILE *fout);

int aachar_to_num(const char one_letter_code);



unsigned long TotPairs, TotSeqs, TotAas, TotWidth;
double FTotPairs, FTotWeight;
unsigned long TotBlk;			/* # blocks contributing to matrix */
double TotClump;			/* # clumps contributing to matrix */
unsigned long TotSeg;			/* # seqs contributing to matrix */
int MinStr, MaxStr, cluster;

double count[AAS][AAS];
double count2[MAX_L][AAS][AAS][AAS][AAS];
double sp[AAS][AAS];
double singlet[AAS_EXT][AAS_EXT];
double doublet[MAX_L][AAS_EXT][AAS_EXT][AAS_EXT][AAS_EXT];

double singlet_entropy;
double singlet_expected;

double doublet_entropy[MAX_L];
double doublet_expected[MAX_L];

unsigned long total_singlet_pairs =0;
unsigned long total_doublet_pairs[MAX_L];



void help( void ) {
  printf("Double Blosum\n");
  printf("  Extract singlet and doublet substitution matrixes from a blocks database.\n\n");
  printf("  Usage:   doublet_blosum < blocks_database >out\n");
  printf("  Options:\n");
  printf("  -h           Help: print this help message\n");
  printf("  -i filename  Read this blocks database (default: stdin)\n"); 
  printf("  -o filename  Output file. (default: stdout)\n"); 
  printf("  -m int       Minimum block strength (default: 0)\n");
  printf("  -M int       Maximum block strength (default: 9999)\n");
  printf("  -c int       Re-clustering percent identity. (Default: 45%%) \n");
  printf("  -s int       Matrix scale in 1/n bits.(Default: scale determined \n");
  printf("                   from singlet matrix relative entropy)\n");
  printf("  -r           Resample. Randomly reweight all blocks.\n");
  printf("  -d filename  Save raw count data \n");

  exit ( 0 );
}


int main(int argc, char * argv[]) {
  FILE * fin = stdin;
  FILE * fout = stdout;
  FILE * fcountdata = NULL;
  char * datfile = NULL;
  int resampling = NO;
  int totblk;
  int c;

  int mat_scale=DEFAULT_MAT_SCALE;
  MinStr = MINSTR;
  MaxStr = MAXSTR;
  cluster = DEFAULT_CLUSTER;		
  

  while ((c = getopt (argc, argv, "i:o:c:s:hm:M:d:")) != -1) {
    switch (c) {
    case 'i' :
      datfile = optarg;
      fin= fopen(optarg, "r");
      if(fin == NULL) die("Cannot open file %s for input.", optarg);
      break;
    case 'o' :
      fout = fopen(optarg, "w");
      if(fout == NULL) die("Cannot open file %s for output.", optarg);
      break;
    case 'd' :
      fcountdata = fopen(optarg, "w");
      if(fcountdata == NULL) die("Cannot open file %s for output.", optarg);
      break;
    case 'm' : /* Minimum block strength */
      MinStr = atoi(optarg);
      break;
    case 'M' : /* Maximum block strength */
      MaxStr = atoi(optarg);
      break;
    case 'c' : /* Re-clustering percent identity */
      cluster = atoi(optarg);
      break;
    case 's' : /* scale in 1/n bits */ 
      mat_scale = atoi(optarg);
      break;
    case 'r' :
      resampling = YES;
      break;
    case 'h' :
      help();
      break;
    default :
      die("Internal argument parsing error");
    }
  }

  fprintf(fout, "# Doublet Blosum: Singlet and Doublet substitution matrixes\n");
  fprintf(fout, "# (%s)\n",VERSION);
  if(datfile != NULL) fprintf(fout, "# Data file: %s\n", datfile);
  fprintf(fout, "# Minimum block strength=%d\n", MinStr); 
  fprintf(fout, "# Maximum block strength=%d\n", MaxStr);
  fprintf(fout, "# Re-clustering percentage = %d\n", cluster);
  if(resampling) fprintf(fout, "# Resampling...\n");
  fflush(fout);

  /*----------------Initialize--------------------------------------*/
  TotPairs = TotSeqs = TotWidth = TotAas = (unsigned long) 0;
  TotBlk = TotSeg = (unsigned long) 0;
  TotClump = FTotPairs = FTotWeight = 0.0;

  count_init(); 
 

  /* Read data */
  totblk = read_dat(fin, resampling);

  
  /* Calculate singlet and doublet matrixes */
  calc_singlet(fout);
  calc_doublet(fout);

  if (mat_scale == 0) {
    /* Determine the matrix scale from the relative entropy of the singlet matrix.*/
    mat_scale = round(2.0 / sqrt(singlet_entropy));
    if (mat_scale < 2) mat_scale = 2;
  } 


  /* Output results */
  fprintf(fout, "#\n# %d blocks processed", totblk);
  fprintf(fout,", %ld blocks contributed pairs to matrix\n", TotBlk);

  if (cluster >= 0){
    fprintf(fout, "# %f clumps contributed pairs to matrix (%f)\n",
	    TotClump, TotClump/FTotWeight);
    fprintf(fout, "# %ld segments contributed pairs to matrix (%f)\n",
	    TotSeg, (float) TotSeg/TotBlk);
  }

  fprintf(fout, "# %f total pairs, %f total weight\n", FTotPairs, FTotWeight);
  fprintf(fout, "# %ld total sequences, ", TotSeqs);
  fprintf(fout, " %ld total columns, ", TotWidth);
  fprintf(fout, " %ld total AAs\n", TotAas);
  fprintf(fout, "# %lu total pairs for singlet substituion matrix\n", total_singlet_pairs);
  fprintf(fout, "#\n#\n");

  save_singlet(fout, mat_scale, cluster);
  save_doublet(fout, mat_scale, cluster);
   
  if(fcountdata!=NULL)
    save_countdata(fcountdata);

  exit(0);
}  /* end of main */



void count_init() {
  int i,j,k,m,L;

  for(i=0; i<AAS; i++) 
    for(j=0;j<AAS; j++) { 
      count[i][j]=0.0;
      singlet[i][j]=0.0;
    }

  for(L=0;L<MAX_L;L++) 
    for(i=0; i<AAS; i++) 
      for(j=0;j<AAS; j++) 
	for(k=0;k<AAS;k++) 
	  for(m=0;m<AAS;m++) 
            count2[L][i][j][k][m]=0.0;
	    
  for(L=0;L<MAX_L;L++) 
    for(i=0; i<AAS_EXT; i++) 
      for(j=0;j<AAS_EXT; j++) 
	for(k=0;k<AAS_EXT;k++) 
	  for(m=0;m<AAS_EXT;m++)
	    doublet[L][i][j][k][m]=0.0;

  for(L=0; L<MAX_L; L++) total_doublet_pairs[L] =0;
}

void count_singlet_cluster(void) {
  int seq1, seq2, col;
  int aa1, aa2;
  double weight;

  /* Singlet count */
  for (seq1=0; seq1 < Block.nseq; seq1++) {           
    for (seq2=seq1+1; seq2 < Block.nseq; seq2++) {
      if(Block.cluster[seq1] == Block.cluster[seq2]) continue; 
      for (col=0; col < Block.width; col++) {
        aa1 = Block.aa[seq1][col];
        aa2 = Block.aa[seq2][col];
        if(aa1<0 || aa1>=AAS) continue;
        if(aa2<0 || aa2>=AAS) continue;
        
        total_singlet_pairs++;
        weight = 1.0 / (Block.ncluster[seq2]*Block.ncluster[seq1]) ;
        weight *= Block.resampling_weight;
        count[aa1][aa2] += weight;
      }
    }
  }
}


void count_doublet_cluster(void) {
  int L;
  int seq1, seq2, col;
  int aa1, aa2, aa1l, aa2l;
  double weight;
              
  /* Doublet count */
  for (seq1=0; seq1 < Block.nseq; seq1++) {           
    for (seq2=seq1+1; seq2 < Block.nseq; seq2++) {
      if(Block.cluster[seq1] == Block.cluster[seq2]) continue; 
      for (L=1; L < MAX_L; L++) {
        for (col=0; col+L < Block.width; col++) {
          aa1 = Block.aa[seq1][col];
          aa1l= Block.aa[seq1][col+L];          
          aa2 = Block.aa[seq2][col];
          aa2l = Block.aa[seq2][col+L];                    
          
          if(aa1<0  || aa1>=AAS) continue;
          if(aa1l<0 || aa1l>=AAS) continue;                         
          if(aa2<0  || aa2>=AAS) continue;
          if(aa2l<0 || aa2l>=AAS) continue;                         
          
          total_doublet_pairs[L]++;
          weight = 1.0 / (Block.ncluster[seq2]*Block.ncluster[seq1]);
          weight *= Block.resampling_weight;
          count2[L][aa1][aa1l][aa2][aa2l] += weight;
        }
      }
    }
  }       
}


/* Calculate the singlet substitution matrix from the singlet count. */
void calc_singlet(FILE *fout) {
  int i;
  int row, col;
  
  double entropy, expected;
  double score, odds;
  double total_weight=0.0;
  double aa_freq[AAS];     
  
  for(i=0;i<AAS;i++) aa_freq[i]=0.0;
  
  /* total_weight */
  for(row=0; row<AAS; row++) 
    for(col=0;col<AAS; col++) 
      total_weight += count[row][col];


  /* printf("# total_weight = %f",total_weight); */
  
  /* Amino acid frequency */
  for(row=0; row<AAS; row++) 
    for(col=0;col<AAS; col++) {
      aa_freq[col] += count[row][col];
      aa_freq[row] += count[row][col];
    }
  for(i=0;i<AAS;i++) aa_freq[i] /= (2.0*total_weight);
  
  for(row=0; row<AAS; row++) {
    for(col=0;col<AAS; col++) {
      sp[row][col] = (count[row][col] + count[col][row])
        / (2.0 * total_weight);
    }
  }
  
  /* Singlet, the log odds ratio */
  for(row=0; row<AAS; row++) {
    for(col=0;col<AAS; col++) {
      odds = sp[row][col]/(aa_freq[row]*aa_freq[col]);
      if(odds<EPSILON) {
        singlet[row][col] = NEG_INF;
      } else {
        singlet[row][col] = nats_to_bits * log(odds); /* bits */
      }
    }
  }
              
  /* Compute values for B (20) as the weighted average of N (2) and D (3) */
  for (col=0; col <= B; col++) {
    score = (aa_freq[N] * singlet[N][col] + aa_freq[D]*singlet[D][col])
      / ( aa_freq[N]+ aa_freq[D]);
    singlet[col][B] = score;
    singlet[B][col] = score;
  }
  
  /* Compute values for Z (21) as the weighted average of Q (5) and E (6) */
  for (col=0; col<=Z; col++) {
    score = (aa_freq[Q] * singlet[Q][col] + aa_freq[E]*singlet[E][col])
      / ( aa_freq[Q]+ aa_freq[E]);
    singlet[col][Z] = score;
    singlet[Z][col] = score;
  }
  
  /* Compute values for X (22) as the weighted average everything */
  for (col=0; col<=Z; col++) {
    score =0.0;
    for(row=0;row<AAS;row++) 
      score += aa_freq[row]*singlet[row][col];
    singlet[col][X] = score;
    singlet[X][col] = score;
  }

  /*
   Compute entropy & expected value in bits 
   entropy is the per symbol mutual entropy of correctly aligned sequences
   expected is the per symbol entropy of randomly aligned sequences
  */
  entropy =0.0;
  expected = 0.0;

  for(row=0; row<AAS; row++) {
    for(col=0;col<AAS; col++) {
      entropy += singlet[row][col] * sp[row][col];
      expected += singlet[row][col] * aa_freq[row] * aa_freq[col];
    }
  }
  
  singlet_entropy = entropy;
  singlet_expected = expected;

  /*
    fprintf(fout,"#\n#\n# SINGLET:  Entropy=%.4f bits, expected=%.4f bits\n", entropy, expected);
   */
}



/* Calculate the doublet substitution matrixs from the doublet count.*/
void calc_doublet(FILE *fout) {
  int a, i,j,k,m,L;

  double avg, weight, tot;

  int zero_count;         /* Entries in doublet matrix with no data */
  double data_weight;
  double pseudo_weight;
  double total_weight;

  double odds, entropy, expected;
  
  double mean_theta[AAS][AAS][AAS][AAS];
  double post[AAS][AAS][AAS][AAS];
  double aa_freq[AAS_EXT][AAS_EXT];     

  fprintf(fout, "#\n# Doublet:\n");
  fprintf(fout, "# length, total_pairs, data_weight,  pseudo_weight, zero_count, entropy, expected\n");

  for(L=1; L<MAX_L; L++) {

    /* Impose the underlying symetry on our noisy data. */
    for(i=0; i<AAS; i++) 
      for(j=0;j<AAS; j++)
	for(k=0;k<AAS;k++)
	  for(m=0;m<AAS;m++) {
	    count2[L][i][j][k][m]=
	      (count2[L][i][j][k][m] + count2[L][k][m][i][j])/2.0;
	    count2[L][k][m][i][j] = count2[L][i][j][k][m];
	  }

    data_weight = 0.0;
    zero_count = 0;
    for(i=0; i<AAS; i++) 
      for(j=0;j<AAS; j++)
	for(k=0;k<AAS;k++)
	  for(m=0;m<AAS;m++) {
	    data_weight += count2[L][i][j][k][m];
	    if( count2[L][i][j][k][m]==0.0) zero_count ++;
	  }


    /* Construct an unscaled Dirichlet prior from the marginals.
       (Using a Dirichlet prior is the same thing as using pseudocounts)
       The pseudocount is added to cope with holes in the data
    */
    for(i=0; i<AAS; i++) 
      for(j=0; j<AAS; j++)
	for(k=0; k<AAS; k++)
	  for(m=0; m<AAS; m++)
	    mean_theta[i][j][k][m] = sp[i][k] * sp[j][m];
          
    /* The scale factor of the Dirichlet prior (The total pseudocount added
       to the real data) is chosen so as to maximise the probability of 
       observing the data. See dirichlet/dirichlet_ml.c
    */
    pseudo_weight = dirichlet_beta_ml( (AAS*AAS*AAS*AAS), (double *) count2[L], (double *) mean_theta, NULL);
    total_weight = data_weight + pseudo_weight;

    /* Calc posterier probability from Dirichlet Prior and Multinomial data */
    for(i=0; i<AAS; i++) 
      for(j=0; j<AAS; j++)
	for(k=0; k<AAS; k++)
	  for(m=0; m<AAS; m++) {
	    post[i][j][k][m] = 
	      (count2[L][i][j][k][m] + pseudo_weight * mean_theta[i][j][k][m])
	      / total_weight;
	  }


    /* Amino acid pair frequency */
    for(i=0;i<AAS_EXT;i++) 
      for(j=0;j<AAS_EXT;j++) 
	aa_freq[i][j]=0.0;  

    for(i=0; i<AAS; i++) 
      for(j=0; j<AAS; j++)
	for(k=0; k<AAS; k++)
	  for(m=0; m<AAS; m++) {
          aa_freq[i][j] += post[i][j][k][m];
          aa_freq[k][m] += post[i][j][k][m];
        }

    for(i=0;i<AAS;i++)
      for(j=0;j<AAS;j++) 
	aa_freq[i][j] *= 0.5;


    /* Doublet, the log odds ratio */
    for(i=0; i<AAS; i++) 
      for(j=0; j<AAS; j++)
	for(k=0; k<AAS; k++)
	  for(m=0; m<AAS; m++) {
	    odds = post[i][j][k][m]/(aa_freq[i][j]*aa_freq[k][m]);
	    if(odds<EPSILON) {
	      doublet[L][i][j][k][m] = NEG_INF;
	    } else {
	      doublet[L][i][j][k][m] = nats_to_bits * log(odds); /* bits */
	    }
	  }
  
    /* Subtract the singlet scores */
    for(i=0; i<AAS; i++) 
      for(j=0; j<AAS; j++)
	for(k=0; k<AAS; k++)
	  for(m=0; m<AAS; m++) {
	    doublet[L][i][j][k][m] -= singlet[i][k];
	    doublet[L][i][j][k][m] -= singlet[j][m];
	  }

    /* Calculate values for B, Z and X.
       B (20) as the weighted average of N (2) and D (3)
       Z (21) as the weighted average of Q (5) and E (6) 
       X (22) as the weighted average everything        */

    /* compute B, Z, and X columns of aa_freq */
    for(i=0; i <= B; i++) {
      aa_freq[i][B] = aa_freq[i][N] + aa_freq[i][D];
      aa_freq[B][i] = aa_freq[N][i] + aa_freq[D][i];
    }

    for(i=0; i <= Z; i++) {
      aa_freq[i][Z] = aa_freq[i][Q] + aa_freq[i][E];
      aa_freq[Z][i] = aa_freq[Q][i] + aa_freq[E][i];
    }

    for(i=0; i <= X; i++) {
      for(tot=0.0, a =0; a<AAS; a++) tot += aa_freq[i][a];
      aa_freq[i][X] = tot;

      for(tot=0.0, a =0; a<AAS; a++) tot += aa_freq[a][i];
      aa_freq[X][i] = tot;
    }

    /* printf("# TEST aa_freq[X][X]=%g (=? 1.0)\n", aa_freq[X][X]); */


    /* Compute values for B (20) as the weighted average of N (2) and D (3) */
    for(i=0; i<=B; i++) 
      for(j=0; j<=B; j++)
	for(k=0; k<=B; k++) {
	  avg = (aa_freq[N][i] * doublet[L][N][i][j][k] 
		   + aa_freq[D][i]*doublet[L][D][i][j][k])
	    / ( aa_freq[N][i]+ aa_freq[D][i]);
	  doublet[L][B][i][j][k] = avg;
	  doublet[L][j][k][B][i] = avg;

	  avg = (aa_freq[i][N] * doublet[L][i][N][j][k] 
		   + aa_freq[i][D]*doublet[L][i][D][j][k])
	    / ( aa_freq[i][N]+ aa_freq[i][D]);
	  doublet[L][i][B][j][k] = avg;
	  doublet[L][j][k][i][B] = avg;
	} 

    /* Compute values for Z (21) as the weighted average of Q (5) and E (6) */
    for(i=0; i<=Z; i++) 
      for(j=0; j<=Z; j++)
	for(k=0; k<=Z; k++) {
	  avg = (aa_freq[Q][i] * doublet[L][Q][i][j][k] 
		   + aa_freq[E][i]*doublet[L][E][i][j][k])
	    / ( aa_freq[Q][i]+ aa_freq[E][i]);
	  doublet[L][Z][i][j][k] = avg;
	  doublet[L][j][k][Z][i] = avg;

	  avg = (aa_freq[i][Q] * doublet[L][i][Q][j][k] 
		   + aa_freq[i][E]*doublet[L][i][E][j][k])
	    / ( aa_freq[i][Q]+ aa_freq[i][E]);
	  doublet[L][i][Z][j][k] = avg;
	  doublet[L][j][k][i][Z] = avg;
	} 
	  
    /* Compute values for X (22) as the weighted average everything */
    for(i=0; i<=X; i++) 
      for(j=0; j<=X; j++)
	for(k=0; k<=X; k++) {

	  avg=0.0;
	  weight=0.0;
	  for(a=0; a<AAS; a++) {
	    avg += aa_freq[a][i] * doublet[L][a][i][j][k];
	    weight += aa_freq[a][i];
	  }
	  avg /= weight;
	  doublet[L][X][i][j][k] = avg;
	  doublet[L][j][k][X][i] = avg;

	  avg=0.0;
	  weight=0.0;
	  for(a=0; a<AAS; a++) {
	    avg += aa_freq[i][a] * doublet[L][i][a][j][k];
	    weight += aa_freq[i][a];
	  }
	  avg /= weight;
	  doublet[L][i][X][j][k] = avg;
	  doublet[L][j][k][i][X] = avg;
	}



    /* compute entropy & expected value in bits 
       entropy is the per symbol mutual entropy of correctly aligned sequences
       expected is the per symbol entropy of randomly aligned sequences */

    entropy =0.0;
    expected = 0.0;

    for(i=0; i<AAS; i++) 
      for(j=0; j<AAS; j++)
	for(k=0; k<AAS; k++)
	  for(m=0; m<AAS; m++) {
	    entropy += doublet[L][i][j][k][m] * post[i][j][k][m];
	    expected += doublet[L][i][j][k][m] * aa_freq[i][j] * aa_freq[k][m];
	  }
  
    doublet_entropy[L] = entropy;
    doublet_expected[L] = expected;
  
    fprintf(fout, "# %d\t%lu\t%g\t%g\t%d\t%g\t%g\n", 
	    L, total_doublet_pairs[L], data_weight, pseudo_weight,
	    zero_count, entropy, expected);
  }
}

void save_singlet(FILE *fout, int mat_scale, int clustering) {
  int row, col;
  int int_score;

  fprintf(fout, "#  BLOSUM Clustered Singlet Scoring Matrix in 1/%d Bit Units\n", mat_scale);
  fprintf(fout, "#  Cluster Percentage: >= %d\n", clustering);
  fprintf(fout, "#  Entropy = % 8.4f, Expected = % 8.4f\n", singlet_entropy, singlet_expected);

  fprintf(fout, "    ");
  for (col=0; col<AAS_EXT; col++) 
    fprintf(fout, " %c  ", aa_codes[col]);
  fprintf(fout, "\n");
  
  for( row=0; row<AAS_EXT; row++) {
    fprintf(fout, " %c  ", aa_codes[row]);
    for( col=0; col<AAS_EXT; col++) {
      int_score = round( singlet[row][col] * mat_scale);
      fprintf(fout, "%3d ", int_score);
    }
    fprintf(fout, "\n");
  }
}



void save_doublet(FILE *fout, int mat_scale, int clustering) {
  int i,j,k,m,L;
  int int_score;
  fprintf(fout, "#\n");
  fprintf(fout, "#  BLOSUM Clustered Doublet Scoring Matrix in 1/%d Bit Units\n", mat_scale);
  fprintf(fout, "#  Cluster Percentage: >= %d\n", clustering);


  fprintf(fout, "        ");
  for(L=1;L<MAX_L;L++) 
    fprintf(fout, "%4d ", L);
  fprintf(fout,"\n");

  for(i=0;i<AAS_EXT; i++) {
    for(j=0;j<AAS_EXT; j++) {
      for(k=0;k<AAS_EXT; k++) { 
        for(m=0;m<AAS_EXT; m++) { 
            fprintf(fout, "%c%c %c%c   ", 
                    aa_codes[i],aa_codes[j],aa_codes[k],aa_codes[m]);
            for(L=1; L<MAX_L; L++) {
              int_score = round( doublet[L][i][j][k][m] * mat_scale);
              fprintf(fout, "%4d ", int_score);
            }
            fprintf(fout, "\n");
        }
      }
    }
  }
}



void save_countdata(FILE *fout) {
  int row, col;
  int i,j,k,m,L;

  fprintf(fout, "# Count Data\n\n");

  fprintf(fout, "# Singlet\n");
  for( row=0; row<AAS; row++) {
    for( col=0; col<AAS; col++) {
      fprintf(fout, "  %c%c\t%d\n", aa_codes[row], aa_codes[col], round(count[row][col]));
    }
  }


  fprintf(fout, "\n\n\n");
  fprintf(fout, "# Doublet\n");

  fprintf(fout, "      ");
  for(L=1;L<MAX_L;L++) 
    fprintf(fout, "%d\t", L);
  fprintf(fout,"\n");

  for(i=0;i<AAS; i++) {
    for(j=0;j<AAS; j++) {
      for(k=0;k<AAS; k++) { 
        for(m=0;m<AAS; m++) { 
            fprintf(fout, "%c%c %c%c \t", 
                    aa_codes[i],aa_codes[j],aa_codes[k],aa_codes[m]);
            for(L=1; L<MAX_L; L++) {
              fprintf(fout, "%d\t", round(count2[L][i][j][k][m]));
            }
            fprintf(fout, "\n");
        }
      }
    }
  }

  fprintf(fout, "\n");
}



/* 
 * Convert an Amino acid one letter code into an integer
 * between 0 (A, Alanine) and 22 (X, unkown amino acid).
 * B is changed to D, Z is changed to E, O and J are changed to X.
 * Returns -1 on failure.
 */
int aachar_to_num(const char one_letter_code) {
  int i;
  char c = toupper(one_letter_code);
  if( c=='B') c='D';
  if( c=='Z') c='E';
  if( c=='O') c='X';
  if( c=='J') c='X';

  for(i=0; i<AAS_EXT; i++)
    if(aa_codes[i] == c) return i;
  return -1;
}



/* Print an error message and exit with failure */
void die(char *comment, ...) {
 va_list argp;

 va_start(argp, comment);
         vfprintf(stderr, comment, argp);
 va_end(argp);
 fprintf(stderr, "\n");

 exit(2);
}




/* The code below is taken (almost) directly from blosum.c and motifj.h*/




/*   INDEX & INDEXCOL compute the sequential indices for the lower half of an
     nxn symmetric matrix given row & column coordinates.  Lower half has
     n(n-1)/2 entries; col=0,n-2 and row=col+1,n-1; col has n-col-1 rows.
     Index runs from 0 to n(n-1)/2 - 1 down columns with
     (index=0)==(col=0,row=1) and (index=n(n-1)/2-1)==(col=n-2,row=n-1).  */
#define INDEXCOL(n, col)    ( col*n - (col*(col+1))/2 )
#define INDEXCOLROW(n, col, row)  ( col*n - (col*(col+3))/2 - 1 + row )


/*-----------------------------------------------------------------------*/
/*    Structure for pairs of sequences.                                  */
/*     pair should be allocated as an array, & the number of the         */
/*     sequences forming the pair inferred from the array index.         */
/*-----------------------------------------------------------------------*/
struct pair {
	int score;		/* # of identities within trimmed block */
	int cluster;		/* cluster # for this pair */
};


/*****************************************************************
    Uses unix specific calls to produce a random number from the
    system clock.
    From comp.lang.c.moderated, Dan Pop <danpop@cernapo.cern.ch> 
*****************************************************************/

unsigned long int  get_ran_clock(void)
{
    return( (unsigned long int)clock() + (unsigned long int)time(NULL) );
}


/*=====================================================================
      Read the blocks database
=======================================================================*/
int read_dat(FILE *fdat, const int resampling) {
   int totblk;
   char line[MAXLINE], *ptr, *ptr1;

   const gsl_rng_type * rng_type;
   gsl_rng * rng;

   gsl_rng_env_setup();
   rng_type = gsl_rng_default;
   rng = gsl_rng_alloc(rng_type);
   gsl_rng_set(rng, get_ran_clock());



   totblk=0;
   /*SPSS*/
   /*   printf("\nBlock     diagonal off-diagonal total  strength\n"); */
   while (!feof(fdat) && fgets(line, MAXLINE, fdat) != NULL)
   {
      if (strncmp(line, "AC   ", 5) == 0)
      {
	 strncpy(Block.ac, line+5, 8); Block.ac[9] = '\0';
      }
      else if (strncmp(line, "BL   ", 5) == 0)
      {
	 Block.strength = 0;
	 ptr=strstr(line,"strength=");
	 if (ptr != NULL) {
           ptr1 = strtok(ptr, "="); ptr1=strtok(NULL, "\n\t\r");
           Block.strength = atoi(ptr1);
         }
	 else {
           Block.strength = MinStr;	/* use block if no strength field*/
         }

         if (Block.strength >= MinStr && Block.strength <= MaxStr) {
           fill_block(fdat);
           cluster_seqs();    /*  re-cluster sequences */
           
           if(resampling) {
             // Bayesian Bootstrap
             // Annals of Statistics (1981) v9 130-134
             // Draw weights from expoential with unit mean.
             // For large dimensional distribution this is the same
             // as resampling with Dirichlet weights. 
             Block.resampling_weight = 
               gsl_ran_exponential(rng, 1.0);             
           } else {
             Block.resampling_weight = 1.0;
           }

           count_cluster();		  /* clustering */
           count_singlet_cluster();
           count_doublet_cluster();
           
           totblk++;
	 }
      }
   }
   
   return(totblk);
}  /* end of read_dat */

/*====================================================================
     Fill up the block structure
=======================================================================*/
void fill_block(FILE * fdat) {
   int done, i, li, n, cluster, ncluster;
   char line[MAXLINE], *ptr;

   Block.nseq = Block.width = 0;
   Block.totdiag = Block.totoffd = Block.wtot = (double) 0;
   cluster = ncluster = 0;
   done=NO;
   while (!done && !feof(fdat) && fgets(line,MAXLINE,fdat) != NULL)
   {
      if (strlen(line) == 1)		/* blank line => new cluster */
      {
	 /*  Set #seqs in cluster to seqs in previous cluster */
	 if (ncluster > 0)
	   for (n=0; n<Block.nseq; n++)
	      if (Block.cluster[n] == cluster) Block.ncluster[n] = ncluster;
	 cluster++; ncluster = 0;
      }
      else if (strlen(line) > 1)
      {
	 if (strncmp(line, "//", 2) == 0) done=YES;
	 else if (strlen(line) > 20)
	 {
	    li=0;		/* skip over sequence name & offset */
            while (line[li] != ')') li++;
            li++;
	    while (line[li] == ' ') li++;  /* skip over spaces */
		/* assuming no spaces within block ! */
	    ptr=strtok(line+li, " \r\n\t");
	    for (i=0; i<strlen(ptr); i++)
	    {
	       Block.aa[Block.nseq][i] = aachar_to_num(ptr[i]);
	       if (Block.aa[Block.nseq][i] >= 0 &&
		   Block.aa[Block.nseq][i] < AAS)
	       {
		  TotAas++;
	       }
/*	       else
	       {
		  printf("\nBad char: %s %c", Block.ac, ptr[i]);
	       }
*/
	    }
            li += strlen(ptr);
            ptr = strtok(NULL, " \r\n\t");
            if (ptr != NULL)
            {
               Block.weight[Block.nseq] = atof(ptr);
               Block.wtot += Block.weight[Block.nseq];
            }
            else
               Block.weight[Block.nseq] = 0;
	    Block.cluster[Block.nseq] = cluster;
	    ncluster++;		/* # seqs in current cluster */
	    Block.width = i;
	    Block.nseq++;
	 }
      }
   }
   /*  Compute weights for the last cluster */
   if (ncluster > 0)
	   for (n=0; n<Block.nseq; n++)
	      if (Block.cluster[n] == cluster) Block.ncluster[n] = ncluster;
   TotSeqs += Block.nseq;
   TotWidth += Block.width;
}  /* end of fill_block */

/*===================================================================*/

/* Calculate some random bits of information. */
void count_cluster() {
   int seq1, seq2, col;
   double weight;

   for (col=0; col < Block.width; col++)
      for (seq1=0; seq1 < Block.nseq; seq1++)
	 for (seq2=seq1+1; seq2 < Block.nseq; seq2++)
	    if (Block.aa[seq1][col] >=0 && Block.aa[seq1][col] < AAS &&
		Block.aa[seq2][col] >=0 && Block.aa[seq2][col] < AAS &&
		Block.cluster[seq1] != Block.cluster[seq2])
	    {
		  weight = (double) 1.0 / Block.ncluster[seq2];
		  weight *= (double) 1.0 / Block.ncluster[seq1];
                  
		  FTotPairs += 1.0;
		  FTotWeight += weight;
		  if (Block.aa[seq1][col] ==  Block.aa[seq2][col])
			 Block.totdiag += weight;
		  else   Block.totoffd += weight;
	    }
   /*  SPSS */
   if (Block.totoffd > 0.0 || Block.totdiag > 0.0)
   {
     TotBlk++;
     TotClump += (Block.totdiag + Block.totoffd) * Block.nclus;
     TotSeg += Block.nseq;
     /*      printf("%s %9.2f %9.2f %9.2f %d\n", Block.ac,
             Block.totdiag,  Block.totoffd,
             Block.totdiag+Block.totoffd, Block.strength);
     */
   }
}  /* end of count_cluster */







/*======================================================================*/
/*    Cluster sequences in a block based on the number of               */
/*    identities within the block. Sets Block.cluster & Block.ncluster  */
/*    Sets Block.nclus = total number of clusters                       */
/*      1. Compute number of identities for each possible pair of seqs. */
/*         Results stored in lower half of matrix (pairs).              */
/*      2. Use clustering threshold % of # of AAs in trimmed block.     */
/*      3. Cluster recursively by traversing cols, rows of matrix.      */
/*UNIX NOTE:  Program aborts when running under UNIX at free(pairs),
   so use the fixed size declaration pairs & remove the malloc() &
   free() calls when compiling for UNIX                                 */
/*======================================================================*/
void cluster_seqs()
{
   int clus, npair, threshold, s1, s2, l1, l2, px, i, i1, i2;
   int nclus[MAXSEQS], minclus, oldclus;
   struct pair *pairs;
/*UNIX   struct pair pairs[MAXSEQS*(MAXSEQS-1)/2];  */

   npair = Block.nseq*(Block.nseq-1)/2;
   pairs = (struct pair *) malloc (npair * sizeof(struct pair));
   if (pairs == NULL)
   {
      fprintf(stderr, "\ncluster_seqs: Unable to allocate pair structure!\n");
      exit(-1);
   }
   threshold = (int) (cluster*(Block.width))/100;

/*    Compute scores for all possible pairs of sequences            */
   for (s1=0; s1<Block.nseq-1; s1++)   		/* col = 0, n-2     */
   {
      l1 = 0;
      for (s2=s1+1; s2<Block.nseq; s2++)	/* row = col+1, n-1 */
      {
	 l2 = 0;
	 px = INDEXCOLROW(Block.nseq, s1, s2);
	 pairs[px].score = 0;
	 pairs[px].cluster = -1;
	 for (i=0; i<=Block.width; i++)
	 {
	    i1 = l1+i;  i2 = l2+i;
	    if (i1 >= 0 && i1 < Block.width &&
		i2 >= 0 && i2 < Block.width &&
		Block.aa[s1][i1] == Block.aa[s2][i2])
		   pairs[px].score += 1;
	 }
      }  /* end of s2 */
   }  /* end of s1 */

/*  Print scores */
/*   printf("\nThreshold=%d", threshold);
   for (s2=1; s2<Block.nseq; s2++)
   {
      printf ("\n");
      for (s1=0; s1<s2; s1++)
      {
	 px = INDEXCOLROW(Block.nseq, s1, s2);
	 printf(" %.3d", pairs[px].score);
      }
    }
*/

/*-------Cluster if score exceeds threshold by scanning cols (s1) */
   for (s1=0; s1<Block.nseq; s1++)
   {
      Block.cluster[s1] = -1;			/* clear out old values */
      Block.ncluster[s1] = 1;
      nclus[s1] = 0;
   }
   clus = 0;        				/* cluster number */
   for (s1=0; s1<Block.nseq-1; s1++)   		/* col = 0, n-2     */
      for (s2=s1+1; s2<Block.nseq; s2++)	/* row = col+1, n-1 */
      {
	 px = INDEXCOLROW(Block.nseq, s1, s2);
	 if (pairs[px].score >= threshold)	/*  cluster this pair */
	 {
	    if (Block.cluster[s1] < 0)          /* s1 not yet clustered */
	    {
	       if (Block.cluster[s2] < 0)       /* new cluster */
	       {
		  Block.cluster[s1] = clus++;
		  Block.cluster[s2] = Block.cluster[s1];
	       }
	       else  				/* use s2's cluster  */
		  Block.cluster[s1] =  Block.cluster[s2];
	    }
	    /*  use s1's cluster if it has one and s2 doesn't */
	    else if (Block.cluster[s1] >= 0 && Block.cluster[s2] < 0)
	       Block.cluster[s2] = Block.cluster[s1];
	    /* merge the two clusters into the lower number */
	    else if (Block.cluster[s1] >= 0 && Block.cluster[s2] >= 0)
	    {
	       minclus = Block.cluster[s1]; oldclus = Block.cluster[s2];
	       if (Block.cluster[s2] < Block.cluster[s1])
	       {
		  minclus = Block.cluster[s2]; oldclus = Block.cluster[s1];
	       }
	       for (i1=0; i1<Block.nseq; i1++)
		 if (Block.cluster[i1] == oldclus)
		     Block.cluster[i1] = minclus;
	    }
	 }  /* end of if pairs */
      }  /* end of s2 */

   /*---  Set Block.ncluster, get rid of negative cluster numbers --*/
   for (s1=0; s1<Block.nseq; s1++)
      if (Block.ncluster[s1] >= 0)  nclus[Block.cluster[s1]]++;
   for (s1=0; s1<Block.nseq; s1++)
   {
      if (Block.cluster[s1] < 0)
      {
	  Block.cluster[s1] = clus++;
	  Block.ncluster[s1] = 1;
      }
      else
	  Block.ncluster[s1] = nclus[Block.cluster[s1]];
      
   }
   /*   Count the total number of clusters and put in Block.nclus,
        the numbers in Block.ncluster[] are arbitrary */
   Block.nclus = clus;		/* number of clumps */
   free(pairs);
}  /*  end of cluster_seqs */


