
/* 
  Copyright (c) 2003 Richard Green <ed@compbio.berkeley.edu>,
		     Univ. of California, Berkeley

  Permission is hereby granted, free of charge, to any person obtaining a 
  copy of this software and associated documentation files (the "Software"),
  to deal in the Software without restriction, including without limitation
  the rights to use, copy, modify, merge, publish, distribute, sublicense,
  and/or sell copies of the Software, and to permit persons to whom the
  Software is furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included
  in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
  THE SOFTWARE.

  (This is the MIT Open Source License, 
  http://www.opensource.org/licenses/mit-license.html)

*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <assert.h>

#include "cbt_evd.h"


//#define DEBUG
const int    MAXITER = 1000;
const double EPSILON = 0.000005;
const int    PC_ITER = 10; // Min. iterations before 
                           // pseudo-convergence check starts
static int MLH ( const long int* scores, 
          const long int* lengths,
          size_t n,
          long int q,
          double* lambda_p, 
          double* K_p, 
          double* H_p );

static void 
newtonRhapsonKLambda( int* use_scores, const long int* scores, 
                      const long int* lengths, size_t n, long int q,
                      double* K_p, double* lambda_p, double H,
                      double K, double lambda, int iter );
static double findK ( int* use_scores, const long int* scores,
	       const long int* lengths, size_t n, long int q, double K, 
	       double H, double lambda );
static double gOfLambda ( int* use_scores, const long int* scores, const long int* lengths, 
		   size_t n, long int q, double K, 
		   double H, double lambda );
static double gPrimeOfLambda ( int* use_scores, const long int* scores, 
			const long int* lengths, size_t n, long int q, 
			double K, double H, double lambda );
static double findEi ( long int score, long int q, long int t,
		double K, double H, double lambda );
static double findFi ( long int score, long int q, long int t,
		double K, double H, double lambda );
static double findGi ( long int score, long int q, long int t,
		double K, double H, double lambda );
static double findNi ( long int q, long int t, double K, double H );
static double findLi ( long int q, long int t, double K, double H );
static int modNewtonRhapsonH ( int* use_scores, const long int* scores,
			const long int* lengths, size_t n, long int q, 
			double K, double lambda, double* H_p, double H );
static double gOfH ( int* use_scores, const long int* scores, 
		const long int* lengths, size_t n, long int q, 
		double K, double H, double lambda );
static double gPrimeOfH ( int* use_scores, const long int* scores, 
		   const long int* lengths, size_t n, long int q, 
		   double K, double H, double lambda );
static double findAi ( long int q, long int t, double K, double H );
static double findBi ( long int score, long int q, long int t, 
		double K, double H, double lambda );
static double findCi ( long int q, long int t, double K, double H );
static void initLambda ( const long int* scores, size_t n, double* lambda_p );
static double sampStdDev( const long int* scores, size_t n );
static double findAvg( const long int* scores, size_t n );
static void initUseScores ( int* use_scores, size_t n );

static void computeEValues ( const long int* scores,
                      const long int* lengths, 
		      size_t n, 
                      long int q, 
                      double* e_values, 
		      double K, 
                      double lambda,
                      double H );


static size_t maskScoresE ( double* e_values, int* use_scores, size_t n );
static void maskScoresN ( int* use_scores, const long int* scores, const long int* lengths,
		   size_t n, long int q, 
		   double K, double lambda, double H );
static double logLike ( int* use_scores, const long int* scores, const long int* lengths, 
		 size_t n, long int q, double K, 
		 double lambda, double H );
static void initK ( double* K_p );

//! @file Extream Value Distribution

//! Fit an extream value distribution to the raw scores, converting them
//! into e-values. Fit is performed using the Bailey MLH algorithm. Includes
//! length correction. 
//! Reference: Bailey and Gribskov, JCB 9:575
//! @param lib_size       number of input sequences
//! @param scores         raw scores
//! @param lengths        sequence lengths
//! @param query_length   length of the query sequence
//! @param lambda         pointer to lambda (to be computed)
//! @param K              pointer to K (to be computed)
//! @param H              pointer to H (to be computed)
//! @param e_values       pointer to array of evalues (to be computed)
//! @return error_code    Zero for success (??)
//! @author Richard E. Green
//! @author Gavin E. Crooks (Minor changes)

/* This is the function that is called from some external code.  It
   calls MLH, which does the heaving lifting to estimate EVD parameters.
   Then, it calls computeEValues to assign e-values to each of the
   input scores.   Note that e-values are computed and returned
   regardless of the error status. */
int 
cbt_evd( const size_t lib_size,   // number of input sequences
         const long int scores[], // pointer to array of raw scores
         const long int lengths[],// pointer to array of sequence lengths
         const long int query_length, // length of the query sequence
         double * lambda,         // pointer to lambda (to be computed)
         double * K,              // pointer to K (to be computed)
         double * H,              // pointer to H (to be computed)
         double e_values[])       // pointer to array of e-values (to be computed)
{
  int error_flag;

  error_flag = MLH( scores, lengths, lib_size, query_length,
		    lambda, K, H);
  // if(error_flag) return error_flag;
  
  computeEValues ( scores, lengths, lib_size, query_length, e_values,
                   *K, *lambda,*H);
  return error_flag;
}


/* Maximum likelihood parameter estimatation algorithm.  Fig 1, p 579
   of Bailey and Gribskov, JCB 9:575
   Takes a pointer to array of scores, array of lengths, number of
   scores, length of query sequence, and pointers to where we can put
   the ML lambda, K and H that we find.
*/
int MLH ( const long int* scores,
          const long int* lengths,
          size_t n,
          long int q,
          double* lambda_p, 
          double* K_p,
          double* H_p ) {
  int* use_scores; // array of TRUE / FALSE values for whether to
                   // use the ith value of scores[] or not
  double* e_values; // array of e-values that we'll update on each pass
  double log_like, last_log_like;
  size_t i;
  //  int mask_set = 0; // Boolean to determine if we should update the
  //		    // use_scores array or not.  NOT USED
  int delta_mask = 0; // Number of changes to the use_scores array
		      // each time through.  Not necessary to the
		      // algorithm, but informative for diagnosing
		      // pathological inputs.  If the algorthm is
		      // converging, this number should go down each
		      // time.

  log_like = DBL_MAX; // Impossible initial value

  // make some room for these arrays
  use_scores = (int*) malloc( n * sizeof( int ) );
  e_values   = (double*) malloc( n * sizeof( double ) );

  // set lambda equal to 1 / sample std dev. of scores
  initLambda( scores, n, lambda_p );

  // set H equal to 1
  *H_p = 1.0;

  // makes first guess at K
  initK( K_p );

  // lets use all the scores to begin with
  initUseScores( use_scores, n );

  for( i = 0; i < MAXITER; i++ ) {

    // maskScoresN( use_scores, scores, lengths, 
    // n, q, *K_p, *lambda_p, *H_p );
    // use N-R to find the ML K and lambda for these scores assuming
    // EVD and the current value of H
    newtonRhapsonKLambda( use_scores, scores, lengths, n, q, 
			  K_p, lambda_p, *H_p, *K_p, *lambda_p, 1 );


    // maskScoresN( use_scores, scores, lengths, 
    // n, q, *K_p, *lambda_p, *H_p );
    // use the modified N-R (with binary search to get close) to find
    // the ML H for the scores with the current values for lambda and K
    modNewtonRhapsonH( use_scores, scores, lengths, n, q, 
		       *K_p, *lambda_p, H_p, *H_p );
    assert( !( isnan( *H_p ) ) );

    // find the new log-likelihood for these scores under our new
    // parameters and if it's not too much different than the last
    // one, then we'll say we're done
    last_log_like = log_like;
    log_like = logLike( use_scores, scores, lengths, n, q, 
			*K_p, *lambda_p, *H_p );
    assert( !( isnan( *H_p ) ) );

#ifdef DEBUG
    printf("l: %g K: %g H:%g lL:%g L:%g dm:%d\n",
           *lambda_p, *K_p, *H_p, 
	   last_log_like, log_like, delta_mask);
#endif

    /* Check for convergence */
    if ( fabs( ( last_log_like - log_like ) / log_like )
	 < EPSILON ) {
      free( use_scores );
      free( e_values );
      return 0;
    }
    
    /* Check for pseudo-convergence (likelihood has peaked and is not
       decreasing */
    if ( i > PC_ITER && log_like <= last_log_like ) {
      free( use_scores );
      free( e_values );
      return 0;
    }

    // recompute e-values for all the scores given the new K, lambda,
    // and H
    computeEValues( scores, lengths, n, q, e_values, 
		    *K_p, *lambda_p, *H_p );

    // now mask the scores (put a 0 in their use_scores[] space) if
    // they're e-value is less than 1.0.  If it's greater, put a 1
    delta_mask = maskScoresE( e_values, use_scores, n );

    // mask the scores if they give a negative value for Ni
    //    maskScoresN( use_scores, scores, lengths, 
    //	 n, q, *K_p, *lambda_p, *H_p );
  }

  // never did converge on a good parameter set
  free( use_scores );
  free( e_values );
  return 1; // 1 => did not converge
}

/* Newton-Rhapson algorithm for finding the root of a function.  The
   function in this case is the log-likelihood function for K and
   lambda, given the scores in scores[].  We
*/
void newtonRhapsonKLambda( int* use_scores, const long int* scores, 
			   const long int* lengths, size_t n, long int q,
			   double* K_p, double* lambda_p, double H,
			   double K, double lambda, int iter ) {

  double g_of_lambda;
  double g_prime_of_lambda;

  if ( iter > MAXITER ) {
    *lambda_p = lambda;
    *K_p      = K;
    return;
  }

  K = findK( use_scores, scores, lengths, n, q, K, H, lambda );
  g_of_lambda = gOfLambda( use_scores, scores, lengths, n, q, 
			   K, H, lambda );
  g_prime_of_lambda = gPrimeOfLambda( use_scores, scores, lengths, n, q, 
				      K, H, lambda );
  lambda = lambda - ( g_of_lambda / g_prime_of_lambda );

  if ( fabs( g_of_lambda ) < 0.0001 ) {
    *lambda_p = lambda;
    *K_p      = K;
    return;
  }
  else {
    newtonRhapsonKLambda( use_scores, scores, lengths, n, q, 
			  K_p, lambda_p, H, K, lambda, (iter + 1) );
  }
}

double findK ( int* use_scores, const long int* scores,
	       const long int* lengths, size_t n, long int q, double K, 
	       double H, double lambda ) {
  size_t i;
  double f = 0.0;

  for ( i = 0; i < n; i++ ) {
    if ( use_scores[ i ] ) {
      f += findFi( scores[ i ], q, lengths[ i ], K, H, lambda );
    }
  }
  
  return (n / f);
}

double gOfLambda ( int* use_scores, const long int* scores, const long int* lengths, 
		   size_t n, long int q, double K, 
		   double H, double lambda ) {
  size_t i;
  double d, e, f;
  d = e = f = 0.0;
  for ( i = 1; i < n; i++ ) {
    if ( use_scores[ i ] ) {
      d += scores[ i ];
      e += findEi( scores[ i ], q, lengths[ i ], K, H, lambda );
      f += findFi( scores[ i ], q, lengths[ i ], K, H, lambda );
    }
  }
  return ( (1/lambda) - (d/n) + (e/f) );
}

double gPrimeOfLambda ( int* use_scores, const long int* scores, 
			const long int* lengths, size_t n, long int q, 
			double K, double H, double lambda ) {
  size_t i;
  double g, f, e;
  g = f = e = 0.0;
  for ( i = 1; i < n; i++ ) {
    if( use_scores[ i ] ) {
      g += findGi( scores[ i ], q, lengths[ i ], K, H, lambda );
      f += findFi( scores[ i ], q, lengths[ i ], K, H, lambda );
      e += findEi( scores[ i ], q, lengths[ i ], K, H, lambda );
    }
  }
  return ( (-1/(lambda*lambda)) - (g/f) + ((e/f)*(e/f)) );
}

double findEi ( long int score, long int q, long int t,
		double K, double H, double lambda ) {
  double e;
  e =  findNi( q, t, K, H ) * score / exp( lambda * score );
  if ( isnan( e ) ) {
    return 0;
  }
  return e;
}

double findFi ( long int score, long int q, long int t,
		double K, double H, double lambda ) {
  return ( findNi( q, t, K, H ) * (1 / exp( lambda * score )) );
}

double findGi ( long int score, long int q, long int t,
		double K, double H, double lambda ) {
  return ( findNi( q, t, K, H ) * 
	   (score * score) * 
	   (1 / exp( lambda * score )) );
}

double findNi ( long int q, long int t, double K, double H ) {
  double Ni;
  double l = findLi( q, t, K, H );
  double t1, t2;
  /* Sometimes l comes back bigger than the query sequence or bigger
     than the target sequence.  When this happens, it's a problem
     because it makes Ni negative.  In logLike(), we take the log of 
     Ni.  If Ni is negative, we get a nan error.  :(.  Therefore, if
     l is bigger than either q or t, if just set that term equal to
     1. */
  
  if ( l >= q ) {
    l = q - 1;
  }
  if ( l >= t ) {
    l = t - 1;
  }

  t1 = q - l;
  t2 = t - l;
  
  

  /*  if ( t1 <= 0 ) {
      t1 = 1;
      }
      if ( t2 <= 0 ) {
      t2 = 1;
      } */
  
  Ni = t1 * t2;

  assert( !( isnan( Ni ) ) );
  return Ni;
}

double findLi ( long int q, long int t, double K, double H ) {
  double Li;
  Li = ( log( K * q * t ) / H );
  if ( isnan( Li ) ) {
    printf( "findLi got nan\n" );
    return 0.0;
  }
  else {
    return Li;
  }
}

/* Uses a modified Newton-Rhapson algorithm to find the root of the
   function that maximizes the likelihood of the score data by
   adjusting the H parameter at a set K and lambda.  (Fig 2 of Bailey
   and Gribskov)
*/
int modNewtonRhapsonH ( int* use_scores, const long int* scores,
			const long int* lengths, size_t n, long int q, 
			double K, double lambda, double* H_p, double H ) {
     size_t i;
     double g_of_H;
     double g_prime_of_H;
     double last_H;

     for ( i = 0; i < MAXITER; i++ ) {
       last_H = H;
       g_of_H = gOfH( use_scores, scores, lengths, n, q, K, H, lambda );
       g_prime_of_H = gPrimeOfH( use_scores, scores, lengths, n, q, 
				 K, H, lambda );
       
       if ( fabs( g_of_H ) < 0.0001 ) {
	 *H_p = H;
	 return 1;
       }
       
       if ( g_prime_of_H > 0 ) {
	 if ( g_of_H  > 0 ) {
	   H = 2 * H;
	 }
	 else {
	   H = H / 2;
	 }
       }
     
       else {
	 H = H - ( g_of_H / g_prime_of_H );
	 if ( isnan( H ) ) {
	   *H_p = 1.0;
	   return 0;
	 }
	 if ( gOfH( use_scores, scores, lengths, n, q, K, H, lambda ) 
	      <= 0 ) {
	   H = last_H / 2;
	   if ( isnan( H ) ) {
	     *H_p = 1.0;
	     return 0;
	   }
	 }
	 
       }
     }

     // sometimes (rarely) H gets to be nan.  If this happens, reset it
     // to 1 for the next round
     if ( isnan( H ) ) {
       *H_p = 1.0;
       return 0;
     }
     else {
       *H_p = H;
       return 0;
     }
}

double gOfH ( int* use_scores, const long int* scores, 
	      const long int* lengths, size_t n, long int q, 
	      double K, double H, double lambda ) {
  size_t i;
  double dLdH = 0.0;
  double ai, bi, ci;
  ai = bi = ci = 0.0;
  for ( i = 0; i < n; i++ ) {
    if ( use_scores[ i ] ) {
      ai = findAi( q, lengths[ i ], K, H );
      bi = findBi( scores[ i ], q, lengths[ i ], K, H, lambda );
      ci = findCi( q, lengths[ i ], K, H );
      dLdH += ( ai * bi * ci );
    }
  }
  return dLdH;
}

double gPrimeOfH ( int* use_scores, const long int* scores, 
		   const long int* lengths, size_t n, long int q, 
		   double K, double H, double lambda ) {
  size_t i;
  double d2Ld2H = 0.0;
  double ai, bi, ci, Ni = 0.0;
  ai = bi = ci = Ni = 0.0;
  for ( i = 0; i < n; i++ ) {
    if ( use_scores[ i ] ) {
      ai = findAi( q, lengths[ i ], K, H );
      bi = findBi( scores[ i ], q, lengths[ i ], K, H, lambda );
      ci = findCi( q, lengths[ i ], K, H );
      Ni = findNi( q, lengths[ i ], K, H );
      d2Ld2H += ( (2*bi*ci*ci) - 
		  (ai*ci/Ni)*(ai*ci/Ni) -
		  (2*ai*bi*ci/H) );
    }
  }
  return d2Ld2H;
}

double findAi ( long int q, long int t, double K, double H ) {
  return ( (2*findLi( q, t, K, H )) - q - t );
}

double findBi ( long int score, long int q, long int t, 
		double K, double H, double lambda ) {
  return ( (1.0 / findNi( q, t, K, H )) - (K / exp( lambda * score )) );
}

double findCi ( long int q, long int t, double K, double H ) {
  return ( -1.0 * ( findLi( q, t, K, H ) / H ) );
}

/* Initialize lambda to 1 / sample standard deviation of all scores */
void initLambda ( const long int* scores, size_t n, double* lambda_p ) {
  *lambda_p = 1.0 / sampStdDev( scores, n );
}

double sampStdDev( const long int* scores, size_t n ) {
  double avg = findAvg( scores, n );
  double dif;
  double sum_of_sq_dif = 0;
  size_t i;
  for ( i = 0; i < n; i++ ) {
    dif = avg - scores[ i ];
    sum_of_sq_dif += dif * dif;
  }
  return ( sqrt( sum_of_sq_dif / (n - 1) ) );
}

double findAvg( const long int* scores, size_t n ) {
  long int total = 0;
  size_t i;
  for ( i = 0; i < n; i++ ) {
    total += scores[ i ];
  }
  return ( total / n );
}

void initUseScores ( int* use_scores, size_t n ) {
  size_t i;
  for ( i = 0; i < n; i++ ) {
    use_scores[ i ] = 1;
  }
}

/* Go through array of scores and comput an e-value for each using these
   values for K, lambda, and H */
void computeEValues ( const long int* scores,
                      const long int* lengths, 
		      size_t n, 
                      long int q, 
                      double* e_values, 
		      double K, 
                      double lambda,
                      double H )
{
  size_t i;
  double term1, term2, e_value;
  for ( i = 0; i < n ; i++ ) {
    term1 = -1.0 * K * findNi( q, lengths[ i ], K, H );
    term2 = exp( lambda * scores[ i ] );
    e_value = (double)n * ( 1.0 - exp( term1 / term2 ) );
    e_values[ i ] = e_value;

    /*    e_values[ i ] = n * ( 1 - exp(-1 * K * findNi( q, lengths[ i ], K, H ) 
	  / exp( lambda * scores[ i ] ) ) ); */
  }
}

size_t maskScoresE ( double* e_values, int* use_scores, size_t n ) {
  size_t i;
  size_t delta_mask = 0;

  // Loop through all the booleans in use_scores
  for ( i = 0; i < n; i++ ) {
    if ( e_values[ i ] <= 1 ) {
      // This score should be masked because its e-value is <= 1, but
      // only change it to 0 (masked) if it isn't already 0.  Also,
      // increment delta_mask
      if ( use_scores[ i ] == 1 ) {
	use_scores[ i ] = 0;
	delta_mask++;
      }
    }
    else {
      // This score should not be masked.  Change it to 1 (unmasked)
      // only if it isn't already 1.  Also, increment delta_mask
      if ( use_scores[ i ] == 0 ) {
	use_scores[ i ] = 1;
	delta_mask++;
      }
    }
  }
  return delta_mask;
}

void maskScoresN ( int* use_scores, const long int* scores, const long int* lengths,
		   size_t n, long int q, double K, double lambda, double H ) {
  size_t i;
  for ( i = 0; i < n; i++ ) {
    if ( findNi( q, lengths[ i ], K, H ) <= 0 ) {
      use_scores[ i ] = 0;
    }
  }
}

double logLike ( int* use_scores, const long int* scores, const long int* lengths, 
		 size_t n, long int q, double K, 
		 double lambda, double H ) {
  size_t i;
  double sum = 0.0;
  double term = 0.0;
  for ( i = 0; i < n; i++ ) {
    if ( use_scores[ i ] ) {
      term = ( log( findNi( q, lengths[ i ], K, H ) ) ) - 
	( lambda * scores[ i ] ) -
	( K * findNi( q, lengths[ i ], K, H ) / exp( lambda * scores[ i ] ) );
      if ( isnan( term ) ) {
	printf( "term got nan\n" );
	printf ( "%g\n", findNi(  q, lengths[ i ], K, H ) );
	printf ( "%g\n", log( findNi(  q, lengths[ i ], K, H ) ) );
	printf ( "%g\n", lambda * scores[ i ] );
	printf ( "%g\n", log( findNi(  q, lengths[ i ], K, H ) ) );
	
      }
      else {
	sum += term;
      }
    }
  }
  return ( n * log( lambda * K ) + sum );
}
  
void initK ( double* K_p ) {
  *K_p = 1.0;
}
