 /* 
  Copyright (c) 2003 Gavin E. Crooks <gec@compbio.berkeley.edu>
                     Richard Green <ed@compbio.berkeley.edu>,
		     Univ. of California, Berkeley

  Permission is hereby granted, free of charge, to any person obtaining a 
  copy of this software and associated documentation files (the "Software"),
  to deal in the Software without restriction, including without limitation
  the rights to use, copy, modify, merge, publish, distribute, sublicense,
  and/or sell copies of the Software, and to permit persons to whom the
  Software is furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included
  in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
  THE SOFTWARE.

  (This is the MIT Open Source License, 
  http://www.opensource.org/licenses/mit-license.html)
*/

//! @file
//! Computational Biology Toolkit - Biological Sequences
//! @author Gavin E. Crooks <gec@compbio.berkeley.edu>
//! @author Richard Green <ed@compbio.berkeley.edu>,



// $Id: cbt_seq.c,v 1.1 2003/08/20 22:03:42 gec Exp $


#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include "cbt_util.h"
#include "cbt_seq.h"


//! One-letter codes for the 20 canonical amino acids, 
//! alphabetized by full name.
const char 
cbt_alphabet_amino_acid_canonical[] = "ARNDCQEGHILKMFPSTWYV";

//! One-letter codes for the 20 canonical amino acids, 
//! plus unkown, 'X', in alphabetic order, 
const char 
cbt_alphabet_amino_acid_alpha[] = "ACDEFGHIKLMNPQRSTVWYX"; 

//! Canonical plus B, Z, X
const char 
cbt_alphabet_amino_acid_iupac[] = "ARNDCQEGHILKMFPSTWYVBZX";

//! Includes B, X, Z, U, *, -
const char 
cbt_alphabet_amino_acid_ext[] = "ARNDCQEGHILKMFPSTWYVBZXU*-";


/*********************************************************
 * Read, write and store fasta style sequences
 *********************************************************/

//! Allocate a new sequence. Internal arrays are set to NULL.
//!
cbt_seq *  
cbt_seq_alloc() {
  cbt_seq * self;
  self = (cbt_seq *) calloc(1,sizeof(struct cbt_seq_str));
  self->id = NULL;
  self->comment = NULL;
  self->alphabet = calloc( 256+1, sizeof(char));
  self->sym = NULL;

  return self;
}

//! Free a sequence.
//! Member arrays (id, alphabet, comment, sym) are freed first.
//! @return NULL
cbt_seq *  
cbt_seq_free(cbt_seq * self) {
  if(self->id      != NULL) free(self->id);
  if(self->comment != NULL) free(self->comment);
  if(self->sym     != NULL) free(self->sym);
  free(self->alphabet);

  free(self);

  return NULL;
}



//! Parse a fasta formated file into a sequence.
//! @verbatim
//! >sequence_id From_first_white_space_to_EOL_is_just_comments
//! thisisaseqthisisaseqthisisaseqthisisaseqthisisaseqthisisaseq
//! thisisaseqthisisaseqthisisaseqthisisaseqthisisaseqthis 
//! @endverbatim
//! @param seq        A previously allocated sequence. Previous values are invalidated 
//!                   and/or freed
//! @param stream     File to read
//! @param alphabet   A string of symbols used to convert the sequence ascii into indices.
//!                   e.g. ""ARNDCQEGHILKMFPSTWYVBZX". Some common alphabets are defined
//!                   elsewhere in the file.
//! @param ignore_case  If true, we convert sequence to upper case before encoding.
//! @return 0    (Sequence parsed without error)
//! @return 1    (Failure)
//! @return EOF  (End of file. No sequence parsed)
int 
cbt_seq_read_fasta(cbt_seq * seq,
                   FILE * stream,
                   const char * alphabet,
                   const bool ignore_case
                   ) 
{
  int c,i;
  size_t length;
  size_t sym_len = 1024;
  const int err =1; // Generic error

  // Someone might try to reuse a seq
  if(seq->id != NULL) free(seq->id);
  if(seq->comment != NULL) free(seq->comment);
  if(seq->sym != NULL) free(seq->sym);

  seq->id = calloc( cbt_line_length_max, sizeof(char));
  seq->comment = calloc( cbt_line_length_max, sizeof(char));
  seq->sym = calloc( sym_len, sizeof(char));
  if(seq->id == NULL || seq->comment == NULL || seq->sym==NULL) 
    return CBT_ERROR(err, "No memory");

  // Start to parse the stream
  c = fgetc(stream);
  if( c==EOF ) return EOF;

  if( c!='>' ) 
    return CBT_ERROR( err, "Expected '>'");

  // Read id
  length =0;
  while( !isspace(c=fgetc(stream))) {
    if (c==EOF) 
      return CBT_ERROR(err, "Premature EOF");
    seq->id[length] =c;
    length +=1;
    if(length>cbt_line_length_max) 
      return CBT_ERROR(err, "Sequence ID is too long!");
  }
  seq->id[length] = '\0';
  seq->id = realloc( seq->id, length+1);


  // read comment
  if(c!='\n') {
    length =0;
    while( (c=fgetc(stream)) !='\n') {
      if (c==EOF)
        return CBT_ERROR( err, "Premature EOF");

      seq->comment[length] =c;
      length +=1;
      if(length>cbt_line_length_max)
        return CBT_ERROR(err,"Sequence comment is too long!");
    }
    seq->comment[length] = '\0';
  } else {
    // No comment
    seq->comment[0] = '\0';
    length = 0;
  }
  seq->comment = realloc( seq->comment, length+1);
  
  // read sequence
  length=0;
  while( (c=fgetc(stream))!='>') {
    if (c=='\n' || c==' ') continue;
    if (c==EOF) break;
    //    self->aa[length] = char_to_aa( (char)c);
    seq->sym[length] = (char) c;
    length +=1;
    
    if(length>sym_len) {
      sym_len *=2;
      seq->sym = realloc(seq->sym, sym_len);
    }    
  }

  seq->sym = realloc(seq->sym, length);
  seq->len = length;

  // If this is not the last sequence, then we started to read
  // the id line of the next sequence. Put it back!
  if(c=='>') ungetc('>',stream);


  // Store the alphabet. Make a copy of our own.
  if(alphabet != NULL) {
    strncpy(seq->alphabet, alphabet, 256);
    
    // Encode, and check symbols
    return cbt_encode(alphabet, ignore_case, seq->len, seq->sym);
  }

  // Defaiult, ASCII alphabet
  for(i=0;i<256;i++) seq->alphabet[i] = i;
  return 0; //Success
}


//! Encode sequence and check symbols
//! @return  0 => Success
//! @return  1 => Failure
int
cbt_encode(const char * alphabet, 
           const bool ignore_case,
           const size_t len,
           char         vec[len])
{
  const size_t max_alpha_len = 255;
  size_t alen, i;
  int map[max_alpha_len];
  int encoded;

  // build map
  for(alen=0;alen<max_alpha_len; alen++)
    map[alen] = -1;

  for(alen=0;alen<max_alpha_len; alen++) {
    if(alphabet[alen] == '\0') break;
    map[(size_t) alphabet[alen]] = alen;
  }

  // Now do the translation
  for(i=0; i< len; i++) {
    if(ignore_case) {
      encoded = map[ (size_t) toupper(vec[i])]; 
    } else {
      encoded = map[(size_t) vec[i]];
    }    

    if(encoded == -1 ) 
      return CBT_ERROR(1, "Invalid symbol");  
    
    vec[i] = (char) encoded;


  }
  
  return 0; // SUCCESS
}



//! Write a sequence to a stream in fasta format.
int
cbt_seq_write_fasta(const cbt_seq * self, FILE * stream) {
  int len =self->len;
  int c,d;

  fprintf(stream, ">%s %s\n",self->id, self->comment);

  for(d=0,c=0; c<len; c++,d++) {
    if(d==cbt_line_length) {
      fprintf(stream, "\n");
      d=0;
    }
    fprintf(stream, "%c", self->alphabet[ (size_t) self->sym[c]]);
  }
  fprintf(stream, "\n");

  return 0; // Success
}



















