#!/usr/local/bin/perl
#
# To run this program, follow these steps:
#
#  1) Change the first line of this program to point to the correct path 
#     for perl on your computer.
#  2) Perl 5.8.0 or higher (ActivePerl or Windows machine) much be installed
#     on your computer. 
#  3) You must have a set of contigs that were obtained from CAP3 run of
#     your raw sequences (the CAP3 alignment serves as your input file).
#  4) Run this program on command line (For Windows, open the command prompt;
#     For Mac OS or Linux, bring up the terminal window): change directory 
#     to the location of this program.  Then type the name of the program and 
#     hit "enter" to run.
#
# The output from the run should contain two files: 
# 
# (1) "find.snp.seq.txt": The consensus sequence from each contig which has 
#     been judged to contain 1 or more SNPs, with the SNPs marked by an N.
#
# (2) "find.snp.sta.txt": The statistics on the SNPs identified. This file
#     contains a list with the following 5 columns per row for each SNP:
#
#  1. The score (minor nucleotide or allele frequency x the number of
#     sequences containing the minor nucleotide or allele).
#  2. The contig number
#  3. The base position   
#  4. The minor nucleotide or allele frequency.
#  5. The number of sequences at that base in the contig.
#
# by "Danielle Gorbach" <dmbowen@iastate.edu>
#    "Zhiliang Hu" <zhu@iastate.edu>
# on Monday, Feb 16, 2009 14:15:25
########################################################
# Set your selection criteria here:
#
# Minimum number of minor allele-containing sequences required
# to detect a SNP:
#
 $scoreCutOff = 4; # recommended = 4

# Minimum minor allele frequency you want to detect
# (used with score cut-off):
#
 $percentDiffCutOffWithScore = 0.1;

# If you also want to base your calculations on minor allele frequency 
# without a minimum number of minor alleles, set this value to a number
# between 0 and 0.5.  Leaving this value as 1 will ignore this cut-off:
#
 $percentDiffCutOffAlone = 1; 

# Number of contigs in your data set
#
# $contigNo = 15;

########################################################

print "\nAn input file is required, which is the CAP3 alignment output.\n\n";
print "What is the input file name?   ";
$file = <STDIN>;
chomp($file);

#unless ( privateShrimpSequences.txt ) {     # check command line arguments
#        die "Usage: scoringSNPs.pl in_file\n";
#}
#$file = "Nov2008Contigs.txt";  # File containing the aligned sequences from cap3
# $file = "all.masked.aligned";  # File containing the aligned sequences from
# cap3
# $file = "monodon.masked.align";# File containing the aligned sequences from
# cap3
open(SEQ,">find.snp.seq.txt");
open(STA,">find.snp.sta.txt");

##############################################################
## Do not change below this line unless you know what you do
##############################################################
open (file) or die "Can't open the file!"; # Open alignment file.
@ESTlines = <file>;   # Assign each line of the file to
close (file);         # a separate index in the array.

# Decide where the first important line of the file is located 
# (the one that tells the contig number right before the sequence information
# begins). 
$currentLine=0;
$lastLine = scalar(@ESTlines);
while($ESTlines[$currentLine] !~ /DETAILED DISPLAY OF CONTIGS/){  
    $ESTlines[$currentLine]=();
    $currentLine++;
} #while 
$linecount=$currentLine+1;

# Work through one contig at a time. 
for($contig=1; $linecount<$lastLine-1; $contig++){
# Separate the characters in each line into separate elements of the array.
getContig();

# Assign a contig number to each sequence to allow easy identification of
# useful and useless rows.
#print scalar(@EST);
for ($i=0; $i<scalar(@EST); $i++){
#   print "$i, $EST[$i][0],";
    if($EST[$i][0]=~/[a-z]|[A-Z]|[0-9]/){
        $EST[$i][0]=$contig;
    }#if
    else {
        $EST[$i][0]=0;
    }#else
    #print " $EST[$i][0]\n";
    #Assign each EST name to a single variable
    if($EST[$i][0]==$contig){
        for($k=1;$k<22;$k++){
            $EST[$i][$k] =~ tr/|+-//d;
        }
        $EST[$i][1]=$EST[$i][1].$EST[$i][2].$EST[$i][3].$EST[$i][4].$EST[$i][5].$EST[$i][6].$EST[$i][7].$EST[$i][8].$EST[$i][9].$EST[$i][10].$EST[$i][11].$EST[$i][12].$EST[$i][13].$EST[$i][14].$EST[$i][15].$EST[$i][16].$EST[$i][17].$EST[$i][18].$EST[$i][19].$EST[$i][20];
        #print "$i, $EST[$i][1], "; 
        #print "$EST[$i][0]\n";
    }#if
}#for

# Concatenate sequences that are on multiple lines

$basePos=82;

# If the sequence did not start early enough to be amongst the first group,
# we need to adjust the base positions so they line up with the consensus
# sequence.
for ($rowNum=1;$rowNum<scalar(@EST); $rowNum++){
    #print "restart row\n";
    if($basePos>82 and $EST[$rowNum][0]>0){
        #print "$rowNum is the Row\n";
        for ($startPos=22; $startPos<82; $startPos++){
            $EST[$rowNum][$basePos-60]=$EST[$rowNum][$startPos];
            $basePos++;
            $EST[$rowNum][$startPos]='';
        }#for
        $basePos-=60;
    }#if
    $countMatches=0;

    if ($EST[$rowNum][0]>0) {
        for ($j=$rowNum+5;$j<scalar(@EST);$j++){
#           print "$basePos, $EST[$rowNum][0] = $EST[$j][0], $EST[$rowNum][1] =
#           $EST[$j][1], j=$j\n";
            if ($EST[$rowNum][0] eq $EST[$j][0] and $EST[$rowNum][1] =~ $EST[$j][1]){
#                       print "$contig=contig. $rowNum $j\n";
#               print "match\n";
                for ($k=22; $k<82; $k++){
                    if ($EST[$j][$k]=~ /A/ or $EST[$j][$k]=~ /C/ or $EST[$j][$k]=~ /G/ or $EST[$j][$k]=~ /T/ or $EST[$j][$k]=~ /N/ or $EST[$j][$k]=~ /-/ ){
                        $EST[$rowNum][$basePos]=$EST[$j][$k];
                        $basePos++;
#                       print "$basePos\n";
                    }#if
                    else {
                        $basePos++;
                    }#else
                }#for
                $EST[$j][0]=0;
                $countMatches++;
            }#if
        }#for
        $basePos -= ($countMatches * 60);
    }#if
    if ($EST[$rowNum][1]=~/onsensus/){
        $basePos += 60;
    }#if
}#for
#for ($k=0;$k<scalar(@EST);$k++)
#{
#   if ($EST[$k][0]==$contig)
#   {
        #print "$EST[$k][1] : ";
#       for ($m=22;$m<scalar(@EST[$k]);$m++)
#       {
        #   print $EST[$k][$m];
#       }
        #print "\n";
#   }
#}

#Count the number of sequences for each contig to determine if any SNPs can be discovered in that contig.

@countDiffs=0;
@countSeqs=0;
@usefulContigs=0;
@sortingInfo=0;
$sequencesInContig=0;

for($rowNum=0; $rowNum<scalar(@EST);$rowNum++){
    if ($EST[$rowNum][0] > 0){
        $sequencesInContig++;
        if ($EST[$rowNum][1]=~/onsensus/){
            $consensusRow=$rowNum;
#- Debug -- print "for Contig $EST[$rowNum][0] the consensus sequence is in row $consensusRow. $rowNum ";
        }#if
    }#if
}#for

$minimumSequences = 2*$scoreCutOff;
if ($sequencesInContig > $minimumSequences-1){
#- Debug -- print "sequences =$sequencesInContig\n";
    for ($columnPos=22; $columnPos<32; $columnPos++){
        $EST[$consensusRow][$columnPos]=();
    }#for
    for ($thisRow=1; $thisRow<scalar(@EST); $thisRow++){
        if ($thisRow != $consensusRow and $EST[$thisRow][0]>0){
            my $NCount=0;
            my $SeqLength=0;
            for ($column=22; $column<4000; $column++){
                if ($EST[$thisRow][$column]=~/N/){
                    $NCount++;
                    $SeqLength++;
                }#if
                elsif ($EST[$thisRow][$column]=~/A/ or $EST[$thisRow][$column]=~/C/ or $EST[$thisRow][$column]=~/G/ or $EST[$thisRow][$column]=~/T/){
                    $SeqLength++;
                }#elsif
                if ($SeqLength < 11){ # Remove the first 10 bases of each sequence.
                    $EST[$thisRow][$column]=();
                    $NCount=0;
                }#if
            }#for
            $SeqLength-=10; # Reset the sequence length value based on removing the first 10 bases.
            if ($SeqLength>0){
                my $Quality = $NCount/$SeqLength;
                if ($Quality > 0.1){
                    $EST[$thisRow][0]=0;
                }#if
            }#if
            if ($EST[$thisRow][0]>0){
                for ($column=32; $column<4000; $column++){  
                    if ($EST[$consensusRow][$column]=~/A/ or $EST[$consensusRow][$column]=~/C/ or $EST[$consensusRow][$column]=~/G/ or $EST[$consensusRow][$column]=~/T/){
                        if ($EST[$thisRow][$column]=~/A/ or $EST[$thisRow][$column]=~/C/ or $EST[$thisRow][$column]=~/G/ or $EST[$thisRow][$column]=~/T/) {
                            if ($EST[$thisRow][$column]!~ $EST[$consensusRow][$column]){
    #print "$contig $thisRow $column\n";
                                $trueDiff = checkQuality($thisRow, $column);
                                if ($trueDiff==0){
                                    $countDiffs[$column]++;
                                }#if
                            }#if
                            else{$trueDiff=0;}
                            if ($trueDiff!=1){$countSeqs[$column]++;}
                        }#if
                    }#if
                }#for
            }#if
        }#if
    }#for
}#if


$containsSNP=1;
for($column=32; $column<4000; $column++){
    if ($EST[$consensusRow][$column]=~ /N/){
        $EST[$consensusRow][$column]=();
    }#if
    if ($countDiffs[$column]>$scoreCutOff-1 and $countSeqs[$column]>$minimumSequences-1){
        $percentDiff = sprintf("%.3f",$countDiffs[$column]/$countSeqs[$column]);
        if ($percentDiff>0.5){
            $countDiffs[$column] = $countSeqs[$column]-$countDiffs[$column];
            $percentDiff = 1-$percentDiff;
        }#if
        $score = $countDiffs[$column]; # this is equivalent to the number of sequences 
                                   # multiplied by the minor allele frequency.      
        if (($score > $scoreCutOff-1 and $percentDiff > $percentDiffCutOffWithScore-.001) or $percentDiff > $percentDiffCutOffAlone){ 
            $sortingInfo = join (" ",($score, $contig, $column-21, $percentDiff, $countSeqs[$column]));
            push (@usefulInfo, $sortingInfo);
            $EST[$consensusRow][$column]=N;
            $containsSNP=0;
            $SNPbasePosition = $column - 21;
        }#if
    }#if
}#for
if ($containsSNP==0){
#   print "\nContig number $contig contains one or more SNPs.  Here is the
#   consensus sequence with SNPs marked with N: ";
    print SEQ "\nContig_$contig\t";
    for($column=32; $column<4000; $column++){
        if ($EST[$consensusRow][$column]!~ /-/){
            print SEQ "$EST[$consensusRow][$column]";
        }#if
    }#for
}#if

@countDiffs = ();
@countSeqs = ();
}#for (program-wide)

#-- Print file header; optional
# print STA "Here are your potential SNPs ranked by score, where score combines
# the number of sequences considered and the minor allele frequency.\n"; 
# print STA "Score ContigNumber BasePosition MinorAlleleFrequency
# NumberOfSequences\n";

for ($i=0; $i<scalar(@usefulInfo); $i++){
        $usefulInfo[$i] =~ s/ /\t/g;  #----Added by Zhiliang
    print STA "\t$usefulInfo[$i]\n";
}#for

close(STA);
close(SEQ);
quit;
#####----------- The End of the Program

#--------------------
# Subroutines

sub getContig {
    @EST=();
    $ESTlines[$linecount] =~ tr/*//d;
    push @EST, [ split('',$ESTlines[$linecount])];
    $ESTlines[$linecount]=();
    $linecount++;
    while($ESTlines[$linecount]!~/Contig/ and $linecount<scalar(@ESTlines)){
        push @EST, [ split('',$ESTlines[$linecount]) ];
        $ESTlines[$linecount]=();
        $linecount++;
    }#while
#print "$EST[2][22]\n";
}#getContig


sub checkQuality {
    my ($seqRow, $basePosition)=@_;
    $numChecked = 0;
    $forwardNumChecked = 0;
    $differences = 0;
    for ($currentBase=$basePosition-1; $numChecked<16; $currentBase--){
        if ($EST[$seqRow][$currentBase]=~/A/ or $EST[$seqRow][$currentBase]=~/C/ or $EST[$seqRow][$currentBase]=~/G/ or $EST[$seqRow][$currentBase]=~/T/ or $EST[$seqRow][$currentBase]=~/N/ or $EST[$seqRow][$currentBase]=~/-/){
            $differences = $EST[$seqRow][$currentBase] cmp $EST[$consensusRow][$currentBase];
            if ($EST[$seqRow][$currentBase]!~ /-/ and $EST[$consensusRow][$currentBase]!~ /-/){
                $numChecked++;
            }#if
            if ($differences != 0){
                return 1;
            }#if
        }#if
        else {
            #if we ran into the end of the sequence working backward, then we want to increase the number of bases to compare in the forward direction and stop the comparisons in this direction immediately.
            $forwardNumChecked = $numChecked - 15;
            $numChecked = 16;
        }#else
    }#for
    for ($currentBase=$basePosition+1; $forwardNumChecked<16; $currentBase++){
        if ($EST[$seqRow][$currentBase]=~/A/ or $EST[$seqRow][$currentBase]=~/C/ or $EST[$seqRow][$currentBase]=~/G/ or $EST[$seqRow][$currentBase]=~/T/ or $EST[$seqRow][$currentBase]=~/N/ or $EST[$seqRow][$currentBase]=~/-/){
            $differences = $EST[$seqRow][$currentBase] cmp $EST[$consensusRow][$currentBase];
            if ($EST[$seqRow][$currentBase]!~ /-/ and $EST[$consensusRow][$currentBase]!~ /-/){
                $forwardNumChecked++;
            }#if
            if ($differences != 0){
                return 1;
            }#if
        }#if
        else {
            $forwardNumChecked = 16;
        }#else
    }#for

    return 0;
}
##########
## END ##
########

