The CompBio Dude: More Jython and BioJava

Continuing the exploration of how it use BioJava with Jython I've written up an example that takes a Genpept file an does alignments (both NeedlemanWunsch and SmithWaterman) against every Coding feature in a Genebank file.

Things to note:

CDS sequences are kept in the Annotations of Features. If you read the straight sequence from the Genebank file, you will get the DNA of the chromosome.
The BLOSUM62 matrix is stored in a string, so no extra file needed to load the substitution scores
The 'PROTEIN-TERM' alphabet is the 20 amino acids and the TERMinal (stop) symbol. If you use the regular 'PROTEIN' alphabet BioJava will throw an exception when creating the BLOSUM62 Substitution matrix.


#!/usr/bin/env jython

import sys
from java.io import *
from java.util import *
from org.biojava.bio import *
from org.biojava.bio.seq import *
from org.biojava.bio.seq.io import *
from org.biojava.bio.alignment import NeedlemanWunsch
from org.biojava.bio.alignment import SequenceAlignment
from org.biojava.bio.alignment import SmithWaterman
from org.biojava.bio.alignment import SubstitutionMatrix
from org.biojava.bio.symbol import AlphabetManager
from org.biojava.bio.symbol import FiniteAlphabet


BLOSUM62 = """#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
  A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
"""

if __name__ == '__main__':
   alphabet = AlphabetManager.alphabetForName("PROTEIN-TERM")
   matrix = SubstitutionMatrix(alphabet, BLOSUM62, "BLOSUM62" )

   nw_aligner = NeedlemanWunsch(
       0,     #// match
       3,    #// replace
       2,      #// insert
       2,    #// delete
       1,      #// gapExtend
       matrix     #// SubstitutionMatrix
   )
   sw_aligner = SmithWaterman(
       0,     #// match
       3,    #// replace
       2,      #// insert
       2,    #// delete
       1,      #// gapExtend
       matrix     #// SubstitutionMatrix
   )
   br1 = BufferedReader(FileReader( sys.argv[1] ) )
   seqs1 = SeqIOTools.readGenpept(br1)
   while ( seqs1.hasNext() ):
       query = seqs1.nextSequence()
       ff = FeatureFilter.ByType("CDS")
       br2 = BufferedReader(FileReader( sys.argv[2]) )
       seqs2 = SeqIOTools.readGenbank(br2)
       while(seqs2.hasNext()):
           seq = seqs2.nextSequence()
           fh = seq.filter(ff)
           feats = fh.features()
           while ( feats.hasNext() ):
               feat = feats.next()
               fAnno = feat.getAnnotation()
               locus = fAnno.getProperty('locus_tag')
               target = ProteinTools.createProteinSequence( fAnno.getProperty('translation'), locus )
               nw_score = nw_aligner.pairwiseAlignment( query, target )
               sw_score = sw_aligner.pairwiseAlignment( query, target )
               print "%s - %s Needleman-Wunsch: %d Smith-Waterman: %d" % (query.getName(), target.getName(), nw_score, sw_score)

The CompBio Dude

Tuesday, July 28, 2009

More Jython and BioJava

No comments:

Blog Archive

Links