The CompBio Dude: July 2009

Continuing the exploration of how it use BioJava with Jython I've written up an example that takes a Genpept file an does alignments (both NeedlemanWunsch and SmithWaterman) against every Coding feature in a Genebank file.

Things to note:

CDS sequences are kept in the Annotations of Features. If you read the straight sequence from the Genebank file, you will get the DNA of the chromosome.
The BLOSUM62 matrix is stored in a string, so no extra file needed to load the substitution scores
The 'PROTEIN-TERM' alphabet is the 20 amino acids and the TERMinal (stop) symbol. If you use the regular 'PROTEIN' alphabet BioJava will throw an exception when creating the BLOSUM62 Substitution matrix.


#!/usr/bin/env jython

import sys
from java.io import *
from java.util import *
from org.biojava.bio import *
from org.biojava.bio.seq import *
from org.biojava.bio.seq.io import *
from org.biojava.bio.alignment import NeedlemanWunsch
from org.biojava.bio.alignment import SequenceAlignment
from org.biojava.bio.alignment import SmithWaterman
from org.biojava.bio.alignment import SubstitutionMatrix
from org.biojava.bio.symbol import AlphabetManager
from org.biojava.bio.symbol import FiniteAlphabet


BLOSUM62 = """#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
  A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
"""

if __name__ == '__main__':
   alphabet = AlphabetManager.alphabetForName("PROTEIN-TERM")
   matrix = SubstitutionMatrix(alphabet, BLOSUM62, "BLOSUM62" )

   nw_aligner = NeedlemanWunsch(
       0,     #// match
       3,    #// replace
       2,      #// insert
       2,    #// delete
       1,      #// gapExtend
       matrix     #// SubstitutionMatrix
   )
   sw_aligner = SmithWaterman(
       0,     #// match
       3,    #// replace
       2,      #// insert
       2,    #// delete
       1,      #// gapExtend
       matrix     #// SubstitutionMatrix
   )
   br1 = BufferedReader(FileReader( sys.argv[1] ) )
   seqs1 = SeqIOTools.readGenpept(br1)
   while ( seqs1.hasNext() ):
       query = seqs1.nextSequence()
       ff = FeatureFilter.ByType("CDS")
       br2 = BufferedReader(FileReader( sys.argv[2]) )
       seqs2 = SeqIOTools.readGenbank(br2)
       while(seqs2.hasNext()):
           seq = seqs2.nextSequence()
           fh = seq.filter(ff)
           feats = fh.features()
           while ( feats.hasNext() ):
               feat = feats.next()
               fAnno = feat.getAnnotation()
               locus = fAnno.getProperty('locus_tag')
               target = ProteinTools.createProteinSequence( fAnno.getProperty('translation'), locus )
               nw_score = nw_aligner.pairwiseAlignment( query, target )
               sw_score = sw_aligner.pairwiseAlignment( query, target )
               print "%s - %s Needleman-Wunsch: %d Smith-Waterman: %d" % (query.getName(), target.getName(), nw_score, sw_score)

Jython is an exciting project to implement a Python parser and environment in Java. One of it's more exciting aspects is the ability to directly call existing Java classes with no need to write an additional wrapper layer (as compared to the situation with CPython and existing C libraries). This means the complex libraries, like BioJava, can be easily utilized in a scripting environment.
The cookbook example of opening and reading a Genbank file can easily be translated into Jython:


#!/usr/bin/env jython

import sys
from java.io import *
from java.util import *
from org.biojava.bio import *
from org.biojava.bio.seq.db import *
from org.biojava.bio.seq.io import *
from org.biojava.bio.symbol import *

if __name__ == "__main__":
    br = BufferedReader( FileReader( sys.argv[1] ) )
    sequences = SeqIOTools.readGenbank(br)
    while sequences.hasNext():
        seq = sequences.nextSequence()
        print seq.seqString()

The CompBio Dude

Tuesday, July 28, 2009

More Jython and BioJava

BioJava and Jython

Wednesday, July 08, 2009

The complexity of Life

NIH Expands Human Microbiome Project

Blog Archive

Links