The CompBio Dude

More Jython and BioJava

2009-07-28T16:30:00.000-07:00

Continuing the exploration of how it use BioJava with Jython I've written up an example that takes a Genpept file an does alignments (both NeedlemanWunsch and SmithWaterman) against every Coding feature in a Genebank file.

Things to note:

CDS sequences are kept in the Annotations of Features. If you read the straight sequence from the Genebank file, you will get the DNA of the chromosome.
The BLOSUM62 matrix is stored in a string, so no extra file needed to load the substitution scores
The 'PROTEIN-TERM' alphabet is the 20 amino acids and the TERMinal (stop) symbol. If you use the regular 'PROTEIN' alphabet BioJava will throw an exception when creating the BLOSUM62 Substitution matrix.


#!/usr/bin/env jython

import sys
from java.io import *
from java.util import *
from org.biojava.bio import *
from org.biojava.bio.seq import *
from org.biojava.bio.seq.io import *
from org.biojava.bio.alignment import NeedlemanWunsch
from org.biojava.bio.alignment import SequenceAlignment
from org.biojava.bio.alignment import SmithWaterman
from org.biojava.bio.alignment import SubstitutionMatrix
from org.biojava.bio.symbol import AlphabetManager
from org.biojava.bio.symbol import FiniteAlphabet


BLOSUM62 = """#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
  A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
"""

if __name__ == '__main__':
   alphabet = AlphabetManager.alphabetForName("PROTEIN-TERM")
   matrix = SubstitutionMatrix(alphabet, BLOSUM62, "BLOSUM62" )

   nw_aligner = NeedlemanWunsch(
       0,     #// match
       3,    #// replace
       2,      #// insert
       2,    #// delete
       1,      #// gapExtend
       matrix     #// SubstitutionMatrix
   )
   sw_aligner = SmithWaterman(
       0,     #// match
       3,    #// replace
       2,      #// insert
       2,    #// delete
       1,      #// gapExtend
       matrix     #// SubstitutionMatrix
   )
   br1 = BufferedReader(FileReader( sys.argv[1] ) )
   seqs1 = SeqIOTools.readGenpept(br1)
   while ( seqs1.hasNext() ):
       query = seqs1.nextSequence()
       ff = FeatureFilter.ByType("CDS")
       br2 = BufferedReader(FileReader( sys.argv[2]) )
       seqs2 = SeqIOTools.readGenbank(br2)
       while(seqs2.hasNext()):
           seq = seqs2.nextSequence()
           fh = seq.filter(ff)
           feats = fh.features()
           while ( feats.hasNext() ):
               feat = feats.next()
               fAnno = feat.getAnnotation()
               locus = fAnno.getProperty('locus_tag')
               target = ProteinTools.createProteinSequence( fAnno.getProperty('translation'), locus )
               nw_score = nw_aligner.pairwiseAlignment( query, target )
               sw_score = sw_aligner.pairwiseAlignment( query, target )
               print "%s - %s Needleman-Wunsch: %d Smith-Waterman: %d" % (query.getName(), target.getName(), nw_score, sw_score)

BioJava and Jython

2009-07-28T12:17:00.000-07:00

Jython is an exciting project to implement a Python parser and environment in Java. One of it's more exciting aspects is the ability to directly call existing Java classes with no need to write an additional wrapper layer (as compared to the situation with CPython and existing C libraries). This means the complex libraries, like BioJava, can be easily utilized in a scripting environment.
The cookbook example of opening and reading a Genbank file can easily be translated into Jython:


#!/usr/bin/env jython

import sys
from java.io import *
from java.util import *
from org.biojava.bio import *
from org.biojava.bio.seq.db import *
from org.biojava.bio.seq.io import *
from org.biojava.bio.symbol import *

if __name__ == "__main__":
    br = BufferedReader( FileReader( sys.argv[1] ) )
    sequences = SeqIOTools.readGenbank(br)
    while sequences.hasNext():
        seq = sequences.nextSequence()
        print seq.seqString()

The complexity of Life

2009-07-08T09:29:00.000-07:00

Three teams of scientists published three massive studies in Nature on the genes behind schizophrenia. They scanned thousands of people to find variants of genes that tended to show up more in people with schizophrenia than in those without it. And they found a heap of genes. There are thousands of different variants that each may raise your risk of schizophrenia by a tiny amount.

Via Discovery Magazine

NIH Expands Human Microbiome Project

2009-07-08T07:55:00.000-07:00

The Human Microbiome Project has awarded more than $42 million to expand its exploration of how the trillions of microscopic organisms that live in or on our bodies affect our health, the National Institutes of Health (NIH) announced today. (June 23rd 2009)

From the NIH

UNIPOP: A universal operon predictor for prokaryotic genomes

2009-02-24T10:05:00.000-08:00

The Journal of Bioinformatics and Computational Biology has published on article on a tool called UNIPOP. The operon prediction tool uses graph theory to figure out operons by mapping areas of chromosomes that experienced less shuffling between species. Because it's not a machine learning based method, there is no retraining that has to be done for different types of organisms. But it does require multiple related genome to detect signals.

You can find the source code and results on their website. Input is the ppt files available from NCBI (for example the all.ptt.gz file for bacterial genomes).

Abstract:

Identification of operons at the genome scale of prokaryotic organisms represents a key step in deciphering of their transcriptional regulation machinery, biological pathways and networks. While numerous computational methods have been shown to be effective in predicting operons for well-studied organisms such as Escherichia coli (E. coli) K12 and Bacillus subtilis (B. subtilis) 168, these methods generally do not generalize well to genomes other than the ones used to train the methods because they rely heavily on organism-specific information. Several methods have been explored to address this problem through utilizing only genomic structural information conserved across multiple organisms, but they all suffer from the issue of low prediction sensitivity. In this paper, we report a novel operon prediction method that is applicable to any prokaryotic genome with accurate prediction accuracy. The key idea of the method is to predict operons through identification of conserved gene clusters across multiple genomes and through deriving a key parameter relevant to the distribution of intergenic distances in genomes. We have implemented this method using a graph-theoretic approach, called a maximum cardinality bipartite matching algorithm. Computational results have shown that our method has higher prediction sensitivity as well as specificity than any published method. We have carried out a preliminary study on operons unique to archaea and bacteria, respectively, and derived a number of interesting new insights about operons between these two kingdoms. The software and predicted operons of 365 prokaryotic genomes are available at http://csbl.bmb.uga.edu/~dongsheng/UNIPOP.

Nanotech based DNA sequencing to lower costs

2009-02-23T10:53:00.000-08:00

In the new issue of Nature, Oxford Nanopore describes their new DNA sequencing technology. It doesn't require fluorescent labeling, and achieves 99.8% accuracy. They say that this technology 'could' reduce costs and speed up sequencing (that is probably dependent on how the manufacturing details work out). They claim 50 base pair per second per pore, but I haven't yet seen what they estimate maximum read length would be, or how many pores per chip they expect to make. No word yet on time to market.

You can check out the corporate web site at http://nanoporetech.com/. They are pushing this technology as a possible solution to the personal genomes projects that have been springing up recently.

The technology is a combination of biochemical components and nanotechnology. You can find a nice overview animation at YouTube.

The full animation with no voice over:

Flash Molecular Biology Games

2009-02-11T12:00:00.000-08:00

Sometimes you need a little fun. If you are looking for a way to kill some time, but want to be able to justify it as 'research', here are some flash games you can find on the web for free.

Microbe Kombat

You move your mouse around to guide you microbial cells to proteins that can be eaten. Enemy cells inhabit the same space and compete for the limited food source. Cute graphics/Nice Music.
Presentation: A
Fun: B
Science: A

Microbe Arena

This is a simple round based game where you customize your character and let him fight. Basically you adjust a bunch of sliders and then hit the 'fight' button. Less of a game, and more of a toy.
Presentation: B
Fun: D
Science: C-

Microbe War

Not really biology based. You are a little ship that flies around and shoots a gun. You have a small armada that goes with you.
Presentation: B
Fun: B-
Science: F

Winner: Microbe Kombat

Is Biology the future of computing?

2009-02-11T08:00:00.000-08:00

Computer Science is a form of applied mathematics. All the etched silicon and electricity is simply the most convenient form to express those ideas, for the time being. Who's to say that in the future that computers won't take more inspiration from biological sources. Speaking at the 2009 International Solid-State Circuits Conference, Intel Fellow Mark Bohr spoke about the possibility of integrated circuit design taking design queues from neuron design.

You can read more at Venture Beat

Large Scale Phylogenetic Rendering

2009-02-10T11:26:00.000-08:00

The New York Times is running an article, "Crunching the Data for the Tree of Life":

For years now researchers have sequenced DNA from thousands of species from jungles, tundras and museum drawers. They have used supercomputers to crunch the genetic data and have gleaned clues to how today ’s diversity of species evolved over the past 450 million years. There’s just one problem. They have no way to visualize it...

The article mainly concerns itself with large scale phylogenetic analysis and rendering. The kind of rendering you would do with ATV or PAUP is for small sub branches of the total tree of life. This article analogizes the goals of these programs with Google Earth, programs that can quickly deal with large scale data sets.

One of the programs mentioned is Paloverde program from UC Davis(The site mentioned in the paper seemd to be down, but you can find a Arizona mirror to download the program). They provide a compiled binary for Mac OSX.

Another program for this type of large scale rendering is Phlyo3D that works in conjunction with Walrus, a Java3D based graph rendering platform.

Careers in Computational Biology

2009-02-09T12:00:00.000-08:00

If you are thinking about a future working in the field of Bioinformatics and Computational Biology, Nature has an article on Careers in Systems Biology. They interviewed Malcolm Young, CEO of e-Therapeutics, and Hiroaki Kitano, Director of Sony Computer Science Laboratories, and President of the Systems Biology Institute, Tokyo, Japan.

Source

Science Daily: Cancer and Computational Biology

2009-02-05T10:00:00.000-08:00

Science Daily has a write up on a new paper that emphasize the importance of mathematical modeling of cancer cells. The paper, On the Role of Cell Signaling Models in Cancer Research, seeks to "emphasize the role andimportance of the careful mathematical/computational modelingof signaling networks for the understanding of aberrant signalingin cancer and for the development of targeted therapies".

From ScienceDaily

Biology exists in a physical world. That's a fact cancer researchers are beginning to recognize as they look to include concepts of physics and mathematics in their efforts to understand how cancer develops -- and how to stop it.

Complete Genomics

2009-02-01T11:56:00.000-08:00

Looks like we'll be seeing more human genome information in the near future. Complete Genomics Inc. plans to sequence 1000 genomes this year, and 20,000 in 2010. This research sounds similar to the Personal Genomes Project. It's a corporate effort, so individual researchers probably won't see the data ( visit NCBI if you want some free data ).
They plan to present the results of their initial analysis in February.

From the Newswire:

MOUNTAIN VIEW, Calif., Jan. 30 /PRNewswire/ -- Complete Genomics Inc., a newly launched, third-generation human genome sequencing company, today announced that it will release its sequencing data publically for the first time at the 10th annual Advances in Genome Biology and Technology (AGBT) meeting to be held in Marco Island, Fla., from Feb. 4-7.

Dr. Clifford Reid, chairman, president and CEO of Complete Genomics, will review the analysis results during his presentation titled: "Complete Genomics: Revolutionizing Human Genome Sequencing" on Thursday, Feb. 5, at 9:10 p.m. in the Islands Ballroom at the Marco Island Marriott Resort and Spa in Marco Island, Fla.

Google Talk: Current Issues in Computational Biology and Bioinformatics

2009-01-31T22:25:00.000-08:00

Gary Bader, an Assistant Professor at the Terrence Donnelly Centre for Cellular and Biomolecular Research (CCBR) at the University of Toronto, gave a talk on Bioinformatics and Computational Biology featured in the Google Talk Talks series.

It's an intro talk, aimed at introducing the ideas of bioinformatics to people with a computational background.

Protein Wikis

2009-01-30T12:28:00.000-08:00

Genome Biology has published an article: "Proteopedia - a scientific 'wiki' bridging the rift between three-dimensional structure and function of biomacromolecules"

New Media is making inroads to scientific publishing. The traditional model of scientific publishing is peer-review then publish, but the Wiki model is publish then peer-review. While it's not to the point that a Wiki-edit would show up on someone's publication list, Wiki style articles for high-throughput genomics/proteomics experiments make sense.
A protein structure used to be enough work and new research material to support an entire PhD dissertation, but now with high throughput protein structure determination pipelines like PSI, a more informal publishing structure is needed. Wiki's offer the ability for scientific notes to published for each of the catalogued protein structures, without having to do a publication for each and every one of the +50K protein structures stored in the Protein Data Bank.

There are a few different Protein Structure Wiki's that are getting started:

Proteopedia : Seeks to annotate known protein structures with Biologically relevant information.

Topsan : Sub-project of the Protein Structure Initiative and The Joint Center of Structural Genomics. Used to annotate proteins generated in high throughput Protein structure determination pipeline. Many of the targets were originally selected in batches and have no known biological information.

PDBWiki : Seems more geared toward discussion of the of the characteristics of the models themselves (ie density maps and collision errors)

Amazing DNA animation

2009-01-30T10:44:00.000-08:00

From IO9

Drew Berry of the Walter and Eliza Hall Institute of Medical Research has produced a rather amazing video detailing the life of DNA. From nucleosome wrapping, to DNA replication, to Amino acid production, the animations provide a rather amazing view of the molecular processes behind life.

Online searches and drug combinations

2009-01-13T12:09:00.000-08:00

Ars Technica has a nice writeup on a PLoS Computational Biology paper, "Search Algorithms as a Framework for the Optimization of Drug Combinations".

The abstract:

Combination therapies are often needed for effective clinical outcomes in the management of complex diseases, but presently they are generally based on empirical clinical experience. Here we suggest a novel application of search algorithms—originally developed for digital communication—modified to optimize combinations of therapeutic interventions. In biological experiments measuring the restoration of the decline with age in heart function and exercise capacity in Drosophila melanogaster, we found that search algorithms correctly identified optimal combinations of four drugs using only one-third of the tests performed in a fully factorial search. In experiments identifying combinations of three doses of up to six drugs for selective killing of human cancer cells, search algorithms resulted in a highly significant enrichment of selective combinations compared with random searches. In simulations using a network model of cell death, we found that the search algorithms identified the optimal combinations of 6–9 interventions in 80–90% of tests, compared with 15–30% for an equivalent random search. These findings suggest that modified search algorithms from information theory have the potential to enhance the discovery of novel therapeutic drug combinations. This report also helps to frame a biomedical problem that will benefit from an interdisciplinary effort and suggests a general strategy for its solution.

Personal Genomes

2009-01-12T13:23:00.000-08:00

The price of SNP analysis based on DNA microarrays has fallen to the point to make a $400 test commercially viable. The New York Time ran an article about a reporter getting his personal genome analyzed. One of the themes that some have picked up on, is that currently this type of analysis is still a very nascent technology. Correlations between specific SNP markers and particular diseases have been suggested in literature, but we are still a long way from true understanding. The recent call to arms for improvements in Systems Biology research illustrates how much is left to be done.
The Personal Genome Project, which seeks to fully sequence the exomic content of 100,000 personal genomes and provide the data openly with correlated medical histories. Preliminarily information on the first ten subjects has been released. And while the actual sequencing data is not downloadable off the web site, they do encourage you to contact them if you are interested in reserch collaberations.

Nova recently aired a show about the project:

Top 25 most Dangerous Programming Errors

2009-01-12T12:52:00.000-08:00

The SANS has published a list of the top 25 most dangerous programming errors. What does this have to do with computational biology? From my observation, it seems that software in compbio labs goes by the following time-line:

1) Research and Develop a new technique
2) Setup web-server before paper goes to print
3) Profit!!! (writing more grants)

The software is written during Research and Development of a new analytical technique. This means very little software design goes into its development. Once there is a working technique, a paper is written and a web server to provide the tool is setup. The web service is mostly just advertising for the paper and helps argue the point that the lab is worthy of more grant money. Very little time is actually spent on proper software engineering, and even less time on security analysis. This could turn a lot of compbio labs into rather soft targets for hackers.

Via Information week

Systems Biology is important, starting.... Now

2009-01-09T15:39:00.000-08:00

Systems Biology is apparently important, at least that is what 110 scientists from Europe are saying. Science Daily, Genome Web Daily News, are reporting that scientist from the European Science Foundation have published a report entitled, "Advancing Systems Biology for Medical Applications" (SSA LSSG-CT-2006-037673). This paper stresses the importance of developing systems biology techniques for improving medicine.

For the layman, Systems Biology refers the system of biochemical interactions, both the core components and the complex network of reactions that occur between them. Its genius occurred around the time that the human genome project was completing and the total estimate of protein coding genes was rapidly plummeting from initial expectations. Given the complexity of the human body, predictions reached up to 150K. But after all was said and done, estimates pegged the number at around 20K. Only twenty thousand genes to make a human, and it takes 41 000 genes to make rice.

If the complexity didn't come from the total number of genes, then it came from the complex network of those genes interacting.

For the cynic, basically nothing has changed. Everyone already knew that systems biology was important and the next step in understanding complex organisms. All this is about is reminding politicians that if they want results, they better be willing to sign some rather large checks.

Grep'ing Green Genes by TaxonID

2009-01-08T16:16:00.000-08:00

16s RNA is a component in the prokaryotic ribosomal system. It is necessary for survival, so it is very well conserved in prokaryotic genomes. It also has some 'hyper variable' regions that tend to mutate as a species evolves. Because of these two reasons it is a good marker for phylogenetic mapping. Green Genes is a project to provide a comprehensive database of sampled 16s sequences. Sometimes you want to start from a NCBI Taxon ID, for example from E. Coli, which has the NCBI Taxon ID code of 562 and obtain a list of associated 16s RNA sequences.

Start by obtaining a copy of the Green Genes database at http://greengenes.lbl.gov/Download/Sequence_Data/Greengenes_format/greengenes16SrRNAgenes.txt.gz

Assuming we have a list of taxon codes in a file 'taxon.list'

gunzip -c greengenes16SrRNAgenes.txt.gz | ./green_genes_taxon_grep.py taxon.list

'green_genes_taxon_grep.py' code:


#!/usr/bin/python


import sys
import re
import string


def get_fasta(title, seq):
 out_str = ">%s\n" % title
 for i in (range(0, len(seq)+1, 60)):
  out_str += "%s\n" % seq[i:i+60]
 return out_str


taxon_list = {}
taxon_file = open( sys.argv[1] )
for a in taxon_file.xreadlines():
 taxon_list[ string.rstrip(a) ] = 1
taxon_file.close()

file = sys.stdin

re_begin   = re.compile(r'^BEGIN')
re_end     = re.compile(r'^END')
re_seq     = re.compile(r'aligned_seq=(.*)')
re_ncbi_gi = re.compile(r'ncbi_gi=(.*)')
re_dot     = re.compile(r'[\.\-]')
re_taxon_id = re.compile( r'^ncbi_tax_id=(.*)' )
re_name     = re.compile(r'^prokMSAname=(.*)')
re_msa_id   = re.compile(r'^prokMSA_id=(.*)')
re_ncbi_acc = re.compile(r'^ncbi_acc_w_ver=(.*)')
report = 0
for a in file.xreadlines():

 if ( re_begin.search( a ) ):
  report = 0
 elif ( re_end.search( a ) ):
  if report:
   title_str = "%s %s %s" % (cur_msa_id, cur_ncbi_acc, cur_name)
   print get_fasta( title_str, cur_seq )
 elif ( re_seq.search( a ) ):
  cur_seq = re_seq.search( string.rstrip(a) ).group(1)
  cur_seq = re_dot.sub("", cur_seq )
 elif ( re_ncbi_gi.search( a ) ):
  cur_ncbi_gi = re_ncbi_gi.search( string.rstrip(a) ).group(1)
 elif ( re_name.search(a) ):
  cur_name = re_name.search( string.rstrip(a) ).group(1)
 elif ( re_msa_id.search( a ) ):
  cur_msa_id = re_msa_id.search( string.rstrip(a) ).group(1)
 elif ( re_ncbi_acc.search( a ) ):
  cur_ncbi_acc = re_ncbi_acc.search( string.rstrip(a) ).group(1)
 elif ( re_taxon_id.search( string.rstrip(a) ) ):
  taxon_id = re_taxon_id.search( string.rstrip(a) ).group(1)
  if taxon_list.has_key( taxon_id ):
   report = 1
   cur_taxon = taxon_id

Coding Organisms

2009-01-08T12:00:00.000-08:00

Drew Endy, From MIT and OpenWetware, is featured on ForaTV, giving talk on designing organisms. We've previously mentioned his talks on genetic design.

Linux Watch: Open Discovery

2009-01-07T20:43:00.000-08:00

Open Discovery is a Fedora 9 derived USB based distribution with open source Bioinformatics tools pre-installed.

It's nice that they are bundling all of this in a USB bootable distribution for all those Bioinformatians that prefer Windows. However, I'm curious why they chose to go for a whole new distribution rather then simple creating a new YUM repository, like RPM Fusion, that can be added to an existing standard Fedora install.

If you are interested Open Discovery includes:

Via Bioinformatics.org

Homegrown Molecular Biology

2009-01-07T14:37:00.000-08:00

From Yahoo News:

Using homemade lab equipment and the wealth of scientific knowledge available online, these hobbyists are trying to create new life forms through genetic engineering — a field long dominated by Ph.D.s toiling in university and corporate laboratories.

What a negative view of science.... I wouldn't call what I do toiling. Of course I work at a computer terminal, not in the wet lab.

Weird to think that the game year that the Nobel Prize is given out for the work done for Green Fluorescent Protein (GFP), you can use it for home projects.

But if you are interested in setting up a DNA lab like the one mentioned in the article, check out the projects mentioned in 'Make Magazine'

Volume 07

HMMER 3.0 Alpha Incoming

2009-01-07T10:34:00.000-08:00

The Eddy's lab blog Cryptogenomicon has posted a note about the incoming HMMER 3.0 Alpha . Sounds like their hoping for a "won’t explode and kill you" alpha, but with claim like "HMMER is now about as fast as BLAST". This may be an alpha you want to get in on.

The alpha drops Monday Janurary 12th, 2009.

Python Blast SAX example

2008-04-14T10:39:00.000-07:00

For the times when you have very large XML formatted Blast output files (use the -m7 option at run time), you can use a SAX python parser to read out the files.

There are two main methods for dealing with XML data, SAX and DOM. DOM parses the XML file into a tree based hierarchy. And while the tree based representation of the data is a very good way to organize the various facets of the data, it does require that you load the entire file into memory. If the file is very large, say a 1GB file from blasting one set of proteins against another, then loading the file into memory can become problematic. SAX on the other hand, treats an XML file as a stream of information. As it reads the file, it comes across opening and closing tags, and uses those identify the 'state' the parser is in using callbacks. Below is an example Python code that looks for tags in a blast file to identify which which data is being read. The BlastHandler class contains a set of call backs 'startElement', 'characters', and 'endElement'. Using these callbacks the program prints out the ID codes of the query and hit (a ID is assumed to be the first set of none-space characters after the '>' sign in the fasta file). It then prints out this information for every hit that it encounters in the file.
Also note the use of the 'BlankEntityHandler' class as the entity handler. XML files will often refer to external schema files that describe their format. If the default entity handler is left intact, then that schema may be downloaded as the file is parsed, causing extra bandwidth usage and computational time. We just return null data to bypass this process.


#!/usr/bin/python

import sys
import re
import xml.sax.saxutils
import xml.sax.xmlreader
import cStringIO


def get_def_id(a):
return (re.compile(r'\s+').split(a))[0]

class BlankEntityHandler( xml.sax.handler.EntityResolver):
def resolveEntity( self, publicId, systemId):
 return cStringIO.StringIO()

class BlastHandler(xml.sax.handler.ContentHandler):
def __init__(self):
 self.inRead = 0
 self.buffer = ""

def startElement(self, name, attributes):
 if name == "Hsp":
  ""
 elif name == "Iteration_query-def":
  self.inRead = 1
  self.buffer = ""
 elif name == "Iteration_query-len":
  self.inRead = 1
  self.buffer = ""
 elif name == "BlastOutput_query-def":
  self.inRead = 1
  self.buffer = ""
 elif name == "BlastOutput_query-len":
  self.inRead = 1
  self.buffer = ""
 elif name == "Hit_def":
  self.inRead = 1
  self.buffer = ""
 elif name == "Hsp_evalue":
  self.inRead = 1
  self.buffer = ""
 elif name == "Hsp_align-len":
  self.inRead = 1
  self.buffer = ""

def characters(self, data):
 if self.inRead:
  self.buffer += data

def endElement(self, name):
 if name == "Iteration_query-def":
  self.inRead = 0
  self.curQueryDef = self.buffer
 elif name == "Iteration_query-len":
  self.inRead = 0
  self.curQueryLen = int(self.buffer)
 elif name == "BlastOutput_query-def":
  self.inRead = 0
  self.curQueryDef = self.buffer
 elif name == "BlastOutput_query-len":
  self.inRead = 0
  self.curQueryLen = int(self.buffer)
 elif name == "Hit_def":
  self.inRead = 0
  self.curHitDef = self.buffer
 elif name == "Hsp_evalue":
  self.inRead = 0
  self.curEvalue = self.buffer
 elif name == "Hsp_align-len":
  self.inRead = 0
  self.curAlignLen = int(self.buffer)
 elif name == "Hsp":
  if ( self.curAlignLen >= float(self.curQueryLen) * 0.6 ):
   print get_def_id(self.curQueryDef),\
    get_def_id(self.curHitDef), \
    self.curEvalue, \
    self.curAlignLen, \
    self.curQueryLen

parser = xml.sax.make_parser(  )
parser.setEntityResolver( BlankEntityHandler() )
handler = BlastHandler(  )
parser.setContentHandler(handler)
parser.parse( sys.argv[1] )