tag:blogger.com,1999:blog-307448902024-03-13T09:46:58.300-07:00The CompBio DudeBiology from the Computational Point of ViewCompbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.comBlogger37125tag:blogger.com,1999:blog-30744890.post-36198415611548874132009-07-28T16:30:00.000-07:002009-07-28T16:46:58.172-07:00More Jython and BioJava<a href="http://compbiodude.blogspot.com/2009/07/biojava-and-jython.html">Continuing</a> the exploration of how it use BioJava with Jython I've written up an example that takes a <a href="http://www.ncbi.nlm.nih.gov/protein/29348497">Genpept file</a> an does alignments (both NeedlemanWunsch and SmithWaterman) against every Coding feature in a <a href="http://www.ncbi.nlm.nih.gov/nuccore/29345410">Genebank file</a>.<br /><br /><span class="fullpost"><br /><br />Things to note:<br /><ul><br /><li>CDS sequences are kept in the Annotations of Features. If you read the straight sequence from the Genebank file, you will get the DNA of the chromosome.<br /></li><li>The BLOSUM62 matrix is stored in a string, so no extra file needed to load the substitution scores<br /></li><li>The 'PROTEIN-TERM' alphabet is the 20 amino acids and the TERMinal (stop) symbol. If you use the regular 'PROTEIN' alphabet BioJava will throw an exception when creating the BLOSUM62 Substitution matrix.<br /></li></ul><br /><br /><pre><br />#!/usr/bin/env jython<br /><br />import sys<br />from java.io import *<br />from java.util import *<br />from org.biojava.bio import *<br />from org.biojava.bio.seq import *<br />from org.biojava.bio.seq.io import *<br />from org.biojava.bio.alignment import NeedlemanWunsch<br />from org.biojava.bio.alignment import SequenceAlignment<br />from org.biojava.bio.alignment import SmithWaterman<br />from org.biojava.bio.alignment import SubstitutionMatrix<br />from org.biojava.bio.symbol import AlphabetManager<br />from org.biojava.bio.symbol import FiniteAlphabet<br /><br /><br />BLOSUM62 = """# Matrix made by matblas from blosum62.iij<br /># * column uses minimum score<br /># BLOSUM Clustered Scoring Matrix in 1/2 Bit Units<br /># Blocks Database = /data/blocks_5.0/blocks.dat<br /># Cluster Percentage: >= 62<br /># Entropy = 0.6979, Expected = -0.5209<br /> A R N D C Q E G H I L K M F P S T W Y V B Z X *<br />A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4<br />R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4<br />N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4<br />D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4<br />C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4<br />Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4<br />E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4<br />G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4<br />H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4<br />I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4<br />L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4<br />K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4<br />M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4<br />F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4<br />P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4<br />S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4<br />T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4<br />W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4<br />Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4<br />V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4<br />B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4<br />Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4<br />X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4<br />* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1<br />"""<br /><br />if __name__ == '__main__':<br /> alphabet = AlphabetManager.alphabetForName("PROTEIN-TERM")<br /> matrix = SubstitutionMatrix(alphabet, BLOSUM62, "BLOSUM62" )<br /><br /> nw_aligner = NeedlemanWunsch(<br /> 0, #// match<br /> 3, #// replace<br /> 2, #// insert<br /> 2, #// delete<br /> 1, #// gapExtend<br /> matrix #// SubstitutionMatrix<br /> )<br /> sw_aligner = SmithWaterman(<br /> 0, #// match<br /> 3, #// replace<br /> 2, #// insert<br /> 2, #// delete<br /> 1, #// gapExtend<br /> matrix #// SubstitutionMatrix<br /> )<br /> br1 = BufferedReader(FileReader( sys.argv[1] ) )<br /> seqs1 = SeqIOTools.readGenpept(br1)<br /> while ( seqs1.hasNext() ):<br /> query = seqs1.nextSequence()<br /> ff = FeatureFilter.ByType("CDS")<br /> br2 = BufferedReader(FileReader( sys.argv[2]) )<br /> seqs2 = SeqIOTools.readGenbank(br2)<br /> while(seqs2.hasNext()):<br /> seq = seqs2.nextSequence()<br /> fh = seq.filter(ff)<br /> feats = fh.features()<br /> while ( feats.hasNext() ):<br /> feat = feats.next()<br /> fAnno = feat.getAnnotation()<br /> locus = fAnno.getProperty('locus_tag')<br /> target = ProteinTools.createProteinSequence( fAnno.getProperty('translation'), locus )<br /> nw_score = nw_aligner.pairwiseAlignment( query, target )<br /> sw_score = sw_aligner.pairwiseAlignment( query, target )<br /> print "%s - %s Needleman-Wunsch: %d Smith-Waterman: %d" % (query.getName(), target.getName(), nw_score, sw_score)<br /><br /></pre><br /></span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-85221843667389159052009-07-28T12:17:00.000-07:002009-07-28T12:26:38.715-07:00BioJava and Jython<a href="http://www.jython.org/">Jython</a> is an exciting project to implement a Python parser and environment in Java. One of it's more exciting aspects is the ability to directly call existing Java classes with no need to write an additional wrapper layer (as compared to the situation with CPython and existing C libraries). This means the complex libraries, like <a href="http://biojava.org/">BioJava</a>, can be easily utilized in a scripting environment. <br />The cookbook example of opening and reading a <a href="http://biojava.org/wiki/BioJava:Cookbook:SeqIO:ReadGES#Reading_GenBank">Genbank file</a> can easily be translated into Jython:<br /><pre><br />#!/usr/bin/env jython<br /><br />import sys<br />from java.io import *<br />from java.util import *<br />from org.biojava.bio import *<br />from org.biojava.bio.seq.db import *<br />from org.biojava.bio.seq.io import *<br />from org.biojava.bio.symbol import *<br /><br />if __name__ == "__main__":<br /> br = BufferedReader( FileReader( sys.argv[1] ) )<br /> sequences = SeqIOTools.readGenbank(br)<br /> while sequences.hasNext():<br /> seq = sequences.nextSequence()<br /> print seq.seqString()<br /></pre>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-44163102760611707292009-07-08T09:29:00.000-07:002009-07-08T09:31:16.421-07:00The complexity of Life<blockquote>Three teams of scientists published three massive studies in Nature on the genes behind schizophrenia. They scanned thousands of people to find variants of genes that tended to show up more in people with schizophrenia than in those without it. And they found a heap of genes. There are thousands of different variants that each may raise your risk of schizophrenia by a tiny amount.</blockquote><br /><br />Via <a href="http://blogs.discovermagazine.com/loom/2009/07/08/microcosm-week-dreaming-of-a-complete-solution-to-life/">Discovery Magazine</a>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-32635451254827383952009-07-08T07:55:00.000-07:002009-07-08T07:58:44.515-07:00NIH Expands Human Microbiome Project<blockquote>The Human Microbiome Project has awarded more than $42 million to expand its exploration of how the trillions of microscopic organisms that live in or on our bodies affect our health, the National Institutes of Health (NIH) announced today. (June 23rd 2009)</blockquote><br /><br />From the <a href="http://www.nih.gov/news/health/jun2009/nhgri-23.htm">NIH</a>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-88365983103377496342009-02-24T10:05:00.000-08:002009-02-24T10:32:47.616-08:00UNIPOP: A universal operon predictor for prokaryotic genomesThe Journal of Bioinformatics and Computational Biology has published on <a href="http://www.ncbi.nlm.nih.gov/pubmed/19226658?dopt=Abstract">article</a> on a tool called UNIPOP. The <a href="http://en.wikipedia.org/wiki/Operon">operon</a> prediction tool uses graph theory to figure out operons by mapping areas of chromosomes that experienced less shuffling between species. Because it's not a machine learning based method, there is no retraining that has to be done for different types of organisms. But it does require multiple related genome to detect signals.<br /><br />You can find the source code and results on <a href="http://csbl.bmb.uga.edu/~dongsheng/UNIPOP/">their website</a>. Input is the ppt files available from NCBI (for example the <a href="ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/all.ptt.tar.gz">all.ptt.gz</a> file for bacterial genomes).<br /><br /><span class="fullpost"><br />Abstract:<br /><span class="Apple-style-span" style="font-size: x-small;"><blockquote>Identification of operons at the genome scale of prokaryotic organisms represents a key step in deciphering of their transcriptional regulation machinery, biological pathways and networks. While numerous computational methods have been shown to be effective in predicting operons for well-studied organisms such as Escherichia coli (E. coli) K12 and Bacillus subtilis (B. subtilis) 168, these methods generally do not generalize well to genomes other than the ones used to train the methods because they rely heavily on organism-specific information. Several methods have been explored to address this problem through utilizing only genomic structural information conserved across multiple organisms, but they all suffer from the issue of low prediction sensitivity. In this paper, we report a novel operon prediction method that is applicable to any prokaryotic genome with accurate prediction accuracy. The key idea of the method is to predict operons through identification of conserved gene clusters across multiple genomes and through deriving a key parameter relevant to the distribution of intergenic distances in genomes. We have implemented this method using a graph-theoretic approach, called a maximum cardinality bipartite matching algorithm. Computational results have shown that our method has higher prediction sensitivity as well as specificity than any published method. We have carried out a preliminary study on operons unique to archaea and bacteria, respectively, and derived a number of interesting new insights about operons between these two kingdoms. The software and predicted operons of 365 prokaryotic genomes are available at http://csbl.bmb.uga.edu/~dongsheng/UNIPOP.<br /></blockquote></span><br /></span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-14584998656831734022009-02-23T10:53:00.000-08:002009-02-23T11:17:11.483-08:00Nanotech based DNA sequencing to lower costsIn the new issue of <a href="http://www.nature.com/nnano/journal/vaop/ncurrent/abs/nnano.2009.12.html">Nature</a>, Oxford Nanopore describes their new DNA sequencing technology. It doesn't require fluorescent labeling, and achieves 99.8% accuracy. They say that this technology 'could' reduce costs and speed up sequencing (that is probably dependent on how the manufacturing details work out). They claim 50 base pair per second per pore, but I haven't yet seen what they estimate maximum read length would be, or how many pores per chip they expect to make. No word yet on time to market.<br /><br />You can check out the corporate web site at <a href="http://nanoporetech.com/">http://nanoporetech.com/</a>. They are pushing this technology as a possible solution to the <a href="http://compbiodude.blogspot.com/2009/01/complete-genomics.html">personal genomes projects</a> that have been springing up recently.<br /><br />The technology is a combination of biochemical components and nanotechnology. You can find a nice overview animation at <a href="http://www.youtube.com/watch?v=HbjAMJehSlg">YouTube</a>.<br /><br /><object width="480" height="295"><param name="movie" value="http://www.youtube.com/v/HbjAMJehSlg&hl=en&fs=1"></param><param name="allowFullScreen" value="true"></param><param name="allowscriptaccess" value="always"></param><embed src="http://www.youtube.com/v/HbjAMJehSlg&hl=en&fs=1" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="295"></embed></object><br /><br />The full animation with no voice over:<br /><br /><object width="480" height="295"><param name="movie" value="http://www.youtube.com/v/Aiw4j_l8GVs&hl=en&fs=1"></param><param name="allowFullScreen" value="true"></param><param name="allowscriptaccess" value="always"></param><embed src="http://www.youtube.com/v/Aiw4j_l8GVs&hl=en&fs=1" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="295"></embed></object>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-44338306492450020302009-02-11T12:00:00.000-08:002009-02-11T12:00:00.852-08:00Flash Molecular Biology GamesSometimes you need a little fun. If you are looking for a way to kill some time, but want to be able to justify it as 'research', here are some flash games you can find on the web for free.<br /><br /><span style="font-weight: bold;font-size:130%;" ><a href="http://www.herointeractive.com/mk/">Microbe Kombat</a></span><br /><blockquote>You move your mouse around to guide you microbial cells to proteins that can be eaten. Enemy cells inhabit the same space and compete for the limited food source. Cute graphics/Nice Music.<br />Presentation: A<br />Fun: B<br />Science: A<br /></blockquote><br /><span style="font-size:130%;"><a style="font-weight: bold;" href="http://spore.strategyplanet.gamespy.com/microbe.html">Microbe Arena</a></span><br /><blockquote>This is a simple round based game where you customize your character and let him fight. Basically you adjust a bunch of sliders and then hit the 'fight' button. Less of a game, and more of a toy.<br />Presentation: B<br />Fun: D<br />Science: C-<br /></blockquote><br /><span style="font-size:130%;"><a style="font-weight: bold;" href="http://www.sploder.com/games/members/Jim423/play/microbe-war-2/">Microbe War</a></span><br /><blockquote>Not really biology based. You are a little ship that flies around and shoots a gun. You have a small armada that goes with you.<br />Presentation: B<br />Fun: B-<br />Science: F<br /></blockquote><span style="font-weight: bold;font-size:180%;" ><br /><br />Winner: Microbe Kombat</span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-39351650584241722422009-02-11T08:00:00.000-08:002009-02-11T08:00:00.658-08:00Is Biology the future of computing?Computer Science is a form of applied mathematics. All the etched silicon and electricity is simply the most convenient form to express those ideas, for the time being. Who's to say that in the future that computers won't take more inspiration from biological sources. Speaking at the 2009 International Solid-State Circuits Conference, Intel Fellow Mark Bohr spoke about the possibility of integrated circuit design taking design queues from neuron design.<br /><br />You can read more at <a href="http://venturebeat.com/2009/02/09/intels-mark-bohr-look-to-biology-for-the-future-of-computing/">Venture Beat</a>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-82992967096520897522009-02-10T11:26:00.000-08:002009-02-10T12:05:30.276-08:00Large Scale Phylogenetic Rendering<a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://loco.biosci.arizona.edu/paloverde/rbclCircle.jpg"><img style="float:left; margin:0 10px 10px 0;cursor:pointer; cursor:hand;width: 150px; height: 150px;" src="http://loco.biosci.arizona.edu/paloverde/rbclCircle.jpg" border="0" alt="" /></a><br />The New York Times is running an article, "<a href="http://www.nytimes.com/2009/02/10/science/10tree.html?partner=permalink&exprod=permalink">Crunching the Data for the Tree of Life</a>":<br /><blockquote><span class="Apple-style-span" style="font-style: italic;">For years now researchers have sequenced DNA from thousands of species from jungles, tundras and museum drawers. They have used supercomputers to crunch the genetic data and have gleaned clues to how today ’s diversity of species evolved over the past 450 million years. There’s just one problem. They have no way to visualize it...</span></blockquote><br />The article mainly concerns itself with large scale phylogenetic analysis and rendering. The kind of rendering you would do with ATV or PAUP is for small sub branches of the total tree of life. This article analogizes the goals of these programs with Google Earth, programs that can quickly deal with large scale data sets.<br /><br />One of the programs mentioned is <a href="http://bioinformatics.oxfordjournals.org/cgi/content/full/22/8/1004">Paloverde</a> program from <a href="http://ginger.ucdavis.edu/paloverde/paloverde.html">UC Davis</a>(The site mentioned in the paper seemd to be down, but you can find a <a href="http://loco.biosci.arizona.edu/paloverde/paloverde.html">Arizona mirror</a> to download the program). They provide a compiled binary for Mac OSX.<br /><br />Another program for this type of large scale rendering is <a href="http://digitised.info/content/view/15/42/">Phlyo3D</a> that works in conjunction with <a href="http://www.caida.org/tools/visualization/walrus/">Walrus</a>, a Java3D based graph rendering platform.Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-25095773156166761732009-02-09T12:00:00.000-08:002009-02-09T12:00:00.460-08:00Careers in Computational BiologyIf you are thinking about a future working in the field of Bioinformatics and Computational Biology, Nature has an article on <a href="http://www.nature.com/nrd/journal/v8/n2/full/nrd2820.html">Careers in Systems Biology</a>. They interviewed Malcolm Young, CEO of e-Therapeutics, and Hiroaki Kitano, Director of Sony Computer Science Laboratories, and President of the Systems Biology Institute, Tokyo, Japan. <div><br /></div><div><a href="http://www.nature.com/nrd/journal/v8/n2/full/nrd2820.html">Source</a></div>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-42246174559416568812009-02-05T10:00:00.000-08:002009-02-05T10:16:06.038-08:00Science Daily: Cancer and Computational BiologyScience Daily has a write up on a new paper that emphasize the importance of mathematical modeling of cancer cells. The paper, <a href="http://cancerres.aacrjournals.org/cgi/content/abstract/69/2/400?maxtoshow=&HITS=10&hits=10&RESULTFORMAT=&author1=Ventura%2C+Alejandra&searchid=1&FIRSTINDEX=0&resourcetype=HWCIT">On the Role of Cell Signaling Models in Cancer Research</a>, seeks to "<span style="font-style: italic;">emphasize the role and</span><sup style="font-style: italic;"> </sup><span style="font-style: italic;">importance of the careful mathematical/computational modeling</span><sup style="font-style: italic;"> </sup><span style="font-style: italic;">of signaling networks for the understanding of aberrant signaling</span><sup style="font-style: italic;"> </sup><span style="font-style: italic;">in cancer and for the development of targeted therapies</span>".<br /><br />From <a href="http://www.sciencedaily.com/releases/2009/01/090130104256.htm">ScienceDaily</a><br /><blockquote><span style="font-size:85%;"><br />Biology exists in a physical world. That's a fact cancer researchers are beginning to recognize as they look to include concepts of physics and mathematics in their efforts to understand how cancer develops -- and how to stop it.</span></blockquote>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-63605619916743113082009-02-01T11:56:00.000-08:002009-02-01T12:11:18.632-08:00Complete GenomicsLooks like we'll be seeing more human genome information in the near future. <a href="http://www.completegenomics.com/">Complete Genomics Inc.</a> plans to sequence 1000 genomes this year, and 20,000 in 2010. This research sounds similar to the <a href="http://compbiodude.blogspot.com/2009/01/personal-genomes.html">Personal Genomes Project</a>. It's a corporate effort, so individual researchers probably won't see the data ( visit <a href="ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/">NCBI if you want some free data</a> ). <br />They plan to present the results of their initial analysis in February.<br /><br />From the <a href="http://news.prnewswire.com/DisplayReleaseContent.aspx?ACCT=104&STORY=/www/story/01-30-2009/0004963357&EDATE=">Newswire</a>:<br /> <div id="uc_template_div_storycontent" class="releaseContent" style="padding-left: 5px;"><storycontent> <p> </p> <p><location></location></p><blockquote><p><span style="font-size:85%;"><location>MOUNTAIN VIEW, Calif.</location>, <chron>Jan. 30</chron><b> </b>/PRNewswire/<b> -- </b><a href="http://www.completegenomics.com">Complete Genomics Inc.</a>, a newly launched, third-generation human genome sequencing company, today announced that it will release its sequencing data publically for the first time at the 10th annual Advances in Genome Biology and Technology (AGBT) meeting to be held in <location>Marco Island, Fla.</location>, from <chron>Feb. 4-7</chron>.</span></p> <p><span style="font-size:85%;">Dr. <person>Clifford Reid</person>, chairman, president and CEO of Complete Genomics, will review the analysis results during his presentation titled: "Complete Genomics: Revolutionizing Human Genome Sequencing" on <chron>Thursday, Feb. 5</chron>, at <chron>9:10 p.m.</chron> in the Islands Ballroom at the Marco Island Marriott Resort and Spa in <location>Marco Island, Fla.</location></span></p></blockquote><p><location></location></p></storycontent></div>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-59852531799850539232009-01-31T22:25:00.000-08:002009-01-31T22:33:56.910-08:00Google Talk: Current Issues in Computational Biology and Bioinformatics<span>Gary Bader, an Assistant Professor at the Terrence Donnelly Centre for Cellular and Biomolecular Research (CCBR) at the University of Toronto, gave a talk on Bioinformatics and Computational Biology featured in the Google Talk Talks series.</span><br /><br />It's an intro talk, aimed at introducing the ideas of bioinformatics to people with a computational background.<br /><br /><object width="425" height="344"><param name="movie" value="http://www.youtube.com/v/bVhOntMCmnQ&hl=en&fs=1"></param><param name="allowFullScreen" value="true"></param><param name="allowscriptaccess" value="always"></param><embed src="http://www.youtube.com/v/bVhOntMCmnQ&hl=en&fs=1" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="425" height="344"></embed></object>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-33069442225745339442009-01-30T12:28:00.000-08:002009-01-30T13:25:37.577-08:00Protein WikisGenome Biology has published an article: <a href="http://genomebiology.com/2008/9/8/R121">"<em>Proteopedia </em>- a scientific 'wiki' bridging the rift between three-dimensional structure and function of biomacromolecules"</a><br /><br /><br />New Media is making inroads to scientific publishing. The traditional model of scientific publishing is peer-review then publish, but the Wiki model is publish then peer-review. While it's not to the point that a Wiki-edit would show up on someone's publication list, Wiki style articles for high-throughput genomics/proteomics experiments make sense.<br />A protein structure used to be enough work and new research material to support an entire PhD dissertation, but now with high throughput protein structure determination pipelines like <a href="http://www.nigms.nih.gov/Initiatives/PSI/">PSI</a>, a more informal publishing structure is needed. Wiki's offer the ability for scientific notes to published for each of the catalogued protein structures, without having to do a publication for each and every one of the +50K protein structures stored in the <a href="http://www.rcsb.org/pdb/home/home.do">Protein Data Bank</a>.<br /><br />There are a few different Protein Structure Wiki's that are getting started:<br /><ul><br /><li><a href="http://www.proteopedia.org/">Proteopedia</a> : Seeks to annotate known protein structures with Biologically relevant information.</li><br /><li><a href="http://www.topsan.org/">Topsan</a> : Sub-project of the <a href="http://www.nigms.nih.gov/Initiatives/PSI/">Protein Structure Initiative</a> and <a href="http://www.jcsg.org/">The Joint Center of Structural Genomics</a>. Used to annotate proteins generated in high throughput Protein structure determination pipeline. Many of the targets were originally selected in batches and have no known biological information.</li><br /><li><a href="http://pdbwiki.org/index.php/Main_Page">PDBWiki</a> : Seems more geared toward discussion of the of the characteristics of the models themselves (ie density maps and collision errors) </li><br /></ul>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-92196695180930018132009-01-30T10:44:00.000-08:002009-01-30T10:51:14.892-08:00Amazing DNA animationFrom <a href="http://io9.com/5142583/the-most-awesome-science-video-about-dna-ever-made">IO9</a><br /><br /><a href="http://features.cgsociety.org/story_custom.php?story_id=3195">Drew Berry</a> of the <a href="http://www.wehi.edu.au/">Walter and Eliza Hall Institute of Medical Research</a> has produced a rather amazing video detailing the life of DNA. From <a href="http://en.wikipedia.org/wiki/Nucleosome" title="Nucleosome">nucleosome</a> wrapping, to DNA replication, to Amino acid production, the animations provide a rather amazing view of the molecular processes behind life.<br /><br /><br /><object height="344" width="425"><param name="movie" value="http://www.youtube.com/v/4PKjF7OumYo&color1=0xb1b1b1&color2=0xcfcfcf&hl=en&feature=player_embedded&fs=1"><param name="allowFullScreen" value="true"><embed src="http://www.youtube.com/v/4PKjF7OumYo&color1=0xb1b1b1&color2=0xcfcfcf&hl=en&feature=player_embedded&fs=1" type="application/x-shockwave-flash" allowfullscreen="true" height="344" width="425"></embed></object>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-27192918386510255462009-01-13T12:09:00.000-08:002009-01-13T13:45:29.900-08:00Online searches and drug combinationsArs Technica has a <a href="http://arstechnica.com/news.ars/post/20090113-borrowing-from-online-search-for-better-drug-combinations.html">nice writeup</a> on a PLoS Computational Biology paper, "<a href="http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1000249">Search Algorithms as a Framework for the Optimization of Drug Combinations</a>".<br /><br /><span class="fullpost"><br />The abstract:<br /><span style="font-size:85%;"></span><blockquote><span style="font-size:85%;">Combination therapies are often needed for effective clinical outcomes in the management of complex diseases, but presently they are generally based on empirical clinical experience. Here we suggest a novel application of search algorithms—originally developed for digital communication—modified to optimize combinations of therapeutic interventions. In biological experiments measuring the restoration of the decline with age in heart function and exercise capacity in <i>Drosophila melanogaster</i>, we found that search algorithms correctly identified optimal combinations of four drugs using only one-third of the tests performed in a fully factorial search. In experiments identifying combinations of three doses of up to six drugs for selective killing of human cancer cells, search algorithms resulted in a highly significant enrichment of selective combinations compared with random searches. In simulations using a network model of cell death, we found that the search algorithms identified the optimal combinations of 6–9 interventions in 80–90% of tests, compared with 15–30% for an equivalent random search. These findings suggest that modified search algorithms from information theory have the potential to enhance the discovery of novel therapeutic drug combinations. This report also helps to frame a biomedical problem that will benefit from an interdisciplinary effort and suggests a general strategy for its solution.</span></blockquote><br /></span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-85893501177008539682009-01-12T13:23:00.000-08:002009-01-13T14:11:44.207-08:00Personal GenomesThe price of <a href="http://en.wikipedia.org/wiki/Single_nucleotide_polymorphism">SNP</a> analysis based on <a href="http://en.wikipedia.org/wiki/DNA_microarray">DNA microarrays</a> has fallen to the point to make a $400 test <a href="https://www.23andme.com/">commercially viable</a>. The New York Time ran an article about a reporter getting his <a href="http://www.nytimes.com/2009/01/11/magazine/11Genome-t.html?partner=permalink&exprod=permalink">personal genome analyzed</a>. One of the themes that <a href="http://scienceblogs.com/geneticfuture/2009/01/steven_pinker_and_the_diversif.php?utm_source=sbhomepage&utm_medium=link&utm_content=channellink">some have picked up on</a>, is that currently this type of analysis is still a very nascent technology. Correlations between specific SNP markers and particular diseases have been <a href="http://en.wikipedia.org/wiki/Apolipoprotein_E">suggested in literature</a>, but we are still a long way from true understanding. The <a href="http://compbiodude.blogspot.com/2009/01/systems-biology-is-important-starting.html">recent call to arms</a> for improvements in Systems Biology research illustrates how much is left to be done.<br />The <a href="http://www.personalgenomes.org/">Personal Genome Project</a>, which seeks to fully <a href="http://www.newscientist.com/article/dn15001-thousands-volunteer-to-expose-dna-secrets-to-the-world.html?DCMP=ILC-hmts&nsref=news1_head_dn15001">sequence the exomic content of 100,000 personal genomes</a> and provide the data openly with correlated medical histories. Preliminarily information on the first ten subjects <a href="http://scienceblogs.com/geneticfuture/2008/10/personal_genome_project_releas.php">has been released</a>. And while the actual sequencing data is not downloadable off the web site, they do encourage you to <a href="http://www.personalgenomes.org/contact.html">contact them</a> if you are interested in reserch collaberations.<br /><br /><span class="fullpost"><br />Nova recently aired a show about the project:<br /><object height="344" width="425"><param name="movie" value="http://www.youtube.com/v/MjevOzSHZTU&hl=en&fs=1"><param name="allowFullScreen" value="true"><param name="allowscriptaccess" value="always"><embed src="http://www.youtube.com/v/MjevOzSHZTU&hl=en&fs=1" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" height="344" width="425"></embed></object><br /></span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-53939741619847492802009-01-12T12:52:00.000-08:002009-01-12T13:27:30.859-08:00Top 25 most Dangerous Programming ErrorsThe SANS has published a list of the <a href="http://www.sans.org/top25errors/">top 25 most dangerous programming errors</a>. What does this have to do with computational biology? From my observation, it seems that software in compbio labs goes by the following time-line:<br /><br />1) Research and Develop a new technique<br />2) Setup web-server before paper goes to print<br />3) Profit!!! (writing more grants)<br /><br />The software is written during Research and Development of a new analytical technique. This means very little software design goes into its development. Once there is a working technique, a paper is written and a web server to provide the tool is setup. The web service is mostly just advertising for the paper and helps argue the point that the lab is worthy of more grant money. Very little time is actually spent on proper software engineering, and even less time on security analysis. This could turn a lot of compbio labs into rather soft targets for hackers.<br /><br />Via <a href="http://www.informationweek.com/blog/main/archives/2009/01/more_than_codin.html">Information week</a>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-92196795964202403852009-01-09T15:39:00.000-08:002009-01-09T16:02:14.041-08:00Systems Biology is important, starting.... NowSystems Biology is apparently important, at least that is what 110 scientists from Europe are saying. <a href="http://www.sciencedaily.com/releases/2009/01/090106083834.htm">Science Daily</a>, <a href="http://www.genomeweb.com/issues/news/151680-1.html">Genome Web Daily News</a>, are reporting that scientist from the European Science Foundation have published a report entitled, "Advancing Systems Biology for Medical Applications" (SSA LSSG-CT-2006-037673). This paper stresses the importance of developing systems biology techniques for improving medicine.<br /><br /><span class="fullpost"><br />For the layman, <a href="http://en.wikipedia.org/wiki/Systems_biology">Systems Biology</a> refers the system of biochemical interactions, both the core components and the complex network of reactions that occur between them. Its genius occurred around the time that the human genome project was completing and the total estimate of protein coding genes was rapidly plummeting from initial expectations. Given the complexity of the human body, predictions reached up to 150K. But after all was <a href="http://www.ornl.gov/sci/techresources/Human_Genome/faq/genenumber.shtml">said and done</a>, estimates pegged the number at around 20K. Only twenty thousand genes to make a human, and it takes <a href="http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6VS4-4N0PG3D-2&_user=10&_rdoc=1&_fmt=&_orig=search&_sort=d&view=c&_acct=C000050221&_version=1&_urlVersion=0&_userid=10&md5=b3baf4d1c0a78a7a8aac9a13b7522c95">41 000 genes to make rice</a>. <br /><br />If the complexity didn't come from the total number of genes, then it came from the complex network of those genes interacting.<br /><br />For the cynic, basically nothing has changed. Everyone already knew that systems biology was important and the next step in understanding complex organisms. All this is about is reminding politicians that if they want results, they better be willing to sign some rather large checks.<br /></span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-37686699439423221502009-01-08T16:16:00.000-08:002009-01-08T16:20:20.755-08:00Grep'ing Green Genes by TaxonID<a href="http://en.wikipedia.org/wiki/16S_ribosomal_RNA">16s RNA</a> is a component in the prokaryotic ribosomal system. It is necessary for survival, so it is very well conserved in prokaryotic genomes. It also has some 'hyper variable' regions that tend to mutate as a species evolves. Because of these two reasons it is a good marker for <a href="http://en.wikipedia.org/wiki/Phylogenetic">phylogenetic mapping</a>. <a href="http://greengenes.lbl.gov/">Green Genes</a> is a project to provide a comprehensive database of sampled 16s sequences. Sometimes you want to start from a NCBI Taxon ID, for example from <a href="http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Undef&name=Escherichia+coli&lvl=0&srchmode=1">E. Coli</a>, which has the NCBI Taxon ID code of 562 and obtain a list of associated 16s RNA sequences.<br /><br /><span class="fullpost"><br />Start by obtaining a copy of the Green Genes database at <a href="http://greengenes.lbl.gov/Download/Sequence_Data/Greengenes_format/greengenes16SrRNAgenes.txt.gz">http://greengenes.lbl.gov/Download/Sequence_Data/Greengenes_format/greengenes16SrRNAgenes.txt.gz</a> <br /><br /><br />Assuming we have a list of taxon codes in a file 'taxon.list'<br /><br />gunzip -c greengenes16SrRNAgenes.txt.gz | ./green_genes_taxon_grep.py taxon.list<br /><br />'green_genes_taxon_grep.py' code:<br /><pre><br />#!/usr/bin/python<br /><br /><br />import sys<br />import re<br />import string<br /><br /><br />def get_fasta(title, seq):<br /> out_str = ">%s\n" % title<br /> for i in (range(0, len(seq)+1, 60)):<br /> out_str += "%s\n" % seq[i:i+60]<br /> return out_str<br /><br /><br />taxon_list = {}<br />taxon_file = open( sys.argv[1] )<br />for a in taxon_file.xreadlines():<br /> taxon_list[ string.rstrip(a) ] = 1<br />taxon_file.close()<br /><br />file = sys.stdin<br /><br />re_begin = re.compile(r'^BEGIN')<br />re_end = re.compile(r'^END')<br />re_seq = re.compile(r'aligned_seq=(.*)')<br />re_ncbi_gi = re.compile(r'ncbi_gi=(.*)')<br />re_dot = re.compile(r'[\.\-]')<br />re_taxon_id = re.compile( r'^ncbi_tax_id=(.*)' )<br />re_name = re.compile(r'^prokMSAname=(.*)')<br />re_msa_id = re.compile(r'^prokMSA_id=(.*)')<br />re_ncbi_acc = re.compile(r'^ncbi_acc_w_ver=(.*)')<br />report = 0<br />for a in file.xreadlines():<br /><br /> if ( re_begin.search( a ) ):<br /> report = 0<br /> elif ( re_end.search( a ) ):<br /> if report:<br /> title_str = "%s %s %s" % (cur_msa_id, cur_ncbi_acc, cur_name)<br /> print get_fasta( title_str, cur_seq )<br /> elif ( re_seq.search( a ) ):<br /> cur_seq = re_seq.search( string.rstrip(a) ).group(1)<br /> cur_seq = re_dot.sub("", cur_seq )<br /> elif ( re_ncbi_gi.search( a ) ):<br /> cur_ncbi_gi = re_ncbi_gi.search( string.rstrip(a) ).group(1)<br /> elif ( re_name.search(a) ):<br /> cur_name = re_name.search( string.rstrip(a) ).group(1)<br /> elif ( re_msa_id.search( a ) ):<br /> cur_msa_id = re_msa_id.search( string.rstrip(a) ).group(1)<br /> elif ( re_ncbi_acc.search( a ) ):<br /> cur_ncbi_acc = re_ncbi_acc.search( string.rstrip(a) ).group(1)<br /> elif ( re_taxon_id.search( string.rstrip(a) ) ):<br /> taxon_id = re_taxon_id.search( string.rstrip(a) ).group(1)<br /> if taxon_list.has_key( taxon_id ):<br /> report = 1<br /> cur_taxon = taxon_id<br /></pre><br /><br /></span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-58376587969583746462009-01-08T12:00:00.000-08:002009-01-08T13:27:09.780-08:00Coding OrganismsDrew Endy, From MIT and <a href="http://openwetware.org/">OpenWetware</a>, is featured on <a href="http://fora.tv/2008/11/17/Designing_and_Building_Organisms_Simple_As_Writing_Code">ForaTV</a>, giving talk on designing organisms. We've previously <a href="http://compbiodude.blogspot.com/2008/01/hacking-you-dna.html">mentioned</a> his talks on <a href="http://io9.com/342030/the-dos-and-donts-of-hacking-your-dna">genetic design</a>.<br /><br /><br /><object classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,0,0" height="264" width="400"><param name="flashvars" value="webhost=fora.tv&clipid=8468&cliptype=clip"><param name="allowScriptAccess" value="always"><param name="allowFullScreen" value="true"><param name="movie" value="http://fora.tv/embedded_player"><embed flashvars="webhost=fora.tv&clipid=8468&cliptype=clip" src="http://fora.tv/embedded_player" allowscriptaccess="always" allowfullscreen="true" type="application/x-shockwave-flash" pluginspage="http://www.macromedia.com/go/getflashplayer" height="264" width="400"></embed></object>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-22769292184224610942009-01-07T20:43:00.000-08:002009-01-08T13:28:37.002-08:00Linux Watch: Open Discovery<a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://tbn0.google.com/images?q=tbn:zRU0W-fq3m3ByM:http://www.file-extensions.org/imgs/company-logo/891/linux-online-inc.jpg"><img style="margin: 0pt 10px 10px 0pt; float: left; cursor: pointer; width: 102px; height: 122px;" src="http://tbn0.google.com/images?q=tbn:zRU0W-fq3m3ByM:http://www.file-extensions.org/imgs/company-logo/891/linux-online-inc.jpg" alt="" border="0" /></a><br /><a href="http://www.opendiscovery.org.in/">Open Discovery</a> is a Fedora 9 derived USB based distribution with open source Bioinformatics tools pre-installed.<br /><br />It's nice that they are bundling all of this in a USB bootable distribution for all those Bioinformatians that prefer Windows. However, I'm curious why they chose to go for a whole new distribution rather then simple creating a new YUM repository, like <a href="http://rpmfusion.org/">RPM Fusion</a>, that can be added to an existing standard Fedora install.<br /><br /><span class="fullpost"><br />If you are interested Open Discovery includes:<br /><ul><li><span style="font-size:85%;"><a href="http://emboss.sourceforge.net/">EMBOSS</a><br /></span></li><li><a href="http://www.clustal.org/"><span style="font-size:85%;"></span><span style="font-size:85%;">ClustalX</a> </span></li><li><a href="http://www.jalview.org/"><span style="font-size:85%;"></span><span style="font-size:85%;">Jalview</a> </span></li><li><span style="font-size:85%;"> <a href="http://www.bioinformatics.org/sms2">Sequence Manipulation Suite(SMS2)</a> </span></li><li><a href="http://cmgm.stanford.edu/phylip"><span style="font-size:85%;"></span><span style="font-size:85%;">Phylip</a> </span></li><li><span style="font-size:85%;"><a href="http://hmmer.janelia.org/">HMMER</a><br /></span></li><li><a href="http://genome.ucsc.edu/"><span style="font-size:85%;"></span><span style="font-size:85%;">BLAT</a> </span></li><li><span style="font-size:85%;"><a href="http://www.sanger.ac.uk/Software/Alfresco">Alfresco</a><br /></span></li><li><span style="font-size:85%;"><a href="http://www.sacs.ucsf.edu/Documentation/mfold">Mfold</a><br /></span></li><li><a href="http://pymol.sourceforge.net/"><span style="font-size:85%;"></span><span style="font-size:85%;">Pymol</a> </span></li><li><a href="http://spdbv.vital-it.ch/"><span style="font-size:85%;"></span><span style="font-size:85%;">SwissPdb Viewer</a> </span></li><li><span style="font-size:85%;"><a href="http://www.ncbi.nlm.nih.gov/Structure/CN3D/cn3d.shtml">Cn3D</a><br /></span></li><li><span style="font-size:85%;"><a href="http://www.umass.edu/microbio/rasmol/index2.htm">Rasmol</a><br /></span></li><li><a href="http://wiki.c2b2.columbia.edu/honiglab_public/index.php/Software:Jackal_General_Description"><span style="font-size:85%;"></span><span style="font-size:85%;">Jackal</a> </span></li><li><span style="font-size:85%;"> <a href="http://swissmodel.expasy.org/SWISS-MODEL.html">Swiss model</a><br /></span></li><li><a href="http://www.gromacs.org/"><span style="font-size:85%;"></span><span style="font-size:85%;">Gromacs</a> </span></li><li><span style="font-size:85%;"><a href="http://www.ks.uiuc.edu/Research/namd/">NAMD</a> </span></li><li><span style="font-size:85%;"><a href="http://autodock.scripps.edu/">AutoDock</a> </span></li><li><span style="font-size:85%;"><a href="http://bioinfo3d.cs.tau.ac.il/PatchDock">PatchDock</a><br /></span></li><li><a href="http://www.bionmr.ualberta.ca/bds/software/stc/html-4.3/stc.html"><span style="font-size:85%;"></span><span style="font-size:85%;">STC</a> </span></li><li><span style="font-size:85%;"><a href="http://openbabel.org/">Open Babel</a><br /></span></li></ul> <p><span style="font-size:85%;"><a href="http://openbabel.org/"></a></span></p><span style="font-size:85%;"><br />Via <a href="http://www.bioinformatics.org/forums/forum.php?forum_id=7020">Bioinformatics.org</a></span><br /><br /></span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-85365642723593494902009-01-07T14:37:00.000-08:002009-01-07T16:03:55.833-08:00Homegrown Molecular BiologyFrom <a href="http://news.yahoo.com/s/ap/20081225/ap_on_sc/do_it_yourself_dna">Yahoo News</a>:<br /><br /><blockquote style="font-style: italic;">Using homemade lab equipment and the wealth of scientific knowledge available online, these hobbyists are trying to create new life forms through <span style="border-bottom: 1px dashed rgb(0, 102, 204); background: transparent none repeat scroll 0% 0%; cursor: pointer; -moz-background-clip: -moz-initial; -moz-background-origin: -moz-initial; -moz-background-inline-policy: -moz-initial;" class="yshortcuts" id="lw_1230249007_1">genetic engineering</span> — a field long dominated by Ph.D.s toiling in university and corporate laboratories.</blockquote><br />What a negative view of science.... I wouldn't call what I do toiling. Of course I work at a computer terminal, not in the wet lab.<br /><br />Weird to think that the game year that the Nobel Prize is given out for the work done for <a href="http://nobelprize.org/nobel_prizes/chemistry/laureates/2008/press.html">Green Fluorescent Protein</a> (GFP), you can use it for home projects.<br /><br />But if you are interested in setting up a DNA lab like the one mentioned in the article, check out the projects mentioned in 'Make Magazine'<br /><br /><table style="margin: 10px 0pt;" border="1" bordercolor="#000000" cellpadding="0" cellspacing="0"><tbody><tr><td><table background="http://www.make-digital.com/make/vol07/include/icons/nav_bg.gif" border="0" cellpadding="0" cellspacing="0" width="100%"><tbody><tr height="35" valign="middle"><td align="left"><a href="http://www.make-digital.com/make/vol07/" title="View Volume 07" target="_blank"><img style="margin-left: 5px; margin-right: 5px;" src="http://www.make-digital.com/make/vol07/include/icons/navbar_logo.gif" border="0" height="28" /></a></td><td id="topBar" align="right"><span style=";font-family:Comic Sans MS,Arial,Helvetica;font-size:78%;" >Look Inside >> </span></td></tr></tbody></table><table align="center" border="0" cellpadding="0" cellspacing="0" width="240"><tbody><tr id="snippetThumbs" align="center"><td align="right"><a href="http://www.make-digital.com/make/vol07/?pg=70" target="_blank" onclick="name='w'+Math.round(Math.random()*(1000));w=screen.width-10;h=screen.height-40;window.open('http://www.make-digital.com/make/vol07/?pg=70',name,'toolbar=no,menubar=no,resizable=yes,scrollbars=yes,left=0,top=0,width='+w+'height='+h);return false;" title="View Magazine"><img src="http://www.make-digital.com/tcprojects/oreilly/make/inbox/33760/imgpages/tn/makevol7_0070.png" border="0" /></a></td><td align="left"><a href="http://www.make-digital.com/make/vol07/?pg=70" target="_blank" onclick="name='w'+Math.round(Math.random()*(1000));w=screen.width-10;h=screen.height-40;window.open('http://www.make-digital.com/make/vol07/?pg=70',name,'toolbar=no,menubar=no,resizable=yes,scrollbars=yes,left=0,top=0,width='+w+'height='+h);return false;" title=" View Magazine"><img src="http://www.make-digital.com/tcprojects/oreilly/make/inbox/33760/imgpages/tn/makevol7_0071.png" border="0" /></a></td></tr></tbody></table><table background="http://www.make-digital.com/make/vol07/include/icons/nav_bg.gif" border="0" cellpadding="0" cellspacing="0" width="100%"><tbody><tr height="28" valign="middle"><td id="bottomBar" align="center"><span style=";font-family:Comic Sans MS,Arial,Helvetica;font-size:78%;" >Volume 07</span></td></tr></tbody></table></td></tr></tbody></table>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-14030672230903810172009-01-07T10:34:00.000-08:002009-01-07T13:10:21.059-08:00HMMER 3.0 Alpha Incoming<a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://selab.janelia.org/images/hmmer_logo.jpg"><img style="margin: 0pt 10px 10px 0pt; float: left; cursor: pointer; width: 100px; height: 100px;" src="http://selab.janelia.org/images/hmmer_logo.jpg" alt="" border="0" /></a><br />The <a href="http://selab.janelia.org/">Eddy's lab</a> blog <a href="http://selab.janelia.org/people/eddys/blog" title="Cryptogenomicon home">Cryptogenomicon</a> has posted a note about the <a href="http://selab.janelia.org/people/eddys/blog/?p=33">incoming HMMER 3.0 Alpha </a>. Sounds like their hoping for a "won’t explode and kill you" alpha, but with claim like<a href="http://selab.janelia.org/people/eddys/blog/?p=6#more-6"><span style="font-style: italic;"> "</span><strong style="font-weight: normal; font-style: italic;">HMMER is now about as fast as BLAST".</strong></a> This may be an alpha you want to get in on.<br /><br />The alpha drops Monday Janurary 12th, 2009.<br /><a href="http://selab.janelia.org/people/eddys/blog/?p=6#more-6"><strong style="font-weight: normal; font-style: italic;"></strong></a>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com0tag:blogger.com,1999:blog-30744890.post-39437852440639151742008-04-14T10:39:00.000-07:002009-01-08T12:06:24.132-08:00Python Blast SAX exampleFor the times when you have very large XML formatted Blast output files (use the -m7 option at run time), you can use a SAX python parser to read out the files. <br /><br /><span class="fullpost"><br />There are two main methods for dealing with XML data, SAX and DOM. DOM parses the XML file into a tree based hierarchy. And while the tree based representation of the data is a very good way to organize the various facets of the data, it does require that you load the entire file into memory. If the file is very large, say a 1GB file from blasting one set of proteins against another, then loading the file into memory can become problematic. SAX on the other hand, treats an XML file as a stream of information. As it reads the file, it comes across opening and closing tags, and uses those identify the 'state' the parser is in using callbacks. Below is an example Python code that looks for tags in a blast file to identify which which data is being read. The BlastHandler class contains a set of call backs 'startElement', 'characters', and 'endElement'. Using these callbacks the program prints out the ID codes of the query and hit (a ID is assumed to be the first set of none-space characters after the '>' sign in the fasta file). It then prints out this information for every hit that it encounters in the file.<br />Also note the use of the 'BlankEntityHandler' class as the entity handler. XML files will often refer to external schema files that describe their format. If the default entity handler is left intact, then that schema may be downloaded as the file is parsed, causing extra bandwidth usage and computational time. We just return null data to bypass this process.<br /><br /><br /><pre><br />#!/usr/bin/python<br /><br />import sys<br />import re<br />import xml.sax.saxutils<br />import xml.sax.xmlreader<br />import cStringIO<br /><br /><br />def get_def_id(a):<br />return (re.compile(r'\s+').split(a))[0]<br /><br />class BlankEntityHandler( xml.sax.handler.EntityResolver):<br />def resolveEntity( self, publicId, systemId):<br /> return cStringIO.StringIO()<br /><br />class BlastHandler(xml.sax.handler.ContentHandler):<br />def __init__(self):<br /> self.inRead = 0<br /> self.buffer = ""<br /><br />def startElement(self, name, attributes):<br /> if name == "Hsp":<br /> ""<br /> elif name == "Iteration_query-def":<br /> self.inRead = 1<br /> self.buffer = ""<br /> elif name == "Iteration_query-len":<br /> self.inRead = 1<br /> self.buffer = ""<br /> elif name == "BlastOutput_query-def":<br /> self.inRead = 1<br /> self.buffer = ""<br /> elif name == "BlastOutput_query-len":<br /> self.inRead = 1<br /> self.buffer = ""<br /> elif name == "Hit_def":<br /> self.inRead = 1<br /> self.buffer = ""<br /> elif name == "Hsp_evalue":<br /> self.inRead = 1<br /> self.buffer = ""<br /> elif name == "Hsp_align-len":<br /> self.inRead = 1<br /> self.buffer = ""<br /><br />def characters(self, data):<br /> if self.inRead:<br /> self.buffer += data<br /><br />def endElement(self, name):<br /> if name == "Iteration_query-def":<br /> self.inRead = 0<br /> self.curQueryDef = self.buffer<br /> elif name == "Iteration_query-len":<br /> self.inRead = 0<br /> self.curQueryLen = int(self.buffer)<br /> elif name == "BlastOutput_query-def":<br /> self.inRead = 0<br /> self.curQueryDef = self.buffer<br /> elif name == "BlastOutput_query-len":<br /> self.inRead = 0<br /> self.curQueryLen = int(self.buffer)<br /> elif name == "Hit_def":<br /> self.inRead = 0<br /> self.curHitDef = self.buffer<br /> elif name == "Hsp_evalue":<br /> self.inRead = 0<br /> self.curEvalue = self.buffer<br /> elif name == "Hsp_align-len":<br /> self.inRead = 0<br /> self.curAlignLen = int(self.buffer)<br /> elif name == "Hsp":<br /> if ( self.curAlignLen >= float(self.curQueryLen) * 0.6 ):<br /> print get_def_id(self.curQueryDef),\<br /> get_def_id(self.curHitDef), \<br /> self.curEvalue, \<br /> self.curAlignLen, \<br /> self.curQueryLen<br /><br />parser = xml.sax.make_parser( )<br />parser.setEntityResolver( BlankEntityHandler() )<br />handler = BlastHandler( )<br />parser.setContentHandler(handler)<br />parser.parse( sys.argv[1] )<br /><br /></pre><br /><br /></span>Compbio Dudehttp://www.blogger.com/profile/04059892623805508123noreply@blogger.com1