The CompBio Dude: Grep'ing Green Genes by TaxonID

16s RNA is a component in the prokaryotic ribosomal system. It is necessary for survival, so it is very well conserved in prokaryotic genomes. It also has some 'hyper variable' regions that tend to mutate as a species evolves. Because of these two reasons it is a good marker for phylogenetic mapping. Green Genes is a project to provide a comprehensive database of sampled 16s sequences. Sometimes you want to start from a NCBI Taxon ID, for example from E. Coli, which has the NCBI Taxon ID code of 562 and obtain a list of associated 16s RNA sequences.

Start by obtaining a copy of the Green Genes database at http://greengenes.lbl.gov/Download/Sequence_Data/Greengenes_format/greengenes16SrRNAgenes.txt.gz

Assuming we have a list of taxon codes in a file 'taxon.list'

gunzip -c greengenes16SrRNAgenes.txt.gz | ./green_genes_taxon_grep.py taxon.list

'green_genes_taxon_grep.py' code:


#!/usr/bin/python


import sys
import re
import string


def get_fasta(title, seq):
 out_str = ">%s\n" % title
 for i in (range(0, len(seq)+1, 60)):
  out_str += "%s\n" % seq[i:i+60]
 return out_str


taxon_list = {}
taxon_file = open( sys.argv[1] )
for a in taxon_file.xreadlines():
 taxon_list[ string.rstrip(a) ] = 1
taxon_file.close()

file = sys.stdin

re_begin   = re.compile(r'^BEGIN')
re_end     = re.compile(r'^END')
re_seq     = re.compile(r'aligned_seq=(.*)')
re_ncbi_gi = re.compile(r'ncbi_gi=(.*)')
re_dot     = re.compile(r'[\.\-]')
re_taxon_id = re.compile( r'^ncbi_tax_id=(.*)' )
re_name     = re.compile(r'^prokMSAname=(.*)')
re_msa_id   = re.compile(r'^prokMSA_id=(.*)')
re_ncbi_acc = re.compile(r'^ncbi_acc_w_ver=(.*)')
report = 0
for a in file.xreadlines():

 if ( re_begin.search( a ) ):
  report = 0
 elif ( re_end.search( a ) ):
  if report:
   title_str = "%s %s %s" % (cur_msa_id, cur_ncbi_acc, cur_name)
   print get_fasta( title_str, cur_seq )
 elif ( re_seq.search( a ) ):
  cur_seq = re_seq.search( string.rstrip(a) ).group(1)
  cur_seq = re_dot.sub("", cur_seq )
 elif ( re_ncbi_gi.search( a ) ):
  cur_ncbi_gi = re_ncbi_gi.search( string.rstrip(a) ).group(1)
 elif ( re_name.search(a) ):
  cur_name = re_name.search( string.rstrip(a) ).group(1)
 elif ( re_msa_id.search( a ) ):
  cur_msa_id = re_msa_id.search( string.rstrip(a) ).group(1)
 elif ( re_ncbi_acc.search( a ) ):
  cur_ncbi_acc = re_ncbi_acc.search( string.rstrip(a) ).group(1)
 elif ( re_taxon_id.search( string.rstrip(a) ) ):
  taxon_id = re_taxon_id.search( string.rstrip(a) ).group(1)
  if taxon_list.has_key( taxon_id ):
   report = 1
   cur_taxon = taxon_id

The CompBio Dude

Thursday, January 08, 2009

Grep'ing Green Genes by TaxonID

No comments:

Blog Archive

Links