Start by obtaining a copy of the Green Genes database at http://greengenes.lbl.gov/Download/Sequence_Data/Greengenes_format/greengenes16SrRNAgenes.txt.gz
Assuming we have a list of taxon codes in a file 'taxon.list'
gunzip -c greengenes16SrRNAgenes.txt.gz | ./green_genes_taxon_grep.py taxon.list
'green_genes_taxon_grep.py' code:
#!/usr/bin/python
import sys
import re
import string
def get_fasta(title, seq):
out_str = ">%s\n" % title
for i in (range(0, len(seq)+1, 60)):
out_str += "%s\n" % seq[i:i+60]
return out_str
taxon_list = {}
taxon_file = open( sys.argv[1] )
for a in taxon_file.xreadlines():
taxon_list[ string.rstrip(a) ] = 1
taxon_file.close()
file = sys.stdin
re_begin = re.compile(r'^BEGIN')
re_end = re.compile(r'^END')
re_seq = re.compile(r'aligned_seq=(.*)')
re_ncbi_gi = re.compile(r'ncbi_gi=(.*)')
re_dot = re.compile(r'[\.\-]')
re_taxon_id = re.compile( r'^ncbi_tax_id=(.*)' )
re_name = re.compile(r'^prokMSAname=(.*)')
re_msa_id = re.compile(r'^prokMSA_id=(.*)')
re_ncbi_acc = re.compile(r'^ncbi_acc_w_ver=(.*)')
report = 0
for a in file.xreadlines():
if ( re_begin.search( a ) ):
report = 0
elif ( re_end.search( a ) ):
if report:
title_str = "%s %s %s" % (cur_msa_id, cur_ncbi_acc, cur_name)
print get_fasta( title_str, cur_seq )
elif ( re_seq.search( a ) ):
cur_seq = re_seq.search( string.rstrip(a) ).group(1)
cur_seq = re_dot.sub("", cur_seq )
elif ( re_ncbi_gi.search( a ) ):
cur_ncbi_gi = re_ncbi_gi.search( string.rstrip(a) ).group(1)
elif ( re_name.search(a) ):
cur_name = re_name.search( string.rstrip(a) ).group(1)
elif ( re_msa_id.search( a ) ):
cur_msa_id = re_msa_id.search( string.rstrip(a) ).group(1)
elif ( re_ncbi_acc.search( a ) ):
cur_ncbi_acc = re_ncbi_acc.search( string.rstrip(a) ).group(1)
elif ( re_taxon_id.search( string.rstrip(a) ) ):
taxon_id = re_taxon_id.search( string.rstrip(a) ).group(1)
if taxon_list.has_key( taxon_id ):
report = 1
cur_taxon = taxon_id
...Read more
No comments:
Post a Comment