-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathretrieve_essential_genbanks.py
81 lines (57 loc) · 2.52 KB
/
retrieve_essential_genbanks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
import os
import sys
from Bio import Seq
from Bio import SeqIO
from Bio.Alphabet import IUPAC
import numpy as np
import pandas as pd
# path = os.path.join(os.path.expanduser('~'),'GENOMES_BACTER/ftp.ncbi.nih.gov/refseq/release/bacteria/genbank')
path = os.path.join(os.path.expanduser('~'),'GENOMES_BACTER_RELEASE69/genbank')
path_nucs = os.path.join(os.path.expanduser('~'),'GENOMES_BACTER_RELEASE69/nucfasta')
print "Loading original databases and description file ..."
# summary file parsing ...
summary_fname = "env_catalog_compgenome.dat"
summary = pd.read_csv(os.path.join(path,summary_fname))
# genbank DB ...
gbdb_fname = os.path.join(path,'genbank.idx')
gbdb = SeqIO.index_db(gbdb_fname)
# nucleotides fasta files ...
fndb_fname = os.path.join(path_nucs,'nucfasta.idx')
fndb = SeqIO.index_db(fndb_fname,alphabet=IUPAC.IUPACAmbiguousDNA(),key_function=(lambda name: name.split('|')[3]))
# gbdb and fndb keys must be the same ...
print "Original databases are loaded ..."
print
print "Output the reduced genbank with the required entries only ..."
gb_small_fname = "tmp_gbank.gb"
with open(os.path.join(path,gb_small_fname),"w") as fp:
for idx in summary['GenomicID']:
fp.write(gbdb.get_raw(idx))
# index in-memory style the smaller subset of the genbank database ...
gbdb_small = SeqIO.index(os.path.join(path,gb_small_fname),"genbank")
# records = SeqIO.to_dict(SeqIO.parse("Quality/example.fastq", "fastq"))
# maybe to_dict would be faster, as we are parsing all of the records anyways ...
# maybe having another tmp file for nucfasta records would make things faster
# try it! sometime. So far, this takes ~40 mins
print "tmp file is written and indexed ..."
print
print "Matching nucleotide sequences from fasta with corresponding SeqRecords in genbank ..."
sequences = []
for idx in summary['GenomicID']:
seqrec = gbdb_small[idx]
nuc_seq = fndb[idx].seq
# next thing should never happen in practice ...
if ( len(seqrec.seq)!=len(nuc_seq) ):
print "Genome length doesn't match between fasta and genbank for %s record"%idx
print "Proceed anyways ..."
# give the genbank, its actual nucleotide sequence ...
seqrec.seq = nuc_seq
sequences.append(seqrec)
print "All set"
print
print "Writing final genbank ..."
# size_seqs = sys.getsizeof(sequences)
# print "All interesting sequences are in memory! %d byte of interesting sequences."%size_seqs
print "Writing all of that to the condensed.gb file"
with open("condensed.raw.gb","w") as fp:
SeqIO.write(sequences,fp,"genbank")