-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathchain_map.py
executable file
·285 lines (242 loc) · 9.9 KB
/
chain_map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#!/usr/bin/env python
from optparse import OptionParser
from bx.intervals.intersection import Interval, IntervalTree
import gzip, pdb, sys
import gff
################################################################################
# chain_map.py
#
# Map annotations in gff format to a new genome via a chain file.
#
# Assuming that features with the same group attribute are all on the same
# strand and can be treated as one big feature.
#
# The chain files are messy and contain overlapping alignment blocks, so
# I'm separating each chain. To further filter the output, provide a net
# file, and I'll only map to chains in the net.
################################################################################
################################################################################
# main
################################################################################
def main():
usage = 'usage: %prog [options] <chain file> <gff file>'
parser = OptionParser(usage)
parser.add_option('-k', dest='gtf_key', help='Group based on the given gtf key [Default: %default]')
parser.add_option('-m', dest='merge_t', type='int', default=40, help='Minimum distance between alignment blocks to merge into a single exon [Default: %default]')
parser.add_option('-n', dest='net_file', help='Net file to filter the chains considered')
(options,args) = parser.parse_args()
if len(args) != 2:
parser.error(usage)
else:
chain_file = args[0]
gff_file = args[1]
# build interval tree's of gff annotations
# map intervals to annotations
chr_features, interval_map = gff_intervals(gff_file, options.gtf_key)
# filter by net chains
if options.net_file:
net_chains = get_net_chains(options.net_file)
else:
net_chains = None
# process chain file
if chain_file[-3:] == '.gz':
chain_in = gzip.open(chain_file)
else:
chain_in = open(chain_file)
# header
line = chain_in.readline()
while line[:2] == '##':
line = chain_in.readline()
# map features
mapped_features = {}
chain_lines = []
while line:
if line.startswith('chain'):
if chain_lines and (not net_chains or chain_id in net_chains):
process_chain(chr_features, interval_map, mapped_features, chain_lines)
chain_lines = [line]
chain_id = line.split()[-1]
elif line.rstrip(): # skip blank lines
chain_lines.append(line)
line = chain_in.readline()
chain_in.close()
if chain_lines and (not net_chains or chain_id in net_chains):
process_chain(chr_features, interval_map, mapped_features, chain_lines)
# merge mapped features as gff lines
for feature_id in mapped_features:
for chain_id in mapped_features[feature_id]:
# sort features
mapped_features[feature_id][chain_id].sort(feature_cmp)
# make first line
(qchrom,qstart,qend,qstrand) = mapped_features[feature_id][chain_id][0]
gff_cols = [[qchrom, 'MyTransMap', 'feature', qstart, qend, '.', qstrand, '.', 'chain_id %s; %s'%(chain_id,feature_id)]]
# make the rest of the lines
for (qchrom,qstart,qend,qstrand) in mapped_features[feature_id][chain_id][1:]:
if gff_cols[-1][0] == qchrom and gff_cols[-1][6] == qstrand and gff_cols[-1][4] + options.merge_t >= qstart:
# large overlaps unexpected
ovl_start = max(gff_cols[-1][3], qstart)
ovl_end = min(gff_cols[-1][4], qend)
if (ovl_end-ovl_start+1) > 50:
print >> sys.stderr, 'Large overlap between features - %s' % feature_id
# merge w/ prior
gff_cols[-1][4] = qend
else:
# add new
gff_cols.append([qchrom, 'MyTransMap', 'feature', qstart, qend, '.', qstrand, '.', 'chain_id %s; %s'%(chain_id,feature_id)])
# print lines
for cols in gff_cols:
print '\t'.join([str(c) for c in cols])
################################################################################
# get_net_chains
#
# Save all of the chains used in nets to a set.
################################################################################
def get_net_chains(net_file):
net_chains = set()
if net_file[-3:] == '.gz':
net_in = gzip.open(net_file)
else:
net_in = open(net_file)
line = net_in.readline()
while line:
a = line.split()
if a[0] == 'fill':
for i in range(len(a)):
if a[i] == 'id':
net_chains.add(a[i+1])
break
line = net_in.readline()
return net_chains
################################################################################
# gff_intervals
#
# Build interval tree's of gff annotations and map intervals to annotations.
################################################################################
def gff_intervals(gff_file, gtf_key):
chr_features = {}
interval_map = {}
for line in open(gff_file):
a = line.split('\t')
a[-1] = a[-1].rstrip()
chrom = a[0]
start = int(a[3])
end = int(a[4])
strand = a[6]
if gtf_key:
feature_id = gff.gtf_kv(a[8]).get(gtf_key,a[8])
else:
feature_id = a[8]
chr_features.setdefault(chrom,IntervalTree()).insert_interval(Interval(start,end))
interval_map.setdefault(chrom,{}).setdefault((start,end),[]).append((feature_id,strand))
return chr_features, interval_map
################################################################################
# feature_cmp
#
# Sort mapped features before merging.
################################################################################
def feature_cmp(x, y):
# chrom
if x[0] < y[0]:
return -1
elif x[0] > y[0]:
return 1
else:
# strand
if x[3] < y[3]:
return -1
elif x[3] > y[3]:
return 1
else:
# start
if x[1] < y[1]:
return -1
elif x[1] > y[1]:
return 1
else:
# end
if x[2] < y[2]:
return -1
else:
return 1
################################################################################
# process_chain
#
# Map the annotation features described in chr_features and interval_map
# from the reference to query genomes described by the alignment chain in
# chain_lines and save the mappings to mapped_features.
################################################################################
def process_chain(chr_features, interval_map, mapped_features, chain_lines):
a = chain_lines[0].split()
rchrom = a[2]
rstrand = a[4]
rstart = int(a[5])
rend = int(a[6])
qchrom = a[7]
qstrand = a[9]
if qstrand == '+':
qstart = int(a[10])
qend = int(a[11])
else:
qstart = int(a[8]) - int(a[11])
qend = int(a[8]) - int(a[10])
chain_id = a[-1]
map_dir = (rstrand == qstrand)
# to follow along w/ alignments in the chain
align_rstart = rstart
if map_dir:
align_qstart = qstart
else:
align_qend = qend
if rchrom in chr_features:
for line in chain_lines[1:]:
# get alignment info
a = line.split()
align_size = int(a[0])
if len(a) > 1:
rgap = int(a[1])
qgap = int(a[2])
else:
rgap = 0
qgap = 0
align_rend = align_rstart + align_size
if map_dir:
align_qend = align_qstart + align_size
else:
align_qstart = align_qend - align_size
# search for overlapping feature intervals
for feature_interval in chr_features[rchrom].find(align_rstart, align_rend):
# can only map the overlapping portion
map_rstart = max(align_rstart, feature_interval.start)
map_rend = min(align_rend, feature_interval.end)
# map
if map_dir:
map_qstart = align_qstart + (map_rstart - align_rstart)
map_qend = align_qstart + (map_rend - align_rstart)
else:
map_qstart = align_qstart + (align_rend - map_rend)
map_qend = align_qstart + (align_rend - map_rstart)
#log_cols = [chain_id, align_rstart, align_rend, align_qstart, align_qend, feature_interval.start, feature_interval.end, map_rstart, map_rend, map_qstart, map_qend]
#print >> sys.stderr, '\t'.join([str(c) for c in log_cols])
# apply to all features w/ this interval
for (feature_id,map_rstrand) in interval_map[rchrom][(feature_interval.start,feature_interval.end)]:
map_qstrand = map_rstrand
if not map_dir:
# flip
if map_qstrand == '+':
map_qstrand = '-'
else:
map_qstrand = '+'
# add 1 to start to convert 0-based to 1-based
mapped_features.setdefault(feature_id,{}).setdefault(chain_id,[]).append((qchrom,map_qstart+1,map_qend,map_qstrand))
# get past the gap
align_rstart = align_rend + rgap
if map_dir:
align_qstart = align_qend + qgap
else:
align_qend = align_qstart - qgap
################################################################################
# __main__
################################################################################
if __name__ == '__main__':
main()
#pdb.runcall(main)