-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path08.lift.sh
executable file
·162 lines (124 loc) · 4.46 KB
/
08.lift.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/bin/bash
set -o pipefail
set -u # error if reference uninitialized variable
shopt -s nullglob
# lift annotations from v2 to v3b genome assemblies
# configuration
pfx=$(echo $0 | sed -e "s/^.*\///" -e "s/\..*$//")
chrs="PT"
# input files
ann="/gen/0061/2015cgp/scripts/15.DCARv2.sequintable"
inseqv2="/vcru_share_s/carrot/LNRQ01/data/130.DHv2_~.fna.gz" # substitute ~ with chromosome
inseqv3="/vcru_share_s/carrot/dcarv3c/data/DCARv3c_~.fna.gz"
# output files
outdir="$pfx.outputdir"
# Initialization
if [ ! -d "$outdir" ]; then
echo "Creating directory \"$outdir\" $(date)"
mkdir "$outdir"
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
fi
cd "$outdir"
for chr in PT MT ; do
echo "+++ Processing chromosome \"$chr\" $(date)"
v2=$(echo $inseqv2 | sed -e "s/~/$chr/")
if [ ! -s "$v2" ]; then echo "Missing input file \"$v2\"" ; exit 1; fi
v3=$(echo $inseqv3 | sed -e "s/~/$chr/")
if [ ! -s "$v3" ]; then echo "Missing input file \"$v3\"" ; exit 1; fi
sequin="$pfx.$chr.v2.sequin"
inbed="$pfx.$chr.v2.bed"
outbed="$pfx.$chr.v3.bed"
outsequin="$pfx.$chr.v3.sequin"
if [ ! -s "$sequin" ]; then
echo "+++ Creating V2 sequin file for $chr"
bb.fastagrep --infile="$ann" --outfile="$sequin" --grep="Feature DCARv2_$chr"
fi
#set -x
set -e
dir1=1.$chr.split
dir2=2.$chr.lift
dir3=3.$chr.psl
dir4=4.$chr.liftup
dir5=5.$chr.chain_raw
dir6=6.$chr.chain_split
dir7=7.$chr.net
dir8=8.$chr.over
mkdir -p $dir1
mkdir -p $dir2
mkdir -p $dir3
mkdir -p $dir4
mkdir -p $dir5
# 6 made by chainSplit
mkdir -p $dir7
mkdir -p $dir8
# uncompress fasta files temporarily
echo "+++ Step 1 $chr Uncompress fasta files"
zcat "$v2" > $pfx.$chr.old.tmp
zcat "$v3" > $pfx.$chr.new.tmp
ls -ltr $pfx.$chr.*.tmp
# split the new assembly sequences into 3K chunks and make lift files
echo "+++ Step 2 $chr Split new assembly"
faSplit size $pfx.$chr.new.tmp 3000 ./$dir1/$chr.split -lift=./$dir2/$chr.lft -oneFile
ls -ltr $dir1/
ls -ltr $dir2/
# run blat, subject is old assembly, query is new assembly
echo "+++ Step 3 $chr BLAT"
blat "$pfx.$chr.old.tmp" ./$dir1/$chr.split.fa -t=dna -q=dna -tileSize=12 \
-fastMap -minIdentity=95 -noHead -minScore=100 ./$dir3/$chr.psl
ls -ltr $dir3/
# for plastid, we can ignore reverse matches due to the inverted repeat
if [ "$chr" == "PT" ]; then
mv ./$dir3/$chr.psl ./$dir3/$chr.psl.full
grep -v ' - ' ./$dir3/$chr.psl.full > ./$dir3/$chr.psl
fi
# Change coordinates of .psl files to parent coordinate system
echo "+++ Step 4 $chr Change coordinates"
liftUp -pslQ ./$dir4/$chr.liftup.psl ./$dir2/$chr.lft warn ./$dir3/$chr.psl
ls -ltr $dir4/
# Make chain files
echo "+++ Step 5 $chr Made chain files"
axtChain -linearGap=medium -faQ -faT -psl ./$dir4/$chr.liftup.psl \
./$pfx.$chr.old.tmp ./$pfx.$chr.new.tmp ./$dir5/$chr.chain
ls -ltr $dir5/
# Merge and sort chain files
echo "+++ Step 6a $chr Merged chain files"
chainMergeSort ./$dir5/*.chain | chainSplit $dir6 stdin
ls -ltr $dir6/
echo "+++ Step 6b $chr FASTA lengths"
faSize $pfx.$chr.old.tmp -detailed > old.$chr.chr_length.txt
faSize $pfx.$chr.new.tmp -detailed > new.$chr.chr_length.txt
ls -ltr *chr_length.txt
# Make alignment nets from chain files
for i in ./$dir6/*.chain
do
echo "+++ Step 7 $chr Process i=\"$i\""
tag=${i/\.\/$dir6\//}
chainNet $i ./old.$chr.chr_length.txt ./new.$chr.chr_length.txt ./$dir7/$tag.net /dev/null
done
# Create liftOver chain file
for i in ./$dir6/*.chain
do
echo "+++ Step 8 $chr Process i=\"$i\""
tag=${i/\.\/$dir6\//}
netChainSubset ./$dir7/$tag.net $i ./$dir8/$tag.chain
done
echo "+++ Step 9 $chr chain file"
cat ./$dir8/*.chain > $chr.old_new.over.chain
ls -ltr $chr.old_new.over.chain
# convert sequin to bed
/gen/0081/scripts/06.sequintobed.pl "$sequin" "$inbed"
# Make bed file to report converted coordinates. We can give the coordinates of
# our query regions (based on old assembly) in the $inbed file and liftOver
# will report the converted coordinates in $outbed file.
liftOver "$inbed" ./$chr.old_new.over.chain "$outbed" $chr.unMapped
# convert bed back to sequin
/gen/0081/scripts/06.sequintobed.pl "$outbed" "$outsequin"
# remove temporary uncompressed fasta files
rm -f $pfx.$chr.{old,new}.tmp
echo "+++ Finished chromosome \"$chr\" $(date)"
done # for chr
cd -
cp -puv --no-preserve=owner "$0" notebook/
echo "$0 Done $(date)"
exit 0
#eof