-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path31.centromere.location.sh
executable file
·148 lines (120 loc) · 4.6 KB
/
31.centromere.location.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/bin/bash
set -o pipefail
set -u # error if reference uninitialized variable
shopt -s nullglob
# Find the centromere motif in the V3 genome assembly
# configuration
pfx=$(echo $0 | sed -e "s/^.*\///" -e "s/\..*$//")
# page 44 Nov. 26, 2014
marinamotif="CCAACTTCAAACGAGTCAAGAATGAAGCTACAAGTTGTTT"
# page 44 Nov. 28, 2014
consensusmotif="TCAACTTCAAACGAGTCTGGAATGAAGCTACAAGTTGTTT
CCAACTTCAATCTAGYCAATAATGAAGCWACAAGTTGTTT
CCRGGCTCAAACGAGTCAAGAATGAAACTACAAGTTGATT
CYAATTTYAAAC AACCGAAAATGAAGCTACAAGTTGTTT"
# master assembly
mastergff='/vcru_share_s/carrot/dcarv3d/data/DCARv3d.master.gff3'
# chromosome lengths (from $mastergff)
chrlen=(0 56320272 59175025 64558952 51152908 47556597 40885772 41844801 33368437 45850561)
# configuration of bin size for this analysis
binsize=250000
# input files
v3fasta="/vcru_share_s/carrot/dcarv3d/data/DCARv3d.fna"
v3db="/vcru_share_s/blastdbs/DCARv3d"
# output files
querymarina="$pfx.marinamotif.fna"
outmarina="$pfx.marinamotif.tsv"
queryconsensus="$pfx.consensusmotif.fna"
outconsensus="$pfx.consensusmotif.tsv"
# store two copies of the motif sequences so that there is no edge effect
if [ ! -s "$querymarina" ] ; then
echo "Creating motif fasta \"$querymarina\" $(date)"
echo ">cent1
$marinamotif
$marinamotif" > "$querymarina"
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
fi
if [ ! -s "$queryconsensus" ] ; then
echo "Creating motif fasta \"$queryconsensus\" $(date)"
echo ">cent1
$consensusmotif
$consensusmotif" > "$queryconsensus"
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
fi
if [ ! -s "$outmarina" ] ; then
echo "Blast Marina $(date)"
blastn \
-db "$v3db" \
-query "$querymarina" \
-out "$outmarina" \
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs qcovhsp" \
-dust no \
-max_target_seqs 50000
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
fi
if [ ! -s "$outconsensus" ] ; then
echo "Blast consensus $(date)"
blastn \
-db "$v3db" \
-query "$queryconsensus" \
-out "$outconsensus" \
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs qcovhsp" \
-dust no \
-max_target_seqs 50000
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
fi
for chr in {1..9} ; do
echo "Summarizing chromosome $chr $(date)"
# Length of current chromosome, defined in config section
len=${chrlen[$chr]}
# Create file of bins based on chromosome length
perl - $chr $len $binsize <<' __EOP__'
use strict;
use warnings;
my $chr = shift;
my $len = shift;
my $binsize = shift; # defined in parent bash script config section
my $i = 0;
open (my $OUTF, '>', 'bins.bed');
while ($i < $len)
{
my $e = $i + $binsize;
if ($e > $len) { $e = $len; }
print $OUTF join("\t", "DCARv3_Chr$chr", $i, $e), "\n";
$i += $binsize;
}
__EOP__
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
# Extract blast hit data just for this chromosome, convert to bed coordinate system
cat "$outconsensus" \
| perl -lanF/"\t"/ -e 'if ($F[8]>$F[9]){($F[8],$F[9])=($F[9],$F[8])} print join ("\t", $F[1], $F[8]-1, $F[9])' \
| grep "^DCARv3_Chr$chr" \
| sort-bed - \
> "$pfx.tmp.1"
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
bedtools coverage -a bins.bed -b "$pfx.tmp.1" > "$pfx.DCARv3_Chr$chr.tsv"
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
# preprocess for genome plotting. Convert fraction coverage (0-1) to percent (0-100)
echo "Chr Pos Value" > "$pfx.tmp.2"
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
cat "$pfx.DCARv3_Chr$chr.tsv" \
| perl -lanF/"\t"/ -e 'print join ("\t", $F[0], $F[1], $F[6]*100)' \
>> "$pfx.tmp.2"
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
bb.genomeplotter \
--infile="$pfx.tmp.2" \
--outfile="notebook/$pfx.DCARv3_Chr$chr.png" \
--mastergff="$mastergff" \
--chr="DCARv3_Chr$chr" \
--bars --filled \
--legend="DCARv3 Chr$chr" \
--nolabels \
--binsize=$binsize \
--ymax=100
r=$?; if [ $r != 0 ]; then echo "Error code $r line $LINENO script $0, halting"; exit $r; fi
rm -f bins.bed $pfx.tmp.1 $pfx.tmp.2
done
cp -puv --no-preserve=owner "$0" notebook/
echo "$0 Done $(date)"
exit 0
#eof