-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlr_benchmark.sh
executable file
·131 lines (110 loc) · 6.26 KB
/
lr_benchmark.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env bash
# lr_benchmark.sh: Run a benchmark for vg long read mapping
# Meant to be run on UCSC Courtyard/Plaza
set -ex
set -o pipefail
# Here we use : and := to set variables to default values if not present in the environment.
# You can set these in the environment to override them and I don't have to write a CLI option parser.
# See https://stackoverflow.com/a/28085062
# Where should output go?
: "${OUT_DIR:="./lr_benchmark"}"
echo "Writing to ${OUT_DIR}"
mkdir -p "${OUT_DIR}"
# Adam Novak's simulated reads, loosely following Stephen Hwang's method.
# Annotated with GRCh38.chr1 style path names.
# Also available in 100 and 10000 read versions
# On GRCh38, sample HG00741:
# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/r9.5-acc0.95/HG00741/HG00741-sim-r9.5-acc0.95-1000.gam
# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/hifi/HG00741/HG00741-sim-hifi-1000.gam
# On CHM13, sample HG002:
# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/r9.5-acc0.95/HG002/HG002-sim-r9.5-acc0.95-1000.gam
# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/hifi/HG002/HG002-sim-hifi-1000.gam
# Note that not all of these reads have true positions on CHM13 due to a lack of alignment for some HG002 regions!
: "${INPUT_READ_PATH:=/public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/r9.5-acc0.95/HG002/HG002-sim-r9.5-acc0.95-1000.gam}"
# For GRCh38 mapping:
# An HPRC graph, linked to /public/groups/cgl/graph-genomes/xhchang/hprc_graph/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.giraffe.gbz
# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.giraffe.gbz
# Its indexes
# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.dist
# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.min
# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.xg
# For CHM13 mapping:
# An HPRC graph, with .min, .dist, .gbwt, and .xg files at least
: "${INPUT_GRAPH_BASE_URL:=https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc-chm13}"
# Where it is/will be stored on disk locally, along with a .gbz
: "${INPUT_GRAPH_BASE_PATH:=/public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/hprc-v1.0-mc-chm13}"
# Find the indexes
: "${INPUT_DIST_PATH:=${INPUT_GRAPH_BASE_PATH}.dist}"
: "${INPUT_MIN_PATH:=${INPUT_GRAPH_BASE_PATH}.min}"
: "${INPUT_XG_PATH:=${INPUT_GRAPH_BASE_PATH}.xg}"
: "${INPUT_GBZ_PATH:=${INPUT_GRAPH_BASE_PATH}.gbz}"
# Download the files that are probably there.
if [[ ! -e "${INPUT_DIST_PATH}" ]] ; then
wget "${INPUT_GRAPH_BASE_URL}.dist" -O "${INPUT_DIST_PATH}.tmp"
mv "${INPUT_DIST_PATH}.tmp" "${INPUT_DIST_PATH}"
fi
if [[ ! -e "${INPUT_MIN_PATH}" ]] ; then
wget "${INPUT_GRAPH_BASE_URL}.min" -O "${INPUT_MIN_PATH}.tmp"
mv "${INPUT_MIN_PATH}.tmp" "${INPUT_MIN_PATH}"
fi
if [[ ! -e "${INPUT_XG_PATH}" ]] ; then
wget "${INPUT_GRAPH_BASE_URL}.xg" -O "${INPUT_XG_PATH}.tmp"
mv "${INPUT_XG_PATH}.tmp" "${INPUT_XG_PATH}"
fi
if [[ ! -e "${INPUT_GBZ_PATH}" ]] ; then
# Make GBZ from other files if not there
# For which we need the GBWT
: "${INPUT_GBWT_PATH:=${INPUT_GRAPH_BASE_PATH}.gbwt}"
if [[ ! -e "${INPUT_GBWT_PATH}" ]] ; then
wget "${INPUT_GRAPH_BASE_URL}.gbwt" -O "${INPUT_GBWT_PATH}.tmp"
mv "${INPUT_GBWT_PATH}.tmp" "${INPUT_GBWT_PATH}"
fi
time vg gbwt -x "${INPUT_XG_PATH}" "${INPUT_GBWT_PATH}" --gbz-format -g "${INPUT_GBZ_PATH}.tmp"
mv "${INPUT_GBZ_PATH}.tmp" "${INPUT_GBZ_PATH}"
fi
if [[ "${WORK_DIR}" == "" ]] ; then
# Make a work directory
WORK_DIR="$(mktemp -d)"
CLEAN_WORK_DIR=1
else
# Let the user send one in in the environment.
mkdir -p "${WORK_DIR}"
CLEAN_WORK_DIR=0
fi
echo "Working in ${WORK_DIR}"
if [[ ! -e "${WORK_DIR}/possible.txt" ]] ; then
# Get the list of all reads with ref positions set, and which are thus possible to get right
vg view -aj "${INPUT_READ_PATH}" | jq -r 'select(.refpos) | .name' > "${WORK_DIR}/possible.txt.tmp"
mv "${WORK_DIR}/possible.txt.tmp" "${WORK_DIR}/possible.txt"
fi
if [[ ! -e "${WORK_DIR}/annotated.gam" ]] ; then
# Map reads using correctness tracking.
# Make sure to apply multi-position annotation which Giraffe won't do.
/usr/bin/time -v vg giraffe -G "${INPUT_READ_PATH}" -t 16 -B 8 --align-from-chains -Z "${INPUT_GBZ_PATH}" -d "${INPUT_DIST_PATH}" -m "${INPUT_MIN_PATH}" -x "${INPUT_XG_PATH}" --track-provenance --track-correctness --progress | /usr/bin/time -v vg annotate -x "${INPUT_XG_PATH}" -a - --multi-position -l 100 >"${WORK_DIR}/annotated.gam.tmp"
mv "${WORK_DIR}/annotated.gam.tmp" "${WORK_DIR}/annotated.gam"
fi
# Compute general stats
vg stats -a "${WORK_DIR}/annotated.gam" >"${OUT_DIR}/stats.txt"
if [[ ! -e "${WORK_DIR}/benchmark.tsv" ]] ; then
# See if reads get close enough to be correct
# TODO: vg gamcompare announces a correctness count, which we should save
vg gamcompare -r 200 "${WORK_DIR}/annotated.gam" "${INPUT_READ_PATH}" --aligner lrgiraffe --tsv >"${WORK_DIR}/benchmark.tsv.tmp"
mv "${WORK_DIR}/benchmark.tsv.tmp" "${WORK_DIR}/benchmark.tsv"
fi
# Make a QQ plot
scripts/plot-qq.R "${WORK_DIR}/benchmark.tsv" "${OUT_DIR}/qq.png"
# Compute a correctness rate
TOTAL_READS="$(cat "${WORK_DIR}/benchmark.tsv" | tail -n +2 | wc -l)"
POSSIBLE_READS="$(cat "${WORK_DIR}/possible.txt" | wc -l)"
CORRECT_READS="$(cat "${WORK_DIR}/benchmark.tsv" | tail -n +2 | grep "^1" | wc -l)"
CORRECT_FRACTION_TOTAL="$(echo "${CORRECT_READS}/${TOTAL_READS}" | bc -l)"
CORRECT_FRACTION_POSSIBLE="$(echo "${CORRECT_READS}/${POSSIBLE_READS}" | bc -l)"
echo "Correct reads: ${CORRECT_READS}" >"${OUT_DIR}/results.txt"
echo "Total reads: ${TOTAL_READS}" >>"${OUT_DIR}/results.txt"
echo "Reads with truth positions: ${POSSIBLE_READS}" >>"${OUT_DIR}/results.txt"
echo "Correct fraction: ${CORRECT_FRACTION_TOTAL} of all reads, ${CORRECT_FRACTION_POSSIBLE} of reads with truth positions" >>"${OUT_DIR}/results.txt"
cat "${OUT_DIR}/results.txt"
if [[ "${CLEAN_WORK_DIR}" == "1" ]] ; then
# Clean up the work directory
rm -Rf "${WORK_DIR}"
fi