-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpickaxe.job
475 lines (440 loc) · 15.1 KB
/
pickaxe.job
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
#!/bin/bash
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=__EMAIL__
#SBATCH --time=__WALLTIME__
#SBATCH -N 1 # Ensure all cores are on one machine
#SBATCH -c __CPUS__
#SBATCH --output=__OUTPUTFILE__
#SBATCH --job-name=__NAME__
# # #SBATCH --mem=__MEM__g
#set -eu
set -o pipefail
module purge
module load cutadapt/1.18
module load prinseq/0.20.4
module load bowtie2/2.3.3 samtools/1.14
# Other modules are loaded when needed because there were conflicts with modules
# get software versions
SVFILE="software_versions.tsv"
rm -f $SVFILE
bowtie2 --version | perl -ne 'if (/version (\S+)/) { print "bowtie2\t$1\n"; exit 0 }' >> $SVFILE
echo -e "cutadapt\t"$(cutadapt --version) >> $SVFILE
prinseq-lite.pl -version | tr ' ' '\t' >> $SVFILE
RepeatMasker -v | cut -f 1,3 -d ' ' | tr ' ' '\t' >> $SVFILE
samtools --version | head -n1 | tr ' ' '\t' >> $SVFILE
# VARIABLES
AID=__AID__
AIDTYPE=__AIDTYPE__
THREADS=__CPUS__
BTHREADS=$((THREADS*2))
ASSEMBLY=assembly
ASSEMBLER=__ASSEMBLER__ # supported assemblers: clc_assembler, megahit
SEQLENGTH=500
ANNOTCONF=__ANNOTATERCONF__
OK=__OKFILE__
NEGREFLIST=__NEGREFLIST__
TEMP_SAM=bowtie.sam
UNMAPPED_FQ=unmapped.fq
ADAPTERSET=__ADAPTERSET__
ADAPTEROPTIONS=__ADAPTEROPTIONS__
SKIP_RM=__SKIP_RM__
REMOTETAX=__REMOTETAX__
EXITSUBX=__EXITSUBX__
EXITASSEMBLY=__EXITASSEMBLY__
EXITRM=__EXITRM__
EXITENT=__EXITENT__
TRIM_LEFT=__TRIMLEFT__
TRIM_RIGHT=__TRIMRIGHT__
SKIPMESSAGE="Skipping step since it has already been performed"
OKFILEDIR="okfiles/"
echo -e "\nPipeline starting: $(date)" && s=$(date +%s)
echo SLURM_JOBID: $SLURM_JOBID
mkdir -p $OKFILEDIR
##################################################################################################
# FUNCTIONS
# function to report success or failure of job step
function status_report () {
if [[ $1 != 0 ]] ; then
echo "ERROR: Step $2 failed with error code $1"
exit $1
fi
}
# function to report the date for each job step
function date_report () {
echo -en "\nStep $1 $2 [starting: $(date) "
now=$(date +%s)
printf "(%s sec)]\n" $(echo $((now-s)))
}
# function to for normal exit
function exitOK {
touch ${OK}
echo -e "\nPipeline end: $(date)" && e=$(date +%s)
echo Pipelinetime:$((e-s))
echo Pipeline ended with message: $1
exit;
}
##################################################################################################
# MAIN CODE
# Check if any work needs done
if [[ -e ${OK} ]]; then
echo $OK exists therefore no work needs done...exiting.
exit;
fi
# Step - obtain BAM file or skip step if input is fastq or a local BAM file
STEP_NUM=0
STEP="downloading_data"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report "$STEP_NUM" "Downloading data"
if [[ ! -e "$STEP_OKFILE" ]]; then
if [[ $AIDTYPE = bam ]]; then
echo "Doing nothing for this step since input is BAM"
elif [[ $AIDTYPE = srr ]]; then
echo "Downloading SRR fastq with fastq-dump"
module load sra-toolkit/2.9.2
fastq-dump --split-3 "$AID"
elif [[ $AIDTYPE = fastq ]]; then
echo "Doing nothing for this step since input is Fastq"
else
echo "AID is not a supported input"
fi
else
echo $SKIPMESSAGE
fi
status_report "$?" "$STEP_NUM"
touch "$STEP_OKFILE"
# Step - extract unmapped reads from BAM file
STEP_NUM=$((STEP_NUM+1))
STEP="extract_unmapped_reads"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report "$STEP_NUM" "Getting unmapped reads"
TEMP_OUT=temp.fastq
if [[ ! -e "$STEP_OKFILE" ]]; then
if [[ $AIDTYPE = fastq || $AIDTYPE = srr ]]; then
echo "Concatenating fastq files"
# Treat the fastq seqs as the "unmapped_reads". Concatenate fastq files into $TEMP_OUT.
FQFILES=("$AID"*)
SUFFIX="${FQFILES[0]##*.}"
for f in "${FQFILES[@]}"; do
case "$SUFFIX" in
bz2) bzip2 -dc "$f" >> "$TEMP_OUT"
echo "bzip2 -dc $f"
;;
gz) gunzip -c "$f" >> "$TEMP_OUT"
echo "gunzip -c $f"
;;
*) cat "$f" >> "$TEMP_OUT"
echo "cat $f"
;;
esac
done
else
echo "samtools fastq"
if [[ ${AID##*.} = bam ]]; then
TEMP_IN=${AID}
elif [[ $AIDTYPE = srr ]]; then
TEMP_IN=$AID.bam
else
for file in $(ls ${AID}/*bam*); do
suffix=${file##*.}
if [[ ${suffix} = bam || ${suffix} = bam_HOLD_QC_PENDING ]]; then
TEMP_IN=$file
break
fi
done
fi
samtools fastq -f 4 "$TEMP_IN" > "$TEMP_OUT"
fi
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
# Step - samtools flagstat stats
STEP_NUM=$((STEP_NUM+1))
STEP="samtools_stats"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Samtools flagstat stats (skipped for fastq input)"
if [[ ! -e "$STEP_OKFILE" ]]; then
if [[ $AIDTYPE = aid || $AIDTYPE = bam ]]; then
samtools flagstat "$TEMP_IN"
if [[ $AIDTYPE == "aid" || $AIDTYPE = srr ]]; then
rm -rf "$AID" "$AID".gto "$TEMP_IN"
fi
fi
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
# Step - preprocess fastq
STEP_NUM=$((STEP_NUM+1))
STEP="preprocess_fastq"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Preprocessing - cutadapt and prinseq"
TEMP_IN="$TEMP_OUT" # temp.fastq
TEMP_OUT="${TEMP_IN%.*}".preprocessed.fastq # temp.preprocessed.fastq
if [[ ! -e "$STEP_OKFILE" ]]; then
# hardcoding defaults QUALITYCUTOFF = 15; ENTROPYCUTOFF = 60; and LENGTHCUTOFF = 30
preprocessing_parallel.sh "$TEMP_IN" "$ADAPTERSET" 15 60 30 "$THREADS" "$TRIM_LEFT" "$TRIM_RIGHT" "$ADAPTEROPTIONS"
if [[ ! -s $TEMP_OUT ]]; then
exitOK "no quality sequences found"
fi
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
rm -f "$TEMP_IN"
touch "$STEP_OKFILE"
# Step - SUBTRACTION
STEP_NUM=$((STEP_NUM+1))
STEP="subx_pipeline"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Subtraction pipeline"
TEMP_IN="$TEMP_OUT"
if [[ ! -e "$STEP_OKFILE" ]]; then
subxpipeline.pl -f $TEMP_IN -t $THREADS -d __SUBTRACTION__
status_report ${?} ${STEP_NUM}
if [[ ! -s "$UNMAPPED_FQ" ]]; then
exitOK "no unmapped reads after subtraction pipeline"
fi
if [[ $EXITSUBX == 1 ]]; then
gzip -f "$UNMAPPED_FQ"
touch "$STEP_OKFILE"
exitOK "exiting after subtraction step"
fi
else
echo "$SKIPMESSAGE"
fi
touch "$STEP_OKFILE"
# Step - Map to VRS
STEP_NUM=$((STEP_NUM+1))
STEP="mapping_to_reference_sequences"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "VRS mapping"
if [[ ! -e "$STEP_OKFILE" ]]; then
if [[ -e $UNMAPPED_FQ ]]; then
bowtie2 --no-unal -p $THREADS -x __VIRUSINDEX__ -U $UNMAPPED_FQ | samtools view -b -S - | samtools sort - -o kv.vrs.hg19.bam
elif [[ -e "$UNMAPPED_FQ.gz" ]]; then
echo "Found unmapped reads as $UNMAPPED_FQ.gz...gunzipping file"
gunzip "$UNMAPPED_FQ.gz"
bowtie2 --no-unal -p $THREADS -x __VIRUSINDEX__ -U $UNMAPPED_FQ | samtools view -b -S - | samtools sort - -o kv.vrs.hg19.bam
else
echo "No unmapped reads found either as $UNMAPPED_FQ or $UNMAPPED_FQ.gz...exiting"
exit 102
fi
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
# Step - assemble reads
STEP_NUM=$((STEP_NUM+1))
STEP="assembling_reads"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Assembling reads"
TEMP_IN=${UNMAPPED_FQ}
TEMP_OUT=${ASSEMBLY}.fa
if [[ ! -e "$STEP_OKFILE" ]]; then
if [[ ! -e "$TEMP_IN" ]]; then
if [[ -e "$TEMP_IN".gz ]]; then
gunzip "$TEMP_IN".gz
else
echo "$TEMP_IN is missing. $TEMP_IN.gz could not be found either. Maybe it was manually deleted. Please make sure a file called $TEMP_IN (containing unmapped reads for assembly) exists in the Pickaxe output folder. Exiting now."
exit 100
fi
fi
if [[ $ASSEMBLER == "clc_assembler" ]]; then
# if hash clc_assembler 2>/dev/null; then
module load clc-assembly-cell/5.1.1
clc_assembler | perl -ne 'if (/Version: (\S+)/) { print "clc_assembler\t$1\n" }' >> $SVFILE
echo "Running clc_assembler "$(clc_assembler | grep Version)
clc_assembler -q ${TEMP_IN} -o ${TEMP_OUT} --cpus ${THREADS}
status_report ${?} clc_assembler
if [[ ! -s ${TEMP_OUT} ]]; then
gzip -f "$UNMAPPED_FQ"
exitOK "no assembled sequences were generated"
fi
elif [[ $ASSEMBLER == "megahit" ]]; then
# elif hash megahit 2>/dev/null; then
module unload cutadapt/1.18
module load megahit/1.2.9
megahit -v | tr ' ' '\t' >> $SVFILE
echo "Running megahit Version: "$(megahit -v)
megahit -r ${TEMP_IN}
status_report ${?} megahit
if [[ ! -s megahit_out/final.contigs.fa ]]; then
gzip -f "$UNMAPPED_FQ"
exitOK "no assembled sequences were generated"
else
mv -v megahit_out/final.contigs.fa $TEMP_OUT
fi
module unload megahit/1.2.9
else
echo "Supported assemblers not found"
exit 101
fi
ncontigs=$(grep -c '^>' "$TEMP_OUT")
echo Raw contigs: "$ncontigs"
else
echo "$SKIPMESSAGE"
fi
touch "$STEP_OKFILE"
# Step - relative abundance
STEP_NUM=$((STEP_NUM+1))
STEP="relative_abundance"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Relative abundance"
TEMP_IN="$TEMP_OUT" # assembly.fa
if [[ ! -e "$STEP_OKFILE" ]]; then
if [[ ! -e "$TEMP_IN" ]]; then
echo "$TEMP_IN is missing. It could have been removed by a previous Pickaxe run or manually deleted. Please make sure a file called $TEMP_IN exists in the Pickaxe output folder. Exiting now."
exit 100
fi
relabundance.pl -c "$TEMP_IN" -r "$UNMAPPED_FQ" > "$ASSEMBLY".ra.tsv
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
if [[ -e "$UNMAPPED_FQ" ]]; then
gzip -f "$UNMAPPED_FQ" # unmapped.fq -> unmapped.fq.gz
fi
# delete unnecessary files created by relabundance.pl
rm -f "$UNMAPPED_FQ".bam *.bt2
if [[ $EXITASSEMBLY == 1 ]]; then
exitOK "exiting after assembly and relabundance"
fi
# Step - Repeat Masker
STEP_NUM=$((STEP_NUM+1))
STEP="repeat_masker"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Repeat Masker"
TEMP_IN=${TEMP_OUT} # assembly.fa
RM_OUT=${TEMP_IN}.masked # assembly.fa.masked
TEMP_OUT=${TEMP_IN%%.*}.RM.fa # assembly.RM.fa
rm -rf RM_* # remove old repeatmasker temporary directories
if [[ ! -e "$STEP_OKFILE" ]]; then
if [[ $SKIP_RM == 0 ]]; then # run RM
RepeatMasker -pa ${THREADS} -xsmall ${TEMP_IN}
status_report ${?} RepeatMasker
if [[ ! -s ${RM_OUT} ]]; then # happens when there are no masked sequences
mv -v "$TEMP_IN" "$TEMP_OUT"
else
mv -v ${RM_OUT} ${TEMP_OUT}
fi
else # skip RM
echo "Skipping RepeatMasker step because --skip_repeatmasker in effect"
mv -v "$TEMP_IN" "$TEMP_OUT" # move assembly.fa -> assembly.RM.fa
fi
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
rm -f "$TEMP_IN"* # removes assembly.fa, assembly.fa.alert, assembly.fa.cat, assembly.fa.out, assembly.fa.tbl
if [[ $EXITRM == 1 ]]; then
exitOK "exiting after RepeatMaster"
fi
# Step - Repeat Sorter
STEP_NUM=$((STEP_NUM+1))
STEP="repeat_sorter"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Repeat Sorter"
TEMP_IN=${TEMP_OUT} # assembly.RM.fa
TEMP_OUT=${TEMP_IN}.good.fa # assembly.RM.fa.good.fa
if [[ ! -e "$STEP_OKFILE" ]]; then
RepeatSorter.pl -f ${TEMP_IN} -m tcga > ${TEMP_OUT}
if [[ ! -s ${TEMP_OUT} ]]; then
exitOK "no good sequences generated by Repeat Sorter"
fi
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
# do not delete the input file to this step since they are the assembled sequences that I want to keep
# Step - Getting seqs >= 500bp
STEP_NUM=$((STEP_NUM+1))
STEP="filter_short_contigs"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Getting sequences >= ${SEQLENGTH}bp in length"
TEMP_IN=${TEMP_OUT} # assembly.RM.fa.good.fa
TEMP_OUT=${TEMP_IN}.${SEQLENGTH}bp.fa # assembly.RM.fa.good.fa.500bp.fa
if [[ ! -e "$STEP_OKFILE" ]]; then
getseqs_minlength.pl -f ${TEMP_IN} -l ${SEQLENGTH} > ${TEMP_OUT}
status_report ${?} getseqs_minlength.pl
if [[ ! -s ${TEMP_OUT} ]]; then # TEMP_OUT is 0 bytes if there are no seqs >= 500bp
exitOK "no ${SEQLENGTH}bp sequences found"
fi
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
rm -f "$TEMP_IN"
# Step - Removing low entropy sequences
STEP_NUM=$((STEP_NUM+1))
STEP="filter_low_entropy_contigs"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Removing low entropy sequences"
TEMP_IN=${TEMP_OUT} # assembly.RM.fa.good.fa.500bp.fa
TEMP_OUT=${TEMP_IN}.ent.fa # assembly.RM.fa.good.fa.500bp.ent.fa
if [[ ! -e "$STEP_OKFILE" ]]; then
entropy.pl -m 65 -f ${TEMP_IN} > ${TEMP_OUT}
status_report ${?} entropy.pl
if [[ ! -s ${TEMP_OUT} ]]; then
exitOK "no good entropy sequences found"
fi
echo $(grep -c '^>' "$TEMP_OUT") high entropy sequences remaining.
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
rm -f ${TEMP_IN}
if [[ $EXITENT == 1 ]]; then
exitOK "exiting after filtering Low Entropy contigs"
fi
# Step - Annotator
STEP_NUM=$((STEP_NUM+1))
STEP="annotater"
STEP_OKFILE="$OKFILEDIR""$STEP_NUM"_"$STEP".OK
date_report ${STEP_NUM} "Annotating "$(grep -c '^>' "$TEMP_OUT")" sequences"
module purge
module load blast+/2.13.0
blastn -version | perl -ne 'if (/blastn: (\S+)/) { print "blast+\t$1\n" }' >> $SVFILE
grep -q rapsearch $ANNOTCONF # check if rapsearch is needed
if [[ $? -eq 0 ]]; then
echo "Loading module rapsearch/2.24"
module load rapsearch/2.24
rapsearch 2>&1 | perl -ne 'if(/rapsearch (v\S+?):/) { print "rapsearch\t$1\n" }' >> $SVFILE
fi
grep -q diamond $ANNOTCONF # check if diamond is needed
if [[ $? -eq 0 ]]; then
echo "Loading module diamond/2.0.15"
module load diamond/2.0.15
diamond version | perl -pe 's/ version /\t/' >> $SVFILE
fi
TEMP_IN=${TEMP_OUT}
if [[ ! -e "$STEP_OKFILE" ]]; then
if [[ ! -e "$TEMP_IN" ]]; then # check for input sequence file
ANN_BE_FILE="annotator/ann.wTax.BE.report.txt"
if [[ -e "$ANN_BE_FILE" ]]; then
cut -f 1,2 annotator/ann.wTax.BE.report.txt | sed 's/^/>/' | tr '\t' '\n' > "$TEMP_IN"
else
echo "The expected input sequence file, $TEMP_IN, is missing. Annotater $ANN_BE_FILE could not be found either. Please make sure a file called $TEMP_IN (containing sequences for annotation) exists in the Pickaxe output folder. Exiting now."
exit 100
fi
fi
if [[ $REMOTETAX == 1 ]]; then REMOTETAX="-remotetax"; else REMOTETAX= ; fi
Reann.pl -file ${TEMP_IN} -config ${ANNOTCONF} -num_threads ${THREADS} -evalue 1e-5 -tax "$REMOTETAX"
else
echo "$SKIPMESSAGE"
fi
status_report ${?} ${STEP_NUM}
touch "$STEP_OKFILE"
cp ${ANNOTCONF} annotator
rm -f ${TEMP_IN}
STEP_NUM=$((STEP_NUM+1))
date_report ${STEP_NUM} "Pickaxe finished"
exitOK "all steps finished"