-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSEQUEL_process.awk
96 lines (67 loc) · 1.63 KB
/
SEQUEL_process.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
function get_library_flag(str,srMax){
gsub(/[[:digit:]]+(L|B)/,"",str);
gsub(/[[:digit:]]+A/,"A",str);
gsub(/A+/,"A",str);
if(length(gensub(/[^A]+/,"","g",str))>=2 && str!~/S[[:digit:]]+S/){
gsub(/S/,"",str);
sub(/^[^A]*A/,"",str);
sub(/A[^A]*$/,"",str);
gsub(/A/,",",str);
split(str,sr,",");
asort(sr);
if(sr[1]>=(srMax*0.7)){return 1}
}
return 0;
}
BEGIN{}
{
#- Compute polymerase (ZMW) read length
prLen=0;
prStr=$3
sub(/[[:alpha:]]$/,"",prStr);
gsub(/[[:alpha:]]/,"_",prStr);
prCnt=split(prStr,p,"_");
for(i=1;i<=prCnt;++i){prLen+=p[i];}
#-
#- Count HQ regions, compute total HQ sequence, and get longest HQ segment
hqTot=0;
hqMax=0;
hqStr=gensub(/[[:digit:]]+L/,"","g",$3);
sub(/H$/,"",hqStr);
hqCnt=split(hqStr,h,"H");
if(hqCnt>0){
asort(h);
hqMax=h[hqCnt];
for(i=1;i<=hqCnt;++i){hqTot+=h[i];}
}
#-
#- Count subreads and get longest subread
srTot=0;
srMax=0;
srStr=gensub(/[[:digit:]]+[ABFL]/,"","g",$4);
sub(/S$/,"",srStr);
srCnt=split(srStr,s,"S");
if(srCnt>0){
asort(s);
srMax=s[srCnt];
for(i=1;i<=srCnt;++i){srTot+=s[i];}
}
#-
#- Count adapters
aCnt=length(gensub(/[^A]+/,"","g",$4));
#-
#- Count barcodes
bCnt=length(gensub(/[^B]+/,"","g",$4));
#-
#- Count filtered reads
fCnt=length(gensub(/[^F]+/,"","g",$4));
#-
#- Check whether read can be used to estimate library size distribution
lFlg=0;
if(srCnt>=1 && fCnt==0 && aCnt>=2){
lFlg=get_library_flag($4,srMax);
}
#-
printf "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\n",$1,$2,prLen,hqTot,hqMax,srTot,srMax,hqCnt,srCnt,aCnt,bCnt,fCnt,lFlg,$4;
}
END{}