-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVennDiagrams.Rmd
136 lines (103 loc) · 4.72 KB
/
VennDiagrams.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
---
title: "VennDiagrams"
author: "Nikeisha Caruana"
date: "21 February 2017"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
library(VennDiagram)
library(splitstackshape)
library(stringr)
library(dplyr)
```
Venn Diagram for Maxquant vs PROTXML glycoproteins
```{r}
#read in files
glyco_MQ <- read.csv("raw_data/Glycosylated_MQ.csv", sep='\t', header = FALSE)
glyco_PROT <- read.csv("raw_data/Glycosylated_PROTXML.csv", sep='\t', header = TRUE)
#split Maxquant to compare proteins
#glyco_MQ <- cSplit(glyco_MQ, "V1", ";","long")
#extract protein IDs
glyco_MQ <- str_extract(glyco_MQ$V1,"[A-Z]*_[A-Z]{2}[0-9]{0,6}_c[0-9]{1,2}_g[0-9]{1}_i[0-9]{1}")
#make venn diagram
vd<-venn.diagram(list(Maxquant=(glyco_MQ),PROTXML=(glyco_PROT$x)),NULL,main="Identified glycosylated proteins",cat.just,
fill = c("slateblue2", "turquoise"), fontfamily='sans', cat.fontfamily= "sans", main.fontfamily = "sans",
cat.dist = c(0.03, 0.03),cat.pos = c(-14, 14))
grid.draw(vd)
```
Venn Diagram for PNGase F vs non-PNGase F Proteins in MQ
```{r}
#read in data
MQ_data = read.table("raw_data/proteinGroups_PNGaseFSL.txt",sep="\t",header=TRUE, row.names = 6)
iBAQdata = MQ_data[c(68,69,70,71,72,73,74,75)]
iBAQdata = subset(iBAQdata, !(iBAQ.1_160309183003 == 0 & iBAQ.2 == 0 & iBAQ.3 == 0 & iBAQ.4 == 0 & iBAQ.1P == 0 & iBAQ.2P == 0 & iBAQ.3P == 0 & iBAQ.4P == 0))
#subset iBAQ data
MQdata_Psub <- subset(iBAQdata, iBAQ.1P > 0 | iBAQ.2P > 0 | iBAQ.3P > 0 | iBAQ.4P > 0)
MQdata_Psub2 <- subset(MQdata_Psub, iBAQ.1_160309183003 == 0 & iBAQ.2 == 0 & iBAQ.3 == 0 & iBAQ.4 == 0)
MQdata_NPsub <- subset(iBAQdata, iBAQ.1_160309183003 > 0 | iBAQ.2 > 0 | iBAQ.3 > 0 | iBAQ.4 > 0)
MQdata_NPsub2 <- subset(MQdata_NPsub, iBAQ.1P == 0 & iBAQ.2P == 0 & iBAQ.3P == 0 & iBAQ.4P == 0 )
```
```{r}
#makes sure the row names are their own column called ID
editiBAQdata<- cbind(ID = rownames(iBAQdata), iBAQdata)
row.names(editiBAQdata) <- NULL
editNP<- cbind(ID = rownames(MQdata_NPsub2), MQdata_NPsub2)
row.names(editNP) <- NULL
editP<- cbind(ID = rownames(MQdata_Psub2), MQdata_Psub2)
row.names(editP) <- NULL
```
```{r}
#anti merge the whole iBAQ data with the subset of NP to get rid of it from the data
merged <- iBAQdata[!(editiBAQdata$ID %in% editNP$ID),]
merged<- cbind(ID = rownames(merged), merged)
row.names(merged) <- NULL
#antimerge the P and iBAQ data -> so now you only have the iBAQ data that doesnt have NP or P alone (i.e the middle of the diagram)
merged <- merged[!(merged$ID %in% editP$ID),]
```
```{r}
#add the iBAQ data and the NP data then the same with the P data
datasetNP <- rbind(merged,editNP)
datasetNP <- filter(datasetNP, grepl("TRINITY",ID))
datasetP <- rbind(merged,editP)
datasetP <- filter(datasetP, grepl("TRINITY",ID))
#make venn diagram
vd<-venn.diagram(list(NonPNGaseF=(datasetNP$ID),PNGaseF=(datasetP$ID)),NULL,main="Proteins in S. lineolata slime",cat.just,fill = c("cornflowerblue", "seagreen3"), fontfamily='sans', cat.fontfamily= "sans", main.fontfamily = "sans", cat.dist = c(0.03, 0.03),cat.pos = c(-14, 14))
grid.draw(vd)
```
Venn Diagram for PNGase F vs non-PNGase F Proteins in the TPP
```{r}
#read in data
TPPP_data = read.table("raw_data/observed_peptides_PNGaseF.gff3",sep="\t",header=FALSE)
TPPNP_data = read.table("raw_data/observed_peptideNPs.gff3",sep="\t",header=FALSE)
datasetTPPP <- TPPP_data$V1
datasetTPPP <- as.data.frame(datasetTPPP)
datasetTPPP <- datasetTPPP %>% filter(!grepl("decoy",datasetTPPP)) %>% filter(!grepl("sp",datasetTPPP))
datasetTPPP <- unique(datasetTPPP)
datasetTPPNP <- TPPNP_data$V1
datasetTPPNP <- as.data.frame(datasetTPPNP)
datasetTPPNP <- datasetTPPNP %>% filter(!grepl("decoy",datasetTPPNP)) %>% filter(!grepl("sp",datasetTPPNP))
datasetTPPNP <-unique(datasetTPPNP)
```
```{r}
#make venn diagram
vd<-venn.diagram(list(NonPNGaseF=(datasetTPPNP$datasetNP),PNGaseF=(datasetTPPP$datasetP)),NULL,main="Proteins identified by the Trans-Proteomic Pipeline in S. lineolata slime",cat.just,fill = c("cornflowerblue", "seagreen3"), fontfamily='sans', cat.fontfamily= "sans", main.fontfamily = "sans", cat.dist = c(0.03, 0.03),cat.pos = c(-14, 14))
grid.draw(vd)
```
```{r checking overall number of proteins PNGase F and Non-PNGase in both search engines}
#pngasef
MQPNGase <- datasetP$ID
TPPPNGase <- datasetTPPP$datasetTPPP
MQPNGase <- str_match(MQPNGase,"TRINITY_DN[0-9]*_c[0-9]*_g[0-9]*_i[0-9]")
MQPNGase <- na.omit(MQPNGase)
intersect <- intersect(TPPPNGase, MQPNGase)
#nonpngasef
MQNPNGase <- datasetNP$ID
TPPNPNGase <- datasetTPPNP$datasetTPPNP
MQNPNGase <- str_match(MQNPNGase,"TRINITY_DN[0-9]*_c[0-9]*_g[0-9]*_i[0-9]")
MQNPNGase <- na.omit(MQNPNGase)
intersectNP <- intersect(TPPNPNGase, MQNPNGase)
#onlyinTPPP
TPPPonly <- setdiff(TPPPNGase, MQPNGase)
```