Commit updated release

syngenta · Apr 15, 2024 · 817ac8e · 817ac8e
1 parent c4abd32
commit 817ac8e
Show file tree

Hide file tree

Showing 22 changed files with 405 additions and 2,365 deletions.
diff --git a/Example.R b/Example.R
@@ -0,0 +1,194 @@
+# This code and the used models were generated using R version 4.1.3 and the 
+# packages tidyverse (version 2.0.0), gpboost (version 1.2.1), and
+# SHAPforxgboost (version 0.1.3).
+# To install these package versions on your system, you can run the commented 
+# code below.
+
+# install.packages("devtools")
+# 
+# require(devtools)
+# install_version("tidyverse", version = "2.0.0", repos = "http://cran.us.r-project.org")
+# install_version("gpboost", version = "1.2.1", repos = "http://cran.us.r-project.org")
+# install_version("SHAPforxgboost", version = "0.1.3", repos = "http://cran.us.r-project.org")
+
+#### Load libraries ####
+
+library(tidyverse)
+library(gpboost)
+library(SHAPforxgboost)
+
+#### Example model predictions ####
+
+# Here we try to reproduce the metrics for the model for fish with DEB parameter
+# values available. 
+# Note that models are named according to the subgroup they are fit for, i.e, 
+# "Fish with DEB", "Fish without DEB", "Invertebrate with DEB", and 
+# "Invertebrate without DEB". All vectors and lists are named accordingly. 
+# As an example, we will analyze data and model for "Fish with DEB". 
+# However, by simply changing the name variable below, you can perform
+# analysis on the other subgroups.
+
+name <- "Fish with DEB"
+# name <- "Fish without DEB"
+# name <- "Invertebrate with DEB"
+# name <- "Invertebrate without DEB"
+
+### Load model
+
+model <- gpb.load(paste0(str_replace_all(name, " ", "_"),".json"))
+
+### Load training and test data
+
+training_data <- readRDS("training_data.rds")[[name]]
+
+test_data <- readRDS("test_data.rds")[[name]]
+
+# For transparency, training and test data feature a variable 'Latin name',
+# i.e., the name of the involved species. This variable is not required for 
+# predictions, so we drop it here.
+
+training_data <- training_data %>%
+  select(-'Latin name')
+
+test_data <- test_data %>%
+  select(-'Latin name')
+
+### Make predictions
+
+## Create tables with actual vs. predicted target values
+
+pred_training_data <- tibble(actual = training_data$`Effect value`, 
+                             pred = predict(model, data = 
+                                              training_data %>% 
+                                              select(-id, -`Effect value`) %>% 
+                                              as.matrix(), 
+                                            group_data_pred = 
+                                              training_data$id, 
+                                            predict_var = TRUE, pred_latent = 
+                                              FALSE)[["response_mean"]])
+
+pred_test_data <- tibble(actual = test_data$`Effect value`, 
+                             pred = predict(model, data = 
+                                              test_data %>% 
+                                              select(-id, -`Effect value`) %>% 
+                                              as.matrix(), 
+                                            group_data_pred = 
+                                              test_data$id, 
+                                            predict_var = TRUE, pred_latent = 
+                                              FALSE)[["response_mean"]])
+
+# Important note: if you want to make predictions with new data, you need to
+# feed the model an id that is different from all ids it was trained on.
+# Could use, e.g., something like 
+# id <- max(training_data$id) + 1
+
+## Calculate RMSE and RSQ values
+
+rmse_training <- yardstick::rmse(pred_training_data, pred, actual) %>%
+  pull(.estimate)
+rsq_training <- yardstick::rsq(pred_training_data, pred, actual) %>%
+  pull(.estimate)
+
+rmse_test <- yardstick::rmse(pred_test_data, pred, actual) %>%
+  pull(.estimate)
+rsq_test <- yardstick::rsq(pred_test_data, pred, actual) %>%
+  pull(.estimate)
+
+# Gives you the results presented in Table 9 of the publication.
+
+
+#### Applicability domain #### 
+
+# We will apply the respective applicability domain to the test data.
+# Let's create a helper function first.
+
+is_in_AD <- function(data, model_name) {
+
+  # prepare dataset 
+
+  const <- readRDS("constant_features.rds")[[model_name]]
+  data_testing <- data %>% 
+    select(-c(1,2, all_of(const))) %>%
+    as_tibble()
+
+  data_testing <- as_tibble(scale(data_testing))
+
+  SHAP_names <- readRDS("SHAP_names.rds")[[model_name]]
+
+  data_testing_depr <- data_testing %>% select(any_of(names(SHAP_names)))
+
+  # apply weights
+
+  weighting <- readRDS("SHAP_weights.rds")[[model_name]]
+
+  data_testing_depr2 <- data_testing_depr %>% 
+    mutate(across(everything(), ~ . * pull(weighting[1, cur_column()])))
+
+  # apply PCA and calculate distances
+
+  pcs <- readRDS("PCAs.rds")[[model_name]]
+
+  eigs <- pcs$sdev^2
+  cum_sum <- cumsum(eigs) / sum(eigs)
+  num_comp <- sum(cum_sum <= 0.99) + 1
+
+  pca_means <- colMeans(pcs$x)
+
+  pcs_pred <- stats::predict(pcs, data_testing_depr2)
+  pcs_pred <- pcs_pred[, 1:num_comp, drop = FALSE]
+
+  diffs_pred <- sweep(pcs_pred, 2, pca_means)
+  sq_diff_pred <- diffs_pred^2
+  dists_pred <- apply(sq_diff_pred, 1, function(x) sqrt(sum(x)))
+
+  # create logical vector is_in_AD
+
+  cutoff_dist <- readRDS("cutoff_dists.rds")[[model_name]]
+
+  is_in_AD <- dists_pred < cutoff_dist
+
+  is_in_AD
+
+}
+
+# Test the function with the data for "Fish with DEB"; change variable name
+# above if you want to test another dataset.
+
+AD_vector <- is_in_AD(test_data, name)
+
+# This gives a vector with TRUE where samples are within the applicability 
+# domain and FALSE where they are outside.
+
+# We could now apply this vector for indexing either to the whole dataset and
+# then do predictions. But we already have the predictions on the whole dataset
+# (object 'pred_test_data'). So we can just index this and then recalculate 
+# RMSE and RSQ.
+
+pred_test_in_AD <- pred_test_data[AD_vector, ]
+
+rmse_test_in_AD <- yardstick::rmse(pred_test_in_AD, pred, actual) %>%
+  pull(.estimate)
+rsq_test_in_AD <- yardstick::rsq(pred_test_in_AD, pred, actual) %>%
+  pull(.estimate)
+
+# This gives the values presented in Table A3 of the publication.
+
+
+#### Local SHAP predictions ####
+
+# If you are interested in how one of our models came up with a prediction,
+# you can use a local SHAP value breakdown.
+# Let's do this on the first sample of the test data as an example.
+# Note that X_train is an unlucky naming convention within the shap.values
+# function. Also new data can be analysed here.
+
+SHAP <- shap.values(xgb_model = model, X_train = test_data[1, ] %>%
+                      select(-id, -`Effect value`) %>% as.matrix())
+
+# To access the model bias (like an intercept), you can run
+
+SHAP$BIAS0
+
+# For accessing the SHAP scores, run
+
+SHAP$mean_shap_score %>% enframe()
diff --git a/Fish_with_DEB.json b/Fish_with_DEB.json
diff --git a/Fish_without_DEB.json b/Fish_without_DEB.json
diff --git a/Invertebrate_with_DEB.json b/Invertebrate_with_DEB.json
diff --git a/Invertebrate_without_DEB.json b/Invertebrate_without_DEB.json
diff --git a/PCAs.rds b/PCAs.rds
diff --git a/README.md b/README.md
@@ -1,25 +1,19 @@
-# Bio-QSAR
+# Bio-QSARs 2.0
 
-This repository includes data, code, and models from the publication: [Physiological variables in machine learning QSARs allow for both cross-chemical and cross-species predictions](https://www.sciencedirect.com/science/article/pii/S0147651323007546)
+This repository includes data, code, and models from the publication: [Bio-QSARs 2.0: Unlocking a new level of predictive power for machine learning-based ecotoxicity predictions by exploiting chemical and biological information](https://www.sciencedirect.com/science/article/pii/S0160412024001934)
 
 ## Freshwater fish and invertebrate models
 
-These Bio-QSAR models allow the prediction of toxicity in freshwater fish and invertebrates. For information on development, use, and limitations, please see our associated publication.
+The second-generation Bio-QSAR models provided here allow the prediction of toxicity in freshwater fish and invertebrates. For information on development, use, and limitations, please see our associated publication.
 
-Models were built using R version 4.1.3 and the packages tidyverse (version 1.3.1), caret (version 6.0.91), and ranger (version 0.13.1).
+Models were built using R version 4.1.3 and the packages tidyverse (version 2.0.0), gpboost (version 1.2.1), and SHAPforxgboost (version 0.1.3).
 
-The script [model_example.R](model_example.R) includes examples for fish and invertebrates.
+The script [Example.R](Example.R) includes examples on how to to make predictions with the models, how to apply the respective applicability domain, and how to analyse predictions locally using SHAP.
 
-## Algorithmic approach for multicollinearity correction
+## Updated algorithm for multicollinearity correction
 
-Here, we provide an R version of an approach to correct datasets for multicollinearity that was recently presented in a [blog post](https://towardsdatascience.com/are-you-dropping-too-many-correlated-features-d1c96654abe6) by Brian Pietracatella. This approach is deemed to prevent the drop of too many variables and thus loose an unnecessarily large amount of information, while still eliminating multicollinearity. For more information, see our associated publication.
+Additionally, we provide an updated R version of an algorithmic approach to correct datasets for multicollinearity that was presented in a [blog post](https://towardsdatascience.com/are-you-dropping-too-many-correlated-features-d1c96654abe6) by Brian Pietracatella. This approach is deemed to prevent the drop of too many variables and thus loose an unnecessarily large amount of information, while still eliminating multicollinearity. For more information, see our associated publication.
 
-The function was built using R version 4.1.3 and the packages tidyverse (version 1.3.1) and caret (version 6.0.91).
+The function was built using R version 4.1.3 and the packages tidyverse (version 2.0.0) and caret (version 6.0.94).
 
-The script [multicoll_example.R](multicoll_example.R) includes an example.
-
-The function returns a list with four elements:
-* result$drop :  the features to drop according to the algorithm
-* result$caret_before : the features to drop according to the caret function findCorrelation()
-* result$caret_after : result of a second run of findCorrelation() after result$drop features have been dropped from the dataset; should be empty
-* result$saved : features that have been saved by the algorithm
+The function [multicoll_sol.R](multicoll_sol.R) now allows for missing data running on pairwise complete observations. An example of usage can be found [here](https://github.com/syngenta/bio-qsar/blob/main/multicoll_example.R).
diff --git a/SHAP_names.rds b/SHAP_names.rds
diff --git a/SHAP_weights.rds b/SHAP_weights.rds
diff --git a/constant_features.rds b/constant_features.rds
diff --git a/cutoff_dists.rds b/cutoff_dists.rds
diff --git a/example_data_fish.csv b/example_data_fish.csv