-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
22 changed files
with
405 additions
and
2,365 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
# This code and the used models were generated using R version 4.1.3 and the | ||
# packages tidyverse (version 2.0.0), gpboost (version 1.2.1), and | ||
# SHAPforxgboost (version 0.1.3). | ||
# To install these package versions on your system, you can run the commented | ||
# code below. | ||
|
||
# install.packages("devtools") | ||
# | ||
# require(devtools) | ||
# install_version("tidyverse", version = "2.0.0", repos = "http://cran.us.r-project.org") | ||
# install_version("gpboost", version = "1.2.1", repos = "http://cran.us.r-project.org") | ||
# install_version("SHAPforxgboost", version = "0.1.3", repos = "http://cran.us.r-project.org") | ||
|
||
#### Load libraries #### | ||
|
||
library(tidyverse) | ||
library(gpboost) | ||
library(SHAPforxgboost) | ||
|
||
#### Example model predictions #### | ||
|
||
# Here we try to reproduce the metrics for the model for fish with DEB parameter | ||
# values available. | ||
# Note that models are named according to the subgroup they are fit for, i.e, | ||
# "Fish with DEB", "Fish without DEB", "Invertebrate with DEB", and | ||
# "Invertebrate without DEB". All vectors and lists are named accordingly. | ||
# As an example, we will analyze data and model for "Fish with DEB". | ||
# However, by simply changing the name variable below, you can perform | ||
# analysis on the other subgroups. | ||
|
||
name <- "Fish with DEB" | ||
# name <- "Fish without DEB" | ||
# name <- "Invertebrate with DEB" | ||
# name <- "Invertebrate without DEB" | ||
|
||
### Load model | ||
|
||
model <- gpb.load(paste0(str_replace_all(name, " ", "_"),".json")) | ||
|
||
### Load training and test data | ||
|
||
training_data <- readRDS("training_data.rds")[[name]] | ||
|
||
test_data <- readRDS("test_data.rds")[[name]] | ||
|
||
# For transparency, training and test data feature a variable 'Latin name', | ||
# i.e., the name of the involved species. This variable is not required for | ||
# predictions, so we drop it here. | ||
|
||
training_data <- training_data %>% | ||
select(-'Latin name') | ||
|
||
test_data <- test_data %>% | ||
select(-'Latin name') | ||
|
||
### Make predictions | ||
|
||
## Create tables with actual vs. predicted target values | ||
|
||
pred_training_data <- tibble(actual = training_data$`Effect value`, | ||
pred = predict(model, data = | ||
training_data %>% | ||
select(-id, -`Effect value`) %>% | ||
as.matrix(), | ||
group_data_pred = | ||
training_data$id, | ||
predict_var = TRUE, pred_latent = | ||
FALSE)[["response_mean"]]) | ||
|
||
pred_test_data <- tibble(actual = test_data$`Effect value`, | ||
pred = predict(model, data = | ||
test_data %>% | ||
select(-id, -`Effect value`) %>% | ||
as.matrix(), | ||
group_data_pred = | ||
test_data$id, | ||
predict_var = TRUE, pred_latent = | ||
FALSE)[["response_mean"]]) | ||
|
||
# Important note: if you want to make predictions with new data, you need to | ||
# feed the model an id that is different from all ids it was trained on. | ||
# Could use, e.g., something like | ||
# id <- max(training_data$id) + 1 | ||
|
||
## Calculate RMSE and RSQ values | ||
|
||
rmse_training <- yardstick::rmse(pred_training_data, pred, actual) %>% | ||
pull(.estimate) | ||
rsq_training <- yardstick::rsq(pred_training_data, pred, actual) %>% | ||
pull(.estimate) | ||
|
||
rmse_test <- yardstick::rmse(pred_test_data, pred, actual) %>% | ||
pull(.estimate) | ||
rsq_test <- yardstick::rsq(pred_test_data, pred, actual) %>% | ||
pull(.estimate) | ||
|
||
# Gives you the results presented in Table 9 of the publication. | ||
|
||
|
||
#### Applicability domain #### | ||
|
||
# We will apply the respective applicability domain to the test data. | ||
# Let's create a helper function first. | ||
|
||
is_in_AD <- function(data, model_name) { | ||
|
||
# prepare dataset | ||
|
||
const <- readRDS("constant_features.rds")[[model_name]] | ||
data_testing <- data %>% | ||
select(-c(1,2, all_of(const))) %>% | ||
as_tibble() | ||
|
||
data_testing <- as_tibble(scale(data_testing)) | ||
|
||
SHAP_names <- readRDS("SHAP_names.rds")[[model_name]] | ||
|
||
data_testing_depr <- data_testing %>% select(any_of(names(SHAP_names))) | ||
|
||
# apply weights | ||
|
||
weighting <- readRDS("SHAP_weights.rds")[[model_name]] | ||
|
||
data_testing_depr2 <- data_testing_depr %>% | ||
mutate(across(everything(), ~ . * pull(weighting[1, cur_column()]))) | ||
|
||
# apply PCA and calculate distances | ||
|
||
pcs <- readRDS("PCAs.rds")[[model_name]] | ||
|
||
eigs <- pcs$sdev^2 | ||
cum_sum <- cumsum(eigs) / sum(eigs) | ||
num_comp <- sum(cum_sum <= 0.99) + 1 | ||
|
||
pca_means <- colMeans(pcs$x) | ||
|
||
pcs_pred <- stats::predict(pcs, data_testing_depr2) | ||
pcs_pred <- pcs_pred[, 1:num_comp, drop = FALSE] | ||
|
||
diffs_pred <- sweep(pcs_pred, 2, pca_means) | ||
sq_diff_pred <- diffs_pred^2 | ||
dists_pred <- apply(sq_diff_pred, 1, function(x) sqrt(sum(x))) | ||
|
||
# create logical vector is_in_AD | ||
|
||
cutoff_dist <- readRDS("cutoff_dists.rds")[[model_name]] | ||
|
||
is_in_AD <- dists_pred < cutoff_dist | ||
|
||
is_in_AD | ||
|
||
} | ||
|
||
# Test the function with the data for "Fish with DEB"; change variable name | ||
# above if you want to test another dataset. | ||
|
||
AD_vector <- is_in_AD(test_data, name) | ||
|
||
# This gives a vector with TRUE where samples are within the applicability | ||
# domain and FALSE where they are outside. | ||
|
||
# We could now apply this vector for indexing either to the whole dataset and | ||
# then do predictions. But we already have the predictions on the whole dataset | ||
# (object 'pred_test_data'). So we can just index this and then recalculate | ||
# RMSE and RSQ. | ||
|
||
pred_test_in_AD <- pred_test_data[AD_vector, ] | ||
|
||
rmse_test_in_AD <- yardstick::rmse(pred_test_in_AD, pred, actual) %>% | ||
pull(.estimate) | ||
rsq_test_in_AD <- yardstick::rsq(pred_test_in_AD, pred, actual) %>% | ||
pull(.estimate) | ||
|
||
# This gives the values presented in Table A3 of the publication. | ||
|
||
|
||
#### Local SHAP predictions #### | ||
|
||
# If you are interested in how one of our models came up with a prediction, | ||
# you can use a local SHAP value breakdown. | ||
# Let's do this on the first sample of the test data as an example. | ||
# Note that X_train is an unlucky naming convention within the shap.values | ||
# function. Also new data can be analysed here. | ||
|
||
SHAP <- shap.values(xgb_model = model, X_train = test_data[1, ] %>% | ||
select(-id, -`Effect value`) %>% as.matrix()) | ||
|
||
# To access the model bias (like an intercept), you can run | ||
|
||
SHAP$BIAS0 | ||
|
||
# For accessing the SHAP scores, run | ||
|
||
SHAP$mean_shap_score %>% enframe() |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,19 @@ | ||
# Bio-QSAR | ||
# Bio-QSARs 2.0 | ||
|
||
This repository includes data, code, and models from the publication: [Physiological variables in machine learning QSARs allow for both cross-chemical and cross-species predictions](https://www.sciencedirect.com/science/article/pii/S0147651323007546) | ||
This repository includes data, code, and models from the publication: [Bio-QSARs 2.0: Unlocking a new level of predictive power for machine learning-based ecotoxicity predictions by exploiting chemical and biological information](https://www.sciencedirect.com/science/article/pii/S0160412024001934) | ||
|
||
## Freshwater fish and invertebrate models | ||
|
||
These Bio-QSAR models allow the prediction of toxicity in freshwater fish and invertebrates. For information on development, use, and limitations, please see our associated publication. | ||
The second-generation Bio-QSAR models provided here allow the prediction of toxicity in freshwater fish and invertebrates. For information on development, use, and limitations, please see our associated publication. | ||
|
||
Models were built using R version 4.1.3 and the packages tidyverse (version 1.3.1), caret (version 6.0.91), and ranger (version 0.13.1). | ||
Models were built using R version 4.1.3 and the packages tidyverse (version 2.0.0), gpboost (version 1.2.1), and SHAPforxgboost (version 0.1.3). | ||
|
||
The script [model_example.R](model_example.R) includes examples for fish and invertebrates. | ||
The script [Example.R](Example.R) includes examples on how to to make predictions with the models, how to apply the respective applicability domain, and how to analyse predictions locally using SHAP. | ||
|
||
## Algorithmic approach for multicollinearity correction | ||
## Updated algorithm for multicollinearity correction | ||
|
||
Here, we provide an R version of an approach to correct datasets for multicollinearity that was recently presented in a [blog post](https://towardsdatascience.com/are-you-dropping-too-many-correlated-features-d1c96654abe6) by Brian Pietracatella. This approach is deemed to prevent the drop of too many variables and thus loose an unnecessarily large amount of information, while still eliminating multicollinearity. For more information, see our associated publication. | ||
Additionally, we provide an updated R version of an algorithmic approach to correct datasets for multicollinearity that was presented in a [blog post](https://towardsdatascience.com/are-you-dropping-too-many-correlated-features-d1c96654abe6) by Brian Pietracatella. This approach is deemed to prevent the drop of too many variables and thus loose an unnecessarily large amount of information, while still eliminating multicollinearity. For more information, see our associated publication. | ||
|
||
The function was built using R version 4.1.3 and the packages tidyverse (version 1.3.1) and caret (version 6.0.91). | ||
The function was built using R version 4.1.3 and the packages tidyverse (version 2.0.0) and caret (version 6.0.94). | ||
|
||
The script [multicoll_example.R](multicoll_example.R) includes an example. | ||
|
||
The function returns a list with four elements: | ||
* result$drop : the features to drop according to the algorithm | ||
* result$caret_before : the features to drop according to the caret function findCorrelation() | ||
* result$caret_after : result of a second run of findCorrelation() after result$drop features have been dropped from the dataset; should be empty | ||
* result$saved : features that have been saved by the algorithm | ||
The function [multicoll_sol.R](multicoll_sol.R) now allows for missing data running on pairwise complete observations. An example of usage can be found [here](https://github.com/syngenta/bio-qsar/blob/main/multicoll_example.R). |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.