Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate compounds by looking for the "symptoms" of OpenBabel issues #104

Merged
merged 12 commits into from
May 9, 2024
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ biocViews:
Config/testthat/edition: 3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# volcalc (development version)

* adds a `validate = TRUE` option to `calc_vol()` and `get_fx_groups()` that returns `NA`s when there are suspected errors in parsing SMILES or .mol files. This is unfortunately not available on Windows due to differences in the windows version of `ChemmineOB`

# volcalc 2.1.2

* There is no release for this version as it was a rejected CRAN submission.
Expand Down
11 changes: 7 additions & 4 deletions R/calc_vol.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
#' follows: `"clean"` (clean atmosphere, default) -2, 0, 2; `"polluted"`
#' (polluted atmosphere) 0, 2, 4; `"soil"` 4, 6, 8. For more information about
#' these thresholds see Meredith et al. (2023) and Donahue et al. (2006).
#' @param validate logical; if `TRUE` (default), results are checked for
#' possible errors in parsing by Open Babel and `NA`s are returned if possible
#' errors are found. Setting to `FALSE` bypasses these checks—use at your own
#' risk! Validation is not available on Windows. See **Details** of
#' [get_fx_groups()] for more information.
#' @param return_fx_groups When `TRUE`, the returned tibble includes functional
#' group counts.
#' @param return_calc_steps When `TRUE`, the returned tibble includes
Expand Down Expand Up @@ -55,6 +60,7 @@ calc_vol <-
from = c("mol_path", "smiles"),
method = c("meredith", "simpol1"),
environment = c("clean", "polluted", "soil"),
validate = TRUE,
return_fx_groups = FALSE,
return_calc_steps = FALSE) {

Expand All @@ -78,17 +84,14 @@ calc_vol <-
)

if(from == "mol_path") {
#TODO: validate mol files??
compound_sdf_list <- lapply(input, ChemmineR::read.SDFset)
}

if(from == "smiles") {
compound_sdf_list <- lapply(input, ChemmineR::smiles2sdf)
}
#TODO use ChemmineR::validSDF() to check for any invalid inputs
# Warn and return NA for any that aren't valid??
fx_groups_df_list <-
lapply(compound_sdf_list, get_fx_groups)
lapply(compound_sdf_list, get_fx_groups, validate = validate)
names(fx_groups_df_list) <- input
fx_groups_df <-
#adds column for input named "mol_path" or "smiles"
Expand Down
51 changes: 45 additions & 6 deletions R/get_fx_groups.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,23 @@
#'
#' @param compound_sdf a [ChemmineR::SDFset] object returned by
#' [ChemmineR::read.SDFset()] or [ChemmineR::smiles2sdf()], for example.
#' @param validate logical; if `TRUE` (default), results are checked for
#' possible errors in parsing by Open Babel and `NA`s are returned if possible
#' errors are found. Setting to `FALSE` bypasses these checks—use at your own
#' risk! Validation is not available on Windows. See **Details** for more
#' information.
#'
#' @details It is unfortunately difficult to capture errors and warnings
#' produced by the command line tool OpenBabel used by `ChemmineOB`, a
#' dependency of `volcalc`. These errors and warnings are printed to the R
#' console, but they are *not* R errors and do not stop code from running and
#' producing potentially incorrect data. `validate = TRUE` checks the output
#' of certain OpenBabel procedures for the *symptoms* of these errors, namely
#' missing values for InChI and molecular formula. Unfortunately, since InChI
#' generation is not available with the Windows version of `ChemmineOB`, this
#' validation step cannot be performed on Windows and `validate = TRUE` will
#' simply print a warning that can be silenced by setting `validate = FALSE`.
#'
#'
#' @returns A tibble with columns of basic compound info and functional group
#' counts.
Expand All @@ -21,7 +38,13 @@
#' get_fx_groups(sdf)
#'
#' @export
get_fx_groups <- function(compound_sdf) {
get_fx_groups <- function(compound_sdf, validate = TRUE) {

if(Sys.info()[["sysname"]] == "Windows" & isTRUE(validate)) {
warning("`validate = TRUE` is not available on Windows.
Set `validate = FALSE` to silence this warning.")
validate <- FALSE
}

# For now at least, this code only works with SDFset objects that contain
# single molecules.
Expand All @@ -30,6 +53,8 @@ get_fx_groups <- function(compound_sdf) {
stop("SDFset objects must contain a single molecule only")
}

#get basic properties
prop_ob <- ChemmineR::propOB(compound_sdf)
chem_groups <- ChemmineR::groups(compound_sdf,
groups = "fctgroup",
type = "countMA")
Expand Down Expand Up @@ -81,13 +106,13 @@ get_fx_groups <- function(compound_sdf) {
nitro_pattern <- "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]"
hydroxyl_aromatic_pattern <- "[OX2H]c"
nitrate_pattern <- "[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]"

#TODO need patterns for amines that don't pick up amides
amine_primary_pattern <- "[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6X4]"
amine_secondary_pattern <- "[NX3H1!$(NC=[!#6])!$(NC#[!#6])]([#6X4])[#6X4]"
amine_tertiary_pattern <- "[NX3H0!$(NC=[!#6])!$(NC#[!#6])]([#6X4])([#6X4])[#6X4]"
amine_aromatic_pattern <- "[NX3;!$(NO)]c"

amide_primary_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H2]"
amide_secondary_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]"
amide_tertiary_pattern <-
Expand Down Expand Up @@ -117,11 +142,11 @@ get_fx_groups <- function(compound_sdf) {

fx_groups_df <-
dplyr::tibble(
formula = ChemmineR::propOB(compound_sdf)$formula,
formula = prop_ob$formula,
#TODO should name be moved to `calc_vol`? `formula` also?
name = ChemmineR::propOB(compound_sdf)$title,
name = prop_ob$title,
exact_mass = ChemmineR::exactMassOB(compound_sdf),
molecular_weight = ChemmineR::propOB(compound_sdf)$MW
molecular_weight = prop_ob$MW
) %>%
dplyr::mutate(
carbons = atoms[["C"]] %||% 0L,
Expand Down Expand Up @@ -205,6 +230,20 @@ get_fx_groups <- function(compound_sdf) {
#TODO should this be moved to `calc_vol?`. It's only relevant when from = "mol_path"
dplyr::mutate(name = ifelse(.data$name == "", NA_character_, .data$name))

if (isTRUE(validate)) {
# when SDFs have problems, the results of propOB() usually don't have InChI or formula
# TODO: possibly add `| isFALSE(ChemmineR::validSDF(compound_sdf))`, although I've never seen this catch any problems in practice.
if (prop_ob$InChI == "" | prop_ob$formula == "") {
fx_groups_df <- fx_groups_df %>%
dplyr::mutate(dplyr::across(-"name", \(x) magrittr::set_class(NA, class(x))))
warning("Possible OpenBabel errors detected and only NAs returned.
Run with `validate = FALSE` to ignore this.")
}
}

#return
fx_groups_df
}

#for mocking in tests. see: https://blog.r-hub.io/2024/03/21/mocking-new-take/ and ?testthat::local_mocked_bindings()
Sys.info <- NULL
2 changes: 2 additions & 0 deletions R/mol_example.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
#'
#' @details File names are the KEGG identifiers. Compound names are as follows:
#' - C00031: D-Glucose
#' - C00157: Phosphatidylcholine
#' - C08491: (-)-Jasmonic acid
#' - C16181: beta-2,3,4,5,6-Pentachlorocyclohexanol
#' - C16286: Geosmin
#' - C16521: Isoprene

#' @returns File paths to installed example .mol files.
#' @export
#'
Expand Down
51 changes: 51 additions & 0 deletions inst/extdata/C00157.mol
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Phosphatidylcholine


22 21 0 0 0 0 0 0 0 0999 V2000
22.3185 -17.9800 0.0000 P 0 0 3 0 0 0 0 0 0 0 0 0
20.9987 -17.9800 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
23.6441 -17.9800 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
22.3126 -19.3173 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0
22.3243 -16.6602 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
19.6787 -17.9800 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
24.7888 -17.3260 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
19.6671 -16.1522 0.0000 C 0 0 3 0 0 0 0 0 0 0 0 0
25.9334 -17.9800 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
18.1137 -16.1813 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
19.6787 -14.4409 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
27.0840 -17.3260 0.0000 N 0 3 3 0 0 0 0 0 0 0 0 0
16.7879 -16.1813 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
20.9987 -14.4409 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
28.8301 -17.8574 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
27.0840 -16.0062 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
27.4227 -18.6048 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
16.7763 -14.8555 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
15.4681 -16.1813 0.0000 R 0 0 0 0 0 0 0 0 0 0 0 0
22.3185 -14.4409 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
22.3126 -13.1211 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
23.6441 -14.4409 0.0000 R 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0
1 3 1 0 0 0
1 4 1 0 0 0
1 5 2 0 0 0
2 6 1 0 0 0
3 7 1 0 0 0
6 8 1 0 0 0
7 9 1 0 0 0
8 10 1 0 0 0
8 11 1 0 0 0
9 12 1 0 0 0
10 13 1 0 0 0
11 14 1 0 0 0
12 15 1 0 0 0
12 16 1 0 0 0
12 17 1 0 0 0
13 18 2 0 0 0
13 19 1 0 0 0
14 20 1 0 0 0
20 21 2 0 0 0
20 22 1 0 0 0
M CHG 2 4 -1 12 1
M END


9 changes: 8 additions & 1 deletion man/calc_vol.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 19 additions & 1 deletion man/get_fx_groups.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/mol_example.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/volcalc-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

51 changes: 51 additions & 0 deletions tests/testthat/data/C00157.mol
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Phosphatidylcholine


22 21 0 0 0 0 0 0 0 0999 V2000
22.3185 -17.9800 0.0000 P 0 0 3 0 0 0 0 0 0 0 0 0
20.9987 -17.9800 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
23.6441 -17.9800 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
22.3126 -19.3173 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0
22.3243 -16.6602 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
19.6787 -17.9800 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
24.7888 -17.3260 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
19.6671 -16.1522 0.0000 C 0 0 3 0 0 0 0 0 0 0 0 0
25.9334 -17.9800 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
18.1137 -16.1813 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
19.6787 -14.4409 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
27.0840 -17.3260 0.0000 N 0 3 3 0 0 0 0 0 0 0 0 0
16.7879 -16.1813 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
20.9987 -14.4409 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
28.8301 -17.8574 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
27.0840 -16.0062 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
27.4227 -18.6048 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
16.7763 -14.8555 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
15.4681 -16.1813 0.0000 R 0 0 0 0 0 0 0 0 0 0 0 0
22.3185 -14.4409 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
22.3126 -13.1211 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
23.6441 -14.4409 0.0000 R 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0
1 3 1 0 0 0
1 4 1 0 0 0
1 5 2 0 0 0
2 6 1 0 0 0
3 7 1 0 0 0
6 8 1 0 0 0
7 9 1 0 0 0
8 10 1 0 0 0
8 11 1 0 0 0
9 12 1 0 0 0
10 13 1 0 0 0
11 14 1 0 0 0
12 15 1 0 0 0
12 16 1 0 0 0
12 17 1 0 0 0
13 18 2 0 0 0
13 19 1 0 0 0
14 20 1 0 0 0
20 21 2 0 0 0
20 22 1 0 0 0
M CHG 2 4 -1 12 1
M END


36 changes: 36 additions & 0 deletions tests/testthat/test-get_fx_groups.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,41 @@ test_that("SMARTS strings are correct", {
)
})

test_that("validate = TRUE works", {
skip_on_os("windows") #validation not available on Windows because InChI aren't created on Windows
sdf_bad_header <- ChemmineR::read.SDFset(test_path("data/C16181_malformed_header.mol"))
sdf_r_group <- ChemmineR::read.SDFset(test_path("data/C00157.mol"))
expect_true(
all(is.na(
suppressWarnings(get_fx_groups(sdf_bad_header, validate = TRUE)) %>%
dplyr::select(-name)
))
)
expect_true(
all(is.na(
suppressWarnings(get_fx_groups(sdf_r_group, validate = TRUE)) %>%
dplyr::select(-name)
))
)
expect_warning(get_fx_groups(sdf_r_group, validate = TRUE),
"Possible OpenBabel errors detected and only NAs returned.
Run with `validate = FALSE` to ignore this.")
expect_equal(
get_fx_groups(sdf_r_group, validate = FALSE)$carbons,
10
)
})

test_that("validate = TRUE doesn't change correct results", {
from_smiles <- ChemmineR::smiles2sdf("C1(C(C(C(C(C1Cl)Cl)Cl)Cl)Cl)O")
expect_equal(get_fx_groups(from_smiles, validate = TRUE),
get_fx_groups(from_smiles, validate = FALSE))
})

test_that("validate = TRUE warns on Windows and doesn't do anything", {
local_mocked_bindings(Sys.info = function(...) c(sysname = "Windows"))
from_smiles <- ChemmineR::smiles2sdf("C1(C(C(C(C(C1Cl)Cl)Cl)Cl)Cl)O")
expect_warning(get_fx_groups(from_smiles, validate = TRUE),
"`validate = TRUE` is not available on Windows.
Set `validate = FALSE` to silence this warning.")
})
Loading
Loading