Skip to content

Commit

Permalink
Merge pull request #84 from Meredith-Lab/get_mol_kegg
Browse files Browse the repository at this point in the history
Improvements to get_mol_kegg()
  • Loading branch information
Aariq authored Oct 30, 2023
2 parents 5c438d6 + 7d0b331 commit 174a0ed
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 41 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ Imports:
ChemmineR,
dplyr,
fs,
glue,
httr2,
KEGGREST,
magrittr,
purrr,
stringr,
tibble,
tidyr,
Expand Down
5 changes: 4 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

* It is now possible to supply input to `calc_vol()` as a vector of SMILES strings with `from = "smiles"`
* Users can now choose from RVI thresholds for non-volatile, low, moderate, and high volatility for clean atmosphere, polluted atmosphere, or soil using the `environment` parameter of `calc_vol()
* Chagnes to the output of `get_fx_groups()`: `mass` column renamed to `molecular_weight` and addition of an `exact_mass` column
* Changes to the output of `get_fx_groups()`: `mass` column renamed to `molecular_weight` and addition of an `exact_mass` column
* The `pathway_ids` argument of `get_mol_kegg()` now also accepts pathway *module* IDs (e.g. "M00082")
* `get_mol_kegg()` got a significant speed improvement (#84)
* `get_mol_kegg()` will skip downloading a .mol file if it is already present by default (override with `force=TRUE`)

# volcalc 2.0.0

Expand Down
114 changes: 77 additions & 37 deletions R/get_mol_kegg.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ utils::globalVariables(".data")
#'
#' @param compound_ids character vector of KEGG compound IDs---5 digits
#' prepended with a "C".
#' @param pathway_ids character vector of KEGG pathway IDs---5 digits prepended
#' with "map".
#' @param pathway_ids character vector of KEGG pathway or pathway module IDs---5
#' digits prepended with "map" or "M", respectively.
#' @param dir path to a folder to save .mol files in. Folder will be created if
#' it does not already exist
#' @param force logical; by default (`FALSE`), .mol files will not be downloaded
#' if they are found in `dir`. Set this to `TRUE` to download and overwrite
#' existing files.
#'
#' @returns a tibble with the columns `compound_ids`, `pathway_ids` (if used),
#' and `mol_paths` (paths to downloaded .mol files)
Expand All @@ -20,9 +23,7 @@ utils::globalVariables(".data")
#' get_mol_kegg(compound_ids = c("C16181", "C06074"), dir = tempdir())
#' get_mol_kegg(pathway_ids = "map00253", dir = tempdir())
#' }
get_mol_kegg <- function(compound_ids, pathway_ids, dir){

#TODO: implement redownload=FALSE arg?
get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE){

if(missing(dir)) stop("`dir` is required")
if ((missing(compound_ids) & missing(pathway_ids)) |
Expand All @@ -42,7 +43,7 @@ get_mol_kegg <- function(compound_ids, pathway_ids, dir){
}
# if pathways are provided
if (!missing(pathway_ids)) {
if (!all(stringr::str_detect(pathway_ids, "^[m][a][p][:digit:]{5}$"))) {
if (!all(stringr::str_detect(pathway_ids, "^(map|M)\\d{5}$"))) {
stop("Some pathway_ids are not in the correct KEGG format")
}
fs::dir_create(dir, pathway_ids)
Expand All @@ -52,42 +53,39 @@ get_mol_kegg <- function(compound_ids, pathway_ids, dir){
tibble::enframe(compound_ids_list, name = "pathway_id", value = "compound_id") %>%
tidyr::unnest(tidyselect::everything()) %>%
dplyr::mutate(mol_path = fs::path(dir, .data$pathway_id, .data$compound_id, ext = "mol"))

}

# Download mols
.get_mol_kegg <- function(compound_id) {
#TODO I think the KEGG API can handle up to 10 requests at once separated by
#"+". Unfortunately all the mol files come out in a single textfile and
#would need parsing to separate. Could speed things up by reducing API
#calls, but would require additional code.
mol <- KEGGREST::keggGet(compound_id, option = "mol")

# Adds title to mol file because it is used later on by get_fx_groups()
names <- KEGGREST::keggGet(compound_id)[[1]]$NAME
# Only use the first name and remove separator
title <- stringr::str_remove(names[1], ";")
# add title line to mol file
mol_clean <- paste0(title, "\n\n\n", gsub(">.*", "", mol))
mol_clean
if(isFALSE(force)) {
to_dl <- out_tbl$compound_id[!fs::file_exists(out_tbl$mol_path)]
out_paths <- out_tbl$mol_path[!fs::file_exists(out_tbl$mol_path)]
} else {
to_dl <- out_tbl$compound_id
out_paths <- out_tbl$mol_path
}
mols <- lapply(out_tbl$compound_id, .get_mol_kegg)

# write mol files
.write_mol <- function(mol_clean, file_path) {
utils::write.table(
mol_clean,
file = file_path,
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
if (length(to_dl) == 0) {
#if nothing to download, return early
return(out_tbl)
} else {

# Download mols
mols <- dl_mol_kegg(to_dl)

# write mol files
.write_mol <- function(mol_clean, file_path) {
utils::write.table(
mol_clean,
file = file_path,
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
}

mapply(.write_mol, mol_clean = mols, file_path = out_paths)

return(out_tbl)
}

mapply(.write_mol, mol_clean = mols, file_path = out_tbl$mol_path)

#return
out_tbl
}


Expand All @@ -112,5 +110,47 @@ keggGetCompounds <- function(pathway){
stringr::str_split_1("\n") %>%
stringr::str_extract("(?<=cpd:).*")
out[!is.na(out)]

}

dl_mol_kegg <- function(compound_ids) {
#balances compound_ids into groups of less than 10 to meet API guidelines
compound_id_list <- split_to_list(compound_ids, max_len = 10)

#maps over list, but returns it to a single character vector to simplify wrangling code
raw <-
purrr::map(compound_id_list, function(x) KEGGREST::keggGet(x, option = "mol")) %>%
purrr::list_c() %>%
glue::glue_collapse()
#split into multiples
mols <- stringr::str_split(raw, "(?<=\\${4})", n = length(compound_ids)) %>%
unlist() %>%
stringr::str_trim(side = "left")

# Adds title to mol file because it is used later on by get_fx_groups()
titles <- purrr::map(compound_id_list, function(x) { #for every group of <10 IDs
KEGGREST::keggGet(x) %>%
purrr::map_chr(function(names) { #for every ID
purrr::pluck(names, "NAME", 1) %>% #get first element of NAME
stringr::str_remove(";")
})
}) %>% unlist()
purrr::map2(mols, titles, function(mol, title) {
paste0(title, "\n\n\n", gsub(">.*", "", mol))
})

}



split_to_list <- function(x, max_len = 10) {

if(length(x) > max_len) {
n_groups <- ceiling(length(x) / max_len)
split(x, f = cut(seq_along(x), breaks = n_groups)) %>%
purrr::set_names(NULL)
} else {
list(x)
}

}
10 changes: 7 additions & 3 deletions man/get_mol_kegg.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions tests/testthat/test-get_mol_kegg.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
test_that("get_mol_kegg returns tibble", {
skip_if_offline()
skip_on_cran()

dir <- withr::local_tempdir()
out <- get_mol_kegg(compound_ids = "C16181", dir = dir)
expect_s3_class(out, "tbl_df")
Expand All @@ -10,6 +11,7 @@ test_that("get_mol_kegg returns tibble", {
test_that("get_mol_kegg writes files", {
skip_if_offline()
skip_on_cran()

dir <- withr::local_tempdir()
out <- get_mol_kegg(compound_ids = "C16181", dir = dir)
expect_true(fs::file_exists(fs::path(out$mol_path)))
Expand All @@ -18,6 +20,7 @@ test_that("get_mol_kegg writes files", {
test_that("get_mol_kegg errors unless one of compound_id or pathway_id", {
skip_if_offline()
skip_on_cran()

dir <- withr::local_tempdir()
expect_error(get_mol_kegg(dir = dir),
"One of `compound_id` or `pathway_id` are required")
Expand All @@ -30,6 +33,7 @@ test_that("get_mol_kegg errors unless one of compound_id or pathway_id", {
test_that("get_mol_kegg checks ID format", {
skip_if_offline()
skip_on_cran()

dir <- withr::local_tempdir()
expect_error(
get_mol_kegg(compound_ids = "hello", dir = dir),
Expand All @@ -52,6 +56,7 @@ test_that("get_mol_kegg checks ID format", {
test_that("get_mol_kegg dl correct compound", {
skip_if_offline()
skip_on_cran()

dir <- withr::local_tempdir()
out <- get_mol_kegg(compound_ids = "C00083", dir = dir)
expect_equal(readLines(out$mol_path) %>% head(1), "Malonyl-CoA")
Expand All @@ -60,6 +65,7 @@ test_that("get_mol_kegg dl correct compound", {
test_that("get_mol_kegg works with pathways", {
skip_if_offline()
skip_on_cran()

dir <- withr::local_tempdir()
out <- get_mol_kegg(pathway_ids = c("map00253", "map00232"), dir = dir)
expect_equal(nrow(out), 43)
Expand All @@ -70,6 +76,7 @@ test_that(".mol files are correctly formed", {
skip_if_offline()
skip_on_cran()
skip_on_os("windows") # really_capture_error() errors on windows

dir <- withr::local_tempdir()
out <- get_mol_kegg(compound_ids = "C00083", dir = dir)
sdf <- ChemmineR::read.SDFset(out$mol_path)
Expand All @@ -82,6 +89,7 @@ test_that(".mol files are correctly formed", {

test_that("file downloads with correct counts block", {
skip_on_cran()
skip_if_offline()

dir <- withr::local_tempdir()
out <- get_mol_kegg(compound_ids = "C16181", dir = dir)
Expand All @@ -92,3 +100,13 @@ test_that("file downloads with correct counts block", {
)
})

test_that("works with pathway modules", {
skip_on_cran()
skip_if_offline()

dir <- withr::local_tempdir()
out <- get_mol_kegg(pathway_ids = "M00082", dir = dir)
expect_equal(nrow(out), 5)
expect_true(all(file.exists(out$mol_path)))
})

0 comments on commit 174a0ed

Please sign in to comment.