From 518fd6f6a20c8f21f2bebbb457446ff1075597f2 Mon Sep 17 00:00:00 2001 From: dipterix Date: Sat, 6 Aug 2022 09:55:15 -0400 Subject: [PATCH] Ready for CRAN --- DESCRIPTION | 4 +- NAMESPACE | 1 + NEWS.md | 7 +++ R/class_filearray.R | 7 ++- R/load.R | 129 ++++++++++++++++++++++++++++++++++++++++++++ adhoc/rchk.sh | 4 +- cran-comments.md | 18 +------ man/filearray.Rd | 41 ++++++++++++++ 8 files changed, 189 insertions(+), 22 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 772c4ad..60b9a76 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ Package: filearray Type: Package Title: File-Backed Array for Out-of-Memory Computation -Version: 0.1.3.9001 +Version: 0.1.4 Language: en-US Encoding: UTF-8 License: LGPL-3 -URL: http://dipterix.org/filearray/, https://github.com/dipterix/filearray +URL: https://dipterix.org/filearray/, https://github.com/dipterix/filearray BugReports: https://github.com/dipterix/filearray/issues Authors@R: c( person( diff --git a/NAMESPACE b/NAMESPACE index 19033f6..e58797d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ export(filearray_bind) export(filearray_checkload) export(filearray_create) export(filearray_load) +export(filearray_load_or_create) export(filearray_threads) export(fmap) export(fmap2) diff --git a/NEWS.md b/NEWS.md index 0aaf85c..1bb844d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,12 @@ # filearray (development version) +# filearray 0.1.4 + +* Fixed a bug when allocated memory is one byte short than requested. The bug would crash R when triggered in certain cases. +* Removed limit to the maximum number of partitions when writing. The previous implementation creates and opens related file descriptors all at once before writing. This setup will raise errors when the number of connections reach to certain limit, often defined by the operating systems. This update only opens the connection on demand. The performance might be impacted when writing to disk, but in return, the program will be more robust +* Fixed `subset` function environment not resolved correctly when using formula +* Added `filearray_load_or_create` as an alternative to `filearray_checkload` by automatically replace existing obsolete array files if the headers, dimensions, or data types don't match. Also `on_missing` argument is provided to allow array initialization if new array is created. + # filearray 0.1.3 * Automatically detect whether symbolic-link works and show warnings diff --git a/R/class_filearray.R b/R/class_filearray.R index a17decf..a7b83ba 100644 --- a/R/class_filearray.R +++ b/R/class_filearray.R @@ -208,13 +208,16 @@ setRefClass( } return(default) }, - set_header = function(key, value){ + set_header = function(key, value, save = TRUE){ force(value) if(key %in% RESERVED_HEADERS){ stop("Key `", key, "` is preserved and should be read-only or altered via other methods.") } .self$.header[[key]] <- value - .self$.save_header() + if( save ) { + .self$.save_header() + } + invisible(value) }, header_signature = function(include_path = TRUE){ header_sig <- digest::digest(.self$.header, algo = "sha256") diff --git a/R/load.R b/R/load.R index 1c24484..ee3de8e 100644 --- a/R/load.R +++ b/R/load.R @@ -47,6 +47,9 @@ guess_partition <- function(dim, elem_size){ #' @param symlink_ok whether arrays with symbolic-link partitions can pass #' the test; this is usually used on bound arrays with symbolic-links; see #' \code{\link{filearray_bind}}; +#' @param verbose whether to print out some debug messages +#' @param on_missing function to handle file array (such as initialization) +#' when a new array is created; must take only one argument, the array object #' @return A \code{\link{FileArray-class}} instance. #' #' @details The file arrays partition out-of-memory array objects and store them @@ -107,6 +110,28 @@ guess_partition <- function(dim, elem_size){ #' } #' #' +#' # check-load, and create a new array if fail +#' x <- filearray_load_or_create( +#' filebase = filebase, dimension = c(200, 30, 8), +#' verbose = TRUE, signature = "henry" +#' ) +#' x$get_header("signature") +#' +#' # check-load with initialization +#' x <- filearray_load_or_create( +#' filebase = filebase, +#' dimension = c(3, 4, 5), +#' verbose = TRUE, mode = "readonly", +#' on_missing = function(array) { +#' array[] <- seq_len(60) +#' } +#' ) +#' +#' x[1:3,1,1] +#' +#' # Clean up +#' unlink(filebase, recursive = TRUE) +#' NULL #' @rdname filearray @@ -190,3 +215,107 @@ filearray_checkload <- function( } return(arr) } + + +#' @rdname filearray +#' @export +filearray_load_or_create <- function( + filebase, dimension, on_missing = NULL, type = NA, + ..., mode = c("readonly", "readwrite"), symlink_ok = TRUE, + initialize = FALSE, partition_size = NA, verbose = FALSE +) { + mode <- match.arg(mode) + filebase <- normalizePath(filebase, mustWork = FALSE, winslash = "/") + if(length(filebase) != 1 || grepl("(^|^[A-Za-z]:)/$", filebase)) { + stop("Invalid filebase to store a file array.") + } + + + dimension <- as.integer(dimension) + if(length(dimension) < 2 || any(is.na(dimension) | dimension < 0)) { + stop("Incorrect dimension for a file array: `dimension` must a valid positive integer vector with length of two or above.") + } + + if(!is.null(on_missing)) { + if(!is.function(on_missing) || !length(formals(on_missing))) { + stop("`filearray_load_or_create`: `on_missing` must be a function with one argument (i.e. the file array)") + } + } + + + additional_headers <- list(...) + add_header_names <- names(additional_headers) + if(length(additional_headers)) { + if(!length(add_header_names) || "" %in% trimws(additional_headers)) { + stop("`filearray_load_or_create`: additional parameters must be named.") + } + } + + + arr <- tryCatch( + expr = { + # try to load existing array + arr <- filearray_checkload( + filebase = filebase, mode = mode, + symlink_ok = symlink_ok, ... + ) + + # If no error raised, the array has been loaded + + if(!is.na(type)) { + if(!identical(arr$type(), type)) { + stop("`filearray_load_or_create`: Requested array type does not match with existing array.") + } + } else { + # in case the array needs to be reconstructed, assuming the type + # remain the same + type <- arr$type() + } + + # Now check the dimension + arr_dim <- as.integer(arr$dimension()) + if(!identical(arr_dim, dimension)) { + stop("`filearray_load_or_create`: Requested dimension does not match with existing array.") + } + + arr + }, + error = function(e) { + if(verbose) { + message("`filearray_load_or_create`: cannot load the existing file array: ", e$message, "\nTrying creating a new one. If the array already exists, its file path will be removed.") + } + if(file.exists(filebase)) { + unlink(filebase, recursive = TRUE, force = TRUE) + } + pdir <- dirname(filebase) + if(!dir.exists(pdir)) { + dir.create(pdir, showWarnings = FALSE, recursive = TRUE) + } + # create the array + if(is.na(type)) { type <- 'double' } + arr <- filearray_create( + filebase = filebase, + dimension = dimension, + type = type, + partition_size = partition_size, + initialize = initialize + ) + # run on_missing if the function exists + if(is.function(on_missing)) { + arr$.mode <- "readwrite" + on_missing(arr) + } + # seal the header + for(nm in add_header_names) { + arr$set_header(key = nm, value = additional_headers[[nm]], save = FALSE) + } + arr$.save_header() + arr + } + ) + + # set mode + arr$.mode <- mode + arr +} + diff --git a/adhoc/rchk.sh b/adhoc/rchk.sh index 307fdb6..6b0ad90 100644 --- a/adhoc/rchk.sh +++ b/adhoc/rchk.sh @@ -1,2 +1,2 @@ -rm "$HOME/Dropbox/projects/filearray_0.1.0.9000.tar.gz" -docker run -v "$HOME/Dropbox/projects":/projects rchk "/projects/filearray_0.1.0.9000.tar.gz" +# rm "$HOME/Dropbox/projects/filearray_0.1.3.9001.tar.gz" +docker run -v "$HOME/Dropbox/projects":/projects kalibera/rchk:latest "/projects/filearray_0.1.3.9001.tar.gz" diff --git a/cran-comments.md b/cran-comments.md index 3f99da9..458029b 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,5 +1,5 @@ ## Dev environment -* osx (ARM), R 4.1.2 +* osx (ARM), R 4.2.1 ## Test environments * osx (x64, github-action), R-release @@ -9,19 +9,5 @@ ## R CMD check results -On `release` and `devel` +On `oldrelease`, `release` and `devel` 0 errors | 0 warnings | 0 notes - -On R-4.0 -0 errors | 1 warning | 0 notes - -``` -Codoc mismatches from documentation object 'apply': -apply - Code: function(X, MARGIN, FUN, ...) - Docs: function(X, MARGIN, FUN, ..., simplify = TRUE) - Argument names in docs not in code: - simplify -``` - -This is because `simplify` was added to `apply` function since R-4.1. diff --git a/man/filearray.Rd b/man/filearray.Rd index 3845e6c..2718907 100644 --- a/man/filearray.Rd +++ b/man/filearray.Rd @@ -5,6 +5,7 @@ \alias{filearray_create} \alias{filearray_load} \alias{filearray_checkload} +\alias{filearray_load_or_create} \title{Create or load existing file arrays} \usage{ filearray_create( @@ -24,6 +25,19 @@ filearray_checkload( ..., symlink_ok = TRUE ) + +filearray_load_or_create( + filebase, + dimension, + on_missing = NULL, + type = NA, + ..., + mode = c("readonly", "readwrite"), + symlink_ok = TRUE, + initialize = FALSE, + partition_size = NA, + verbose = FALSE +) } \arguments{ \item{filebase}{a directory path to store arrays in the local file @@ -51,6 +65,11 @@ reserved for future compatibility.} \item{symlink_ok}{whether arrays with symbolic-link partitions can pass the test; this is usually used on bound arrays with symbolic-links; see \code{\link{filearray_bind}};} + +\item{on_missing}{function to handle file array (such as initialization) +when a new array is created; must take only one argument, the array object} + +\item{verbose}{whether to print out some debug messages} } \value{ A \code{\link{FileArray-class}} instance. @@ -117,6 +136,28 @@ filearray_checkload(filebase, signature = "jerry") } +# check-load, and create a new array if fail +x <- filearray_load_or_create( + filebase = filebase, dimension = c(200, 30, 8), + verbose = TRUE, signature = "henry" +) +x$get_header("signature") + +# check-load with initialization +x <- filearray_load_or_create( + filebase = filebase, + dimension = c(3, 4, 5), + verbose = TRUE, mode = "readonly", + on_missing = function(array) { + array[] <- seq_len(60) + } +) + +x[1:3,1,1] + +# Clean up +unlink(filebase, recursive = TRUE) + } \author{ Zhengjia Wang