diff --git a/R/store.R b/R/store.R index 63f1e0e..bae36dc 100644 --- a/R/store.R +++ b/R/store.R @@ -21,6 +21,11 @@ file_store <- R6Class( "file_store", public=list( + db = NULL, + initialize = function() { + self$db <- storr::storr_environment() + }, + exists=function(filename) { file.exists(filename) }, @@ -36,7 +41,31 @@ file_store <- R6Class( get_hash=function(filename) { if (self$exists(filename)) { - hash_files(filename, named=FALSE) + ## The approach here is to try and compute the hash of a file + ## only once per session (if the file appears not to have + ## changed). + ## + ## Every time that the hash is computed we save into an + ## emphemeral (environment) storr the hash, mtime and size of + ## the file. + ## + ## On subsequent hash requests if mtime and size are the same, + ## then we will assume that the file is not changed and return + ## the previously computed hash. + ## + ## Extending this to store the information persistently (e.g., + ## in the main storr, or in another alongside the existing + ## one) would not be a great problem. + info <- file_info(filename) + if (self$db$exists(filename)) { + dat <- self$db$get(filename) + if (identical(info, dat[names(info)])) { + return(dat$hash) + } + } + info$hash <- hash_files(filename, named = FALSE) + self$db$set(filename, info) + info$hash } else { stop(sprintf("file %s not found in file store", filename)) } @@ -64,7 +93,12 @@ file_store <- R6Class( dir.create(path_out, showWarnings=FALSE, recursive=TRUE) file_copy(file_in, path_out) } - )) + )) + +file_info <- function(filename) { + info <- file.info(filename, extra_cols = FALSE) + list(mtime = info$mtime, size = info$size) +} ##' @importFrom R6 R6Class store <- R6Class( diff --git a/tests/testthat/test-store.R b/tests/testthat/test-store.R index 2c6f24a..4e71554 100644 --- a/tests/testthat/test-store.R +++ b/tests/testthat/test-store.R @@ -17,3 +17,66 @@ test_that("file store", { st$del(file) expect_false(st$exists(file)) }) + +test_that("caching file store", { + lava <- function(...) stop("Can't touch this") + + filename <- tempfile() + bytes <- as.raw(sample(0:255, 1000, TRUE)) + writeBin(bytes, filename) + h <- digest::digest(bytes, serialize = FALSE) + + st <- file_store$new() + expect_false(st$db$exists(filename)) + + expect_equal(st$get_hash(filename), + unname(tools::md5sum(filename))) + expect_true(st$db$exists(filename)) + expect_equal(st$db$get(filename)$size, 1000) + expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) + expect_equal(st$db$get(filename)$hash, h) + + ## Does not call back to the underlying hash function: + with_mock("hash_files" = lava, + expect_equal(st$get_hash(filename), h)) + + ## Let's change the mtime and confirm changes. The time resolution + ## here is about 1s so this is slightly awkward. A better way here + ## might be to use `touch(1)` to directly change the mtime. I don't + ## know how that would work on windows though. + mtime <- st$db$get(filename)$mtime + repeat { + writeBin(bytes, filename) + if (file.mtime(filename) > mtime) { + break + } + Sys.sleep(.1) + } + + ## This *will* call out to the hash function: + with_mock("hash_files" = lava, + expect_error(st$get_hash(filename), "Can't touch this")) + expect_equal(st$get_hash(filename), h) + expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) + expect_equal(st$db$get(filename)$size, 1000) + + ## Then, we could change the file *size* and force rehashing. + writeBin(bytes[-1], filename) + h2 <- digest::digest(bytes[-1], serialize = FALSE) + with_mock("hash_files" = lava, + expect_error(st$get_hash(filename), "Can't touch this")) + expect_equal(st$get_hash(filename), h2) + expect_equal(st$db$get(filename)$size, 999) + expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) +}) + +test_that("store persists across remake invokations", { + cleanup() + make() + obj <- remake() + expect_true(obj$store$files$db$exists("data.csv")) + make("clean") + expect_true(obj$store$files$db$exists("data.csv")) + make("purge") + expect_true(obj$store$files$db$exists("data.csv")) +})