From c240efc52a9893755f53d96cdf08f025046fe63a Mon Sep 17 00:00:00 2001 From: Rich FitzJohn Date: Wed, 19 Oct 2016 17:51:12 +0100 Subject: [PATCH 1/2] Try to cache file hashes Within a session, this should avoid recomputing file hashes where the file size and mtime have not changed. This exploits remake's internal caching, which is not tested in this commit (and this entirely lacks any sort of integration test). For #110 --- R/store.R | 38 +++++++++++++++++++++++++-- tests/testthat/test-store.R | 52 +++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/R/store.R b/R/store.R index 63f1e0e..bae36dc 100644 --- a/R/store.R +++ b/R/store.R @@ -21,6 +21,11 @@ file_store <- R6Class( "file_store", public=list( + db = NULL, + initialize = function() { + self$db <- storr::storr_environment() + }, + exists=function(filename) { file.exists(filename) }, @@ -36,7 +41,31 @@ file_store <- R6Class( get_hash=function(filename) { if (self$exists(filename)) { - hash_files(filename, named=FALSE) + ## The approach here is to try and compute the hash of a file + ## only once per session (if the file appears not to have + ## changed). + ## + ## Every time that the hash is computed we save into an + ## emphemeral (environment) storr the hash, mtime and size of + ## the file. + ## + ## On subsequent hash requests if mtime and size are the same, + ## then we will assume that the file is not changed and return + ## the previously computed hash. + ## + ## Extending this to store the information persistently (e.g., + ## in the main storr, or in another alongside the existing + ## one) would not be a great problem. + info <- file_info(filename) + if (self$db$exists(filename)) { + dat <- self$db$get(filename) + if (identical(info, dat[names(info)])) { + return(dat$hash) + } + } + info$hash <- hash_files(filename, named = FALSE) + self$db$set(filename, info) + info$hash } else { stop(sprintf("file %s not found in file store", filename)) } @@ -64,7 +93,12 @@ file_store <- R6Class( dir.create(path_out, showWarnings=FALSE, recursive=TRUE) file_copy(file_in, path_out) } - )) + )) + +file_info <- function(filename) { + info <- file.info(filename, extra_cols = FALSE) + list(mtime = info$mtime, size = info$size) +} ##' @importFrom R6 R6Class store <- R6Class( diff --git a/tests/testthat/test-store.R b/tests/testthat/test-store.R index 2c6f24a..6bf6940 100644 --- a/tests/testthat/test-store.R +++ b/tests/testthat/test-store.R @@ -17,3 +17,55 @@ test_that("file store", { st$del(file) expect_false(st$exists(file)) }) + +test_that("caching file store", { + lava <- function(...) stop("Can't touch this") + + filename <- tempfile() + bytes <- as.raw(sample(0:255, 1000, TRUE)) + writeBin(bytes, filename) + h <- digest::digest(bytes, serialize = FALSE) + + st <- file_store$new() + expect_false(st$db$exists(filename)) + + expect_equal(st$get_hash(filename), + unname(tools::md5sum(filename))) + expect_true(st$db$exists(filename)) + expect_equal(st$db$get(filename)$size, 1000) + expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) + expect_equal(st$db$get(filename)$hash, h) + + ## Does not call back to the underlying hash function: + with_mock("hash_files" = lava, + expect_equal(st$get_hash(filename), h)) + + ## Let's change the mtime and confirm changes. The time resolution + ## here is about 1s so this is slightly awkward. A better way here + ## might be to use `touch(1)` to directly change the mtime. I don't + ## know how that would work on windows though. + mtime <- st$db$get(filename)$mtime + repeat { + writeBin(bytes, filename) + if (file.mtime(filename) > mtime) { + break + } + Sys.sleep(.1) + } + + ## This *will* call out to the hash function: + with_mock("hash_files" = lava, + expect_error(st$get_hash(filename), "Can't touch this")) + expect_equal(st$get_hash(filename), h) + expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) + expect_equal(st$db$get(filename)$size, 1000) + + ## Then, we could change the file *size* and force rehashing. + writeBin(bytes[-1], filename) + h2 <- digest::digest(bytes[-1], serialize = FALSE) + with_mock("hash_files" = lava, + expect_error(st$get_hash(filename), "Can't touch this")) + expect_equal(st$get_hash(filename), h2) + expect_equal(st$db$get(filename)$size, 999) + expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) +}) From c7032132909657c54d8bc4947bb46e31a0c1ec81 Mon Sep 17 00:00:00 2001 From: Rich FitzJohn Date: Wed, 19 Oct 2016 18:08:21 +0100 Subject: [PATCH 2/2] Check that the file info cache survives remake reinvokcation --- tests/testthat/test-store.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/testthat/test-store.R b/tests/testthat/test-store.R index 6bf6940..4e71554 100644 --- a/tests/testthat/test-store.R +++ b/tests/testthat/test-store.R @@ -69,3 +69,14 @@ test_that("caching file store", { expect_equal(st$db$get(filename)$size, 999) expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) }) + +test_that("store persists across remake invokations", { + cleanup() + make() + obj <- remake() + expect_true(obj$store$files$db$exists("data.csv")) + make("clean") + expect_true(obj$store$files$db$exists("data.csv")) + make("purge") + expect_true(obj$store$files$db$exists("data.csv")) +})