-
Notifications
You must be signed in to change notification settings - Fork 31
Try to cache file hashes #133
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,11 @@ | |
| file_store <- R6Class( | ||
| "file_store", | ||
| public=list( | ||
| db = NULL, | ||
| initialize = function() { | ||
| self$db <- storr::storr_environment() | ||
| }, | ||
|
|
||
| exists=function(filename) { | ||
| file.exists(filename) | ||
| }, | ||
|
|
@@ -36,7 +41,31 @@ file_store <- R6Class( | |
|
|
||
| get_hash=function(filename) { | ||
| if (self$exists(filename)) { | ||
| hash_files(filename, named=FALSE) | ||
| ## The approach here is to try and compute the hash of a file | ||
| ## only once per session (if the file appears not to have | ||
| ## changed). | ||
| ## | ||
| ## Every time that the hash is computed we save into an | ||
| ## emphemeral (environment) storr the hash, mtime and size of | ||
| ## the file. | ||
| ## | ||
| ## On subsequent hash requests if mtime and size are the same, | ||
| ## then we will assume that the file is not changed and return | ||
| ## the previously computed hash. | ||
| ## | ||
| ## Extending this to store the information persistently (e.g., | ||
| ## in the main storr, or in another alongside the existing | ||
| ## one) would not be a great problem. | ||
| info <- file_info(filename) | ||
| if (self$db$exists(filename)) { | ||
| dat <- self$db$get(filename) | ||
| if (identical(info, dat[names(info)])) { | ||
| return(dat$hash) | ||
| } | ||
| } | ||
| info$hash <- hash_files(filename, named = FALSE) | ||
| self$db$set(filename, info) | ||
| info$hash | ||
| } else { | ||
| stop(sprintf("file %s not found in file store", filename)) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handling abnormal conditions first saves one indent, but that's a matter of style. |
||
| } | ||
|
|
@@ -64,7 +93,12 @@ file_store <- R6Class( | |
| dir.create(path_out, showWarnings=FALSE, recursive=TRUE) | ||
| file_copy(file_in, path_out) | ||
| } | ||
| )) | ||
| )) | ||
|
|
||
| file_info <- function(filename) { | ||
| info <- file.info(filename, extra_cols = FALSE) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this function support len(filename) != 1? How about renaming to file_mtime_and_size() or similar? |
||
| list(mtime = info$mtime, size = info$size) | ||
| } | ||
|
|
||
| ##' @importFrom R6 R6Class | ||
| store <- R6Class( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,3 +17,66 @@ test_that("file store", { | |
| st$del(file) | ||
| expect_false(st$exists(file)) | ||
| }) | ||
|
|
||
| test_that("caching file store", { | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about splitting the test so that each test_that() call tests only one aspect of the behavior?
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that would be better style. It can get a bit tedious though as there's so much side effect heavy bits here (remake is basically all side effect) |
||
| lava <- function(...) stop("Can't touch this") | ||
|
|
||
| filename <- tempfile() | ||
| bytes <- as.raw(sample(0:255, 1000, TRUE)) | ||
| writeBin(bytes, filename) | ||
| h <- digest::digest(bytes, serialize = FALSE) | ||
|
|
||
| st <- file_store$new() | ||
| expect_false(st$db$exists(filename)) | ||
|
|
||
| expect_equal(st$get_hash(filename), | ||
| unname(tools::md5sum(filename))) | ||
| expect_true(st$db$exists(filename)) | ||
| expect_equal(st$db$get(filename)$size, 1000) | ||
| expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) | ||
| expect_equal(st$db$get(filename)$hash, h) | ||
|
|
||
| ## Does not call back to the underlying hash function: | ||
| with_mock("hash_files" = lava, | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to set "remake::hash_files" so that tests work in R CMD check. (Don't know why.) |
||
| expect_equal(st$get_hash(filename), h)) | ||
|
|
||
| ## Let's change the mtime and confirm changes. The time resolution | ||
| ## here is about 1s so this is slightly awkward. A better way here | ||
| ## might be to use `touch(1)` to directly change the mtime. I don't | ||
| ## know how that would work on windows though. | ||
| mtime <- st$db$get(filename)$mtime | ||
| repeat { | ||
| writeBin(bytes, filename) | ||
| if (file.mtime(filename) > mtime) { | ||
| break | ||
| } | ||
| Sys.sleep(.1) | ||
| } | ||
|
|
||
| ## This *will* call out to the hash function: | ||
| with_mock("hash_files" = lava, | ||
| expect_error(st$get_hash(filename), "Can't touch this")) | ||
| expect_equal(st$get_hash(filename), h) | ||
| expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) | ||
| expect_equal(st$db$get(filename)$size, 1000) | ||
|
|
||
| ## Then, we could change the file *size* and force rehashing. | ||
| writeBin(bytes[-1], filename) | ||
| h2 <- digest::digest(bytes[-1], serialize = FALSE) | ||
| with_mock("hash_files" = lava, | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. |
||
| expect_error(st$get_hash(filename), "Can't touch this")) | ||
| expect_equal(st$get_hash(filename), h2) | ||
| expect_equal(st$db$get(filename)$size, 999) | ||
| expect_equal(st$db$get(filename)$mtime, file.mtime(filename)) | ||
| }) | ||
|
|
||
| test_that("store persists across remake invokations", { | ||
| cleanup() | ||
| make() | ||
| obj <- remake() | ||
| expect_true(obj$store$files$db$exists("data.csv")) | ||
| make("clean") | ||
| expect_true(obj$store$files$db$exists("data.csv")) | ||
| make("purge") | ||
| expect_true(obj$store$files$db$exists("data.csv")) | ||
| }) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would inserting the whole object at once provide better encapsulation?