Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions R/store.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
file_store <- R6Class(
"file_store",
public=list(
db = NULL,
initialize = function() {
self$db <- storr::storr_environment()
},

exists=function(filename) {
file.exists(filename)
},
Expand All @@ -36,7 +41,31 @@ file_store <- R6Class(

get_hash=function(filename) {
if (self$exists(filename)) {
hash_files(filename, named=FALSE)
## The approach here is to try and compute the hash of a file
## only once per session (if the file appears not to have
## changed).
##
## Every time that the hash is computed we save into an
## emphemeral (environment) storr the hash, mtime and size of
## the file.
##
## On subsequent hash requests if mtime and size are the same,
## then we will assume that the file is not changed and return
## the previously computed hash.
##
## Extending this to store the information persistently (e.g.,
## in the main storr, or in another alongside the existing
## one) would not be a great problem.
info <- file_info(filename)
if (self$db$exists(filename)) {
dat <- self$db$get(filename)
if (identical(info, dat[names(info)])) {
return(dat$hash)
}
}
info$hash <- hash_files(filename, named = FALSE)
self$db$set(filename, info)
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would inserting the whole object at once provide better encapsulation?

self$db$set(filename, list(info = info))

info$hash
} else {
stop(sprintf("file %s not found in file store", filename))
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Handling abnormal conditions first saves one indent, but that's a matter of style.

}
Expand Down Expand Up @@ -64,7 +93,12 @@ file_store <- R6Class(
dir.create(path_out, showWarnings=FALSE, recursive=TRUE)
file_copy(file_in, path_out)
}
))
))

file_info <- function(filename) {
info <- file.info(filename, extra_cols = FALSE)
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this function support len(filename) != 1? How about renaming to file_mtime_and_size() or similar?

list(mtime = info$mtime, size = info$size)
}

##' @importFrom R6 R6Class
store <- R6Class(
Expand Down
63 changes: 63 additions & 0 deletions tests/testthat/test-store.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,66 @@ test_that("file store", {
st$del(file)
expect_false(st$exists(file))
})

test_that("caching file store", {
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about splitting the test so that each test_that() call tests only one aspect of the behavior?

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that would be better style. It can get a bit tedious though as there's so much side effect heavy bits here (remake is basically all side effect)

lava <- function(...) stop("Can't touch this")

filename <- tempfile()
bytes <- as.raw(sample(0:255, 1000, TRUE))
writeBin(bytes, filename)
h <- digest::digest(bytes, serialize = FALSE)

st <- file_store$new()
expect_false(st$db$exists(filename))

expect_equal(st$get_hash(filename),
unname(tools::md5sum(filename)))
expect_true(st$db$exists(filename))
expect_equal(st$db$get(filename)$size, 1000)
expect_equal(st$db$get(filename)$mtime, file.mtime(filename))
expect_equal(st$db$get(filename)$hash, h)

## Does not call back to the underlying hash function:
with_mock("hash_files" = lava,
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to set "remake::hash_files" so that tests work in R CMD check. (Don't know why.)

expect_equal(st$get_hash(filename), h))

## Let's change the mtime and confirm changes. The time resolution
## here is about 1s so this is slightly awkward. A better way here
## might be to use `touch(1)` to directly change the mtime. I don't
## know how that would work on windows though.
mtime <- st$db$get(filename)$mtime
repeat {
writeBin(bytes, filename)
if (file.mtime(filename) > mtime) {
break
}
Sys.sleep(.1)
}

## This *will* call out to the hash function:
with_mock("hash_files" = lava,
expect_error(st$get_hash(filename), "Can't touch this"))
expect_equal(st$get_hash(filename), h)
expect_equal(st$db$get(filename)$mtime, file.mtime(filename))
expect_equal(st$db$get(filename)$size, 1000)

## Then, we could change the file *size* and force rehashing.
writeBin(bytes[-1], filename)
h2 <- digest::digest(bytes[-1], serialize = FALSE)
with_mock("hash_files" = lava,
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above.

expect_error(st$get_hash(filename), "Can't touch this"))
expect_equal(st$get_hash(filename), h2)
expect_equal(st$db$get(filename)$size, 999)
expect_equal(st$db$get(filename)$mtime, file.mtime(filename))
})

test_that("store persists across remake invokations", {
cleanup()
make()
obj <- remake()
expect_true(obj$store$files$db$exists("data.csv"))
make("clean")
expect_true(obj$store$files$db$exists("data.csv"))
make("purge")
expect_true(obj$store$files$db$exists("data.csv"))
})