Title: | File Cacher |
---|---|
Description: | The main functions in this package are with_cache() and cached_read(). The former is a simple way to cache an R object into a file on disk, using 'cachem'. The latter is a wrapper around any standard read function, but caches both the output and the file list info. If the input file list info hasn't changed, the cache is used; otherwise, the original files are re-read. This can save time if the original operation requires reading from many files, and/or involves lots of processing. |
Authors: | Or Gadish [aut, cre, cph] |
Maintainer: | Or Gadish <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.2.9 |
Built: | 2025-03-09 05:42:11 UTC |
Source: | https://github.com/orgadish/filecacher |
Reads data and save to a local file for easier management and re-reading.
By default, also saves the file info to determine whether the cache
is valid, or whether the contents need to be updated because the files
have been modified. To skip this, or force reading from scratch, set
skip_file_info=TRUE
or force=TRUE
, respectively.
If updating is called for, all the files are re-read.
cached_read_csv()
is a convenience function using a csv read function
based on read_type
.
cached_read( files, label, read_fn, cache = NULL, type = NULL, force = FALSE, skip_file_info = FALSE ) cached_read_csv( files, label, read_type = NULL, cache = NULL, type = NULL, skip_file_info = FALSE, force = FALSE )
cached_read( files, label, read_fn, cache = NULL, type = NULL, force = FALSE, skip_file_info = FALSE ) cached_read_csv( files, label, read_type = NULL, cache = NULL, type = NULL, skip_file_info = FALSE, force = FALSE )
files |
A file or files to read with |
label |
A string to use as the name of the file to cache. |
read_fn |
A function which takes file(s) as its first parameter and
reads them. To use a single-input read function such as
|
cache |
One of the following:
|
type |
A string describing the type of cache.
Must be |
force |
If |
skip_file_info |
Whether to skip saving and/or checking the file info. Use this when just querying the file system (without opening files) is slow. |
read_type |
Type of csv read function to use. One of:
|
The result of read_fn(files)
.
vectorize_reader()
to convert a single-input read function into a
multiple-input function.
# Create a temporary directory for the cache. tf <- tempfile() dir.create(tf) # A function that logs when it's called. read_csv_log <- function(files) { message("Reading from file ...") return(vectorize_reader(read.csv)(files, stringsAsFactors = TRUE)) } # `iris` data frame separated into multiple subset files. iris_files <- system.file("extdata", package = "filecacher") |> list.files(pattern = "_only[.]csv$", full.names = TRUE) # 1) First time, the message is shown. iris_files |> cached_read("mtcars", read_csv_log, cache = tf) |> all.equal(iris) # 2) Second time, no message is shown since the data is pulled from cache. iris_files |> cached_read("mtcars", read_csv_log, cache = tf) |> all.equal(iris) # 3) If desired, reloading can be forced using `force = TRUE`. iris_files |> cached_read("mtcars", read_csv_log, cache = tf, force = TRUE) |> all.equal(iris) unlink(tf, recursive = TRUE)
# Create a temporary directory for the cache. tf <- tempfile() dir.create(tf) # A function that logs when it's called. read_csv_log <- function(files) { message("Reading from file ...") return(vectorize_reader(read.csv)(files, stringsAsFactors = TRUE)) } # `iris` data frame separated into multiple subset files. iris_files <- system.file("extdata", package = "filecacher") |> list.files(pattern = "_only[.]csv$", full.names = TRUE) # 1) First time, the message is shown. iris_files |> cached_read("mtcars", read_csv_log, cache = tf) |> all.equal(iris) # 2) Second time, no message is shown since the data is pulled from cache. iris_files |> cached_read("mtcars", read_csv_log, cache = tf) |> all.equal(iris) # 3) If desired, reloading can be forced using `force = TRUE`. iris_files |> cached_read("mtcars", read_csv_log, cache = tf, force = TRUE) |> all.equal(iris) unlink(tf, recursive = TRUE)
Similar to dplyr::all_equal(x, y, ignore_row_order=TRUE)
,
which is now deprecated.
If either argument is not a data.frame it returns FALSE
,
rather than raise an error.
dfs_equal(target, current)
dfs_equal(target, current)
target |
R object. |
current |
other R object, to be compared with |
cachem
object for use with other functions.Gets or creates a cachem
object for use with other functions.
file_cache(cache = NULL, type = NULL, ext_prefix = "cache_")
file_cache(cache = NULL, type = NULL, ext_prefix = "cache_")
cache |
The path to an existing directory to use for caching. If Advanced: if an existing |
type |
A string describing the type of cache.
Must be |
ext_prefix |
The prefix to use with the file extension, e.g. "cache_csv", instead of "csv". |
A cachem::cache_disk()
object.
# Create a temporary directory for the cache. tf <- tempfile() dir.create(tf) # A dummy function that logs when it's called. get_df <- function() { message("Getting df ...") return(mtcars) } # Use the resulting object in `with_cache()`. # 1) The first time, the message is printed. # 2) The second time, the object is pulled from the cache, with no message. all.equal(with_cache(get_df(), "df", cache = tf), mtcars) all.equal(with_cache(get_df(), "df", cache = tf), mtcars) # `with_cache` is designed to be compatible with piping. get_df() |> with_cache("df", cache = tf) |> all.equal(mtcars) # Advanced: If desired, the `cachem` object methods can be used directly. cache <- file_cache(tf) cache$get("df") |> # Get objects previously cached using `with_cache`. all.equal(mtcars) cache$set("df2", mtcars) # Set objects using `$set`. cache$get("df2") |> all.equal(mtcars) unlink(tf, recursive = TRUE)
# Create a temporary directory for the cache. tf <- tempfile() dir.create(tf) # A dummy function that logs when it's called. get_df <- function() { message("Getting df ...") return(mtcars) } # Use the resulting object in `with_cache()`. # 1) The first time, the message is printed. # 2) The second time, the object is pulled from the cache, with no message. all.equal(with_cache(get_df(), "df", cache = tf), mtcars) all.equal(with_cache(get_df(), "df", cache = tf), mtcars) # `with_cache` is designed to be compatible with piping. get_df() |> with_cache("df", cache = tf) |> all.equal(mtcars) # Advanced: If desired, the `cachem` object methods can be used directly. cache <- file_cache(tf) cache$get("df") |> # Get objects previously cached using `with_cache`. all.equal(mtcars) cache$set("df2", mtcars) # Set objects using `$set`. cache$get("df2") |> all.equal(mtcars) unlink(tf, recursive = TRUE)
Check whether two function objects have the same text definition.
fns_equal(x, y)
fns_equal(x, y)
x |
First function to compare. |
y |
Second function to compare. |
Logical
Read functions are vectorized.
get_csv_fns(type = NULL)
get_csv_fns(type = NULL)
type |
Type of csv read/write functions to get.
If |
List of read/write functions.
Get the first CSV Read function installed
get_csv_read_fn(read_type = NULL)
get_csv_read_fn(read_type = NULL)
read_type |
Type of csv read function to use. One of:
|
Function that reads multiple paths to CSVs.
Uses file.info()
to get size
and mtime
.
get_file_info(path)
get_file_info(path)
path |
A character vector of one or more paths. |
Generate cache parameters from preexisting shorthand types.
interpret_cache_type(type, ext_prefix = "cache_")
interpret_cache_type(type, ext_prefix = "cache_")
type |
A string describing the type of cache.
Must be |
ext_prefix |
The prefix to use with the file extension, e.g. "cache_csv", instead of "csv". |
List of read_fn
, write_fn
, and extension
for use with
cachem::cache_disk()
.
The resulting vectorized read function still takes all the arguments of the original function.
Uses purrr::list_rbind()
to bind the data frames, which generates
a data frame with a superset of the columns from all the files,
filling NA
where data was not present.
vectorize_reader(read_fn, file_path_to = NULL)
vectorize_reader(read_fn, file_path_to = NULL)
read_fn |
The read function to vectorize. The first argument must be the files to read. |
file_path_to |
A string, which if provided, is the name of the column
containing the file paths in the result. See 'names_to' in
|
A version of read_fn
that can read multiple paths.
# Convert iris$Species to character to simplify comparison. iris_chr <- iris iris_chr$Species <- as.character(iris$Species) # `iris` data frame separated into multiple subset files. iris_files <- system.file("extdata", package = "filecacher") |> list.files(pattern = "_only[.]csv$", full.names = TRUE) try(read.csv(iris_files)) vectorize_reader(read.csv)( iris_files, stringsAsFactors = TRUE ) |> all.equal(iris) if (rlang::is_installed("arrow")) { try(arrow::read_csv_arrow(iris_files)) vectorize_reader(arrow::read_csv_arrow)( iris_files ) |> as.data.frame() |> all.equal(iris_chr) } if (rlang::is_installed("data.table")) { try(data.table::fread(iris_files)) vectorize_reader(data.table::fread)( iris_files, stringsAsFactors = TRUE ) |> as.data.frame() |> all.equal(iris) }
# Convert iris$Species to character to simplify comparison. iris_chr <- iris iris_chr$Species <- as.character(iris$Species) # `iris` data frame separated into multiple subset files. iris_files <- system.file("extdata", package = "filecacher") |> list.files(pattern = "_only[.]csv$", full.names = TRUE) try(read.csv(iris_files)) vectorize_reader(read.csv)( iris_files, stringsAsFactors = TRUE ) |> all.equal(iris) if (rlang::is_installed("arrow")) { try(arrow::read_csv_arrow(iris_files)) vectorize_reader(arrow::read_csv_arrow)( iris_files ) |> as.data.frame() |> all.equal(iris_chr) } if (rlang::is_installed("data.table")) { try(data.table::fread(iris_files)) vectorize_reader(data.table::fread)( iris_files, stringsAsFactors = TRUE ) |> as.data.frame() |> all.equal(iris) }
If the cache exists, the object is retrieved from the cache. Otherwise, it is evaluated and stored for subsequent retrieval.
Use force=TRUE
to ensure the object is evaluated and stored
anew in the cache.
The object evaluated must be compatible with the cache type.
For example, a cache type of 'csv' or 'parquet' requires a
data.frame
or similar type.
with_cache(x, label, cache = NULL, type = NULL, force = FALSE)
with_cache(x, label, cache = NULL, type = NULL, force = FALSE)
x |
The object to store in the cache. Must be compatible with the cache type. |
label |
A string to use as the name of the file to cache. |
cache |
One of the following:
|
type |
A string describing the type of cache.
Must be |
force |
If |
The value of x
.
# Create a temporary directory for the cache. tf <- tempfile() dir.create(tf) # A dummy function that logs when it's called. get_df <- function() { message("Getting df ...") return(mtcars) } # Use the resulting object in `with_cache()`. # 1) The first time, the message is printed. # 2) The second time, the object is pulled from the cache, with no message. all.equal(with_cache(get_df(), "df", cache = tf), mtcars) all.equal(with_cache(get_df(), "df", cache = tf), mtcars) # `with_cache` is designed to be compatible with piping. get_df() |> with_cache("df", cache = tf) |> all.equal(mtcars) # Advanced: If desired, the `cachem` object methods can be used directly. cache <- file_cache(tf) cache$get("df") |> # Get objects previously cached using `with_cache`. all.equal(mtcars) cache$set("df2", mtcars) # Set objects using `$set`. cache$get("df2") |> all.equal(mtcars) unlink(tf, recursive = TRUE)
# Create a temporary directory for the cache. tf <- tempfile() dir.create(tf) # A dummy function that logs when it's called. get_df <- function() { message("Getting df ...") return(mtcars) } # Use the resulting object in `with_cache()`. # 1) The first time, the message is printed. # 2) The second time, the object is pulled from the cache, with no message. all.equal(with_cache(get_df(), "df", cache = tf), mtcars) all.equal(with_cache(get_df(), "df", cache = tf), mtcars) # `with_cache` is designed to be compatible with piping. get_df() |> with_cache("df", cache = tf) |> all.equal(mtcars) # Advanced: If desired, the `cachem` object methods can be used directly. cache <- file_cache(tf) cache$get("df") |> # Get objects previously cached using `with_cache`. all.equal(mtcars) cache$set("df2", mtcars) # Set objects using `$set`. cache$get("df2") |> all.equal(mtcars) unlink(tf, recursive = TRUE)