Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7745fd3
clv.time: Accept observation.end when setting periods
pschil Aug 11, 2025
765d8e5
clv.time: Tests for set.sample.periods(observation.end)
pschil Aug 11, 2025
3f99675
observation.end: Docu
pschil Aug 13, 2025
3bb12e7
summary.clv.data: Print start and end of periods
pschil Aug 13, 2025
6caf825
clv.data: Implement observation.end, incl input checks
pschil Aug 13, 2025
375b301
Fix tests: clv.time produces new error message
pschil Aug 13, 2025
4d4e847
clvdata(): Remove redundant check
pschil Aug 13, 2025
a195f7e
Move check to clv.time
pschil Aug 13, 2025
0da6c76
clv.time: Adapt tests
pschil Aug 13, 2025
d687fe0
clvdata: Update error message
pschil Aug 13, 2025
9ffcd3a
Fix docu: parameter name
pschil Aug 13, 2025
ad4248f
clvdata s3: Add observation.end
pschil Aug 14, 2025
2b953a2
Improve clvdata docu
pschil Aug 14, 2025
c71e795
Fix tests
pschil Aug 14, 2025
5e41a78
clv.time: Add back test
pschil Aug 14, 2025
8ad3bef
Tests: clvdata(), clv.data S3
pschil Aug 14, 2025
39d7f7c
Tests: Fix setdyncov tests
pschil Aug 14, 2025
d946c9d
Tests: SetDynCov with observation.end
pschil Aug 14, 2025
9b6b62c
Tests: Dyncov runability with observation.end
pschil Aug 16, 2025
c4df0ad
Tests: observation.end moves prediction period
pschil Aug 27, 2025
f39295f
Fix test
pschil Aug 28, 2025
ab7c7cc
Fix docu
pschil Aug 28, 2025
0bcb260
Rename `observation.end` -> `data.end`
pschil Aug 28, 2025
9e7c5a6
Improve docu
pschil Aug 28, 2025
bc85fcd
Merge branch 'development' into feature-clvdata-observationend
pschil Aug 28, 2025
cfd3a22
tracking plot: Plot empty dates
pschil Aug 30, 2025
e260abb
Fix tests: Plot data can be NA now
pschil Aug 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion R/all_generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,10 @@ setGeneric("clv.data.create.bootstrapping.data", def = function(clv.data, ids){
#'
#' @export
as.clv.data <- function(x,
date.format="ymd", time.unit="weeks",
date.format="ymd",
time.unit="weeks",
estimation.split = NULL,
data.end = NULL,
name.id="Id", name.date="Date", name.price="Price",
...){
UseMethod("as.clv.data", x)
Expand Down
14 changes: 14 additions & 0 deletions R/class_clv_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,21 @@ clv.data.make.descriptives <- function(clv.data, ids){
dt.interp <- clv.data.mean.interpurchase.times(clv.data=clv.data, dt.transactions = dt.data)
dt.num.trans.by.cust <- dt.data[, .N, by="Id"]

tp.period.start <- switch(
sample.name,
Estimation=clv.time@timepoint.estimation.start,
Holdout=clv.time@timepoint.holdout.start,
Total=clv.time@timepoint.estimation.start)

tp.period.end <- switch(
sample.name,
Estimation=clv.time@timepoint.estimation.end,
Holdout=clv.time@timepoint.holdout.end,
Total=clv.time@timepoint.holdout.end)

l.desc <- list(
"Period Start" = clv.time.format.timepoint(clv.time=clv.time, timepoint=tp.period.start),
"Period End" = clv.time.format.timepoint(clv.time=clv.time, timepoint=tp.period.end),
"Number of customers" = if(sample.name=="Total"){nrow(dt.num.trans.by.cust)}else{"-"},
"First Transaction in period" = clv.time.format.timepoint(clv.time=clv.time, timepoint=dt.data[, min(Date)]),
"Last Transaction in period" = clv.time.format.timepoint(clv.time=clv.time, timepoint=dt.data[, max(Date)]),
Expand Down
35 changes: 27 additions & 8 deletions R/class_clv_time.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,24 @@ clv.time.has.holdout <- function(clv.time){

# set.sample.periods ------------------------------------------------------------------------
#' @importFrom lubridate period
clv.time.set.sample.periods <- function(clv.time, tp.first.transaction, tp.last.transaction, user.estimation.end){
clv.time.set.sample.periods <- function(clv.time, tp.first.transaction, tp.last.transaction, user.estimation.end, user.data.end){

tp.estimation.start <- tp.first.transaction

if(is.null(user.data.end)){
tp.data.end <- tp.last.transaction
}else{
tp.data.end <- clv.time.convert.user.input.to.timepoint(
clv.time=clv.time,
user.timepoint=user.data.end)

# Data end may not be before last transaction
if(tp.data.end < tp.last.transaction){
stop("The given data.end may not be before the last recorded transaction!")
}
}


if(!is.null(user.estimation.end)){
# specific end

Expand All @@ -98,24 +112,29 @@ clv.time.set.sample.periods <- function(clv.time, tp.first.transaction, tp.last.
user.timepoint=user.estimation.end)
}


# Before the last transaction to ensure there is at least 1 transaction in the holdout period.
# Needed additionally to holdout >=2 periods
if(tp.estimation.end >= tp.last.transaction)
stop("Parameter estimation.split needs to indicate a point before the last transaction!", call. = FALSE)

# Need to be 2 periods because otherwise for days, holdout can be not on estimation.end but still be of length zero
# ie 2 periods to still have 1 as holdout
if(tp.estimation.end > tp.last.transaction-clv.time.number.timeunits.to.timeperiod(clv.time, 2L))
stop("Parameter estimation.split needs to indicate a point at least 2 periods before the last transaction!", call. = FALSE)
if(tp.estimation.end > tp.data.end - clv.time.number.timeunits.to.timeperiod(clv.time, 2L))
stop("Parameter estimation.split needs to indicate a point in time such that it yields a holdout period of at least 2 time.units!", call. = FALSE)

# + 1 day is the same for all because most fine-grained change that Date can do
tp.holdout.start <- tp.estimation.end + clv.time.epsilon(clv.time=clv.time)
tp.holdout.end <- tp.last.transaction
tp.holdout.end <- tp.data.end
holdout.period.in.tu <- clv.time.interval.in.number.tu(clv.time,
interv=interval(start = tp.holdout.start,
end = tp.holdout.end))
}else{
# NULL: no specific end - until end of data (last transaction)
# **TODO: last transaction or full period where last transaction is in?
# NULL: no specific end - until data end

# tp.holdout.start and .end HAVE to be end of estimation period as this is used elsewhere!
# tp.holdout.start/.end and HAVE to be end of estimation period as this is used elsewhere!
# ie to ensure prediction.end (with clv.time.get.prediction.table) finds correct end if user gives NULL
tp.estimation.end <- tp.last.transaction
tp.estimation.end <- tp.data.end
tp.holdout.start <- tp.estimation.end
tp.holdout.end <- tp.estimation.end
holdout.period.in.tu <- 0
Expand Down
25 changes: 25 additions & 0 deletions R/f_clvdata_inputchecks.R
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,31 @@ check_userinput_datanocov_estimationsplit <- function(estimation.split, date.for
return(c())
}

#' @importFrom lubridate is.POSIXt is.Date parse_date_time
check_userinput_datanocov_dataend <- function(data.end, date.format){

# May be NULL
if(is.null(data.end))
return(c())

if(length(data.end) != 1)
return("data.end must contain exactly one single element!")

if(anyNA(data.end))
return("data.end may not contain any NAs!")

if(!is.character(data.end)
& !is.Date(data.end)
& !is.POSIXt(data.end))
return("data.end needs to either of type character or date-like (Date or POSIXt)")

if(is.character(data.end))
if(anyNA(parse_date_time(x=data.end, quiet=TRUE, orders=date.format)))
return("Please provide a valid data.end to that can be converted with the given date.format!")

return(c())
}


#' @importFrom lubridate is.POSIXct
check_userinput_datanocov_datatransactions <- function(data.transactions.dt, has.spending){
Expand Down
32 changes: 23 additions & 9 deletions R/f_interface_clvdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,17 @@
#' (i.e., "2010-06-17") is indicated with \code{"ymd"}. Other combinations such as \code{"dmy"}, \code{"dym"},
#' \code{"ymd HMS"}, or \code{"HMS dmy"} are possible as well.
#'
#' \code{data.end} A point in time beyond the last purchase at which the data should fictionally end.
#' It defines the total time frame in which customers could be observed: The combined estimation and holdout periods.
#' For example, when the last recorded transaction was on "2000-12-29" but customers were actually observed until "2000-12-31".
#' Using \code{data.end="2000-12-31"} without holdout period,
#' the estimation period will be until "2000-12-31" and the prediction period will start on "2001-01-01".
#' Required to be after the last recorded transaction.
#'
#' \code{estimation.split} May be specified as either the number of periods since the first transaction or the timepoint
#' (either as character, Date, or POSIXct) at which the estimation period ends. The indicated timepoint itself will be part of the estimation sample.
#' (either as character, Date, or POSIXct) at which the estimation period ends.
#' Required to be before the last transaction.
#' The indicated timepoint itself will be part of the estimation sample.
#' If no value is provided or set to \code{NULL}, the whole dataset will used for fitting the model (no holdout sample).
#'
#' @details ## Aggregation of Transactions
Expand Down Expand Up @@ -84,6 +93,15 @@
#' time.unit = "w",
#' estimation.split = "1997-10-15")
#'
#' # Extend data fictionally until 31th Dec 1998
#' # In this case, this only moves the holdout period and has no effect on the
#' # estimation.
#' clv.data.cdnow <- clvdata(data.transactions = cdnow,
#' date.format="ymd",
#' time.unit = "w",
#' data.end = "1998-12-31",
#' estimation.split = "1997-10-15")
#'
#' # summary of the transaction data
#' summary(clv.data.cdnow)
#'
Expand Down Expand Up @@ -112,7 +130,7 @@
#'
#'
#' @export
clvdata <- function(data.transactions, date.format, time.unit, estimation.split=NULL, name.id="Id", name.date="Date", name.price="Price"){
clvdata <- function(data.transactions, date.format, time.unit, estimation.split=NULL, data.end=NULL, name.id="Id", name.date="Date", name.price="Price"){
# silence CRAN notes
Date <- Price <- Id <- x <- previous <- date.first.actual.trans <- NULL

Expand All @@ -136,6 +154,7 @@ clvdata <- function(data.transactions, date.format, time.unit, estimation.split=

err.msg <- c(err.msg, .check_userinput_charactervec(char=date.format, var.name = "date.format", n=1))
err.msg <- c(err.msg, check_userinput_datanocov_estimationsplit(estimation.split=estimation.split, date.format=date.format))
err.msg <- c(err.msg, check_userinput_datanocov_dataend(data.end=data.end, date.format=date.format))
check_err_msg(err.msg)


Expand Down Expand Up @@ -208,14 +227,9 @@ clvdata <- function(data.transactions, date.format, time.unit, estimation.split=
clv.t <- clv.time.set.sample.periods(clv.time = clv.t,
tp.first.transaction = tp.first.transaction,
tp.last.transaction = tp.last.transaction,
user.data.end = data.end,
user.estimation.end = estimation.split)

if(clv.t@timepoint.estimation.end > dt.trans[, max(Date)])
stop("Parameter estimation.split needs to indicate a point in the data!", call. = FALSE)

if(clv.t@estimation.period.in.tu < 1)
stop("Parameter estimation.split needs to be at least 1 time.unit after the start!", call. = FALSE)


# Check if the estimation.split is valid ----------------------------------------
# - estimation period long enough
Expand All @@ -229,7 +243,7 @@ clvdata <- function(data.transactions, date.format, time.unit, estimation.split=
everyones.first.trans <- dt.trans[, list(date.first.actual.trans = min(Date)), by="Id"]
date.last.first.trans <- everyones.first.trans[, max(date.first.actual.trans)]
if(clv.t@timepoint.estimation.end < date.last.first.trans)
stop("The estimation split is too short! Not all customers of this cohort had their first actual transaction until the specified estimation.split!", call. = F)
stop("The estimation period is too short! Not all customers had their first transaction until the end of the estimation period!", call. = FALSE)



Expand Down
8 changes: 6 additions & 2 deletions R/f_s3generics_clvdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,10 @@ subset.clv.data <- function(x,
#' @rdname as.clv.data
#' @export
as.clv.data.data.frame <- function(x,
date.format="ymd", time.unit="weeks",
date.format="ymd",
time.unit="weeks",
estimation.split = NULL,
data.end = NULL,
name.id="Id", name.date="Date", name.price="Price",
...){
return(clvdata(data.transactions = x,
Expand All @@ -304,8 +306,10 @@ as.clv.data.data.frame <- function(x,
#' @rdname as.clv.data
#' @export
as.clv.data.data.table <- function(x,
date.format="ymd", time.unit="weeks",
date.format="ymd",
time.unit="weeks",
estimation.split = NULL,
data.end = NULL,
name.id="Id", name.date="Date", name.price="Price",
...){
return(clvdata(data.transactions = x,
Expand Down
26 changes: 21 additions & 5 deletions R/f_s3generics_clvdata_plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
#' \item{variable}{"tracking": The number of actual repeat transactions in the period that ends at \code{period.until}.\cr
#' "timings": Coordinate (x or y) for which to use the value in this row for.}
#' \item{value}{"timings": Date or numeric (stored as string) \cr
#' "tracking": numeric}
#' "tracking": numeric, may be \code{NA} if no repeat-transactions were recorded in this period}
#'
#'
#' @examples
Expand Down Expand Up @@ -310,10 +310,26 @@ clv.data.plot.tracking <- function(x, prediction.end, cumulative, plot, verbose,
dt.dates.expectation[dt.repeat.trans, (label.transactions) := get(label.transactions), on="period.until"]
dt.plot <- melt(dt.dates.expectation, id.vars="period.until")

# last period often has NA as it marks the full span of the period
dt.plot <- dt.plot[!is.na(value)]

# data.table does not print when returned because it is returned directly after last [:=]
# The last period usually is set to NA because the data does not reach to the end of it.
# The last period has to be a full period because of the expectation plot.
# At the same time, the transaction data often ends before the last period (is only a partial period).
# This leads to a much lower number of transactions recorded in the last period
# and a noticeable, hard-to-explain drop at the end.
# Periods for which there are no transactions contain 0 not NA. Only the last
# period may contain NA.
# We remove it to not have it in the data and not raise a warning when plotting.
#
# Since introducing `data.end`, we no loner remove NAs as now there can be many
# periods without transactions and these should be shown (plotted) and known (returned data).
# Instead `geom_line(na.rm=T)` is used to remove them during plotting.
# Returning them helps users who want to create their own plots to plot the
# correct range (total time span of data).
# Alternative: Drop NA but set x-axis scale until holdout.end using
# `+ xlim(c(x@clv.time@timepoint.estimation.start, x@clv.time@timepoint.holdout.end))`
#
# dt.plot <- dt.plot[!is.na(value)]

# # data.table does not print when returned because it is returned directly after last [:=]
# " if a := is used inside a function with no DT[] before the end of the function, then the next
# time DT or print(DT) is typed at the prompt, nothing will be printed. A repeated DT or print(DT)
# will print. To avoid this: include a DT[] after the last := in your function."
Expand Down
12 changes: 9 additions & 3 deletions R/f_s3generics_clvfittedtransactions_plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
#' \item{period.until}{The timepoint that marks the end (up until and including) of the period to which the data in this row refers.}
#' \item{variable}{Type of variable that 'value' refers to. Either "model name" or "Actual" (if \code{transactions=TRUE}).}
#' \item{value}{Depending on variable either (Actual) the actual number of repeat transactions in the period that ends at \code{period.until},
#' or the unconditional expectation for the period that ends on \code{period.until} ("model name").}
#' or the unconditional expectation for the period that ends on \code{period.until} ("model name"). Actuals may be \code{NA} if no transaction was recorded.}
#'
#' For the PMF plot:
#' \item{num.transactions}{The number of repeat transactions in the estimation period (as ordered factor).}
Expand Down Expand Up @@ -241,7 +241,7 @@ clv.controlflow.plot.tracking.base <- function(dt.plot, clv.data, color.mapping,
# Plotting order
dt.plot[, variable := factor(variable, levels=names(color.mapping), ordered = TRUE)]

p <- ggplot(data = dt.plot, aes(x=period.until, y=value, colour=variable)) + geom_line()
p <- ggplot(data = dt.plot, aes(x=period.until, y=value, colour=variable)) + geom_line(na.rm = TRUE)

# Add holdout line if there is a holdout period
if(clv.data.has.holdout(clv.data)){
Expand Down Expand Up @@ -382,7 +382,13 @@ clv.fitted.transactions.plot.tracking.get.data <- function(x, prediction.end, cu
dt.plot <- melt(dt.dates.expectation, id.vars='period.until')

# last period often has NA as it marks the full span of the period
dt.plot <- dt.plot[!is.na(value)]
# The last period usually was NA because of explanations it was a partial
# period. See explanations in `clv.data.plot.tracking`.
# dt.plot <- dt.plot[!is.na(value)]
# Since introducing `data.end`, many periods can be NA. The NAs are now removed
# during plotting (`geom_line(na.rm=T)`). For consistency with plot(clvdata),
# the returned data also keeps the NA.

return(dt.plot)
}

Expand Down
1 change: 1 addition & 0 deletions man-roxygen/template_params_clvdata.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @param date.format Character string that indicates the format of the date variable in the data used. See details.
#' @param time.unit What time unit defines a period. May be abbreviated, capitalization is ignored. See details.
#' @param data.end The fictional end of the data, after the last recorded transaction in \code{<%=name_param_trans%>}. See details.
#' @param estimation.split Indicates the length of the estimation period. See details.
#' @param name.id Column name of the customer id in \code{<%=name_param_trans%>}.
#' @param name.date Column name of the transaction date in \code{<%=name_param_trans%>}.
Expand Down
2 changes: 2 additions & 0 deletions man-roxygen/template_summary_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#' be limited to a subset of customers.
#' \describe{
#' \item{\code{Number of customers}}{Count of individual customers.}
#' \item{\code{Period Start}}{Start of the indicated period.}
#' \item{\code{Period End}}{End of indicated period.}
#' \item{\code{First Transaction in period}}{Time point of the first transaction occurring in the indicated period.}
#' \item{\code{Last Transaction in period}}{Time point of the last transaction occurring in the indicated period.}
#' \item{\code{Total # Transactions}}{Count of transactions occurring in the indicated period.}
Expand Down
5 changes: 5 additions & 0 deletions man/as.clv.data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading