diff --git a/R/all_generics.R b/R/all_generics.R index 51eb2e86..ed8c56c9 100644 --- a/R/all_generics.R +++ b/R/all_generics.R @@ -245,8 +245,10 @@ setGeneric("clv.data.create.bootstrapping.data", def = function(clv.data, ids){ #' #' @export as.clv.data <- function(x, - date.format="ymd", time.unit="weeks", + date.format="ymd", + time.unit="weeks", estimation.split = NULL, + data.end = NULL, name.id="Id", name.date="Date", name.price="Price", ...){ UseMethod("as.clv.data", x) diff --git a/R/class_clv_data.R b/R/class_clv_data.R index d130740a..79519299 100644 --- a/R/class_clv_data.R +++ b/R/class_clv_data.R @@ -253,7 +253,21 @@ clv.data.make.descriptives <- function(clv.data, ids){ dt.interp <- clv.data.mean.interpurchase.times(clv.data=clv.data, dt.transactions = dt.data) dt.num.trans.by.cust <- dt.data[, .N, by="Id"] + tp.period.start <- switch( + sample.name, + Estimation=clv.time@timepoint.estimation.start, + Holdout=clv.time@timepoint.holdout.start, + Total=clv.time@timepoint.estimation.start) + + tp.period.end <- switch( + sample.name, + Estimation=clv.time@timepoint.estimation.end, + Holdout=clv.time@timepoint.holdout.end, + Total=clv.time@timepoint.holdout.end) + l.desc <- list( + "Period Start" = clv.time.format.timepoint(clv.time=clv.time, timepoint=tp.period.start), + "Period End" = clv.time.format.timepoint(clv.time=clv.time, timepoint=tp.period.end), "Number of customers" = if(sample.name=="Total"){nrow(dt.num.trans.by.cust)}else{"-"}, "First Transaction in period" = clv.time.format.timepoint(clv.time=clv.time, timepoint=dt.data[, min(Date)]), "Last Transaction in period" = clv.time.format.timepoint(clv.time=clv.time, timepoint=dt.data[, max(Date)]), diff --git a/R/class_clv_time.R b/R/class_clv_time.R index 6a32a3a5..204be747 100644 --- a/R/class_clv_time.R +++ b/R/class_clv_time.R @@ -75,10 +75,24 @@ clv.time.has.holdout <- function(clv.time){ # set.sample.periods ------------------------------------------------------------------------ #' @importFrom lubridate period -clv.time.set.sample.periods <- function(clv.time, tp.first.transaction, tp.last.transaction, user.estimation.end){ +clv.time.set.sample.periods <- function(clv.time, tp.first.transaction, tp.last.transaction, user.estimation.end, user.data.end){ tp.estimation.start <- tp.first.transaction + if(is.null(user.data.end)){ + tp.data.end <- tp.last.transaction + }else{ + tp.data.end <- clv.time.convert.user.input.to.timepoint( + clv.time=clv.time, + user.timepoint=user.data.end) + + # Data end may not be before last transaction + if(tp.data.end < tp.last.transaction){ + stop("The given data.end may not be before the last recorded transaction!") + } + } + + if(!is.null(user.estimation.end)){ # specific end @@ -98,24 +112,29 @@ clv.time.set.sample.periods <- function(clv.time, tp.first.transaction, tp.last. user.timepoint=user.estimation.end) } + + # Before the last transaction to ensure there is at least 1 transaction in the holdout period. + # Needed additionally to holdout >=2 periods + if(tp.estimation.end >= tp.last.transaction) + stop("Parameter estimation.split needs to indicate a point before the last transaction!", call. = FALSE) + # Need to be 2 periods because otherwise for days, holdout can be not on estimation.end but still be of length zero # ie 2 periods to still have 1 as holdout - if(tp.estimation.end > tp.last.transaction-clv.time.number.timeunits.to.timeperiod(clv.time, 2L)) - stop("Parameter estimation.split needs to indicate a point at least 2 periods before the last transaction!", call. = FALSE) + if(tp.estimation.end > tp.data.end - clv.time.number.timeunits.to.timeperiod(clv.time, 2L)) + stop("Parameter estimation.split needs to indicate a point in time such that it yields a holdout period of at least 2 time.units!", call. = FALSE) # + 1 day is the same for all because most fine-grained change that Date can do tp.holdout.start <- tp.estimation.end + clv.time.epsilon(clv.time=clv.time) - tp.holdout.end <- tp.last.transaction + tp.holdout.end <- tp.data.end holdout.period.in.tu <- clv.time.interval.in.number.tu(clv.time, interv=interval(start = tp.holdout.start, end = tp.holdout.end)) }else{ - # NULL: no specific end - until end of data (last transaction) - # **TODO: last transaction or full period where last transaction is in? + # NULL: no specific end - until data end - # tp.holdout.start and .end HAVE to be end of estimation period as this is used elsewhere! + # tp.holdout.start/.end and HAVE to be end of estimation period as this is used elsewhere! # ie to ensure prediction.end (with clv.time.get.prediction.table) finds correct end if user gives NULL - tp.estimation.end <- tp.last.transaction + tp.estimation.end <- tp.data.end tp.holdout.start <- tp.estimation.end tp.holdout.end <- tp.estimation.end holdout.period.in.tu <- 0 diff --git a/R/f_clvdata_inputchecks.R b/R/f_clvdata_inputchecks.R index eee0d4f4..42be999f 100644 --- a/R/f_clvdata_inputchecks.R +++ b/R/f_clvdata_inputchecks.R @@ -210,6 +210,31 @@ check_userinput_datanocov_estimationsplit <- function(estimation.split, date.for return(c()) } +#' @importFrom lubridate is.POSIXt is.Date parse_date_time +check_userinput_datanocov_dataend <- function(data.end, date.format){ + + # May be NULL + if(is.null(data.end)) + return(c()) + + if(length(data.end) != 1) + return("data.end must contain exactly one single element!") + + if(anyNA(data.end)) + return("data.end may not contain any NAs!") + + if(!is.character(data.end) + & !is.Date(data.end) + & !is.POSIXt(data.end)) + return("data.end needs to either of type character or date-like (Date or POSIXt)") + + if(is.character(data.end)) + if(anyNA(parse_date_time(x=data.end, quiet=TRUE, orders=date.format))) + return("Please provide a valid data.end to that can be converted with the given date.format!") + + return(c()) +} + #' @importFrom lubridate is.POSIXct check_userinput_datanocov_datatransactions <- function(data.transactions.dt, has.spending){ diff --git a/R/f_interface_clvdata.R b/R/f_interface_clvdata.R index 08d7c6b1..cb5e2446 100644 --- a/R/f_interface_clvdata.R +++ b/R/f_interface_clvdata.R @@ -32,8 +32,17 @@ #' (i.e., "2010-06-17") is indicated with \code{"ymd"}. Other combinations such as \code{"dmy"}, \code{"dym"}, #' \code{"ymd HMS"}, or \code{"HMS dmy"} are possible as well. #' +#' \code{data.end} A point in time beyond the last purchase at which the data should fictionally end. +#' It defines the total time frame in which customers could be observed: The combined estimation and holdout periods. +#' For example, when the last recorded transaction was on "2000-12-29" but customers were actually observed until "2000-12-31". +#' Using \code{data.end="2000-12-31"} without holdout period, +#' the estimation period will be until "2000-12-31" and the prediction period will start on "2001-01-01". +#' Required to be after the last recorded transaction. +#' #' \code{estimation.split} May be specified as either the number of periods since the first transaction or the timepoint -#' (either as character, Date, or POSIXct) at which the estimation period ends. The indicated timepoint itself will be part of the estimation sample. +#' (either as character, Date, or POSIXct) at which the estimation period ends. +#' Required to be before the last transaction. +#' The indicated timepoint itself will be part of the estimation sample. #' If no value is provided or set to \code{NULL}, the whole dataset will used for fitting the model (no holdout sample). #' #' @details ## Aggregation of Transactions @@ -84,6 +93,15 @@ #' time.unit = "w", #' estimation.split = "1997-10-15") #' +#' # Extend data fictionally until 31th Dec 1998 +#' # In this case, this only moves the holdout period and has no effect on the +#' # estimation. +#' clv.data.cdnow <- clvdata(data.transactions = cdnow, +#' date.format="ymd", +#' time.unit = "w", +#' data.end = "1998-12-31", +#' estimation.split = "1997-10-15") +#' #' # summary of the transaction data #' summary(clv.data.cdnow) #' @@ -112,7 +130,7 @@ #' #' #' @export -clvdata <- function(data.transactions, date.format, time.unit, estimation.split=NULL, name.id="Id", name.date="Date", name.price="Price"){ +clvdata <- function(data.transactions, date.format, time.unit, estimation.split=NULL, data.end=NULL, name.id="Id", name.date="Date", name.price="Price"){ # silence CRAN notes Date <- Price <- Id <- x <- previous <- date.first.actual.trans <- NULL @@ -136,6 +154,7 @@ clvdata <- function(data.transactions, date.format, time.unit, estimation.split= err.msg <- c(err.msg, .check_userinput_charactervec(char=date.format, var.name = "date.format", n=1)) err.msg <- c(err.msg, check_userinput_datanocov_estimationsplit(estimation.split=estimation.split, date.format=date.format)) + err.msg <- c(err.msg, check_userinput_datanocov_dataend(data.end=data.end, date.format=date.format)) check_err_msg(err.msg) @@ -208,14 +227,9 @@ clvdata <- function(data.transactions, date.format, time.unit, estimation.split= clv.t <- clv.time.set.sample.periods(clv.time = clv.t, tp.first.transaction = tp.first.transaction, tp.last.transaction = tp.last.transaction, + user.data.end = data.end, user.estimation.end = estimation.split) - if(clv.t@timepoint.estimation.end > dt.trans[, max(Date)]) - stop("Parameter estimation.split needs to indicate a point in the data!", call. = FALSE) - - if(clv.t@estimation.period.in.tu < 1) - stop("Parameter estimation.split needs to be at least 1 time.unit after the start!", call. = FALSE) - # Check if the estimation.split is valid ---------------------------------------- # - estimation period long enough @@ -229,7 +243,7 @@ clvdata <- function(data.transactions, date.format, time.unit, estimation.split= everyones.first.trans <- dt.trans[, list(date.first.actual.trans = min(Date)), by="Id"] date.last.first.trans <- everyones.first.trans[, max(date.first.actual.trans)] if(clv.t@timepoint.estimation.end < date.last.first.trans) - stop("The estimation split is too short! Not all customers of this cohort had their first actual transaction until the specified estimation.split!", call. = F) + stop("The estimation period is too short! Not all customers had their first transaction until the end of the estimation period!", call. = FALSE) diff --git a/R/f_s3generics_clvdata.R b/R/f_s3generics_clvdata.R index 198ccdfd..e12f6402 100644 --- a/R/f_s3generics_clvdata.R +++ b/R/f_s3generics_clvdata.R @@ -291,8 +291,10 @@ subset.clv.data <- function(x, #' @rdname as.clv.data #' @export as.clv.data.data.frame <- function(x, - date.format="ymd", time.unit="weeks", + date.format="ymd", + time.unit="weeks", estimation.split = NULL, + data.end = NULL, name.id="Id", name.date="Date", name.price="Price", ...){ return(clvdata(data.transactions = x, @@ -304,8 +306,10 @@ as.clv.data.data.frame <- function(x, #' @rdname as.clv.data #' @export as.clv.data.data.table <- function(x, - date.format="ymd", time.unit="weeks", + date.format="ymd", + time.unit="weeks", estimation.split = NULL, + data.end = NULL, name.id="Id", name.date="Date", name.price="Price", ...){ return(clvdata(data.transactions = x, diff --git a/R/f_s3generics_clvdata_plot.R b/R/f_s3generics_clvdata_plot.R index 054ecdf7..c6acac54 100644 --- a/R/f_s3generics_clvdata_plot.R +++ b/R/f_s3generics_clvdata_plot.R @@ -95,7 +95,7 @@ #' \item{variable}{"tracking": The number of actual repeat transactions in the period that ends at \code{period.until}.\cr #' "timings": Coordinate (x or y) for which to use the value in this row for.} #' \item{value}{"timings": Date or numeric (stored as string) \cr -#' "tracking": numeric} +#' "tracking": numeric, may be \code{NA} if no repeat-transactions were recorded in this period} #' #' #' @examples @@ -310,10 +310,26 @@ clv.data.plot.tracking <- function(x, prediction.end, cumulative, plot, verbose, dt.dates.expectation[dt.repeat.trans, (label.transactions) := get(label.transactions), on="period.until"] dt.plot <- melt(dt.dates.expectation, id.vars="period.until") - # last period often has NA as it marks the full span of the period - dt.plot <- dt.plot[!is.na(value)] - - # data.table does not print when returned because it is returned directly after last [:=] + # The last period usually is set to NA because the data does not reach to the end of it. + # The last period has to be a full period because of the expectation plot. + # At the same time, the transaction data often ends before the last period (is only a partial period). + # This leads to a much lower number of transactions recorded in the last period + # and a noticeable, hard-to-explain drop at the end. + # Periods for which there are no transactions contain 0 not NA. Only the last + # period may contain NA. + # We remove it to not have it in the data and not raise a warning when plotting. + # + # Since introducing `data.end`, we no loner remove NAs as now there can be many + # periods without transactions and these should be shown (plotted) and known (returned data). + # Instead `geom_line(na.rm=T)` is used to remove them during plotting. + # Returning them helps users who want to create their own plots to plot the + # correct range (total time span of data). + # Alternative: Drop NA but set x-axis scale until holdout.end using + # `+ xlim(c(x@clv.time@timepoint.estimation.start, x@clv.time@timepoint.holdout.end))` + # + # dt.plot <- dt.plot[!is.na(value)] + + # # data.table does not print when returned because it is returned directly after last [:=] # " if a := is used inside a function with no DT[] before the end of the function, then the next # time DT or print(DT) is typed at the prompt, nothing will be printed. A repeated DT or print(DT) # will print. To avoid this: include a DT[] after the last := in your function." diff --git a/R/f_s3generics_clvfittedtransactions_plot.R b/R/f_s3generics_clvfittedtransactions_plot.R index 7217629e..d2b1f454 100644 --- a/R/f_s3generics_clvfittedtransactions_plot.R +++ b/R/f_s3generics_clvfittedtransactions_plot.R @@ -78,7 +78,7 @@ #' \item{period.until}{The timepoint that marks the end (up until and including) of the period to which the data in this row refers.} #' \item{variable}{Type of variable that 'value' refers to. Either "model name" or "Actual" (if \code{transactions=TRUE}).} #' \item{value}{Depending on variable either (Actual) the actual number of repeat transactions in the period that ends at \code{period.until}, -#' or the unconditional expectation for the period that ends on \code{period.until} ("model name").} +#' or the unconditional expectation for the period that ends on \code{period.until} ("model name"). Actuals may be \code{NA} if no transaction was recorded.} #' #' For the PMF plot: #' \item{num.transactions}{The number of repeat transactions in the estimation period (as ordered factor).} @@ -241,7 +241,7 @@ clv.controlflow.plot.tracking.base <- function(dt.plot, clv.data, color.mapping, # Plotting order dt.plot[, variable := factor(variable, levels=names(color.mapping), ordered = TRUE)] - p <- ggplot(data = dt.plot, aes(x=period.until, y=value, colour=variable)) + geom_line() + p <- ggplot(data = dt.plot, aes(x=period.until, y=value, colour=variable)) + geom_line(na.rm = TRUE) # Add holdout line if there is a holdout period if(clv.data.has.holdout(clv.data)){ @@ -382,7 +382,13 @@ clv.fitted.transactions.plot.tracking.get.data <- function(x, prediction.end, cu dt.plot <- melt(dt.dates.expectation, id.vars='period.until') # last period often has NA as it marks the full span of the period - dt.plot <- dt.plot[!is.na(value)] + # The last period usually was NA because of explanations it was a partial + # period. See explanations in `clv.data.plot.tracking`. + # dt.plot <- dt.plot[!is.na(value)] + # Since introducing `data.end`, many periods can be NA. The NAs are now removed + # during plotting (`geom_line(na.rm=T)`). For consistency with plot(clvdata), + # the returned data also keeps the NA. + return(dt.plot) } diff --git a/man-roxygen/template_params_clvdata.R b/man-roxygen/template_params_clvdata.R index e3cf6454..7e184d4b 100644 --- a/man-roxygen/template_params_clvdata.R +++ b/man-roxygen/template_params_clvdata.R @@ -1,5 +1,6 @@ #' @param date.format Character string that indicates the format of the date variable in the data used. See details. #' @param time.unit What time unit defines a period. May be abbreviated, capitalization is ignored. See details. +#' @param data.end The fictional end of the data, after the last recorded transaction in \code{<%=name_param_trans%>}. See details. #' @param estimation.split Indicates the length of the estimation period. See details. #' @param name.id Column name of the customer id in \code{<%=name_param_trans%>}. #' @param name.date Column name of the transaction date in \code{<%=name_param_trans%>}. diff --git a/man-roxygen/template_summary_data.R b/man-roxygen/template_summary_data.R index bca429a2..7f5ed15e 100644 --- a/man-roxygen/template_summary_data.R +++ b/man-roxygen/template_summary_data.R @@ -16,6 +16,8 @@ #' be limited to a subset of customers. #' \describe{ #' \item{\code{Number of customers}}{Count of individual customers.} +#' \item{\code{Period Start}}{Start of the indicated period.} +#' \item{\code{Period End}}{End of indicated period.} #' \item{\code{First Transaction in period}}{Time point of the first transaction occurring in the indicated period.} #' \item{\code{Last Transaction in period}}{Time point of the last transaction occurring in the indicated period.} #' \item{\code{Total # Transactions}}{Count of transactions occurring in the indicated period.} diff --git a/man/as.clv.data.Rd b/man/as.clv.data.Rd index 297a620f..652068f4 100644 --- a/man/as.clv.data.Rd +++ b/man/as.clv.data.Rd @@ -11,6 +11,7 @@ as.clv.data( date.format = "ymd", time.unit = "weeks", estimation.split = NULL, + data.end = NULL, name.id = "Id", name.date = "Date", name.price = "Price", @@ -22,6 +23,7 @@ as.clv.data( date.format = "ymd", time.unit = "weeks", estimation.split = NULL, + data.end = NULL, name.id = "Id", name.date = "Date", name.price = "Price", @@ -33,6 +35,7 @@ as.clv.data( date.format = "ymd", time.unit = "weeks", estimation.split = NULL, + data.end = NULL, name.id = "Id", name.date = "Date", name.price = "Price", @@ -48,6 +51,8 @@ as.clv.data( \item{estimation.split}{Indicates the length of the estimation period. See details.} +\item{data.end}{The fictional end of the data, after the last recorded transaction in \code{x}. See details.} + \item{name.id}{Column name of the customer id in \code{x}.} \item{name.date}{Column name of the transaction date in \code{x}.} diff --git a/man/clvdata.Rd b/man/clvdata.Rd index 6987312e..1eeae7d3 100644 --- a/man/clvdata.Rd +++ b/man/clvdata.Rd @@ -9,6 +9,7 @@ clvdata( date.format, time.unit, estimation.split = NULL, + data.end = NULL, name.id = "Id", name.date = "Date", name.price = "Price" @@ -23,6 +24,8 @@ clvdata( \item{estimation.split}{Indicates the length of the estimation period. See details.} +\item{data.end}{The fictional end of the data, after the last recorded transaction in \code{data.transactions}. See details.} + \item{name.id}{Column name of the customer id in \code{data.transactions}.} \item{name.date}{Column name of the transaction date in \code{data.transactions}.} @@ -62,8 +65,17 @@ and hence all formats it accepts in argument \code{orders} can be used. For exam (i.e., "2010-06-17") is indicated with \code{"ymd"}. Other combinations such as \code{"dmy"}, \code{"dym"}, \code{"ymd HMS"}, or \code{"HMS dmy"} are possible as well. +\code{data.end} A point in time beyond the last purchase at which the data should fictionally end. +It defines the total time frame in which customers could be observed: The combined estimation and holdout periods. +For example, when the last recorded transaction was on "2000-12-29" but customers were actually observed until "2000-12-31". +Using \code{data.end="2000-12-31"} without holdout period, +the estimation period will be until "2000-12-31" and the prediction period will start on "2001-01-01". +Required to be after the last recorded transaction. + \code{estimation.split} May be specified as either the number of periods since the first transaction or the timepoint -(either as character, Date, or POSIXct) at which the estimation period ends. The indicated timepoint itself will be part of the estimation sample. +(either as character, Date, or POSIXct) at which the estimation period ends. +Required to be before the last transaction. +The indicated timepoint itself will be part of the estimation sample. If no value is provided or set to \code{NULL}, the whole dataset will used for fitting the model (no holdout sample). \subsection{Aggregation of Transactions}{ @@ -101,6 +113,15 @@ clv.data.cdnow <- clvdata(data.transactions = cdnow, time.unit = "w", estimation.split = "1997-10-15") +# Extend data fictionally until 31th Dec 1998 +# In this case, this only moves the holdout period and has no effect on the +# estimation. +clv.data.cdnow <- clvdata(data.transactions = cdnow, + date.format="ymd", + time.unit = "w", + data.end = "1998-12-31", + estimation.split = "1997-10-15") + # summary of the transaction data summary(clv.data.cdnow) diff --git a/man/plot.clv.data.Rd b/man/plot.clv.data.Rd index bac706d1..b871fedd 100644 --- a/man/plot.clv.data.Rd +++ b/man/plot.clv.data.Rd @@ -81,7 +81,7 @@ excluding customers with no repeat-transactions.} \item{variable}{"tracking": The number of actual repeat transactions in the period that ends at \code{period.until}.\cr "timings": Coordinate (x or y) for which to use the value in this row for.} \item{value}{"timings": Date or numeric (stored as string) \cr - "tracking": numeric} + "tracking": numeric, may be \code{NA} if no repeat-transactions were recorded in this period} } \description{ Depending on the value of parameter \code{which}, one of the following plots will be produced. diff --git a/man/plot.clv.fitted.transactions.Rd b/man/plot.clv.fitted.transactions.Rd index 93ac58ad..236374e6 100644 --- a/man/plot.clv.fitted.transactions.Rd +++ b/man/plot.clv.fitted.transactions.Rd @@ -82,7 +82,7 @@ For the Tracking plot: \item{period.until}{The timepoint that marks the end (up until and including) of the period to which the data in this row refers.} \item{variable}{Type of variable that 'value' refers to. Either "model name" or "Actual" (if \code{transactions=TRUE}).} \item{value}{Depending on variable either (Actual) the actual number of repeat transactions in the period that ends at \code{period.until}, -or the unconditional expectation for the period that ends on \code{period.until} ("model name").} +or the unconditional expectation for the period that ends on \code{period.until} ("model name"). Actuals may be \code{NA} if no transaction was recorded.} For the PMF plot: \item{num.transactions}{The number of repeat transactions in the estimation period (as ordered factor).} diff --git a/man/summary.clv.data.Rd b/man/summary.clv.data.Rd index b5999cfd..e7cb5aa5 100644 --- a/man/summary.clv.data.Rd +++ b/man/summary.clv.data.Rd @@ -60,6 +60,8 @@ for the overall time period (estimation + holdout). By using the \code{ids} argu be limited to a subset of customers. \describe{ \item{\code{Number of customers}}{Count of individual customers.} +\item{\code{Period Start}}{Start of the indicated period.} +\item{\code{Period End}}{End of indicated period.} \item{\code{First Transaction in period}}{Time point of the first transaction occurring in the indicated period.} \item{\code{Last Transaction in period}}{Time point of the last transaction occurring in the indicated period.} \item{\code{Total # Transactions}}{Count of transactions occurring in the indicated period.} diff --git a/tests/testthat/helper_arrange.R b/tests/testthat/helper_arrange.R index 23c2b2a7..3ea9a1a5 100644 --- a/tests/testthat/helper_arrange.R +++ b/tests/testthat/helper_arrange.R @@ -19,7 +19,12 @@ fct.helper.load.apparelDynCov <- function(){.load.data.locally("apparelDynCov")} -fct.helper.create.clvdata.cdnow <- function(data.cdnow = NULL, estimation.split = 37, name.price = "Price") { +fct.helper.create.clvdata.cdnow <- function( + data.cdnow = NULL, + data.end = NULL, + estimation.split = 37, + name.price = "Price") +{ if (is.null(data.cdnow)) { data.cdnow <- fct.helper.load.cdnow() } @@ -27,6 +32,7 @@ fct.helper.create.clvdata.cdnow <- function(data.cdnow = NULL, estimation.split data.transactions = data.cdnow, date.format = "ymd", time.unit = "w", + data.end = data.end, estimation.split = estimation.split, name.price = name.price ) @@ -35,6 +41,7 @@ fct.helper.create.clvdata.cdnow <- function(data.cdnow = NULL, estimation.split fct.helper.create.clvdata.apparel.nocov <- function( data.apparelTrans = NULL, + data.end = NULL, estimation.split = 104) { if (is.null(data.apparelTrans)) { @@ -45,6 +52,7 @@ fct.helper.create.clvdata.apparel.nocov <- function( data.transactions = data.apparelTrans, date.format = "ymd", time.unit = "W", + data.end = data.end, estimation.split = estimation.split )) } @@ -52,6 +60,7 @@ fct.helper.create.clvdata.apparel.nocov <- function( fct.helper.create.clvdata.apparel.staticcov <- function( data.apparelTrans = NULL, data.apparelStaticCov = NULL, + data.end = NULL, estimation.split = 104, names.cov.life = c("Gender", "Channel"), names.cov.trans = c("Gender", "Channel")) { @@ -64,7 +73,10 @@ fct.helper.create.clvdata.apparel.staticcov <- function( return(SetStaticCovariates( clvdata( - data.transactions = data.apparelTrans, date.format = "ymd", time.unit = "W", + data.transactions = data.apparelTrans, + date.format = "ymd", + time.unit = "W", + data.end = data.end, estimation.split = estimation.split ), data.cov.life = data.apparelStaticCov, @@ -77,6 +89,7 @@ fct.helper.create.clvdata.apparel.staticcov <- function( fct.helper.create.clvdata.apparel.dyncov <- function( data.apparelTrans = NULL, data.apparelDynCov = NULL, + data.end = NULL, estimation.split = 104, names.cov.life = c("High.Season", "Gender", "Channel"), names.cov.trans = c("High.Season", "Gender", "Channel")) { @@ -92,6 +105,7 @@ fct.helper.create.clvdata.apparel.dyncov <- function( data = data.apparelTrans, date.format = "ymd", time.unit = "w", + data.end = data.end, estimation.split = estimation.split )) @@ -110,6 +124,7 @@ fct.helper.create.clvdata.apparel.dyncov <- function( fit.cdnow <- function( data.cdnow = NULL, + data.end = NULL, estimation.split = 37, name.price = 'Price', model = pnbd, @@ -119,6 +134,7 @@ fit.cdnow <- function( clv.cdnow <- fct.helper.create.clvdata.cdnow( data.cdnow = data.cdnow, + data.end = data.end, estimation.split = estimation.split, name.price=name.price ) @@ -137,6 +153,7 @@ fit.cdnow <- function( fit.apparel.nocov <- function( data.apparelTrans = NULL, + data.end = NULL, estimation.split = 104, model = pnbd, verbose=FALSE, @@ -147,6 +164,7 @@ fit.apparel.nocov <- function( clv.data.apparel <- fct.helper.create.clvdata.apparel.nocov( data.apparelTrans = data.apparelTrans, + data.end = data.end, estimation.split = estimation.split ) @@ -163,6 +181,7 @@ fit.apparel.nocov <- function( fit.apparel.static <- function( data.apparelTrans = NULL, data.apparelStaticCov = NULL, + data.end = NULL, estimation.split = 104, names.cov.life = c("Gender", "Channel"), names.cov.trans = c("Gender", "Channel"), @@ -176,6 +195,7 @@ fit.apparel.static <- function( clv.data.apparel.cov <- fct.helper.create.clvdata.apparel.staticcov( data.apparelTrans = data.apparelTrans, data.apparelStaticCov = data.apparelStaticCov, + data.end = data.end, estimation.split = estimation.split, names.cov.life = names.cov.life, names.cov.trans = names.cov.trans @@ -195,6 +215,7 @@ fit.apparel.static <- function( fit.apparel.dyncov <- function( data.apparelTrans = NULL, data.apparelDynCov = NULL, + data.end = NULL, estimation.split = 104, names.cov.life = c("High.Season", "Gender", "Channel"), names.cov.trans = c("High.Season", "Gender", "Channel"), @@ -206,6 +227,7 @@ fit.apparel.dyncov <- function( data.apparelTrans = data.apparelTrans, data.apparelDynCov = data.apparelDynCov, estimation.split = estimation.split, + data.end = data.end, names.cov.life = names.cov.life, names.cov.trans = names.cov.trans ) diff --git a/tests/testthat/helper_s3_fitted_plot.R b/tests/testthat/helper_s3_fitted_plot.R index b9f4005c..936a004b 100644 --- a/tests/testthat/helper_s3_fitted_plot.R +++ b/tests/testthat/helper_s3_fitted_plot.R @@ -120,10 +120,12 @@ fct.testthat.runability.clvfittedtransactions.plot.tracking <- function(clv.fitt test_that("Works with newdata", { skip_on_cran() expect_silent(dt.plot <- plot(clv.fitted, newdata = clv.newdata.nohold, prediction.end=3, plot=FALSE, verbose=FALSE)) - expect_false(anyNA(dt.plot)) + # Since introducing data.end: Actuals after last transaction are kept and contain NA. Only check model + expect_false(anyNA(dt.plot[variable != "Actual"])) expect_warning(dt.plot <- plot(clv.fitted, newdata = clv.newdata.withhold, prediction.end=3, plot=FALSE, verbose=FALSE), regexp = "full holdout") - expect_false(anyNA(dt.plot)) + # Since introducing data.end: Actuals after last transaction are kept and contain NA. Only check model + expect_false(anyNA(dt.plot[variable != "Actual"])) }) test_that("Works for prediction.end in different formats, after holdout", { diff --git a/tests/testthat/helper_testthat_correctness_clvfitted.R b/tests/testthat/helper_testthat_correctness_clvfitted.R index 33db3404..86cefbcc 100644 --- a/tests/testthat/helper_testthat_correctness_clvfitted.R +++ b/tests/testthat/helper_testthat_correctness_clvfitted.R @@ -14,7 +14,8 @@ fct.testthat.correctness.clvfitted.flawless.results.out.of.the.box <- function(m expect_false(fct.DT.any.non.finite(predict(fitted, verbose = FALSE))) if(is(fitted, "clv.fitted.transactions")){ - expect_false(fct.DT.any.non.finite(plot(fitted, plot = FALSE, verbose = FALSE))) + # No NAs, except last period may have (partial period that is set to NA on purpose) + expect_false(fct.DT.any.non.finite(plot(fitted, plot = FALSE, verbose = FALSE)[period.until != max(period.until)])) } # KKTs both true expect_true(res.sum$kkt1) diff --git a/tests/testthat/helper_testthat_correctness_clvtime.R b/tests/testthat/helper_testthat_correctness_clvtime.R index b0d0f1c5..ee848177 100644 --- a/tests/testthat/helper_testthat_correctness_clvtime.R +++ b/tests/testthat/helper_testthat_correctness_clvtime.R @@ -27,23 +27,35 @@ fct.helper.clv.time.create.test.objects <- function(with.holdout){ pred.tp.last <- as.Date("2008-09-27") if(with.holdout){ expect_silent(clv.t.hours <-clv.time.set.sample.periods(clv.t.hours, user.estimation.end = 37, + user.data.end = NULL, tp.first.transaction = lubridate::ymd("2005-01-20", tz="UTC"), tp.last.transaction = lubridate::ymd("2008-09-27", tz="UTC"))) expect_silent(clv.t.days <-clv.time.set.sample.periods(clv.t.days, user.estimation.end = 37, + user.data.end = NULL, tp.first.transaction = pred.tp.first, tp.last.transaction = pred.tp.last)) expect_silent(clv.t.weeks <-clv.time.set.sample.periods(clv.t.weeks, user.estimation.end = 37, + user.data.end = NULL, tp.first.transaction = pred.tp.first, tp.last.transaction = pred.tp.last)) expect_silent(clv.t.years <-clv.time.set.sample.periods(clv.t.years, user.estimation.end = 1, + user.data.end = NULL, tp.first.transaction = pred.tp.first, tp.last.transaction = pred.tp.last)) }else{ - expect_silent(clv.t.hours <-clv.time.set.sample.periods(clv.t.hours, user.estimation.end = NULL, + expect_silent(clv.t.hours <-clv.time.set.sample.periods(clv.t.hours, + user.estimation.end = NULL, + user.data.end = NULL, tp.first.transaction = lubridate::ymd("2005-01-20", tz="UTC"), tp.last.transaction = lubridate::ymd("2008-09-27", tz="UTC"))) - expect_silent(clv.t.days <-clv.time.set.sample.periods(clv.t.days, user.estimation.end = NULL, + expect_silent(clv.t.days <-clv.time.set.sample.periods(clv.t.days, + user.estimation.end = NULL, + user.data.end = NULL, tp.first.transaction = pred.tp.first, tp.last.transaction = pred.tp.last)) - expect_silent(clv.t.weeks <-clv.time.set.sample.periods(clv.t.weeks, user.estimation.end = NULL, + expect_silent(clv.t.weeks <-clv.time.set.sample.periods(clv.t.weeks, + user.estimation.end = NULL, + user.data.end = NULL, tp.first.transaction = pred.tp.first, tp.last.transaction = pred.tp.last)) - expect_silent(clv.t.years <-clv.time.set.sample.periods(clv.t.years, user.estimation.end = NULL, + expect_silent(clv.t.years <-clv.time.set.sample.periods(clv.t.years, + user.estimation.end = NULL, + user.data.end = NULL, tp.first.transaction = pred.tp.first, tp.last.transaction = pred.tp.last)) } @@ -96,7 +108,9 @@ fct.testthat.correctness.clvtime.set.sample.periods.estimation.start <- function tp.first <- fct.helper.clv.time.correct.datetype("2018-01-01", clv.t) tp.last <- fct.helper.clv.time.correct.datetype("2019-06-15", clv.t) - expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = NULL, + expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, + user.estimation.end = NULL, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last)) expect_equal(clv.t@timepoint.estimation.start, tp.first) @@ -112,7 +126,10 @@ fct.testthat.correctness.clvtime.set.sample.periods.no.estimation.end <- functio tp.last <- fct.helper.clv.time.correct.datetype("2019-06-15", clv.t) # Dates - expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = NULL, tp.first.transaction = tp.first, + expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, + user.estimation.end = NULL, + user.data.end = NULL, + tp.first.transaction = tp.first, tp.last.transaction = tp.last)) expect_equal(clv.t@timepoint.estimation.end, tp.last) expect_equal(clv.t@timepoint.holdout.start, tp.last) @@ -135,7 +152,9 @@ fct.testthat.correctness.clvtime.set.sample.periods.numeric.estimation.end <- fu NULL) stopifnot(!is.null(splitting.end)) - expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = splitting.end, + expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, + user.estimation.end = splitting.end, + user.data.end = NULL, tp.first.transaction =tp.first, tp.last.transaction = tp.last)) expect_equal(clv.t@timepoint.estimation.end, tp.first+lubridate::period(splitting.end, period.type)) @@ -158,7 +177,9 @@ fct.testthat.correctness.clvtime.set.sample.periods.warn.partial.period <- funct NULL) stopifnot(!is.null(splitting.end)) - expect_warning(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = splitting.end, + expect_warning(clv.t <- clv.time.set.sample.periods(clv.t, + user.estimation.end = splitting.end, + user.data.end = NULL, tp.first.transaction =tp.first, tp.last.transaction = tp.last), regexp = "partial periods") @@ -176,22 +197,26 @@ fct.testthat.correctness.clvtime.set.sample.periods.stop.estimation.period.less. # Numeric expect_error(clv.time.set.sample.periods(clv.t, user.estimation.end = 0, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last), regexp = "1 time.unit after") expect_error(clv.time.set.sample.periods(clv.t, user.estimation.end = -3, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last), regexp = "1 time.unit after") # Date expect_error(clv.time.set.sample.periods(clv.t, user.estimation.end = tp.first, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last), regexp = "1 time.unit after") expect_error(clv.time.set.sample.periods(clv.t, user.estimation.end = tp.first-lubridate::days(1), + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last), regexp = "1 time.unit after") @@ -199,21 +224,25 @@ fct.testthat.correctness.clvtime.set.sample.periods.stop.estimation.period.less. expect_error(clv.time.set.sample.periods(clv.time = clv.t.hours, user.estimation.end = "2018-01-01 00:35:49", + user.data.end = NULL, tp.first.transaction = fct.helper.clv.time.correct.datetype("2018-01-01", clv.t.hours), tp.last.transaction = fct.helper.clv.time.correct.datetype("2025-06-15", clv.t.hours)), regexp = "1 time.unit after") expect_error(clv.time.set.sample.periods(clv.time = clv.t.days, user.estimation.end = "2018-01-01", + user.data.end = NULL, tp.first.transaction = fct.helper.clv.time.correct.datetype("2018-01-01", clv.t.days), tp.last.transaction = fct.helper.clv.time.correct.datetype("2025-06-15", clv.t.days)), regexp = "1 time.unit after") expect_error(clv.time.set.sample.periods(clv.time = clv.t.weeks, user.estimation.end = "2018-01-03", # Wed + user.data.end = NULL, tp.first.transaction = fct.helper.clv.time.correct.datetype("2018-01-01", clv.t.weeks), # Mon tp.last.transaction = fct.helper.clv.time.correct.datetype("2025-06-15", clv.t.weeks)), regexp = "1 time.unit after") expect_error(clv.time.set.sample.periods(clv.time = clv.t.years, user.estimation.end = "2018-12-31", + user.data.end = NULL, tp.first.transaction = fct.helper.clv.time.correct.datetype("2018-01-01", clv.t.years), tp.last.transaction = fct.helper.clv.time.correct.datetype("2025-06-15", clv.t.years)), regexp = "1 time.unit after") @@ -226,32 +255,37 @@ fct.testthat.correctness.clvtime.set.sample.periods.stop.holdout.length.less.2.p tp.first <- fct.helper.clv.time.correct.datetype("2018-01-01", clv.t) tp.last <- fct.helper.clv.time.correct.datetype("2025-06-15", clv.t) expect_error(clv.time.set.sample.periods(clv.t, + user.data.end = NULL, user.estimation.end = tp.last-lubridate::hours(1), tp.first.transaction = tp.first, tp.last.transaction = tp.last), - regexp = "2 periods before") + regexp = "at least 2 time.units") } expect_error(clv.time.set.sample.periods(clv.time = clv.t.hours, user.estimation.end = "2025-06-14 22:40:11", + user.data.end = NULL, tp.first.transaction = fct.helper.clv.time.correct.datetype("2018-01-01", clv.t.hours), tp.last.transaction = fct.helper.clv.time.correct.datetype("2025-06-15", clv.t.hours)), - regexp = "2 periods before") + regexp = "at least 2 time.units") expect_error(clv.time.set.sample.periods(clv.time = clv.t.days, user.estimation.end = "2025-06-14", + user.data.end = NULL, tp.first.transaction = fct.helper.clv.time.correct.datetype("2018-01-01", clv.t.days), tp.last.transaction = fct.helper.clv.time.correct.datetype("2025-06-15", clv.t.days)), - regexp = "2 periods before") + regexp = "at least 2 time.units") expect_error(clv.time.set.sample.periods(clv.time = clv.t.weeks, user.estimation.end = "2025-06-11", # Wed + user.data.end = NULL, tp.first.transaction = fct.helper.clv.time.correct.datetype("2018-01-01", clv.t.weeks), tp.last.transaction = fct.helper.clv.time.correct.datetype("2025-06-13", clv.t.weeks)),# Fr - regexp = "2 periods before") + regexp = "at least 2 time.units") expect_error(clv.time.set.sample.periods(clv.time = clv.t.years, user.estimation.end = "2025-01-01", + user.data.end = NULL, tp.first.transaction = fct.helper.clv.time.correct.datetype("2018-01-01", clv.t.years), tp.last.transaction = fct.helper.clv.time.correct.datetype("2025-06-15", clv.t.years)), - regexp = "2 periods before") + regexp = "at least 2 time.units") } @@ -265,13 +299,17 @@ fct.testthat.correctness.clvtime.set.sample.periods.date.estimation.end <- funct if(is(clv.t, "clv.time.datetime")){ # POSIX dates in transactions - but split with Date (ymd by user) - expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = tp.split, + expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, + user.estimation.end = tp.split, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last)) # Split same Date but as posix expect_equal(clv.t@timepoint.estimation.end, as.POSIXct.POSIXlt(as.POSIXlt.Date(tp.split), tz = "UTC")) }else{ - expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = tp.split, + expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, + user.estimation.end = tp.split, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last)) expect_equal(clv.t@timepoint.estimation.end, tp.split) @@ -297,6 +335,7 @@ fct.testthat.correctness.clvtime.set.sample.periods.posixct.estimation.end <- fu if(is(clv.t, "clv.time.datetime")){ expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = tp.split, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last)) @@ -306,6 +345,7 @@ fct.testthat.correctness.clvtime.set.sample.periods.posixct.estimation.end <- fu # Date transactions - but split with POSIXct (given by user) expect_message(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = tp.split, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last), regexp = "is ignored") @@ -331,6 +371,7 @@ fct.testthat.correctness.clvtime.set.sample.periods.char.estimation.end <- funct tp.split <- "2019-07-19 15:36:19" expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = tp.split, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last)) @@ -341,6 +382,7 @@ fct.testthat.correctness.clvtime.set.sample.periods.char.estimation.end <- funct tp.split <- "2019-07-19" expect_silent(clv.t <- clv.time.set.sample.periods(clv.t, user.estimation.end = tp.split, + user.data.end = NULL, tp.first.transaction = tp.first, tp.last.transaction = tp.last)) # Split same but as Date @@ -355,6 +397,162 @@ fct.testthat.correctness.clvtime.set.sample.periods.char.estimation.end <- funct } +fct.testthat.correctness.clvtime.set.sample.periods.data.end <- function(){ + + # + clv.t <- clv.time.weeks("ymd") + + test_that("Fail if data.end is before last transaction",{ + expect_error( + clv.time.set.sample.periods( + clv.t, + tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t = clv.t), + tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-12-31", clv.t = clv.t), + user.estimation.end = NULL, + user.data.end = "2000-12-30"), + regexp = "may not be before the last recorded transaction" + ) + + }) + + test_that("Fail if data.end leads to holdout period < 2 periods", { + expect_error( + clv.time.set.sample.periods( + clv.t, + tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t = clv.t), + tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-12-31", clv.t = clv.t), + user.estimation.end = "2000-12-28", + user.data.end = "2001-01-03"), + regexp = "holdout period of at least 2 time.units" + ) + + }) + + + # Not possible anymore since requiring estimation.split to be before last + # transaction and data.end after last transaction + # test_that("Fail if data.end leads to estimation period < 1 period", { + # expect_error( + # clv.time.set.sample.periods( + # clv.t, + # tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t = clv.t), + # tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-01-10", clv.t = clv.t), + # user.estimation.end = "2000-01-05", + # user.data.end = "2000-01-31"), + # regexp = "least 1 time.unit" + # ) + # }) + + # Not possible anymore since requiring estimation.split to be before last + # transaction and data.end after last transaction + # test_that("Fail if data.end is before estimation.split", { + # expect_error( + # clv.time.set.sample.periods( + # clv.t, + # tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t = clv.t), + # tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-12-31", clv.t = clv.t), + # user.estimation.end = "2001-02-15", + # user.data.end = "2001-01-31"), + # regexp = "holdout period of at least 2 time.units" + # ) + # }) + + test_that("Same object when no data.end as when data.end=tp.last.transaction", { + l.args <- list( + clv.time = clv.t, + tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t = clv.t), + tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-12-31", clv.t = clv.t) + ) + + # No holdout + l.args["user.estimation.end"] <- list(NULL) + expect_equal( + do.call(clv.time.set.sample.periods, c(l.args, list(user.data.end = NULL))), + do.call(clv.time.set.sample.periods, c(l.args, list(user.data.end = "2000-12-31"))) + ) + + # With holdout + l.args$user.estimation.end <- "2000-06-15" + expect_equal( + do.call(clv.time.set.sample.periods, c(l.args, list(user.data.end = NULL))), + do.call(clv.time.set.sample.periods, c(l.args, list(user.data.end = "2000-12-31"))) + ) + }) + + + # Not possible anymore since requiring estimation.split to be before last + # transaction and data.end after last transaction + # test_that("estimation.split can be after last transaction if data.end is given", { + # expect_silent( + # clv.time.set.sample.periods( + # clv.t, + # tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t=clv.t), + # tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-12-31", clv.t = clv.t), + # user.estimation.end = "2001-01-10", + # user.data.end = "2001-01-31") + # ) + # }) + + test_that("data.end only moves holdout.end (if estimation.split is before)",{ + clv.t.no.obsend <- clv.time.set.sample.periods( + clv.time = clv.t, + tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t = clv.t), + tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-12-31", clv.t = clv.t), + user.estimation.end = "2000-06-15", + user.data.end = NULL) + + clv.t.with.obsend <- clv.time.set.sample.periods( + clv.time = clv.t, + tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t=clv.t), + tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-12-31", clv.t=clv.t), + user.estimation.end = "2000-06-15", + user.data.end = "2001-02-28") + + expect_true(all( + clv.t.with.obsend@timepoint.estimation.start == clv.t.no.obsend@timepoint.estimation.start, + clv.t.with.obsend@timepoint.estimation.end == clv.t.no.obsend@timepoint.estimation.end, + clv.t.with.obsend@timepoint.holdout.start == clv.t.no.obsend@timepoint.holdout.start, + clv.t.with.obsend@estimation.period.in.tu == clv.t.no.obsend@estimation.period.in.tu + )) + + expect_true(clv.t.with.obsend@timepoint.holdout.end > clv.t.no.obsend@timepoint.holdout.end) + expect_true(clv.t.with.obsend@holdout.period.in.tu > clv.t.no.obsend@holdout.period.in.tu) + }) + + + + test_that("Manually check if yields correct timepoints", { + + + # # With holdout + split after last transaction + clv.holdout <- clv.time.set.sample.periods( + clv.time = clv.t, + tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t = clv.t), + tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-06-15", clv.t = clv.t), + user.estimation.end = "2000-04-04", + user.data.end = "2000-12-31") + + expect_true(clv.holdout@timepoint.estimation.start == "2000-01-01") + expect_true(clv.holdout@timepoint.estimation.end == "2000-04-04") + expect_true(clv.holdout@timepoint.holdout.start == "2000-04-05") + expect_true(clv.holdout@timepoint.holdout.end == "2000-12-31") + + + clv.no.holdout <- clv.time.set.sample.periods( + clv.time = clv.t, + tp.first.transaction = fct.helper.clv.time.correct.datetype("2000-01-01", clv.t = clv.t), + tp.last.transaction = fct.helper.clv.time.correct.datetype("2000-12-28", clv.t=clv.t), + user.estimation.end = NULL, + user.data.end = "2000-12-31") + + expect_true(clv.no.holdout@timepoint.estimation.start == "2000-01-01") + expect_true(clv.no.holdout@timepoint.estimation.end == "2000-12-31") + expect_true(clv.no.holdout@timepoint.holdout.start == "2000-12-31") + expect_true(clv.no.holdout@timepoint.holdout.end == "2000-12-31") + }) +} + + # convert.user.input.to.timepoint ------------------------------------------------------------------------------------- fct.testthat.correctness.clvtime.convert.user.input.chars.to.posixct <- function(clv.t.datetime){ stopifnot(is(clv.t.datetime, "clv.time.datetime")) diff --git a/tests/testthat/helper_testthat_correctness_transactions.R b/tests/testthat/helper_testthat_correctness_transactions.R index 0d0a32f1..5138b5d6 100644 --- a/tests/testthat/helper_testthat_correctness_transactions.R +++ b/tests/testthat/helper_testthat_correctness_transactions.R @@ -296,6 +296,69 @@ fct.testthat.correctness.clvfittedtransactions.staticcov.predict.newcustomer.0.f }) } +fct.testthat.correctness.clvfittedtransactions.data.end.moves.prediction.period <- function(method){ + + test_that("Using data.end moves prediction period",{ + # Only valid for no holdout data + skip_on_cran() + + fitted <- fit.cdnow( + model = method, + estimation.split = NULL, + data.end = "1998-07-15" + ) + + dt.pred <- predict(fitted, prediction.end = "1998-07-30") + + # Prediction period starts first `eps` after data.end + expect_true(dt.pred[1, "period.first"] == "1998-07-16") + # Nothing else changed + expect_true(dt.pred[1, "period.last"] == "1998-07-30") + }) + +} + +fct.testthat.correctness.clvfittedtransactions.nocov.plot.until.data.end <- function(method){ + test_that("Plotting until data.end",{ + + expect_warning(fitted <- fit.cdnow( + model = method, + estimation.split = NULL, + data.end = "1998-12-31", + # PNBD requires NM + optimx.args = list(method="Nelder-Mead", hessian=FALSE, control=list(kkt=FALSE))), + regexp = "Hessian could not be derived") + clv.time <- fitted@clv.data@clv.time + + # Data + expect_silent(dt.plot <- plot(fitted, verbose=FALSE, plot=FALSE)) + dt.after <- dt.plot[period.until > "1998-06-30"] + + # Actuals and expectation are for the same dates (none lost for either (mostly Actual)) + expect_true(all( + dt.plot[variable == "Actual", "period.until"] == dt.plot[variable != "Actual", "period.until"])) + + # Actuals are NA after last transaction but model not + expect_true(dt.after[variable == "Actual", all(is.na(value))]) + expect_false(dt.after[variable != "Actual", any(is.na(value))]) + + # Actuals and expectation are until data.end + expect_true(dt.after[, max(period.until)] >= clv.time@timepoint.holdout.end) + + + # Plotting + # Plots without warnings + expect_silent(p <- plot(fitted, verbose = FALSE, plot = TRUE)) + + # Plot has x-axis limits until data.end + # $x.range: What is really rendered + # $limits: The user set limits + p.xlim <- ggplot2::ggplot_build(p)$layout$panel_params[[1]]$x$limits + expect_true(min(p.xlim) <= clv.time@timepoint.estimation.start) + expect_true(max(p.xlim) >= clv.time@timepoint.holdout.end) + }) +} + fct.testthat.correctness.clvfittedtransactions <- function(name.model, method, correct.start.params.model, correct.params.nocov.coef, correct.LL.nocov, kkt2.true){ @@ -317,6 +380,8 @@ fct.testthat.correctness.clvfittedtransactions <- function(name.model, method, fct.testthat.correctness.clvfittedtransactions.nocov.newdata.fitting.sample.predicting.full.data.equal(method = method, clv.cdnow = clv.cdnow) fct.testhat.correctness.clvfittedtransactions.same.spending.as.independent.spending.model(method = method, clv.data = clv.cdnow) + fct.testthat.correctness.clvfittedtransactions.nocov.plot.until.data.end(method=method) + if(fct.helper.has.pmf(obj.fitted)){ fct.testthat.correctness.clvfittedtransactions.pmf.more.x.more.p(clv.fitted = obj.fitted) fct.testthat.correctness.clvfittedtransactions.pmf.valid.values(clv.fitted = obj.fitted) @@ -326,6 +391,8 @@ fct.testthat.correctness.clvfittedtransactions <- function(name.model, method, # predict(newdata=newcustomer): no cov fct.testthat.correctness.clvfittedtransactions.nocov.predict.newcustomer.0.for.num.periods.eq.0(obj.fitted) + # data.end moves start of prediction period + fct.testthat.correctness.clvfittedtransactions.data.end.moves.prediction.period(method = method) # Static cov data -------------------------------------------------------------------------------------------- # why 100 and not 104??????? diff --git a/tests/testthat/test_correctness_clvdata_s3.R b/tests/testthat/test_correctness_clvdata_s3.R index cdd7262b..3e09d1c8 100644 --- a/tests/testthat/test_correctness_clvdata_s3.R +++ b/tests/testthat/test_correctness_clvdata_s3.R @@ -268,6 +268,36 @@ test_that("Always returns a copy of the data", { # plot --------------------------------------------------------------------- +# . tracking --------------------------------------------------------------- +test_that("tracking plot - without data.end: Last period is NA and plots without warnings", { + skip_on_cran() + clv.cdnow <- fct.helper.create.clvdata.cdnow() + + # Last period is NA in cdnow (on purpose because its a partial period and no longer dropped) + dt.plot <- plot(clv.cdnow, which = "tracking", verbose = FALSE, plot=FALSE) + expect_true(dt.plot[period.until==max(period.until), is.na(value)]) + + # Plots without warnings although data contains NA + expect_silent(plot(clv.cdnow, which="tracking", verbose=FALSE, plot=TRUE)) +}) + +test_that("tracking plot - with data.end: NA after last transaction until data.end and plots w/o warning", { + skip_on_cran() + clv.cdnow <- fct.helper.create.clvdata.cdnow(data.end="1998-12-31") + + # All periods after last transaction are NA + dt.plot <- plot(clv.cdnow, which = "tracking", verbose = FALSE, plot=FALSE) + dt.plot.empty <- dt.plot[period.until > "1998-06-30"] + # Data until at least data.end + expect_true(dt.plot.empty[, max(period.until)] >= "1998-12-31") + # They are all NA + expect_true(dt.plot.empty[, all(is.na(value))]) + + # Plots without warnings although data contains NA + expect_silent(plot(clv.cdnow, which="tracking", verbose=FALSE, plot=TRUE)) +}) + + # . frequency --------------------------------------------------------------- test_that("frequency plot - actual trans has no 0", { diff --git a/tests/testthat/test_correctness_clvtime.R b/tests/testthat/test_correctness_clvtime.R index 44d16760..ccfdf7c3 100644 --- a/tests/testthat/test_correctness_clvtime.R +++ b/tests/testthat/test_correctness_clvtime.R @@ -11,6 +11,7 @@ for(clv.t in c(fct.helper.clv.time.create.test.objects(with.holdout = FALSE), # set.sample.periods -------------------------------------------------------------------------------- +# . no data.end ----------------------------------------------------------------------------- for(clv.t in list(clv.time.hours(time.format="ymd HMS"), clv.time.days( time.format="ymd"), clv.time.weeks(time.format="ymd"), @@ -33,6 +34,12 @@ fct.testthat.correctness.clvtime.set.sample.periods.stop.holdout.length.less.2.p clv.t.weeks = clv.time.weeks(time.format="ymd"), clv.t.years = clv.time.years(time.format="ymd")) +# . with data.end -------------------------------------------------------------------------- +fct.testthat.correctness.clvtime.set.sample.periods.data.end() + + + + # convert.user.input.to.timepoint -------------------------------------------------------------------------------- for(clv.t in c(fct.helper.clv.time.create.test.objects(with.holdout = FALSE), fct.helper.clv.time.create.test.objects(with.holdout = TRUE))){ diff --git a/tests/testthat/test_inputchecks_clvdata_clvdata.R b/tests/testthat/test_inputchecks_clvdata_clvdata.R index d25ed279..918719f7 100644 --- a/tests/testthat/test_inputchecks_clvdata_clvdata.R +++ b/tests/testthat/test_inputchecks_clvdata_clvdata.R @@ -202,14 +202,10 @@ test_that("Fails with split after last transaction",{ }) test_that("Fails with split in 2 periods before last transaction (ie in last period)",{ - expect_error(clvdata(estimation.split = "1998-06-30",time.unit = "d", data.transactions = cdnow, - date.format="ymd"), regexp = "before the last transaction") expect_error(clvdata(estimation.split = "1998-06-29",time.unit = "d", data.transactions = cdnow, - date.format="ymd"), regexp = "before the last transaction") - expect_error(clvdata(estimation.split = "1998-06-30",time.unit = "w", data.transactions = cdnow, - date.format="ymd"), regexp = "before the last transaction") + date.format="ymd"), regexp = "a holdout period of at least 2 time.units") expect_error(clvdata(estimation.split = "1998-06-21",time.unit = "w", data.transactions = cdnow, - date.format="ymd"), regexp = "before the last transaction") + date.format="ymd"), regexp = "a holdout period of at least 2 time.units") }) test_that("Fails if before all first transactions by customer", { diff --git a/tests/testthat/test_inputchecks_clvdata_setdynamiccov.R b/tests/testthat/test_inputchecks_clvdata_setdynamiccov.R index 7736d58b..17fdf0e4 100644 --- a/tests/testthat/test_inputchecks_clvdata_setdynamiccov.R +++ b/tests/testthat/test_inputchecks_clvdata_setdynamiccov.R @@ -13,22 +13,32 @@ fct.expect.error.setdyncov <- function( names.cov.trans=c("High.Season", "Gender", "Channel"), name.id = "Id", name.date = "Cov.Date", + make.missing = NULL, regexp=NULL){ + if(!is.null(make.missing)){ + e <- environment() + # e[[make.missing]] <- NULL + rm(list=make.missing, envir = e) + } + expect_error(SetDynamicCovariates( clv.data = clv.data, data.cov.life = data.cov.life, names.cov.life = names.cov.life, data.cov.trans = data.cov.trans, - names.cov.trans = names.cov.trans - )) + names.cov.trans = names.cov.trans, + name.id = name.id, + name.date = name.date + ), + regexp = regexp) } # Parameter clv.data --------------------------------------------------------------------------------------- test_that("Fails if not clv.data input", { # missing/NA/NULL - fct.expect.error.setdyncov(clv.data = ) + fct.expect.error.setdyncov(make.missing = "clv.data") fct.expect.error.setdyncov(clv.data = NULL) fct.expect.error.setdyncov(clv.data = NA_real_) @@ -52,14 +62,14 @@ test_that("Fails if already has covariates", { test_that("Fails if is wrong type ", { # data.cov.life - fct.expect.error.setdyncov(data.cov.life = , regexp = "missing") + fct.expect.error.setdyncov(make.missing="data.cov.life", regexp = "not found") fct.expect.error.setdyncov(data.cov.life = NULL, regexp = "type data.frame or data.table") fct.expect.error.setdyncov(data.cov.life = NA, regexp = "type data.frame or data.table") fct.expect.error.setdyncov(data.cov.life = as.list(apparelDynCov), regexp = "type data.frame or data.table") # data.cov.trans - fct.expect.error.setdyncov(data.cov.trans = , regexp = "missing") + fct.expect.error.setdyncov(make.missing = "data.cov.trans", regexp = "not found") fct.expect.error.setdyncov(data.cov.trans = NULL, regexp = "type data.frame or data.table") fct.expect.error.setdyncov(data.cov.trans = NA, regexp = "type data.frame or data.table") fct.expect.error.setdyncov(data.cov.trans = as.list(apparelDynCov), regexp = "type data.frame or data.table") @@ -90,6 +100,18 @@ test_that("Fails if covariate data is to short for all customers",{ fct.expect.error.setdyncov(data.cov.trans = apparelDynCov.tooshort, regexp = "covariate data exactly from") }) +test_that("Fails if covariate data ends before data.end", { + + clv.data.apparel.obsE <- fct.helper.create.clvdata.apparel.nocov( + estimation.split = NULL, + data.end = "2012-12-31") + + fct.expect.error.setdyncov( + clv.data = clv.data.apparel.obsE, + regexp = "There need to be weekly covariate data exactly") + +}) + test_that("Fails if there are Ids in the covariates that are not in the transaction data", { dt.cov.1additional <- data.table::copy(apparelDynCov[Id == "1"]) dt.cov.1additional[, Id := "ABC"] @@ -214,13 +236,13 @@ test_that("Fails for variable with single category", { test_that("Fails if missing/NULL/NA/empty",{ # names.cov.life - fct.expect.error.setdyncov(names.cov.life = , regexp = "missing") + fct.expect.error.setdyncov(make.missing = "names.cov.life", regexp = "not found") fct.expect.error.setdyncov(names.cov.life = NULL, regexp = "may not be NULL") fct.expect.error.setdyncov(names.cov.life = NA_character_, regexp = "any NA") fct.expect.error.setdyncov(names.cov.life = "", regexp = "could not be found") # names.cov.trans - fct.expect.error.setdyncov(names.cov.trans = , regexp = "missing") + fct.expect.error.setdyncov(make.missing = "names.cov.trans", regexp = "not found") fct.expect.error.setdyncov(names.cov.trans = NULL, regexp = "may not be NULL") fct.expect.error.setdyncov(names.cov.trans = NA_character_, regexp = "any NA") fct.expect.error.setdyncov(names.cov.trans = "", regexp = "could not be found") @@ -313,7 +335,7 @@ test_that("Has default argument Id",{ # Parameter name.date --------------------------------------------------------------------------------------- test_that("Fails if NA/NULL", { - fct.expect.error.setdyncov(name.date = "id", regexp = "NULL") + fct.expect.error.setdyncov(name.date = "id", regexp = "could not be found") fct.expect.error.setdyncov(name.date = NA_character_, regexp = "any NA") fct.expect.error.setdyncov(name.date = character(0), regexp = "exactly 1 element") }) diff --git a/tests/testthat/test_runability_clvdata_clvdata.R b/tests/testthat/test_runability_clvdata_clvdata.R index 2750b080..887a5d88 100644 --- a/tests/testthat/test_runability_clvdata_clvdata.R +++ b/tests/testthat/test_runability_clvdata_clvdata.R @@ -249,3 +249,40 @@ test_that("Works when called from as.clv.data()", { expect_silent(as.clv.data(as.data.frame(cdnow))) expect_silent(as.clv.data(as.data.table(cdnow))) }) + +# data.end ---------------------------------------------------------------- + +test_that("Works with data.end with and without holdout",{ + skip_on_cran() + l.args <- list( + data.end = "2000-01-01", + data.transactions = cdnow, + time.unit = "w", + date.format = "ymd" + ) + + l.args["estimation.split"] <- list(NULL) + expect_silent(do.call(clvdata, l.args)) + + l.args$estimation.split <- 37 + expect_silent(do.call(clvdata, l.args)) +}) + +test_that("Works with data.end and time.units hours, days, years", { + l.args <- list( + data.transactions = cdnow, + data.end = "2000-01-01", + estimation.split = NULL, + date.format = "ymd" + ) + + l.args$time.unit <- "days" + expect_silent(do.call(clvdata, l.args)) + + l.args$time.unit <- "year" + expect_silent(do.call(clvdata, l.args)) + + l.args$time.unit <- "hours" + expect_silent(do.call(clvdata, l.args)) +}) + diff --git a/tests/testthat/test_runability_clvdata_s3.R b/tests/testthat/test_runability_clvdata_s3.R index a4259260..0422d72e 100644 --- a/tests/testthat/test_runability_clvdata_s3.R +++ b/tests/testthat/test_runability_clvdata_s3.R @@ -25,9 +25,11 @@ fct.helper.test.runability.clv.data.summary <- function(clv.data){ # warning if inexistent ids expect_warning(summary(clv.data, ids=c(ids, "abczxy")), regexp = "Not all given ids were found") - # id with trans in holdout - expect_silent(id.with.holdout <- clv.data@data.transactions[Date>=clv.data@clv.time@timepoint.holdout.start, head(Id,n=1)]) - expect_silent(summary(clv.data, ids=id.with.holdout)) + if(clv.data.has.holdout(clv.data)){ + # id with trans in holdout + expect_silent(id.with.holdout <- clv.data@data.transactions[Date>=clv.data@clv.time@timepoint.holdout.start, head(Id,n=1)]) + expect_silent(summary(clv.data, ids=id.with.holdout)) + } # id without trans in holdout # any zero-repeater @@ -270,20 +272,6 @@ fct.helper.test.runability.clv.data.others3 <- function(clv.data){ } - - -# Create all combos: {w/, w/o} holdout, {w/, w/o} {static, dynamic} covs - -apparel.holdout <- fct.helper.create.clvdata.apparel.nocov() -apparel.no.holdout <- fct.helper.create.clvdata.apparel.nocov(estimation.split = NULL) - -apparel.holdout.static.cov <- fct.helper.create.clvdata.apparel.staticcov() -apparel.no.holdout.static.cov <- fct.helper.create.clvdata.apparel.staticcov(estimation.split = NULL) - -apparel.holdout.dyn.cov <- fct.helper.create.clvdata.apparel.dyncov() -apparel.no.holdout.dyn.cov <- fct.helper.create.clvdata.apparel.dyncov(estimation.split = NULL) - - fct.helper.test.runability.clv.data.runall <- function(clv.data){ fct.helper.test.runability.clv.data.trackingplot(clv.data) fct.helper.test.runability.clv.data.plotfrequency(clv.data) @@ -295,10 +283,25 @@ fct.helper.test.runability.clv.data.runall <- function(clv.data){ } -fct.helper.test.runability.clv.data.runall(apparel.holdout) -fct.helper.test.runability.clv.data.runall(apparel.no.holdout) -fct.helper.test.runability.clv.data.runall(apparel.holdout.static.cov) -fct.helper.test.runability.clv.data.runall(apparel.no.holdout.static.cov) -fct.helper.test.runability.clv.data.runall(apparel.holdout.dyn.cov) -fct.helper.test.runability.clv.data.runall(apparel.no.holdout.dyn.cov) + +for(fn in list( + fct.helper.create.clvdata.apparel.nocov, + fct.helper.create.clvdata.apparel.staticcov +)){ + # With holdout + fct.helper.test.runability.clv.data.runall(fn()) + # . with data.end + fct.helper.test.runability.clv.data.runall(fn(data.end="2011-01-31")) + + # Without holdout + fct.helper.test.runability.clv.data.runall(fn(estimation.split=NULL)) + # . with data.end + fct.helper.test.runability.clv.data.runall(fn(estimation.split=NULL, data.end="2011-01-31")) + +} + +# data.end would require to prepare transaction data (cut before cov end) +# But dyncovs are not used for any of the s3 methods, therefore skip testing with data.end for dyncov +fct.helper.test.runability.clv.data.runall(fct.helper.create.clvdata.apparel.dyncov()) +fct.helper.test.runability.clv.data.runall(fct.helper.create.clvdata.apparel.dyncov(estimation.split = NULL)) diff --git a/tests/testthat/test_runability_pnbd_dynamiccov.R b/tests/testthat/test_runability_pnbd_dynamiccov.R index d5277fc7..acc99163 100644 --- a/tests/testthat/test_runability_pnbd_dynamiccov.R +++ b/tests/testthat/test_runability_pnbd_dynamiccov.R @@ -93,3 +93,68 @@ test_that("Dyncov works with additional model specifications", { names.cov.constr = "Gender", reg.lambda = c(trans=10, life=10)) }) + + +# With data.end --------------------------------------------------------- + +test_that("Fit, plot, predict work with partially empty estimation/holdout period", { + + covs.life <- c("High.Season", "Gender") + covs.trans <- c("High.Season", "Gender", "Channel") + + apparelTrans.cut.obsE <- apparelTrans[Date < "2010-12-01"] + date.original.trans.max <- apparelTrans[, max(Date)] + + + # fct.helper.runability.dyncov.all.downstream requires holdout + + names.params <- c( + 'r', 'alpha', 's', 'beta', + paste0('life.', covs.life), + paste0('trans.', covs.trans)) + + + # Create object with no transactions in December 2010 but with the observation + # period as originally (2010-12-20) + + # No holdout + expect_silent(fitted.dyncov.noholdout.obsE <- fit.apparel.dyncov( + data.apparelTrans = apparelTrans.cut.obsE, + estimation.split = NULL, + data.end = date.original.trans.max, + names.cov.life = covs.life, + names.cov.trans = covs.trans, + optimx.args=fct.helper.dyncov.get.optimxargs.quickfit(hessian=TRUE) + )) + + .fct.helper.clvfitted.all.s3.except.plot.and.predict( + clv.fitted=fitted.dyncov.noholdout.obsE, + full.names=names.params + ) + + expect_silent(predict(fitted.dyncov.noholdout.obsE, prediction.end=1, verbose=FALSE)) + expect_silent(plot(fitted.dyncov.noholdout.obsE, prediction.end = NULL, verbose=FALSE)) + + + # With holdout + expect_silent(fitted.dyncov.holdout.obsE <- fit.apparel.dyncov( + data.apparelTrans = apparelTrans.cut.obsE, + estimation.split = 104, + data.end = date.original.trans.max, + names.cov.life = covs.life, + names.cov.trans = covs.trans, + optimx.args=fct.helper.dyncov.get.optimxargs.quickfit(hessian=TRUE) + )) + + .fct.helper.clvfitted.all.s3.except.plot.and.predict( + clv.fitted=fitted.dyncov.holdout.obsE, + full.names=names.params + ) + + expect_silent(predict(fitted.dyncov.holdout.obsE, prediction.end=5, verbose=FALSE)) + expect_warning( + plot(fitted.dyncov.holdout.obsE, prediction.end = 5, verbose=FALSE), + regexp = "Not plotting full holdout period" + ) + +})