diff --git a/NAMESPACE b/NAMESPACE index 00b6170..8878dc0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,10 @@ export(get_hierarchical_clusters) export(get_robust_colocalization) export(get_robust_ucos) export(get_ucos_summary) +importFrom(Rfast,correls) +importFrom(Rfast,med) +importFrom(Rfast,standardise) +importFrom(Rfast,upper_tri) importFrom(grDevices,adjustcolor) importFrom(graphics,abline) importFrom(graphics,axis) diff --git a/R/colocboost.R b/R/colocboost.R index 3f9f860..ef65278 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -22,12 +22,20 @@ #' \code{variant} is required if sumstat for different outcomes do not have the same number of variables. #' \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale). #' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were -#' obtained from the same genotypes. +#' obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +#' If neither is provided, LD-free mode is used. +#' @param X_ref A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +#' \code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +#' When the number of reference panel samples is less than the number of variants (N_ref < P), +#' this avoids storing the full P x P LD matrix and reduces memory usage. +#' When N_ref >= P, LD is precomputed from \code{X_ref} internally. +#' Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used. #' @param dict_YX A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix. #' The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome. #' The innovation: do not provide the same matrix in \code{X} to reduce the computational burden. -#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} if there exist subsets of outcomes corresponding to the same sumstat. -#' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat. +#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} (or \code{X_ref}) if there exist subsets of outcomes +#' corresponding to the same sumstat. +#' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} (or \code{X_ref}) corresponding to the sumstat. #' The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden. #' @param outcome_names The names of outcomes, which has the same order for Y. #' @param focal_outcome_idx The index of the focal outcome if perform GWAS-xQTL ColocBoost @@ -86,6 +94,7 @@ #' @param check_null_max_ucos The smallest value of change of profile loglikelihood for each outcome in uCoS. #' @param weaker_effect If \code{weaker_effect = TRUE}, consider the weaker single effect due to coupling effects #' @param LD_free When \code{LD_free = FALSE}, objective function doesn't include LD information. +#' @param use_entropy A logic variable to consider the heterogeneity of traits for a single-effect. #' @param output_level When \code{output_level = 1}, return basic cos details for colocalization results #' When \code{output_level = 2}, return the ucos details for the single specific effects. #' When \code{output_level = 3}, return the entire Colocboost model to diagnostic results (more space). @@ -129,9 +138,10 @@ #' #' @family colocboost #' @importFrom stats na.omit +#' @importFrom Rfast correls standardise upper_tri med #' @export colocboost <- function(X = NULL, Y = NULL, # individual data - sumstat = NULL, LD = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y, + sumstat = NULL, LD = NULL, X_ref = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y, ###### - index dict for X match multiple Y / LD match multiple sumstat dict_YX = NULL, # Y index for 1st column, X index for 2nd column dict_sumstatLD = NULL, # sumstat index for 1st column, LD index for 2nd column @@ -187,6 +197,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data check_null_max_ucos = 0.015, # the smallest value of change of profile loglikelihood for each outcome in uCoS. weaker_effect = TRUE, LD_free = FALSE, + use_entropy = FALSE, output_level = 1, ###### - Post filtering parameters cos_npc_cutoff = 0.2, # remove the CoS with cos_npc less than this cutoff @@ -202,11 +213,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data warning("Error: No individual data (X, Y) or summary statistics (sumstat) or (effect_est, effect_se) are provided! Please check!") return(NULL) } + if (!is.null(LD) && !is.null(X_ref)) { + warning("Error: Provide either LD or X_ref, not both.") + return(NULL) + } # - check input data: individual level data and summary-level data validated_data <- colocboost_validate_input_data( X = X, Y = Y, - sumstat = sumstat, LD = LD, + sumstat = sumstat, LD = LD, X_ref = X_ref, dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD, effect_est = effect_est, effect_se = effect_se, effect_n = effect_n, overlap_variables = overlap_variables, @@ -227,6 +242,8 @@ colocboost <- function(X = NULL, Y = NULL, # individual data keep_variable_individual <- validated_data$keep_variable_individual sumstat <- validated_data$sumstat LD <- validated_data$LD + X_ref <- validated_data$X_ref + ref_label <- validated_data$ref_label sumstatLD_dict <- validated_data$sumstatLD_dict keep_variable_sumstat <- validated_data$keep_variable_sumstat Z <- validated_data$Z @@ -277,15 +294,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } cb_data <- colocboost_init_data( X = X, Y = Y, dict_YX = yx_dict, - Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, + Z = Z, LD = LD, X_ref = X_ref, ref_label = ref_label, + N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, Var_y = Var_y, SeBhat = SeBhat, keep_variables = keep_variables, focal_outcome_idx = focal_outcome_idx, focal_outcome_variables = focal_outcome_variables, overlap_variables = overlap_variables, intercept = intercept, - standardize = standardize, - residual_correlation = residual_correlation + standardize = standardize ) ################## colocboost updates ################################### @@ -335,6 +352,8 @@ colocboost <- function(X = NULL, Y = NULL, # individual data median_cos_abs_corr = median_cos_abs_corr, weaker_effect = weaker_effect, merge_cos = merge_cos, + residual_correlation = residual_correlation, + use_entropy = use_entropy, tol = tol, output_level = output_level ) @@ -347,6 +366,8 @@ colocboost <- function(X = NULL, Y = NULL, # individual data npc_outcome_cutoff = npc_outcome_cutoff, pvalue_cutoff = pvalue_cutoff, weight_fudge_factor = weight_fudge_factor, + use_entropy = use_entropy, + residual_correlation = residual_correlation, coverage = coverage ) # ---- post filtering of the colocboost results (get robust trait-specific events) @@ -377,7 +398,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data #' @param X A list of genotype matrices for different outcomes, or a single matrix if all outcomes share the same genotypes. #' @param Y A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes. #' @param sumstat A list of data.frames of summary statistics. -#' @param LD A list of correlation matrices indicating the LD matrix for each genotype. +#' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were +#' obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +#' If neither is provided, LD-free mode is used. +#' @param X_ref A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +#' \code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +#' When the number of reference panel samples is less than the number of variants (N_ref < P), +#' this avoids storing the full P x P LD matrix and reduces memory usage. +#' When N_ref >= P, LD is precomputed from \code{X_ref} internally. +#' Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used. #' @param dict_YX A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix. #' @param dict_sumstatLD A L by 2 matrix of dictionary for sumstat and LD if there exist subsets of outcomes corresponding to the same sumstat. #' @param effect_est Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region @@ -394,6 +423,8 @@ colocboost <- function(X = NULL, Y = NULL, # individual data #' \item{keep_variable_individual}{List of variable names for each X matrix} #' \item{sumstat}{Processed list of summary statistics data.frames} #' \item{LD}{Processed list of LD matrices} +#' \item{X_ref}{Processed list of reference genotype matrices} +#' \item{ref_label}{Style of reference matrics} #' \item{sumstatLD_dict}{Dictionary mapping sumstat to LD} #' \item{keep_variable_sumstat}{List of variant names for each sumstat} #' \item{Z}{List of z-scores for each outcome} @@ -408,7 +439,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data #' #' @keywords internal colocboost_validate_input_data <- function(X = NULL, Y = NULL, - sumstat = NULL, LD = NULL, + sumstat = NULL, LD = NULL, X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, effect_est = NULL, effect_se = NULL, effect_n = NULL, overlap_variables = FALSE, @@ -418,7 +449,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, cos_npc_cutoff = 0.2, npc_outcome_cutoff = 0.2) { - # - check individual level data + ############### Check individual level data ########################### if (!is.null(X) & !is.null(Y)) { # --- check input if (is.data.frame(X)) X <- as.matrix(X) @@ -557,8 +588,9 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, } else { yx_dict <- keep_variable_individual <- NULL } + ############### Done! Check individual level data ########################### - # - check summary-level data + ############### Check summary level data ########################### if ((!is.null(sumstat)) | (!is.null(effect_est) & !is.null(effect_se))) { # --- check input of (effect_est, effect_se) if ((!is.null(effect_est) & !is.null(effect_se))) { @@ -649,8 +681,9 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, } return(xx) }) + ############### Done! Check summary level data ########################### - # --- check input of LD + ############### Check input of LD or reference X_ref ########################### M_updated <- M min_abs_corr_updated <- min_abs_corr jk_equiv_corr_updated <- jk_equiv_corr @@ -659,10 +692,13 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, cos_npc_cutoff_updated <- cos_npc_cutoff npc_outcome_cutoff_updated <- npc_outcome_cutoff - if (is.null(LD)) { + # --- Handle X_ref: convert to LD when N_ref >= P, or keep for on-the-fly computation + + + if (is.null(LD) && is.null(X_ref)) { # if no LD input, set diagonal matrix to LD warning( - "Providing the LD for summary statistics data is highly recommended. ", + "Providing the LD or X_ref for summary statistics data is highly recommended. ", "Without LD, only a single iteration will be performed under the assumption of one causal variable per outcome. ", "Additionally, the purity of CoS cannot be evaluated!" ) @@ -677,61 +713,98 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, func_simplex_updated <- "only_z2z" cos_npc_cutoff_updated <- 0 npc_outcome_cutoff_updated <- 0 + ref_label <- "No_ref" } else { - if (is.data.frame(LD)) LD <- as.matrix(LD) - if (is.matrix(LD)) LD <- list(LD) - # - check if NA in LD matrix - num_na <- sapply(LD, sum) - if (any(is.na(num_na))){ - warning("Error: Input LD must not contain missing values (NA).") - return(NULL) + # -- Determine reference list and variant extraction for LD or X_ref + if (!is.null(LD)){ + + if (is.data.frame(LD)) LD <- as.matrix(LD) + if (is.matrix(LD)) LD <- list(LD) + # - check if NA in LD matrix + num_na <- sapply(LD, sum) + if (any(is.na(num_na))){ + warning("Error: Input LD must not contain missing values (NA).") + return(NULL) + } + ref_list <- LD + ref_label <- "LD" + + } else { + if (is.data.frame(X_ref)) X_ref <- as.matrix(X_ref) + if (is.matrix(X_ref)) X_ref <- list(X_ref) + + # When N_ref >= P, precompute LD (avoids repeated O(N_ref*P) in boosting loop) + # When N_ref < P, keep X_ref for on-the-fly computation (avoids P*P memory) + all_large <- all(sapply(X_ref, function(xr) nrow(xr) >= ncol(xr))) + if (all_large) { + message("N_ref >= P: precomputing LD from X_ref.") + LD <- lapply(X_ref, function(xr) { + ld <- get_cormat(xr) + rownames(ld) <- colnames(ld) <- colnames(xr) + ld + }) + X_ref <- NULL + ref_list <- LD + ref_label <- "LD" + } else { + # N_ref < P: standardize and keep for on-the-fly crossprod/(N_ref-1) + for (idx in seq_along(X_ref)) { + X_ref[[idx]] <- standardise(X_ref[[idx]], center = TRUE, scale = TRUE) + X_ref[[idx]][which(is.na(X_ref[[idx]]))] <- 0 + } + ref_list <- X_ref + ref_label <- "X_ref" + } } - # Create sumstat-LD mapping === - if (length(LD) == 1) { + + # -- Create sumstat - LD/X_ref mapping === + if (length(ref_list) == 1) { sumstatLD_dict <- rep(1, length(sumstat)) - } else if (length(LD) == length(sumstat)) { + } else if (length(ref_list) == length(sumstat)) { sumstatLD_dict <- seq_along(sumstat) } else { if (is.null(dict_sumstatLD)) { warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat), - ' sumstats but only ', length(LD), ' LD matrices') + ' sumstats but only ', length(ref_list), ' ', ref_label, ' matrices') return(NULL) } else { - # - dict for sumstat to LD mapping + # - dict for sumstat to LD/X_ref mapping sumstatLD_dict <- rep(NA, length(sumstat)) for (i in 1:length(sumstat)) { tmp <- unique(dict_sumstatLD[dict_sumstatLD[, 1] == i, 2]) if (length(tmp) == 0) { - warning(paste("Error: You don't provide matched LD for sumstat", i)) + warning("Error: You don't provide matched", ref_label, "for sumstat", i) return(NULL) } else if (length(tmp) != 1) { - warning(paste("Error: You provide multiple matched LD for sumstat", i)) + warning("Error: You provide multiple matched", ref_label, "for sumstat", i) return(NULL) } else { sumstatLD_dict[i] <- tmp } } - if (max(sumstatLD_dict) > length(LD)) { - warning("Error: You don't provide enough LD matrices!") + if (max(sumstatLD_dict) > length(ref_list)) { + warning("Error: You don't provide enough", ref_label, "matrices!") return(NULL) } } } - # === Filter variants for each sumstat === + # -- Filter variants for each sumstat + get_ref_variants <- function(ref_mat) colnames(ref_mat) + ref_ncol <- function(ref_mat) ncol(ref_mat) for (i in seq_along(sumstat)) { # Get sumstat variants (adjust field name based on your data structure) sumstat_variants <- sumstat[[i]]$variant n_total <- length(sumstat_variants) - # Get LD variants + # Get LD/X_ref variants ld_idx <- sumstatLD_dict[i] - current_ld <- LD[[ld_idx]] - ld_variants <- rownames(current_ld) + current_ref <- ref_list[[ld_idx]] + ld_variants <- get_ref_variants(current_ref) if (is.null(ld_variants)) { - if (ncol(current_ld) != n_total){ - warning('Error: LD matrix ', ld_idx, ' has no rownames. Please ensure all LD matrices have variant names as rownames.') + if (ncol(current_ref) != n_total){ + warning('Error: ', ref_label, ' matrix ', ld_idx, ' has no variant names. Please ensure all ', ref_label, ' matrices have variant names.') return(NULL) } } @@ -744,7 +817,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, ' variants since those variants are not in LD matrix ', ld_idx) keep_idx <- match(common_variants, sumstat_variants) if (length(keep_idx) == 0){ - warning('Error: Sumstat data ', i, ' is empty after filtering. Returning NULL') + warning('Error: Sumstat data ', i, ' is empty after filtering. Please check variant in sumstat data! Returning NULL!') return(NULL) } # Filter all relevant fields - ADJUST THESE FIELD NAMES TO YOUR DATA @@ -755,6 +828,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, keep_variable_sumstat <- lapply(sumstat, function(xx) { xx$variant }) + ############### Done! Check input of LD or reference X_ref ########################### # - checking sample size existency n_exist <- sapply(sumstat, function(ss) { @@ -834,7 +908,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, Z[[i.summstat]] <- z } } else { - Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL + Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- X_ref <- ref_label <- NULL M_updated <- M min_abs_corr_updated <- min_abs_corr jk_equiv_corr_updated <- jk_equiv_corr @@ -851,6 +925,8 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, keep_variable_individual = keep_variable_individual, sumstat = sumstat, LD = LD, + X_ref = X_ref, + ref_label = ref_label, sumstatLD_dict = sumstatLD_dict, keep_variable_sumstat = keep_variable_sumstat, Z = Z, diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index 94cbbe5..96b8ab1 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -36,6 +36,8 @@ colocboost_assemble <- function(cb_obj, median_cos_abs_corr = 0.8, weaker_effect = TRUE, merge_cos = TRUE, + residual_correlation = NULL, + use_entropy = FALSE, tol = 1e-9, output_level = 1) { if (!inherits(cb_obj, "colocboost")) { @@ -84,6 +86,7 @@ colocboost_assemble <- function(cb_obj, cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, check_null_max_ucos = check_null_max_ucos, check_null_method = check_null_method) + # --------- about colocalized confidence sets --------------------------------- out_cos <- colocboost_assemble_cos(cb_obj, coverage = coverage, @@ -98,6 +101,8 @@ colocboost_assemble <- function(cb_obj, median_abs_corr = median_abs_corr, min_cluster_corr = min_cluster_corr, median_cos_abs_corr = median_cos_abs_corr, + use_entropy = use_entropy, + residual_correlation = residual_correlation, tol = tol ) @@ -132,10 +137,13 @@ colocboost_assemble <- function(cb_obj, } } if (!is.null(cb_obj_single$cb_data$data[[1]][["XtY"]])) { + X_dict <- cb_obj$cb_data$dict[i] if (is.null(cb_obj_single$cb_data$data[[1]]$XtX)) { - X_dict <- cb_obj$cb_data$dict[i] cb_obj_single$cb_data$data[[1]]$XtX <- cb_obj$cb_data$data[[X_dict]]$XtX } + if (is.null(cb_obj_single$cb_data$data[[1]]$ref_label)) { + cb_obj_single$cb_data$data[[1]]$ref_label <- cb_obj$cb_data$data[[X_dict]]$ref_label + } } class(cb_obj_single) <- "colocboost" out_ucos_each <- colocboost_assemble_ucos(cb_obj_single, @@ -208,6 +216,8 @@ colocboost_assemble <- function(cb_obj, ############# - extract colocboost output - #################### # - colocalization results + cb_obj$cb_model_para$use_entropy <- use_entropy + cb_obj$cb_model_para$residual_correlation <- residual_correlation cb_obj$cb_model_para$weight_fudge_factor <- weight_fudge_factor cb_obj$cb_model_para$coverage <- coverage cb_obj$cb_model_para$min_abs_corr <- min_abs_corr diff --git a/R/colocboost_assemble_cos.R b/R/colocboost_assemble_cos.R index 6a1243a..5912070 100644 --- a/R/colocboost_assemble_cos.R +++ b/R/colocboost_assemble_cos.R @@ -12,6 +12,8 @@ colocboost_assemble_cos <- function(cb_obj, median_abs_corr = NULL, min_cluster_corr = 0.8, median_cos_abs_corr = 0.8, + use_entropy = FALSE, + residual_correlation = NULL, tol = 1e-9) { if (!inherits(cb_obj, "colocboost")) { stop("Input must from colocboost function!") @@ -43,7 +45,8 @@ colocboost_assemble_cos <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss + miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss, + ref_label = cb_data$data[[X_dict]]$ref_label ) check_purity[iiii] <- length(tmp) == 1 } @@ -53,7 +56,8 @@ colocboost_assemble_cos <- function(cb_obj, pos_purity <- which(check_purity) avWeight <- avWeight[, pos_purity, drop = FALSE] coloc_outcomes <- coloc_outcomes[pos_purity] - weights <- get_integrated_weight(avWeight, weight_fudge_factor = weight_fudge_factor) + weights <- get_integrated_weight(avWeight, weight_fudge_factor = weight_fudge_factor, + use_entropy = use_entropy, residual_correlation = residual_correlation) coloc_cos <- get_in_cos(weights, coverage = coverage) evidence_strength <- sum(weights[coloc_cos[[1]]]) @@ -83,7 +87,8 @@ colocboost_assemble_cos <- function(cb_obj, p_tmp <- matrix(get_purity(pos, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, - N = cb_data$data[[i]]$N, n = n_purity + N = cb_data$data[[i]]$N, n = n_purity, + ref_label = cb_data$data[[X_dict]]$ref_label ), 1, 3) purity <- c(purity, list(p_tmp)) } @@ -148,7 +153,8 @@ colocboost_assemble_cos <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss + miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss, + ref_label = cb_data$data[[X_dict]]$ref_label ) check_purity[iiii] <- length(tmp) == 1 } @@ -158,7 +164,8 @@ colocboost_assemble_cos <- function(cb_obj, pos_purity <- which(check_purity) avWeight <- avWeight[, pos_purity, drop = FALSE] coloc_outcomes <- coloc_outcomes[pos_purity] - weights <- get_integrated_weight(avWeight, weight_fudge_factor = weight_fudge_factor) + weights <- get_integrated_weight(avWeight, weight_fudge_factor = weight_fudge_factor, + use_entropy = use_entropy, residual_correlation = residual_correlation) coloc_cos <- get_in_cos(weights, coverage = coverage) evidence_strength <- sum(weights[coloc_cos[[1]]]) @@ -208,7 +215,8 @@ colocboost_assemble_cos <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss + miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss, + ref_label = cb_data$data[[X_dict]]$ref_label ) check_purity[[iiii]] <- tmp } @@ -245,7 +253,8 @@ colocboost_assemble_cos <- function(cb_obj, for (i.w in 1:length(avWeight_coloc)) { w <- avWeight_coloc[[i.w]] if (sum(w) != 0) { - weights <- get_integrated_weight(w, weight_fudge_factor = weight_fudge_factor) + weights <- get_integrated_weight(w, weight_fudge_factor = weight_fudge_factor, + use_entropy = use_entropy, residual_correlation = residual_correlation) csets <- get_in_cos(weights, coverage = coverage) coloc_cos[[fl]] <- unlist(csets) evidence_strength[[fl]] <- sum(weights[coloc_cos[[fl]]]) @@ -351,7 +360,8 @@ colocboost_assemble_cos <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, miss_idx = cb_data$data[[i]]$variable_miss, - P = cb_model_para$P + P = cb_model_para$P, + ref_label = cb_data$data[[X_dict]]$ref_label ) } res <- Reduce(pmax, res) @@ -438,7 +448,8 @@ colocboost_assemble_cos <- function(cb_obj, tmp <- matrix(get_purity(pos, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, - N = cb_data$data[[i3]]$N, n = n_purity + N = cb_data$data[[i3]]$N, n = n_purity, + ref_label = cb_data$data[[X_dict]]$ref_label ), 1, 3) p_tmp <- rbind(p_tmp, tmp) } diff --git a/R/colocboost_assemble_ucos.R b/R/colocboost_assemble_ucos.R index 3336f19..2b44f54 100644 --- a/R/colocboost_assemble_ucos.R +++ b/R/colocboost_assemble_ucos.R @@ -64,7 +64,8 @@ colocboost_assemble_ucos <- function(cb_obj_single, } purity <- matrix(get_purity(pos, X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, - N = cb_data$data[[1]]$N, n = n_purity + N = cb_data$data[[1]]$N, n = n_purity, + ref_label = cb_data$data[[1]]$ref_label ), 1, 3) purity <- as.data.frame(purity) colnames(purity) <- c("min_abs_corr", "mean_abs_corr", "median_abs_corr") @@ -131,7 +132,8 @@ colocboost_assemble_ucos <- function(cb_obj_single, X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, N = cb_data$data[[1]]$N, n = n_purity, coverage = coverage, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - miss_idx = cb_data$data[[1]]$variable_miss + miss_idx = cb_data$data[[1]]$variable_miss, + ref_label = cb_data$data[[1]]$ref_label ) if (length(check_purity) != 0) { w <- w[check_purity] @@ -237,7 +239,8 @@ colocboost_assemble_ucos <- function(cb_obj_single, X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, miss_idx = cb_data$data[[1]]$variable_miss, - P = cb_model_para$P + P = cb_model_para$P, + ref_label = cb_data$data[[1]]$ref_label ) min_between[i.between, j.between] <- min_between[j.between, i.between] <- res[1] max_between[i.between, j.between] <- max_between[j.between, i.between] <- res[2] @@ -301,7 +304,8 @@ colocboost_assemble_ucos <- function(cb_obj_single, purity, matrix(get_purity(pos, X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, - N = cb_data$data[[1]]$N, n = n_purity + N = cb_data$data[[1]]$N, n = n_purity, + ref_label = cb_data$data[[1]]$ref_label ), 1, 3) ) } diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R index d449b03..aef6b0b 100644 --- a/R/colocboost_check_update_jk.R +++ b/R/colocboost_check_update_jk.R @@ -61,13 +61,10 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { model_update <- cb_model[pos.update] # data_update = cb_data$data[pos.update] X_dict <- cb_data$dict[pos.update] - # adj_dependence - adj_dep <- sapply(pos.update, function(ii) cb_data$data[[ii]]$dependency) # -- define jk and jk_each cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) cc$correlation)) - # abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`) - abs_cor_vals_each <- abs(cor_vals_each) * rep(adj_dep, each = nrow(cor_vals_each)) + abs_cor_vals_each <- abs(cor_vals_each) abs_cor_vals <- rowSums(abs_cor_vals_each) # jk <- which(abs_cor_vals == max(abs_cor_vals)) @@ -122,7 +119,9 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, - miss_idx = data_update[[ii]]$variable_miss + miss_idx = data_update[[ii]]$variable_miss, + ref_label = cb_data$data[[X_dict[ii]]]$ref_label, + XtX_beta_cache = model_update[[ii]]$XtX_beta_cache ) }) change_res_each <- as.numeric(unlist(change_res_each)) @@ -197,7 +196,9 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, - miss_idx = data_update[[ii]]$variable_miss + miss_idx = data_update[[ii]]$variable_miss, + ref_label = cb_data$data[[X_dict[ii]]]$ref_label, + XtX_beta_cache = model_update[[ii]]$XtX_beta_cache ) }) change_res_each <- as.numeric(unlist(change_res_each)) @@ -241,7 +242,9 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, - miss_idx = data_update[[ii]]$variable_miss + miss_idx = data_update[[ii]]$variable_miss, + ref_label = cb_data$data[[X_dict[ii]]]$ref_label, + XtX_beta_cache = model_update[[ii]]$XtX_beta_cache ) }) change_res_each <- as.numeric(unlist(change_res_each)) @@ -346,13 +349,10 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, model_update <- cb_model[pos.update] # data_update = cb_data$data[pos.update] X_dict <- cb_data$dict[pos.update] - # adj_dependence - adj_dep <- sapply(pos.update, function(ii) cb_data$data[[ii]]$dependency) # -- define jk and jk_each cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) as.vector(cc$correlation))) - # abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`) - abs_cor_vals_each <- abs(cor_vals_each) * rep(adj_dep, each = nrow(cor_vals_each)) + abs_cor_vals_each <- abs(cor_vals_each) # jk_each <- apply(abs_cor_vals_each, 2, which.max) jk_each <- sapply(1:ncol(abs_cor_vals_each), function(j) which.max(abs_cor_vals_each[, j])) pp_focal <- which(pos.update == focal_outcome_idx) @@ -368,7 +368,8 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, X = cb_data$data[[X_dict[pp_focal]]]$X, XtX = cb_data$data[[X_dict[pp_focal]]]$XtX, N = cb_data$data[[X_dict[pp_focal]]]$N, - remain_jk = 1:cb_model_para$P + remain_jk = 1:cb_model_para$P, + ref_label = cb_data$data[[X_dict[pp_focal]]]$ref_label ) }) # ----- second, if within the same LD buddies, select the following variants @@ -379,7 +380,8 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, # ----- third, if picked variant within the same LD buddies ld_tmp <- get_LD_jk1_jk2(jk_focal, jk_focal_tmp, XtX = cb_data$data[[X_dict[pp_focal]]]$XtX, - remain_jk = 1:cb_model_para$P + remain_jk = 1:cb_model_para$P, + ref_label = cb_data$data[[X_dict[pp_focal]]]$ref_label ) if (ld_tmp > jk_equiv_corr) { jk_focal <- jk_focal_tmp @@ -491,7 +493,7 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, #' @importFrom stats cor get_LD_jk1_jk2 <- function(jk1, jk2, X = NULL, XtX = NULL, N = NULL, - remain_jk = NULL) { + remain_jk = NULL, ref_label = "LD") { if (!is.null(X)) { LD_temp <- suppressWarnings({ get_cormat(X[, c(jk1, jk2)]) @@ -499,12 +501,18 @@ get_LD_jk1_jk2 <- function(jk1, jk2, LD_temp[which(is.na(LD_temp))] <- 0 LD_temp <- LD_temp[1, 2] } else if (!is.null(XtX)) { - if (length(XtX) == 1){ + if (identical(ref_label, "No_ref")) { LD_temp <- 0 } else { jk1.remain <- match(jk1, remain_jk) jk2.remain <- match(jk2, remain_jk) - LD_temp <- XtX[jk1.remain, jk2.remain] + if (identical(ref_label, "X_ref")) { + LD_temp <- suppressWarnings({ get_cormat(XtX[, c(jk1.remain, jk2.remain)]) }) + LD_temp[which(is.na(LD_temp))] <- 0 + LD_temp <- LD_temp[1, 2] + } else { + LD_temp <- XtX[jk1.remain, jk2.remain] + } } } return(LD_temp) @@ -528,7 +536,8 @@ check_jk_jkeach <- function(jk, jk_each, X = cb_data$data[[X_dict[i]]]$X, XtX = cb_data$data[[X_dict[i]]]$XtX, N = data_update[[i]]$N, - remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss) + remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss), + ref_label = cb_data$data[[X_dict[i]]]$ref_label ) judge[i] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr) } else { @@ -549,7 +558,7 @@ check_pair_jkeach <- function(jk_each, #' @importFrom stats cor get_LD_jk_each <- function(jk_each, X = NULL, XtX = NULL, N = NULL, - remain_jk = NULL) { + remain_jk = NULL, ref_label = "LD") { if (!is.null(X)) { LD_temp <- suppressWarnings({ get_cormat(X[, jk_each]) @@ -557,11 +566,15 @@ check_pair_jkeach <- function(jk_each, LD_temp[which(is.na(LD_temp))] <- 0 # LD_temp <- LD_temp[1, 2] } else if (!is.null(XtX)) { - if (length(XtX) == 1){ + if (identical(ref_label, "No_ref")) { LD_temp <- matrix(0, length(jk_each), length(jk_each)) } else { jk.remain <- match(jk_each, remain_jk) - LD_temp <- XtX[jk.remain, jk.remain] + if (identical(ref_label, "X_ref")) { + LD_temp <- suppressWarnings({ get_cormat(XtX[, jk.remain]) }) + } else { + LD_temp <- XtX[jk.remain, jk.remain] + } LD_temp[which(is.na(LD_temp))] <- 0 } } @@ -582,7 +595,8 @@ check_pair_jkeach <- function(jk_each, X = cb_data$data[[X_dict[idx]]]$X, XtX = cb_data$data[[X_dict[idx]]]$XtX, N = data_update[[idx]]$N, - remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss) + remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss), + ref_label = cb_data$data[[X_dict[idx]]]$ref_label ) }) @@ -612,7 +626,9 @@ estimate_change_profile_res <- function(jk, X = NULL, res = NULL, N = NULL, XtX = NULL, YtY = NULL, XtY = NULL, beta_k = NULL, - miss_idx = NULL) { + miss_idx = NULL, + ref_label = "LD", + XtX_beta_cache = NULL) { if (!is.null(X)) { rtr <- sum(res^2) / (N - 1) xtr <- t(X[, jk]) %*% res / (N - 1) @@ -629,10 +645,13 @@ estimate_change_profile_res <- function(jk, } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { rtr <- yty - 2 * sum(beta_k * xty) + sum(beta_k^2) + } else if (!is.null(XtX_beta_cache)) { + rtr <- yty - 2 * sum(beta_k * xty) + sum(XtX_beta_cache * beta_k) } else { - rtr <- yty - 2 * sum(beta_k * xty) + sum((xtx %*% as.matrix(beta_k)) * beta_k) + XtX_beta <- compute_XtX_product(xtx, beta_k, ref_label) + rtr <- yty - 2 * sum(beta_k * xty) + sum(XtX_beta * beta_k) } } numerator <- xtr^2 / (2 * rtr) diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index ff9f312..5e9bfd5 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -190,7 +190,7 @@ get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) { #' @keywords cb_post_inference #' @noRd w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverage = 0.95, - min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL) { + min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL, ref_label = "LD") { tmp_purity <- apply(weights, 2, function(w) { pos <- w_cs(w, coverage = coverage) @@ -198,7 +198,7 @@ w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverag if (!is.null(Xcorr)) { pos <- match(pos, setdiff(1:length(w), miss_idx)) } - get_purity(pos, X = X, Xcorr = Xcorr, N = N, n = n) + get_purity(pos, X = X, Xcorr = Xcorr, N = N, n = n, ref_label = ref_label) }) if (is.null(median_abs_corr)) { is_pure <- which(tmp_purity[1, ] >= min_abs_corr) @@ -227,7 +227,8 @@ check_null_post <- function(cb_obj, } get_profile <- function(cs_beta, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx, + ref_label = "LD") { if (!is.null(X)) { mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) } else if (!is.null(XtY)) { @@ -242,20 +243,23 @@ check_null_post <- function(cb_obj, } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ - (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep + if (identical(ref_label, "No_ref")) { + (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) } else { - (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + XtX_beta <- compute_XtX_product(xtx, cs_beta, ref_label) + (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) } } } - get_cs_obj <- function(cs_beta, res, tau, func_simplex, lambda, adj_dep, LD_free, + get_cs_obj <- function(cs_beta, res, tau, func_simplex, lambda, LD_free, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, + ref_label = "LD") { correlation <- get_correlation( X = X, res = res, XtY = XtY, N = N, YtY = YtY, - XtX = XtX, beta_k = cs_beta, miss_idx = miss_idx + XtX = XtX, beta_k = cs_beta, miss_idx = miss_idx, + ref_label = ref_label ) z <- get_z(correlation, n = N, res) abs_cor <- abs(correlation) @@ -264,12 +268,12 @@ check_null_post <- function(cb_obj, P <- length(z) ld_jk <- get_LD_jk(jk, X = X, XtX = XtX, N = N, - remain_idx = setdiff(1:P, miss_idx), P = P + remain_idx = setdiff(1:P, miss_idx), P = P, ref_label = ref_label ) ld_feature <- sqrt(abs(ld_jk)) # - calculate delta delta <- boost_KL_delta( - z = z, ld_feature = ld_feature, adj_dep = adj_dep, + z = z, ld_feature = ld_feature, func_simplex = func_simplex, lambda = lambda ) scaling_factor <- if (!is.null(N)) (N - 1) else 1 @@ -280,31 +284,32 @@ check_null_post <- function(cb_obj, } obj_ld <- if (LD_free) ld_feature else rep(1, length(ld_feature)) obj_ld[miss_idx] <- 0 - exp_term <- adj_dep * obj_ld * (abs(cov_Xtr)) + exp_term <- obj_ld * (abs(cov_Xtr)) return(tau * matrixStats::logSumExp(exp_term / tau + log(delta))) } - update_res <- function(X = NULL, Y = NULL, XtX = NULL, XtY = NULL, N = NULL, cs_beta, miss_idx) { + update_res <- function(X = NULL, Y = NULL, XtX = NULL, XtY = NULL, N = NULL, cs_beta, miss_idx, ref_label = "LD") { if (!is.null(X)) { return(Y - X %*% cs_beta) } else if (!is.null(XtX)) { scaling.factor <- if (!is.null(N)) N - 1 else 1 beta_scaling <- if (!is.null(N)) 1 else 100 - xtx <- XtX / scaling.factor if (length(miss_idx) != 0) { xty <- XtY[-miss_idx] / scaling.factor res.tmp <- rep(0, length(XtY)) - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { res.tmp[-miss_idx] <- xty - cs_beta[-miss_idx] / beta_scaling } else { - res.tmp[-miss_idx] <- xty - xtx %*% (cs_beta[-miss_idx] / beta_scaling) + XtX_beta <- compute_XtX_product(XtX, cs_beta[-miss_idx] / beta_scaling, ref_label) + res.tmp[-miss_idx] <- xty - XtX_beta / scaling.factor } } else { xty <- XtY / scaling.factor - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { res.tmp <- xty - (cs_beta / beta_scaling) } else { - res.tmp <- xty - xtx %*% (cs_beta / beta_scaling) + XtX_beta <- compute_XtX_product(XtX, cs_beta / beta_scaling, ref_label) + res.tmp <- xty - XtX_beta / scaling.factor } } return(res.tmp) @@ -324,13 +329,14 @@ check_null_post <- function(cb_obj, cs_beta <- cb_obj$cb_model[[j]]$beta cs_beta[cs_variants] <- 0 X_dict <- cb_data$dict[j] - adj_dep <- cb_data$data[[j]]$dependency + ref_label_j <- cb_data$data[[X_dict]]$ref_label if (check_null_method == "profile") { cs_profile <- get_profile(cs_beta, X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, YtY = cb_data$data[[j]]$YtY, N = cb_data$data[[j]]$N, - miss_idx = cb_data$data[[j]]$variable_miss, adj_dep = adj_dep + miss_idx = cb_data$data[[j]]$variable_miss, + ref_label = ref_label_j ) last_profile <- extract_last(cb_obj$cb_model[[j]]$profile_loglike_each) change <- abs(cs_profile - last_profile) @@ -348,16 +354,17 @@ check_null_post <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, N = cb_data$data[[j]]$N, cs_beta, - miss_idx = cb_data$data[[j]]$variable_miss + miss_idx = cb_data$data[[j]]$variable_miss, + ref_label = ref_label_j ) cs_obj <- get_cs_obj(cs_beta, res, cb_obj$cb_model_para$tau, cb_obj$cb_model_para$func_simplex, cb_obj$cb_model_para$lambda, - adj_dep = adj_dep, LD_free = cb_obj$cb_model_para$LD_free, X = cb_data$data[[X_dict]]$X, N = cb_data$data[[j]]$N, XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[X_dict]]$XtY, YtY = cb_data$data[[X_dict]]$YtY, - miss_idx = cb_data$data[[j]]$variable_miss + miss_idx = cb_data$data[[j]]$variable_miss, + ref_label = ref_label_j ) last_obj <- min(cb_obj$cb_model[[j]]$obj_path) change <- abs(cs_obj - last_obj) @@ -401,7 +408,7 @@ check_null_post <- function(cb_obj, #' @keywords cb_post_inference #' @noRd #' @importFrom stats na.omit -get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { +get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100, ref_label = "LD") { get_upper_tri <- Rfast::upper_tri get_median <- Rfast::med @@ -420,16 +427,17 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { if (is.null(Xcorr)) { X_sub <- X[, pos] X_sub <- as.matrix(X_sub) - corr <- suppressWarnings({ - get_cormat(X_sub) - }) + corr <- suppressWarnings({ get_cormat(X_sub) }) corr[which(is.na(corr))] <- 0 value <- abs(get_upper_tri(corr)) } else { - if (length(Xcorr) == 1){ + if (identical(ref_label, "No_ref") || length(Xcorr) == 1) { value <- 0 + } else if (identical(ref_label, "X_ref")) { + corr <- suppressWarnings({ get_cormat(Xcorr[, pos]) }) + corr[which(is.na(corr))] <- 0 + value <- abs(get_upper_tri(corr)) } else { - Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr value <- abs(get_upper_tri(Xcorr[pos, pos])) } } @@ -445,7 +453,7 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { #' @keywords cb_post_inference #' @noRd #' @importFrom stats na.omit -get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NULL, P = NULL) { +get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NULL, P = NULL, ref_label = "LD") { get_matrix_mult <- function(X_sub1, X_sub2) { X_sub1 <- t(X_sub1) X_sub2 <- t(X_sub2) @@ -465,15 +473,21 @@ get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NU X_sub2 <- scale(X[, pos2, drop = FALSE], center = T, scale = F) value <- abs(get_matrix_mult(X_sub1, X_sub2)) } else { - if (sum(Xcorr)==1){ + if (identical(ref_label, "No_ref") || sum(Xcorr) == 1) { value <- 0 } else { - if (length(miss_idx)!=0){ + if (length(miss_idx) != 0) { pos1 <- na.omit(match(pos1, setdiff(1:P, miss_idx))) pos2 <- na.omit(match(pos2, setdiff(1:P, miss_idx))) } if (length(pos1) != 0 & length(pos2) != 0) { - value <- abs(Xcorr[pos1, pos2]) + if (identical(ref_label, "X_ref")) { + X_sub1 <- scale(Xcorr[, pos1, drop = FALSE], center = T, scale = F) + X_sub2 <- scale(Xcorr[, pos2, drop = FALSE], center = T, scale = F) + value <- abs(get_matrix_mult(X_sub1, X_sub2)) + } else { + value <- abs(Xcorr[pos1, pos2]) + } } else { value <- 0 } @@ -488,17 +502,20 @@ get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NU #' @importFrom stats var #' @importFrom utils tail get_cos_evidence <- function(cb_obj, coloc_out, data_info) { - get_cos_config <- function(w, config_idx, weight_fudge_factor = 1.5, coverage = 0.95) { + get_cos_config <- function(w, config_idx, weight_fudge_factor = 1.5, use_entropy = FALSE, + residual_correlation = NULL, coverage = 0.95) { w_outcome <- colnames(w) config_outcome <- paste0("outcome", config_idx) pos <- which(w_outcome %in% config_outcome) w_config <- w[, pos, drop = FALSE] - int_w <- get_integrated_weight(w_config, weight_fudge_factor = weight_fudge_factor) + int_w <- get_integrated_weight(w_config, weight_fudge_factor = weight_fudge_factor, + use_entropy = use_entropy, residual_correlation = residual_correlation) unlist(get_in_cos(int_w, coverage = coverage)) } get_cos_profile <- function(cs_beta, outcome_idx, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, + ref_label = "LD") { if (!is.null(X)) { cos_profile <- mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) yty <- var(Y) @@ -514,10 +531,11 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep + if (identical(ref_label, "No_ref")) { + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) } else { - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + XtX_beta <- compute_XtX_product(xtx, cs_beta, ref_label) + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) } } delta <- yty - cos_profile @@ -543,7 +561,7 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[outcome_idx]]$XtY, YtY = cb_data$data[[outcome_idx]]$YtY, N = cb_data$data[[outcome_idx]]$N, miss_idx = cb_data$data[[outcome_idx]]$variable_miss, - adj_dep = cb_data$data[[outcome_idx]]$dependency + ref_label = cb_data$data[[X_dict]]$ref_label ) max_profile <- max(cb_obj$cb_model[[outcome_idx]]$profile_loglike_each) ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) @@ -584,7 +602,10 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { w <- avWeight[[i]] outcomes <- coloc_out$coloc_outcomes[[i]] # most likely cos - cos <- get_cos_config(w, outcomes, weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor, coverage = cb_obj$cb_model_para$coverage) + cos <- get_cos_config(w, outcomes, weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor, + use_entropy = cb_obj$cb_model_para$use_entropy, + residual_correlation = cb_obj$cb_model_para$residual_correlation, + coverage = cb_obj$cb_model_para$coverage) profile_change_outcome <- sapply(outcomes, function(outcome_idx) { get_outcome_profile_change(outcome_idx, cos, cb_obj, check_null_max) }) @@ -612,7 +633,8 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { get_ucos_evidence <- function(cb_obj, ucoloc_info) { get_ucos_profile <- function(cs_beta, outcome_idx, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, + ref_label = "LD") { if (!is.null(X)) { cos_profile <- mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) yty <- var(Y) @@ -628,10 +650,11 @@ get_ucos_evidence <- function(cb_obj, ucoloc_info) { } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep + if (identical(ref_label, "No_ref")) { + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) } else { - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + XtX_beta <- compute_XtX_product(xtx, cs_beta, ref_label) + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) } } delta <- yty - cos_profile @@ -657,7 +680,7 @@ get_ucos_evidence <- function(cb_obj, ucoloc_info) { XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[outcome_idx]]$XtY, YtY = cb_data$data[[outcome_idx]]$YtY, N = cb_data$data[[outcome_idx]]$N, miss_idx = cb_data$data[[outcome_idx]]$variable_miss, - adj_dep = cb_data$data[[outcome_idx]]$dependency + ref_label = cb_data$data[[X_dict]]$ref_label ) max_profile <- max(cb_obj$cb_model[[outcome_idx]]$profile_loglike_each) ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 4fa8815..077a217 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -28,14 +28,14 @@ colocboost_inits <- function() { #' @noRd #' @keywords cb_objects colocboost_init_data <- function(X, Y, dict_YX, - Z, LD, N_sumstat, dict_sumstatLD, + Z, LD, X_ref, ref_label, + N_sumstat, dict_sumstatLD, Var_y, SeBhat, keep_variables, focal_outcome_idx = NULL, focal_outcome_variables = TRUE, overlap_variables = FALSE, - intercept = TRUE, standardize = TRUE, - residual_correlation = NULL) { + intercept = TRUE, standardize = TRUE) { ################# initialization ####################################### cb_data <- list("data" = list()) class(cb_data) <- "colocboost" @@ -98,6 +98,7 @@ colocboost_init_data <- function(X, Y, dict_YX, ) for (i in 1:length(Y)) { cb_data$data[[flag]] <- ind_formated$result[[i]] + cb_data$data[[flag]]$ref_label <- "individual" names(cb_data$data)[flag] <- paste0("ind_outcome_", i) flag <- flag + 1 } @@ -105,39 +106,25 @@ colocboost_init_data <- function(X, Y, dict_YX, } n_ind <- flag - 1 # if summary: XtX XtY, YtY - if (!is.null(Z) & !is.null(LD)) { + if (!is.null(Z)) { ####################### need to consider more ######################### # ------ only code up one sumstat variant_lists <- keep_variables[c((n_ind_variable+1):length(keep_variables))] sumstat_formated <- process_sumstat( - Z, N_sumstat, Var_y, SeBhat, LD, + Z, N_sumstat, Var_y, SeBhat, + ld_matrices = if (ref_label == "X_ref") X_ref else LD, variant_lists, dict_sumstatLD, - keep_variable_names + keep_variable_names, + ref_label = ref_label ) for (i in 1:length(Z)) { cb_data$data[[flag]] <- sumstat_formated$results[[i]] + cb_data$data[[flag]]$ref_label <- ref_label names(cb_data$data)[flag] <- paste0("sumstat_outcome_", i) flag <- flag + 1 } cb_data$dict <- c(cb_data$dict, sumstat_formated$unified_dict + n_ind) } - # - if residual correlation matrix is not NULL, we need to adjust the study dependence - if (is.null(residual_correlation)) { - for (i in 1:length(cb_data$data)) { - cb_data$data[[i]]$dependency <- 1 - } - } else { - pseudo_inverse <- function(residual_correlation) { - eigen_Sigma <- eigen(residual_correlation) - L <- which(cumsum(eigen_Sigma$values) / sum(eigen_Sigma$values) > 0.999)[1] - return(eigen_Sigma$vectors[, 1:L] %*% diag(1 / eigen_Sigma$values[1:L]) %*% t(eigen_Sigma$vectors[, 1:L])) - } - Theta <- pseudo_inverse(residual_correlation) - for (i in 1:length(cb_data$data)) { - cb_data$data[[i]]$dependency <- Theta[i, i] - } - } - return(cb_data) } @@ -187,7 +174,7 @@ colocboost_init_model <- function(cb_data, YtY = data_each$YtY, XtY = data_each$XtY ) # - initial profile loglikelihood - tmp$profile_loglike_each <- estimate_profile_loglike(Y = data_each[["Y"]], N = data_each$N, YtY = data_each$YtY) * data_each$dependency + tmp$profile_loglike_each <- estimate_profile_loglike(Y = data_each[["Y"]], N = data_each$N, YtY = data_each$YtY) # - initial residual: res for ind; xtr for sumstat tmp$res <- inital_residual(Y = data_each[["Y"]], XtY = data_each$XtY) # - initial correlation between X and residual @@ -196,7 +183,8 @@ colocboost_init_model <- function(cb_data, N = data_each$N, YtY = data_each$YtY, XtX = cb_data$data[[X_dict]]$XtX, beta_k = tmp$beta, - miss_idx = data_each$variable_miss + miss_idx = data_each$variable_miss, + ref_label = cb_data$data[[X_dict]]$ref_label ) # - initial z-score between X and residual based on correlation tmp$z <- get_z(tmp$correlation, n = data_each$N, tmp$res) @@ -378,7 +366,8 @@ inital_residual <- function(Y = NULL, XtY = NULL) { # - Calculate correlation between X and res get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, YtY = NULL, XtX = NULL, beta_k = NULL, miss_idx = NULL, - XtX_beta_cache = NULL) { + XtX_beta_cache = NULL, + ref_label = "LD") { if (!is.null(X)) { corr <- suppressWarnings({ Rfast::correls(res, X)[, "correlation"] @@ -400,12 +389,13 @@ get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, Xtr <- res / scaling_factor XtY <- XtY / scaling_factor } - if (length(XtX) == 1){ + if (identical(ref_label, "No_ref")) { var_r <- YtY - 2 * sum(beta_k * XtY) + sum(beta_k^2) } else if (!is.null(XtX_beta_cache)) { var_r <- YtY - 2 * sum(beta_k * XtY) + sum(XtX_beta_cache * beta_k) } else { - var_r <- YtY - 2 * sum(beta_k * XtY) + sum((XtX %*% as.matrix(beta_k)) * beta_k) + XtX_beta_val <- compute_XtX_product(XtX, beta_k, ref_label) + var_r <- YtY - 2 * sum(beta_k * XtY) + sum(XtX_beta_val * beta_k) } if (var_r > 1e-6) { corr_nomiss <- Xtr / sqrt(var_r) @@ -561,7 +551,10 @@ get_multiple_testing_correction <- function(z, miss_idx = NULL, func_multi_test #' #' @return List containing processed data with optimized LD submatrix storage #' @noRd -process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dict, target_variants) { +process_sumstat <- function(Z, N, Var_y, SeBhat, + ld_matrices, variant_lists, dict, + target_variants, + ref_label = "LD") { # Step 1: Identify unique combinations of (variant list, LD matrix) @@ -582,7 +575,6 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic break } } - if (!is_duplicate) { # If not a duplicate, assign its exact index unified_dict[i] <- i @@ -606,14 +598,12 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic current_variants <- variant_lists[[i]] current_z <- Z[[i]] current_n <- N[[i]] - # Get corresponding LD matrix from original dictionary mapping ld_index <- dict[i] - current_ld_matrix <- ld_matrices[[ld_index]] + current_ref <- ld_matrices[[ld_index]] # Find common variants between current list and target variants common_variants <- intersect(current_variants, target_variants) - # Find variants in target but not in current list missing_variants <- setdiff(target_variants, current_variants) tmp$variable_miss <- which(target_variants %in% missing_variants) @@ -625,51 +615,76 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic Z_extend[pos_target] <- current_z[pos_z] # Calculate submatrix for each unique entry (not duplicates) - if (length(current_ld_matrix) == 1){ - ld_submatrix <- current_ld_matrix + ref_submatrix <- NULL + + if (ref_label == "No_ref"){ + + ref_submatrix <- current_ref + + } else if (ref_label == "X_ref") { + + # match by column names only (rows = samples) + if (length(common_variants) > 0) { + if (i == unified_dict[i]) { + col_idx <- match(common_variants, colnames(current_ref)) + col_idx <- col_idx[!is.na(col_idx)] + if (length(col_idx) > 0) { + ref_submatrix <- current_ref[, col_idx, drop = FALSE] + colnames(ref_submatrix) <- common_variants + } + } + } + } else { - ld_submatrix <- NULL + + # ref_label == "LD": match by both row and column names if (length(common_variants) > 0) { - # Only include the submatrix if this entry is unique or is the first occurrence if (i == unified_dict[i]) { - # Check if common_variants and rownames have identical order - if (identical(common_variants, rownames(current_ld_matrix))) { - # If order is identical, use the matrix directly without reordering - ld_submatrix <- current_ld_matrix + if (identical(common_variants, rownames(current_ref))) { + # Order is identical, use matrix directly without reordering + ref_submatrix <- current_ref } else { - # If order is different, reorder using matched indices - matched_indices <- match(common_variants, rownames(current_ld_matrix)) - ld_submatrix <- current_ld_matrix[matched_indices, matched_indices, drop = FALSE] - rownames(ld_submatrix) <- common_variants - colnames(ld_submatrix) <- common_variants + # Reorder to match common_variants order + matched_indices <- match(common_variants, rownames(current_ref)) + ref_submatrix <- current_ref[matched_indices, matched_indices, drop = FALSE] + rownames(ref_submatrix) <- common_variants + colnames(ref_submatrix) <- common_variants } } } + } # Organize data if (is.null(current_n)) { - tmp$XtX <- ld_submatrix + tmp$XtX <- ref_submatrix tmp$XtY <- Z_extend tmp$YtY <- 1 } else { if (!is.null(SeBhat[[i]]) & !is.null(Var_y[[i]])) { + # var_y, shat (and bhat) are provided, so the effects are on the # *original scale*. adj <- 1 / (Z_extend^2 + current_n - 2) - if (!is.null(ld_submatrix)) { + tmp$YtY <- (current_n - 1) * Var_y[[i]] + tmp$XtY <- Z_extend * sqrt(adj) * Var_y[[i]] / SeBhat[[i]] + if (ref_label == "X_ref") { + # Store X_ref slice directly; XtX computed on-the-fly downstream + tmp$XtX <- ref_submatrix + } else { + # LD or No_ref: scale to original scale XtXdiag <- Var_y[[i]] * adj / (SeBhat[[i]]^2) - xtx <- t(ld_submatrix * sqrt(XtXdiag)) * sqrt(XtXdiag) + xtx <- t(ref_submatrix * sqrt(XtXdiag)) * sqrt(XtXdiag) tmp$XtX <- (xtx + t(xtx)) / 2 } - tmp$YtY <- (current_n - 1) * Var_y[[i]] - tmp$XtY <- Z_extend * sqrt(adj) * Var_y[[i]] / SeBhat[[i]] + } else { - if (!is.null(ld_submatrix)) { - tmp$XtX <- ld_submatrix - } + # Standardised scale tmp$YtY <- (current_n - 1) tmp$XtY <- sqrt(current_n - 1) * Z_extend + if (!is.null(ref_submatrix)) { + tmp$XtX <- ref_submatrix # LD, X_ref, or scalar 1 stored as-is + } } } diff --git a/R/colocboost_one_causal.R b/R/colocboost_one_causal.R index 09692d5..938a7cd 100644 --- a/R/colocboost_one_causal.R +++ b/R/colocboost_one_causal.R @@ -121,11 +121,10 @@ boost_check_update_jk_one_causal <- function(cb_model, cb_model_para, cb_data) { # -- extract data and model based on pos.update model_update <- cb_model[pos.update] X_dict <- cb_data$dict[pos.update] - adj_dep <- sapply(pos.update, function(ii) cb_data$data[[ii]]$dependency) # -- define jk and jk_each cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) cc$correlation)) - abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`) + abs_cor_vals_each <- abs(cor_vals_each) jk_each <- apply(abs_cor_vals_each, 2, function(temp) { jk_temp <- which(temp == max(temp)) return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp, 1))) @@ -213,9 +212,8 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data) { pos.update <- which(cb_model_para$update_y == 1) model_update <- cb_model[pos.update] X_dict <- cb_data$dict[pos.update] - adj_dep <- sapply(pos.update, function(ii) cb_data$data[[ii]]$dependency) cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) cc$correlation)) - abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`) + abs_cor_vals_each <- abs(cor_vals_each) jk_each <- apply(abs_cor_vals_each, 2, function(temp) { jk_temp <- which(temp == max(temp)) return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp, 1))) diff --git a/R/colocboost_output.R b/R/colocboost_output.R index 9502ade..09b3323 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -125,6 +125,8 @@ get_colocboost_summary <- function(cb_output, #' @param npc_outcome_cutoff Minimum threshold of normalized probability of colocalized traits in each CoS. #' @param pvalue_cutoff Maximum threshold of marginal p-values of colocalized variants on colocalized traits in each CoS. #' @param weight_fudge_factor The strength to integrate weight from different outcomes, default is 1.5 +#' @param use_entropy A logic variable to consider the heterogeneity of traits for a single-effect. +#' @param residual_correlation The residual correlation based on the sample overlap, it is diagonal if it is NULL. #' @param coverage A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95). #' #' @return A \code{"colocboost"} object with some or all of the following elements: @@ -170,6 +172,8 @@ get_robust_colocalization <- function(cb_output, npc_outcome_cutoff = 0.2, pvalue_cutoff = NULL, weight_fudge_factor = 1.5, + use_entropy = FALSE, + residual_correlation = NULL, coverage = 0.95) { if (!inherits(cb_output, "colocboost")) { stop("Input must from colocboost object!") @@ -311,11 +315,14 @@ get_robust_colocalization <- function(cb_output, w_outcome <- colnames(w) config_outcome <- paste0("outcome", config_idx) pos <- which(w_outcome %in% config_outcome) - w[, pos, drop = FALSE] + ww = w[, pos, drop = FALSE] + colnames(ww) <- gsub("outcome", "Y", colnames(ww)) + ww }) - cos_details$cos_weights <- cos_weights - int_weight <- lapply(cos_weights, get_integrated_weight, weight_fudge_factor = weight_fudge_factor) + int_weight <- lapply(cos_weights, get_integrated_weight, weight_fudge_factor = weight_fudge_factor, + use_entropy = use_entropy, residual_correlation = residual_correlation) names(int_weight) <- names(cos_weights) <- colocset_names + cos_details$cos_weights <- cos_weights vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight), 1, prod)) names(vcp) <- cb_output$data_info$variables cb_output$vcp <- vcp @@ -1168,8 +1175,49 @@ get_cos <- function(cb_output, coverage = 0.95, X = NULL, Xcorr = NULL, n_purity #' Get integrated weight from different outcomes #' @keywords cb_get_functions #' @noRd -get_integrated_weight <- function(weights, weight_fudge_factor = 1.5) { - av <- apply(weights, 1, function(w) prod(w^(weight_fudge_factor / ncol(weights)))) +get_integrated_weight <- function(weights, weight_fudge_factor = 1.5, + use_entropy = FALSE, + residual_correlation = NULL) { + + # Collapse duplicate outcome columns by geometric mean (only if RE is given + # and colnames are parseable as "outcomeN" or "YN"). + if (!is.null(residual_correlation) && !is.null(colnames(weights))) { + idx <- as.integer(sub("^(outcome|Y)", "", colnames(weights))) + if (anyDuplicated(idx)) { + uniq <- unique(idx) + weights <- sapply(uniq, function(k) { + cols <- which(idx == k) + if (length(cols) == 1) weights[, cols] + else exp(rowMeans(log(pmax(weights[, cols, drop = FALSE], 1e-300)))) + }) + colnames(weights) <- paste0("outcome", uniq) + idx <- uniq + } + } + + L <- ncol(weights) + + if (is.null(residual_correlation)){ + if (use_entropy){ + h <- apply(weights, 2, function(w) { w <- w[w > 0]; -sum(w * log(w)) }) + h <- h + mean(h) + alpha <- h / sum(h) + } else { + alpha <- rep(1 / L, L) + } + } else { + idx <- as.integer(sub("^(outcome|Y)", "", colnames(weights))) + RE_inv <- solve(residual_correlation[idx, idx, drop = FALSE]) + v <- if (use_entropy) { + h <- apply(weights, 2, function(w) { w <- w[w > 0]; -sum(w * log(w)) }) + h + mean(h) # same soft floor + } else { + rep(1, L) + } + alpha <- as.numeric(RE_inv %*% v) + alpha <- alpha / sum(alpha) + } + av <- apply(weights, 1, function(w) prod(w^(weight_fudge_factor * alpha))) return(av / sum(av)) } @@ -1287,78 +1335,87 @@ get_cos_purity <- function(cos, X = NULL, Xcorr = NULL, n_purity = 100) { #' @keywords cb_get_functions #' @noRd merge_ucos_details <- function(ucos_details, ucos_from_cos) { - - get_cos_ucos_purity <- function(from_ucos, from_cos){ - if (is.null(from_cos)){ - return(from_ucos) - } else { - cos <- intersect(rownames(from_ucos), rownames(from_cos)) - tmp_from_ucos <- from_ucos[match(cos, rownames(from_ucos)), , drop = FALSE] - tmp_from_cos <- from_cos[match(cos, rownames(from_cos)), , drop = FALSE] - cbind(tmp_from_ucos, tmp_from_cos) - } - } - - get_ucos_purity <- function(from_ucos, from_cos, cross_from_ucos, cross_from_cos) { - - from_ucos = ucos_details$ucos_purity$min_abs_cor - from_cos = ucos_from_cos$ucos_purity$min_abs_cor - cross_from_ucos = ucos_details$cos_ucos_purity$min_abs_cor - cross_from_cos = ucos_from_cos$cos_ucos_purity$min_abs_cor - - - for (id in unique(sub(":.*", "", rownames(from_cos)))) { - old <- grep(paste0("^", id, ":"), rownames(cross_from_ucos), value = TRUE)[1] - new <- grep(paste0("^", id, ":"), rownames(from_cos), value = TRUE)[1] - if (!is.na(old) && !is.na(new)) { - rownames(cross_from_ucos) <- sub(old, new, rownames(cross_from_ucos), fixed = TRUE) - colnames(cross_from_ucos) <- sub(old, new, colnames(cross_from_ucos), fixed = TRUE) - if (!is.null(cross_from_cos)){ - rownames(cross_from_cos) <- sub(old, new, rownames(cross_from_cos), fixed = TRUE) - colnames(cross_from_cos) <- sub(old, new, colnames(cross_from_cos), fixed = TRUE) + + # Build a merged purity matrix from up to 5 sources, dimensioned by the + # union of uCoS names (NOT by rownames of the inputs). + build_merged <- function(stat) { + a <- ucos_details$ucos_purity[[stat]] + b <- ucos_from_cos$ucos_purity[[stat]] + + nm_a <- names(ucos_details$ucos$ucos_index) + nm_b <- names(ucos_from_cos$ucos$ucos_index) + all_ucos <- c(nm_a, nm_b) + n <- length(all_ucos) + out <- matrix(NA_real_, n, n, dimnames = list(all_ucos, all_ucos)) + + # within-original block + if (!is.null(a) && length(nm_a)) { + ra <- intersect(nm_a, rownames(a)) + if (length(ra)) out[ra, ra] <- a[ra, ra, drop = FALSE] } - } + # within-demoted block + if (!is.null(b) && length(nm_b)) { + rb <- intersect(nm_b, rownames(b)) + if (length(rb)) out[rb, rb] <- b[rb, rb, drop = FALSE] + } + # cross block: 0 (no LD info available at this stage) + if (length(nm_a) && length(nm_b)) { + out[nm_a, nm_b] <- 0 + out[nm_b, nm_a] <- 0 + } + out } - all_ucos <- c(rownames(from_ucos), rownames(from_cos)) - n <- length(all_ucos) - result <- matrix(NA_real_, n, n, dimnames = list(all_ucos, all_ucos)) - mats <- list(from_ucos, from_cos, cross_from_ucos, cross_from_cos) - for (i in 1:n) { - for (j in 1:n) { - for (m in mats) { - if (all_ucos[i] %in% rownames(m) && all_ucos[j] %in% colnames(m)) { - result[i, j] <- result[j, i] <- m[all_ucos[i], all_ucos[j]] - break - } + + # cos_ucos_purity for the merged object: stack remaining-CoS rows against + # both old and new uCoS columns. + build_cos_ucos <- function(stat) { + old <- ucos_details$cos_ucos_purity[[stat]] + new <- ucos_from_cos$cos_ucos_purity[[stat]] + if (is.null(old) && is.null(new)) return(NULL) + if (is.null(old)) return(new) + if (is.null(new)) return(old) + + # Both are CoS-rows x uCoS-cols. Align by union of CoS rownames, pad with NA. + rn_old <- rownames(old); rn_new <- rownames(new) + if (is.null(rn_old)) rn_old <- paste0("cos_old_", seq_len(nrow(old))) + if (is.null(rn_new)) rn_new <- paste0("cos_new_", seq_len(nrow(new))) + rownames(old) <- rn_old + rownames(new) <- rn_new + + all_rows <- union(rn_old, rn_new) + + pad <- function(m, ncol_target, colnames_target) { + out <- matrix(NA_real_, nrow = length(all_rows), ncol = ncol_target, + dimnames = list(all_rows, colnames_target)) + out[rownames(m), ] <- m + out } - } + + old_p <- pad(old, ncol(old), colnames(old)) + new_p <- pad(new, ncol(new), colnames(new)) + cbind(old_p, new_p) } - result - } - + list( "ucos" = list( - "ucos_index" = c(ucos_details$ucos$ucos_index, ucos_from_cos$ucos$ucos_index), + "ucos_index" = c(ucos_details$ucos$ucos_index, ucos_from_cos$ucos$ucos_index), "ucos_variables" = c(ucos_details$ucos$ucos_variables, ucos_from_cos$ucos$ucos_variables) ), "ucos_outcomes" = list( "outcome_index" = c(ucos_details$ucos_outcomes$outcome_index, ucos_from_cos$ucos_outcomes$outcome_index), - "outcome_name" = c(ucos_details$ucos_outcomes$outcome_name, ucos_from_cos$ucos_outcomes$outcome_name) + "outcome_name" = c(ucos_details$ucos_outcomes$outcome_name, ucos_from_cos$ucos_outcomes$outcome_name) ), - "ucos_weight" = c(ucos_details$ucos_weight, ucos_from_cos$ucos_weight), + "ucos_weight" = c(ucos_details$ucos_weight, ucos_from_cos$ucos_weight), "ucos_top_variables" = rbind(ucos_details$ucos_top_variables, ucos_from_cos$ucos_top_variables), "ucos_purity" = list( - "min_abs_cor" = get_ucos_purity(ucos_details$ucos_purity$min_abs_cor, ucos_from_cos$ucos_purity$min_abs_cor, - ucos_details$cos_ucos_purity$min_abs_cor, ucos_from_cos$cos_ucos_purity$min_abs_cor), - "median_abs_cor" = get_ucos_purity(ucos_details$ucos_purity$median_abs_cor, ucos_from_cos$ucos_purity$median_abs_cor, - ucos_details$cos_ucos_purity$median_abs_cor, ucos_from_cos$cos_ucos_purity$median_abs_cor), - "max_abs_cor" = get_ucos_purity(ucos_details$ucos_purity$max_abs_cor, ucos_from_cos$ucos_purity$max_abs_cor, - ucos_details$cos_ucos_purity$max_abs_cor, ucos_from_cos$cos_ucos_purity$max_abs_cor) + "min_abs_cor" = build_merged("min_abs_cor"), + "median_abs_cor" = build_merged("median_abs_cor"), + "max_abs_cor" = build_merged("max_abs_cor") ), "cos_ucos_purity" = list( - "min_abs_cor" = get_cos_ucos_purity(ucos_from_cos$cos_ucos_purity$min_abs_cor, ucos_details$cos_ucos_purity$min_abs_cor), - "median_abs_cor" = get_cos_ucos_purity(ucos_from_cos$cos_ucos_purity$median_abs_cor, ucos_details$cos_ucos_purity$median_abs_cor), - "max_abs_cor" = get_cos_ucos_purity(ucos_from_cos$cos_ucos_purity$max_abs_cor, ucos_details$cos_ucos_purity$max_abs_cor) + "min_abs_cor" = build_cos_ucos("min_abs_cor"), + "median_abs_cor" = build_cos_ucos("median_abs_cor"), + "max_abs_cor" = build_cos_ucos("max_abs_cor") ), "ucos_outcomes_npc" = rbind(ucos_details$ucos_outcomes_npc, ucos_from_cos$ucos_outcomes_npc) ) diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 0e50aba..7d51bce 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -16,8 +16,6 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { for (i in pos.update) { update_jk <- cb_model_para$update_temp$real_update_jk[i] X_dict <- cb_data$dict[i] - # adj_dependence - adj_dep <- cb_data$data[[i]]$dependency ########## BEGIN: MAIN CALCULATION ################### @@ -32,7 +30,8 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { XtX = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[i]]$N, remain_idx = setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss), - P = cb_model_para$P + P = cb_model_para$P, + ref_label = cb_data$data[[X_dict]]$ref_label ) cb_model[[i]]$ld_jk <- rbind(cb_model[[i]]$ld_jk, ld_jk) } @@ -48,7 +47,7 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { } delta <- boost_KL_delta( z = cb_model[[i]]$z, - ld_feature = ld_feature, adj_dep = adj_dep, + ld_feature = ld_feature, func_simplex = cb_model_para$func_simplex, lambda = lambda_outcome ) @@ -64,15 +63,14 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { if (length(cb_data$data[[i]]$variable_miss) != 0) { obj_ld[cb_data$data[[i]]$variable_miss] <- 0 } - exp_term <- adj_dep * obj_ld * (abs(cov_Xtr)) + exp_term <- obj_ld * (abs(cov_Xtr)) # - calculate individual objective function cb_model[[i]]$obj_path <- c(cb_model[[i]]$obj_path, tau * matrixStats::logSumExp(exp_term / tau + log(delta))) cb_model[[i]]$obj_single <- c(cb_model[[i]]$obj_single, abs(cov_Xtr[update_jk])) exp_term <- exp_term - max(exp_term) exp_abs_cor <- delta * exp(exp_term / tau) - # weights <- adj_dep * ld_feature / scaling_factor * exp_abs_cor / sum(exp_abs_cor) - weights <- adj_dep * obj_ld * exp_abs_cor / sum(exp_abs_cor) + weights <- obj_ld * exp_abs_cor / sum(exp_abs_cor) weights <- weights / sum(weights) # cb_model[[i]]$weights_path <- rbind(cb_model[[i]]$weights_path, as.vector(weights)) cb_model[[i]]$weights_path <- c(cb_model[[i]]$weights_path, list(as.vector(weights))) @@ -102,44 +100,32 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { x <- cb_data$data[[X_dict]]$X y <- cb_data$data[[i]]$Y beta <- cb_model[[i]]$beta - profile_log <- mean((y - x %*% beta)^2) * adj_dep + profile_log <- mean((y - x %*% beta)^2) } else if (!is.null(cb_data$data[[X_dict]]$XtX)) { beta_scaling <- cb_model[[i]]$beta_scaling # - summary statistics xtx <- cb_data$data[[X_dict]]$XtX + ref_label_i <- cb_data$data[[X_dict]]$ref_label cb_model[[i]]$res <- rep(0, cb_model_para$P) if (length(cb_data$data[[i]]$variable_miss) != 0) { beta <- cb_model[[i]]$beta[-cb_data$data[[i]]$variable_miss] / beta_scaling xty <- cb_data$data[[i]]$XtY[-cb_data$data[[i]]$variable_miss] - if (length(xtx) == 1){ - XtX_beta <- beta - cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * beta - } else { - XtX_beta <- xtx %*% beta - cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * XtX_beta - } + XtX_beta <- compute_XtX_product(xtx, beta, ref_label_i) + cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * XtX_beta } else { beta <- cb_model[[i]]$beta / beta_scaling xty <- cb_data$data[[i]]$XtY - if (length(xtx) == 1){ - XtX_beta <- beta - cb_model[[i]]$res <- xty - scaling_factor * beta - } else { - XtX_beta <- xtx %*% beta - cb_model[[i]]$res <- xty - scaling_factor * XtX_beta - } + XtX_beta <- compute_XtX_product(xtx, beta, ref_label_i) + cb_model[[i]]$res <- xty - scaling_factor * XtX_beta } # - cache XtX %*% beta for reuse in get_correlation (avoids redundant O(P^2) computation) cb_model[[i]]$XtX_beta_cache <- XtX_beta # - profile-loglikelihood (reuses cached XtX_beta) yty <- cb_data$data[[i]]$YtY / scaling_factor xty <- xty / scaling_factor - if (length(xtx) == 1){ - profile_log <- (yty - 2 * sum(beta * xty) + sum(beta^2)) * adj_dep - } else { - profile_log <- (yty - 2 * sum(beta * xty) + sum(XtX_beta * beta)) * adj_dep - } + profile_log <- (yty - 2 * sum(beta * xty) + sum(XtX_beta * beta)) + } cb_model[[i]]$profile_loglike_each <- c(cb_model[[i]]$profile_loglike_each, profile_log) } @@ -149,7 +135,7 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { # - calculate LD for update_jk -get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P = NULL) { +get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P = NULL, ref_label = "LD") { if (!is.null(X)) { corr <- suppressWarnings({ Rfast::correls(X[, jk1], X)[, "correlation"] @@ -158,8 +144,14 @@ get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P } else if (!is.null(XtX)) { jk1.remain <- which(remain_idx == jk1) corr <- rep(0, P) - if (length(XtX) == 1 | length(jk1.remain)==0){ + if (identical(ref_label, "No_ref") | length(jk1.remain) == 0) { corr[remain_idx] <- 1 + } else if (identical(ref_label, "X_ref")) { + corr_common <- suppressWarnings({ + correls(XtX[, jk1.remain], XtX)[, "correlation"] + }) + corr_common[which(is.na(corr_common))] <- 0 + corr[remain_idx] <- corr_common } else { corr[remain_idx] <- XtX[, jk1.remain] } @@ -168,7 +160,7 @@ get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P } -boost_KL_delta <- function(z, ld_feature, adj_dep, +boost_KL_delta <- function(z, ld_feature, func_simplex = "LD_z2z", lambda = 0.5) { # if (!is.null(n)){ z <- z * sqrt( (n-1)/(z^2+n-2) ) } @@ -177,15 +169,15 @@ boost_KL_delta <- function(z, ld_feature, adj_dep, if (func_simplex == "Range_Z") { z2z <- lambda * 0.5 * z^2 + (1 - lambda) * abs(z) z2z <- (z2z - min(z2z)) / (max(z2z) - min(z2z)) * 5 - z2z <- adj_dep * ld_feature * z2z + z2z <- ld_feature * z2z delta <- exp(z2z - max(z2z)) } else if (func_simplex == "LD_z2z") { z2z <- lambda * 0.5 * z^2 + (1 - lambda) * abs(z) - z2z <- adj_dep * ld_feature * z2z + z2z <- ld_feature * z2z delta <- exp(z2z - max(z2z)) } else if (func_simplex == "only_z2z") { z2z <- lambda * 0.5 * z^2 + (1 - lambda) * abs(z) - z2z <- adj_dep * z2z + z2z <- z2z delta <- exp(z2z - max(z2z)) } else if (func_simplex == "entropy") { delta <- rep(1, length(z)) @@ -252,8 +244,6 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para) { abs_cor <- abs(correlation) jk <- which(abs_cor == max(abs_cor)) jk <- ifelse(length(jk) == 1, jk, sample(jk, 1)) - # adj_dependence - adj_dep <- cb_data$data[[i]]$dependency ########## MAIN CALCULATION ################### X_dict <- cb_data$dict[i] @@ -262,7 +252,8 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para) { XtX = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[i]]$N, remain_idx = setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss), - P = cb_model_para$P + P = cb_model_para$P, + ref_label = cb_data$data[[X_dict]]$ref_label ) ld_feature <- sqrt(abs(ld_jk)) @@ -276,7 +267,7 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para) { } delta <- boost_KL_delta( z = cb_model[[i]]$z, - ld_feature = ld_feature, adj_dep = adj_dep, + ld_feature = ld_feature, func_simplex = cb_model_para$func_simplex, lambda = lambda_outcome ) @@ -293,7 +284,7 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para) { if (length(cb_data$data[[i]]$variable_miss) != 0) { obj_ld[cb_data$data[[i]]$variable_miss] <- 0 } - exp_term <- adj_dep * obj_ld * (abs(cov_Xtr)) + exp_term <- obj_ld * (abs(cov_Xtr)) # - calculate individual objective function cb_model[[i]]$obj_path <- c(cb_model[[i]]$obj_path, tau * matrixStats::logSumExp(exp_term / tau + log(delta))) cb_model[[i]]$obj_single <- c(cb_model[[i]]$obj_single, abs(cov_Xtr[jk])) diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R index b486bf8..7e956bc 100644 --- a/R/colocboost_utils.R +++ b/R/colocboost_utils.R @@ -43,7 +43,8 @@ merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[fine_outcome]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) # is.between <- length(intersect(cset1, cset2)) != 0 is.between <- (abs(res[2] - 1) < tol) @@ -66,7 +67,8 @@ merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) } res <- Reduce(pmax, res) @@ -151,7 +153,8 @@ merge_ucos <- function(cb_obj, past_out, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) flag <- flag + 1 } @@ -221,7 +224,8 @@ merge_ucos <- function(cb_obj, past_out, tmp <- matrix(get_purity(pos, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[i3]]$N, n = n_purity + N = cb_obj$cb_data$data[[i3]]$N, n = n_purity, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ), 1, 3) p_tmp <- rbind(p_tmp, tmp) } @@ -286,71 +290,6 @@ merge_ucos <- function(cb_obj, past_out, return(ll) } -#' @title Set of internal utils functions -#' -#' @description -#' The `colocboost_utils` functions serves as a summary for the following two refine functions. -#' -#' @details -#' The following functions are included in this set: -#' `merge_cos_ucos` merge a trait-specific confidence set CS into a colocalization set CoS. -#' `merge_ucos` merge two trait-specific confidence sets. -#' -#' These functions are not exported individually and are accessed via `colocboost_utils`. -#' -#' @rdname colocboost_utils -#' @keywords cb_utils -#' @noRd -get_vcp <- function(past_out, P) { - if (!is.null(past_out$cos$cos$cos)) { - avW_coloc_vcp <- sapply(past_out$cos$cos$avWeight, get_integrated_weight) - } else { - avW_coloc_vcp <- NULL - } - all_weight <- avW_coloc_vcp - if (length(all_weight) == P) { - all_weight <- as.matrix(unlist(all_weight)) - } - if (!is.null(all_weight)) { - all_weight <- apply(all_weight, 2, as.numeric) - all_weight <- as.matrix(all_weight) - vcp <- as.vector(1 - apply(1 - all_weight, 1, prod)) - } else { - vcp <- rep(0, P) - } - return(vcp) -} - - -get_pip <- function(past_out, R, P) { - if (length(past_out$cos$cos$cos) != 0) { - av_coloc <- do.call(cbind, past_out$cos$cos$avWeight) - } else { - av_coloc <- NULL - } - if (length(past_out$ucos$ucos_each) != 0) { - av_noncoloc <- past_out$ucos$avW_ucos_each - tmp <- do.call(rbind, strsplit(colnames(av_noncoloc), ":")) - colnames(av_noncoloc) <- paste0("outcome", gsub("[^0-9.]+", "", tmp[, 2])) - } else { - av_noncoloc <- NULL - } - av_all <- cbind(av_coloc, av_noncoloc) - pip <- vector(mode = "list", length = R) - if (!is.null(av_all)) { - av_name <- colnames(av_all) - for (i in 1:R) { - pos <- grep(i, av_name) - if (length(pos) != 0) { - av_i <- as.matrix(av_all[, pos]) - pip[[i]] <- as.vector(1 - apply(1 - av_i, 1, prod)) - } else { - pip[[i]] <- rep(0, P) - } - } - } - return(pip) -} check_two_overlap_sets <- function(total, i, j) { t1 <- total[[i]] @@ -663,7 +602,10 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { pos <- match(data_info$variables, cb_obj$cb_model_para$variables) return(w[pos, , drop = FALSE]) }) - int_weight <- lapply(cos_weights, get_integrated_weight, weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor) + int_weight <- lapply(cos_weights, get_integrated_weight, + weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor, + use_entropy = cb_obj$cb_model_para$use_entropy, + residual_correlation = cb_obj$cb_model_para$residual_correlation) names(int_weight) <- names(cos_weights) <- colocset_names # - resummary results @@ -689,7 +631,8 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { tmp <- matrix(get_purity(pos, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[i3]]$N, n = cb_obj$cb_model_para$n_purity + N = cb_obj$cb_data$data[[i3]]$N, n = cb_obj$cb_model_para$n_purity, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ), 1, 3) p_tmp <- rbind(p_tmp, tmp) } @@ -763,7 +706,8 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) flag <- flag + 1 } @@ -1025,7 +969,8 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) flag <- flag + 1 } @@ -1069,7 +1014,8 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) flag <- flag + 1 } @@ -1124,3 +1070,28 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output return(ll) } + +#' Compute XtX %*% beta, dispatching on ref_label +#' @noRd +compute_XtX_product <- function(XtX, beta, ref_label = "LD") { + if (identical(ref_label, "No_ref")) return(beta) + if (identical(ref_label, "X_ref")) { + N_ref <- nrow(XtX) + temp <- XtX %*% as.matrix(beta) + return(as.vector(crossprod(XtX, temp)) / (N_ref - 1)) + } + as.vector(XtX %*% as.matrix(beta)) +} + +#' Pseudo-inverse via truncated eigendecomposition (99.9% variance). +#' Used to invert residual_correlation, which may be rank-deficient when +#' the submatrix contains duplicated traits. +#' @noRd +pseudo_inverse <- function(mat) { + eig <- eigen(mat, symmetric = TRUE) + keep <- which(cumsum(eig$values) / sum(eig$values) > 0.999)[1] + eig$vectors[, 1:keep, drop = FALSE] %*% + diag(1 / eig$values[1:keep], keep, keep) %*% + t(eig$vectors[, 1:keep, drop = FALSE]) +} + diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index 2635e00..2f0a4f7 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -284,7 +284,8 @@ cb_model_update <- function(cb_data, cb_model, cb_model_para) { XtX = cb_data$data[[X_dict]]$XtX, beta_k = model_each$beta, miss_idx = data_each$variable_miss, - XtX_beta_cache = model_each$XtX_beta_cache + XtX_beta_cache = model_each$XtX_beta_cache, + ref_label = cb_data$data[[X_dict]]$ref_label ) cb_model[[i]]$correlation <- tmp cb_model[[i]]$z <- get_z(tmp, n = data_each$N, model_each$res) diff --git a/man/colocboost.Rd b/man/colocboost.Rd index 4f6c3df..ebbb8bb 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -12,6 +12,7 @@ colocboost( Y = NULL, sumstat = NULL, LD = NULL, + X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, outcome_names = NULL, @@ -62,6 +63,7 @@ colocboost( check_null_max_ucos = 0.015, weaker_effect = TRUE, LD_free = FALSE, + use_entropy = FALSE, output_level = 1, cos_npc_cutoff = 0.2, npc_outcome_cutoff = 0.2, @@ -81,14 +83,23 @@ The columns of data.frame should include either \code{z} or \code{beta}/\code{se \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale).} \item{LD}{A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were -obtained from the same genotypes.} +obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +If neither is provided, LD-free mode is used.} + +\item{X_ref}{A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +\code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +When the number of reference panel samples is less than the number of variants (N_ref < P), +this avoids storing the full P x P LD matrix and reduces memory usage. +When N_ref >= P, LD is precomputed from \code{X_ref} internally. +Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used.} \item{dict_YX}{A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix. The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome. The innovation: do not provide the same matrix in \code{X} to reduce the computational burden.} -\item{dict_sumstatLD}{A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} if there exist subsets of outcomes corresponding to the same sumstat. -The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat. +\item{dict_sumstatLD}{A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} (or \code{X_ref}) if there exist subsets of outcomes +corresponding to the same sumstat. +The first column should be 1:L for L sumstat The second column should be the index of \code{LD} (or \code{X_ref}) corresponding to the sumstat. The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden.} \item{outcome_names}{The names of outcomes, which has the same order for Y.} @@ -194,6 +205,8 @@ to merge colocalized sets, which may resulting in a huge set.} \item{LD_free}{When \code{LD_free = FALSE}, objective function doesn't include LD information.} +\item{use_entropy}{A logic variable to consider the heterogeneity of traits for a single-effect.} + \item{output_level}{When \code{output_level = 1}, return basic cos details for colocalization results When \code{output_level = 2}, return the ucos details for the single specific effects. When \code{output_level = 3}, return the entire Colocboost model to diagnostic results (more space).} diff --git a/man/colocboost_validate_input_data.Rd b/man/colocboost_validate_input_data.Rd index d9c45f4..ec7e40a 100644 --- a/man/colocboost_validate_input_data.Rd +++ b/man/colocboost_validate_input_data.Rd @@ -9,6 +9,7 @@ colocboost_validate_input_data( Y = NULL, sumstat = NULL, LD = NULL, + X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, effect_est = NULL, @@ -31,7 +32,16 @@ colocboost_validate_input_data( \item{sumstat}{A list of data.frames of summary statistics.} -\item{LD}{A list of correlation matrices indicating the LD matrix for each genotype.} +\item{LD}{A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were +obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +If neither is provided, LD-free mode is used.} + +\item{X_ref}{A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +\code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +When the number of reference panel samples is less than the number of variants (N_ref < P), +this avoids storing the full P x P LD matrix and reduces memory usage. +When N_ref >= P, LD is precomputed from \code{X_ref} internally. +Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used.} \item{dict_YX}{A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix.} @@ -57,6 +67,8 @@ A list containing: \item{keep_variable_individual}{List of variable names for each X matrix} \item{sumstat}{Processed list of summary statistics data.frames} \item{LD}{Processed list of LD matrices} +\item{X_ref}{Processed list of reference genotype matrices} +\item{ref_label}{Style of reference matrics} \item{sumstatLD_dict}{Dictionary mapping sumstat to LD} \item{keep_variable_sumstat}{List of variant names for each sumstat} \item{Z}{List of z-scores for each outcome} diff --git a/man/get_robust_colocalization.Rd b/man/get_robust_colocalization.Rd index 4128c64..87d20f9 100644 --- a/man/get_robust_colocalization.Rd +++ b/man/get_robust_colocalization.Rd @@ -14,6 +14,8 @@ get_robust_colocalization( npc_outcome_cutoff = 0.2, pvalue_cutoff = NULL, weight_fudge_factor = 1.5, + use_entropy = FALSE, + residual_correlation = NULL, coverage = 0.95 ) } @@ -28,6 +30,10 @@ get_robust_colocalization( \item{weight_fudge_factor}{The strength to integrate weight from different outcomes, default is 1.5} +\item{use_entropy}{A logic variable to consider the heterogeneity of traits for a single-effect.} + +\item{residual_correlation}{The residual correlation based on the sample overlap, it is diagonal if it is NULL.} + \item{coverage}{A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95).} } \value{ diff --git a/tests/testthat/test_RE.R b/tests/testthat/test_RE.R new file mode 100644 index 0000000..f6949c4 --- /dev/null +++ b/tests/testthat/test_RE.R @@ -0,0 +1,563 @@ +library(testthat) + +# ============================================================================ +# Tests for the integration changes across commits c0a2e15 and b61d7cb: +# (1) `use_entropy` parameter: heterogeneity-aware alpha for integrating +# per-trait weights (vs uniform 1/L). +# (2) `residual_correlation` refactor: removed from the core boosting +# loop (no more `dependency` field on cb_data), now consumed at +# assembly time in `get_integrated_weight` as a GLS-style alpha +# adjustment, with: +# - duplicate outcome columns collapsed by geometric mean +# - colname parsing of both "outcomeN" and "YN" +# - sub-selection of residual_correlation[idx, idx] +# ============================================================================ + + +# ---- Shared test data generator (L-adaptive so L=2 works) ---- +generate_entropy_test_data <- function(n = 300, p = 40, L = 3, seed = 2026) { + set.seed(seed) + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + + # Shared causal at SNP10, descending heterogeneous per-trait effects. + het_effects <- c(0.9, 0.3, 0.15, 0.1, 0.08) + true_beta <- matrix(0, p, L) + true_beta[10, ] <- het_effects[seq_len(L)] + if (L >= 2) true_beta[25, 2] <- 0.5 # trait-2-specific + + Y <- matrix(0, n, L) + for (l in seq_len(L)) { + Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 1) + } + + list( + X = X, Y = Y, + Y_list = lapply(seq_len(L), function(l) Y[, l]), + X_list = replicate(L, X, simplify = FALSE), + LD = cor(X), + true_beta = true_beta + ) +} + + +# ============================================================================ +# (1a) Unit tests for get_integrated_weight with use_entropy +# ============================================================================ + +test_that("get_integrated_weight: use_entropy=FALSE returns a valid distribution", { + w1 <- c(0.6, 0.3, 0.05, 0.05) + w2 <- c(0.1, 0.1, 0.4, 0.4) + weights <- cbind(w1, w2) + + result <- get_integrated_weight(weights, weight_fudge_factor = 1.5, use_entropy = FALSE) + + expect_equal(length(result), nrow(weights)) + expect_equal(sum(result), 1, tolerance = 1e-10) + expect_true(all(result >= 0)) +}) + +test_that("get_integrated_weight: use_entropy=TRUE returns a valid distribution", { + w1 <- c(0.6, 0.3, 0.05, 0.05) # low entropy + w2 <- c(0.25, 0.25, 0.25, 0.25) # max entropy + weights <- cbind(w1, w2) + + result_ent <- get_integrated_weight(weights, use_entropy = TRUE) + + expect_equal(length(result_ent), nrow(weights)) + expect_equal(sum(result_ent), 1, tolerance = 1e-10) + expect_true(all(result_ent >= 0)) +}) + +test_that("get_integrated_weight: use_entropy=TRUE and FALSE differ for heterogeneous-entropy traits", { + w_sharp <- c(0.85, 0.10, 0.03, 0.02) + w_flat <- c(0.26, 0.25, 0.25, 0.24) + weights <- cbind(w_sharp, w_flat) + + r_uniform <- get_integrated_weight(weights, use_entropy = FALSE) + r_entropy <- get_integrated_weight(weights, use_entropy = TRUE) + + expect_false(isTRUE(all.equal(r_uniform, r_entropy))) + expect_equal(sum(r_uniform), 1, tolerance = 1e-10) + expect_equal(sum(r_entropy), 1, tolerance = 1e-10) +}) + +test_that("get_integrated_weight: identical trait weights give identical results under both modes", { + w <- c(0.5, 0.25, 0.15, 0.10) + weights <- cbind(w, w, w) + + r_uniform <- get_integrated_weight(weights, use_entropy = FALSE) + r_entropy <- get_integrated_weight(weights, use_entropy = TRUE) + + expect_equal(r_uniform, r_entropy, tolerance = 1e-10) +}) + +test_that("get_integrated_weight: use_entropy handles zero-weight entries without NaN", { + # Both columns have mass on the same three rows; column 1 is zero on row 4. + # The entropy branch filters w > 0 when computing H, so zeros must not + # produce NaN in alpha. + w1 <- c(0.5, 0.3, 0.2, 0.0) + w2 <- c(0.4, 0.3, 0.2, 0.1) + weights <- cbind(w1, w2) + + expect_error( + result <- get_integrated_weight(weights, use_entropy = TRUE), + NA + ) + expect_true(all(is.finite(result))) + expect_equal(sum(result), 1, tolerance = 1e-10) + # Row 4 has w1 = 0, so 0^(positive alpha) = 0 on that row. + expect_equal(result[4], 0) +}) + + +# ============================================================================ +# (1b) get_integrated_weight with residual_correlation (new GLS path) +# ============================================================================ + +test_that("get_integrated_weight: identity residual_correlation matches NULL residual_correlation", { + # With RC = I, alpha = I %*% 1 / sum(I %*% 1) = 1/L, identical to the + # legacy uniform-alpha path. + set.seed(42) + w1 <- c(0.5, 0.2, 0.15, 0.15) + w2 <- c(0.1, 0.4, 0.3, 0.2) + w3 <- c(0.25, 0.25, 0.3, 0.2) + weights <- cbind(w1, w2, w3) + colnames(weights) <- paste0("outcome", 1:3) + + r_null <- get_integrated_weight(weights, residual_correlation = NULL) + r_I <- get_integrated_weight(weights, residual_correlation = diag(3)) + + expect_equal(as.numeric(r_null), as.numeric(r_I), tolerance = 1e-10) +}) + +test_that("get_integrated_weight: identity RC also matches NULL RC under use_entropy=TRUE", { + set.seed(43) + w1 <- c(0.7, 0.1, 0.1, 0.1) + w2 <- c(0.1, 0.4, 0.3, 0.2) + w3 <- c(0.25, 0.25, 0.25, 0.25) + weights <- cbind(w1, w2, w3) + colnames(weights) <- paste0("outcome", 1:3) + + r_null <- get_integrated_weight(weights, use_entropy = TRUE, residual_correlation = NULL) + r_I <- get_integrated_weight(weights, use_entropy = TRUE, residual_correlation = diag(3)) + + expect_equal(as.numeric(r_null), as.numeric(r_I), tolerance = 1e-10) +}) + +test_that("get_integrated_weight: asymmetric RC differs from NULL RC", { + # Equal-correlation RC (e.g. 0.8 everywhere off-diagonal) is actually a + # no-op because its inverse times a vector of 1s is still a multiple of 1. + # We need an asymmetric correlation structure for alpha to shift. + w1 <- c(0.6, 0.2, 0.1, 0.1) + w2 <- c(0.55, 0.25, 0.1, 0.1) + w3 <- c(0.1, 0.2, 0.35, 0.35) + weights <- cbind(w1, w2, w3) + colnames(weights) <- paste0("outcome", 1:3) + + # Traits 1 and 2 heavily correlated, trait 3 nearly independent. + RC <- matrix(c(1, 0.9, 0.1, + 0.9, 1, 0.1, + 0.1, 0.1, 1), nrow = 3, byrow = TRUE) + + r_null <- get_integrated_weight(weights, residual_correlation = NULL) + r_rc <- get_integrated_weight(weights, residual_correlation = RC) + + expect_false(isTRUE(all.equal(as.numeric(r_null), as.numeric(r_rc)))) + expect_equal(sum(r_rc), 1, tolerance = 1e-10) + expect_true(all(is.finite(r_rc))) +}) + +test_that("get_integrated_weight: 'outcomeN' and 'YN' colnames parse equivalently", { + # `get_robust_colocalization` relabels cos_weights columns from "outcomeN" + # to "YN" before calling `get_integrated_weight`, so both prefixes must + # produce the same output for the same numerical inputs. + w1 <- c(0.5, 0.3, 0.1, 0.1) + w2 <- c(0.2, 0.2, 0.3, 0.3) + weights_o <- cbind(w1, w2); colnames(weights_o) <- c("outcome1", "outcome2") + weights_y <- cbind(w1, w2); colnames(weights_y) <- c("Y1", "Y2") + + RC <- matrix(c(1, 0.3, 0.3, 1), nrow = 2) + + r_o <- get_integrated_weight(weights_o, residual_correlation = RC) + r_y <- get_integrated_weight(weights_y, residual_correlation = RC) + + expect_equal(as.numeric(r_o), as.numeric(r_y), tolerance = 1e-10) +}) + +test_that("get_integrated_weight: subset of outcomes picks the right RC sub-block", { + # 4x4 RC, but weights only carry traits 2 and 4. The function must subset + # RC[c(2,4), c(2,4)] automatically from the colnames. + w1 <- c(0.7, 0.1, 0.1, 0.1) + w2 <- c(0.1, 0.4, 0.3, 0.2) + weights <- cbind(w1, w2) + colnames(weights) <- c("outcome2", "outcome4") + + RC_full <- matrix(c(1, 0.2, 0.1, 0.05, + 0.2, 1, 0.3, 0.4, + 0.1, 0.3, 1, 0.15, + 0.05, 0.4, 0.15, 1), nrow = 4, byrow = TRUE) + RC_sub <- RC_full[c(2, 4), c(2, 4)] + + # Running with full RC (letting the function subset by index) should match + # running with the pre-subsetted 2x2 block with colnames relabeled to 1/2. + weights_sub <- weights + colnames(weights_sub) <- c("outcome1", "outcome2") + + r_full <- get_integrated_weight(weights, residual_correlation = RC_full) + r_sub <- get_integrated_weight(weights_sub, residual_correlation = RC_sub) + + expect_equal(as.numeric(r_full), as.numeric(r_sub), tolerance = 1e-10) +}) + +test_that("get_integrated_weight: duplicate outcome columns are collapsed by geometric mean", { + # Two columns for the same trait (outcome1 appearing twice) should produce + # the same integrated weight as a single column whose values are the + # geometric mean of the two duplicates. + set.seed(77) + w1a <- c(0.5, 0.3, 0.1, 0.1) + w1b <- c(0.4, 0.4, 0.1, 0.1) + w2 <- c(0.2, 0.2, 0.3, 0.3) + + gm_w1 <- exp((log(w1a) + log(w1b)) / 2) # geometric mean + + weights_dup <- cbind(w1a, w1b, w2) + colnames(weights_dup) <- c("outcome1", "outcome1", "outcome2") + + weights_collapsed <- cbind(gm_w1, w2) + colnames(weights_collapsed) <- c("outcome1", "outcome2") + + RC <- matrix(c(1, 0.4, 0.4, 1), nrow = 2) + + r_dup <- get_integrated_weight(weights_dup, residual_correlation = RC) + r_col <- get_integrated_weight(weights_collapsed, residual_correlation = RC) + + expect_equal(as.numeric(r_dup), as.numeric(r_col), tolerance = 1e-10) + expect_equal(sum(r_dup), 1, tolerance = 1e-10) +}) + +test_that("get_integrated_weight: duplicate collapse only triggers when RC is provided", { + # Without residual_correlation the collapse branch is skipped — two + # identical-index columns should NOT be collapsed (legacy behavior). + w1a <- c(0.5, 0.3, 0.1, 0.1) + w1b <- c(0.4, 0.4, 0.1, 0.1) + w2 <- c(0.2, 0.2, 0.3, 0.3) + + weights_dup <- cbind(w1a, w1b, w2) + colnames(weights_dup) <- c("outcome1", "outcome1", "outcome2") + + r_null <- get_integrated_weight(weights_dup, residual_correlation = NULL) + expect_equal(sum(r_null), 1, tolerance = 1e-10) + expect_true(all(is.finite(r_null))) +}) + +test_that("get_integrated_weight: duplicate collapse + use_entropy + RC composes", { + set.seed(91) + w1a <- c(0.5, 0.3, 0.1, 0.1) + w1b <- c(0.45, 0.35, 0.1, 0.1) + w2 <- c(0.25, 0.25, 0.25, 0.25) + + weights_dup <- cbind(w1a, w1b, w2) + colnames(weights_dup) <- c("outcome1", "outcome1", "outcome2") + + RC <- matrix(c(1, 0.3, 0.3, 1), nrow = 2) + + expect_error( + res <- get_integrated_weight( + weights_dup, + use_entropy = TRUE, + residual_correlation = RC + ), + NA + ) + expect_equal(sum(res), 1, tolerance = 1e-10) + expect_true(all(is.finite(res))) +}) + + +# ============================================================================ +# (2) End-to-end: use_entropy wired through colocboost() +# ============================================================================ + +test_that("colocboost runs with use_entropy = TRUE and returns a valid object", { + td <- generate_entropy_test_data() + + suppressWarnings(suppressMessages({ + res <- colocboost( + X = td$X_list, Y = td$Y_list, + use_entropy = TRUE, M = 20, output_level = 2 + ) + })) + + expect_s3_class(res, "colocboost") + expect_equal(res$data_info$n_outcomes, 3) + expect_equal(length(res$data_info$variables), ncol(td$X)) + if (!is.null(res$vcp)) { + expect_true(all(res$vcp >= 0 & res$vcp <= 1 + 1e-10)) + } +}) + +test_that("colocboost use_entropy default is FALSE and preserves legacy behavior", { + td <- generate_entropy_test_data() + + suppressWarnings(suppressMessages({ + res_default <- colocboost( + X = td$X_list, Y = td$Y_list, M = 20, output_level = 2 + ) + res_explicit_false <- colocboost( + X = td$X_list, Y = td$Y_list, + use_entropy = FALSE, M = 20, output_level = 2 + ) + })) + + expect_equal(res_default$vcp, res_explicit_false$vcp) + expect_equal( + names(res_default$cos_details$cos$cos_index), + names(res_explicit_false$cos_details$cos$cos_index) + ) +}) + +test_that("colocboost use_entropy=TRUE can yield different VCP than FALSE under heterogeneous effects", { + td <- generate_entropy_test_data() + + suppressWarnings(suppressMessages({ + res_F <- colocboost(X = td$X_list, Y = td$Y_list, + use_entropy = FALSE, M = 30, output_level = 2) + res_T <- colocboost(X = td$X_list, Y = td$Y_list, + use_entropy = TRUE, M = 30, output_level = 2) + })) + + if (!is.null(res_F$vcp) && !is.null(res_T$vcp) && + length(res_F$vcp) == length(res_T$vcp)) { + expect_false(isTRUE(all.equal(as.numeric(res_F$vcp), + as.numeric(res_T$vcp), + tolerance = 1e-12))) + } else { + succeed("One run returned no colocalization; entropy-vs-uniform comparison skipped") + } +}) + +test_that("colocboost use_entropy flag is propagated into cb_model_para", { + td <- generate_entropy_test_data() + + suppressWarnings(suppressMessages({ + res <- colocboost(X = td$X_list, Y = td$Y_list, + use_entropy = TRUE, M = 10, output_level = 3) + })) + + expect_true("diagnostic_details" %in% names(res)) + expect_equal(res$diagnostic_details$cb_model_para$use_entropy, TRUE) +}) + +test_that("get_robust_colocalization accepts and propagates use_entropy", { + td <- generate_entropy_test_data() + + suppressWarnings(suppressMessages({ + res <- colocboost(X = td$X_list, Y = td$Y_list, + use_entropy = TRUE, M = 20, output_level = 2) + })) + + expect_error( + suppressMessages( + filtered <- get_robust_colocalization( + res, + cos_npc_cutoff = 0.2, + npc_outcome_cutoff = 0.1, + use_entropy = TRUE + ) + ), + NA + ) + expect_s3_class(filtered, "colocboost") +}) + + +# ============================================================================ +# (3) End-to-end: residual_correlation wired through colocboost() +# ============================================================================ + +test_that("colocboost accepts residual_correlation without error", { + td <- generate_entropy_test_data(L = 2, seed = 7) + resid_cor <- matrix(c(1, 0.4, 0.4, 1), nrow = 2) + + expect_error( + suppressWarnings(suppressMessages({ + res <- colocboost( + X = td$X_list, Y = td$Y_list, + residual_correlation = resid_cor, + M = 15, output_level = 2 + ) + })), + NA + ) + expect_s3_class(res, "colocboost") +}) + +test_that("colocboost_init_data no longer creates a 'dependency' field on cb_data entries", { + # Structural guard: the per-outcome `dependency` scalar was the mechanism + # that used to leak residual_correlation into the fitting loop. It must + # stay deleted even though RC is now consumed at assembly time. + td <- generate_entropy_test_data(L = 2, seed = 11) + Y_list_mat <- lapply(td$Y_list, as.matrix) + + cb_data <- colocboost_init_data( + X = td$X_list, Y = Y_list_mat, + dict_YX = c(1, 2), + Z = NULL, LD = NULL, X_ref = NULL, ref_label = NULL, + N_sumstat = NULL, dict_sumstatLD = NULL, + Var_y = NULL, SeBhat = NULL, + keep_variables = lapply(td$X_list, colnames) + ) + + expect_s3_class(cb_data, "colocboost") + for (i in seq_along(cb_data$data)) { + expect_false("dependency" %in% names(cb_data$data[[i]]), + info = paste0("data element ", i, " still has a 'dependency' field")) + } +}) + +test_that("residual_correlation does NOT affect the core boosting loop (only assembly)", { + # The refactor keeps residual_correlation out of the fitting loop, so the + # fitted per-trait model objects (beta, profile_loglike_each, z) must be + # IDENTICAL with and without RC. Only the downstream integrated VCP is + # allowed to differ. + td <- generate_entropy_test_data(L = 2, seed = 13) + resid_cor <- matrix(c(1, 0.6, 0.6, 1), nrow = 2) + + suppressWarnings(suppressMessages({ + res_no_rc <- colocboost( + X = td$X_list, Y = td$Y_list, + M = 20, output_level = 3 + ) + res_rc <- colocboost( + X = td$X_list, Y = td$Y_list, + residual_correlation = resid_cor, + M = 20, output_level = 3 + ) + })) + + cb_no <- res_no_rc$diagnostic_details$cb_model + cb_rc <- res_rc$diagnostic_details$cb_model + expect_equal(length(cb_no), length(cb_rc)) + + for (i in seq_along(cb_no)) { + expect_equal(cb_no[[i]]$beta, cb_rc[[i]]$beta, tolerance = 1e-12, + info = paste0("beta differs at outcome ", i)) + expect_equal(cb_no[[i]]$profile_loglike_each, + cb_rc[[i]]$profile_loglike_each, + tolerance = 1e-12, + info = paste0("profile_loglike_each differs at outcome ", i)) + expect_equal(cb_no[[i]]$z, cb_rc[[i]]$z, tolerance = 1e-12, + info = paste0("z differs at outcome ", i)) + } +}) + +test_that("identity residual_correlation matches NULL residual_correlation end-to-end", { + # RC = I should produce VCP identical to RC = NULL because the GLS alpha + # collapses to uniform 1/L for an identity matrix. + td <- generate_entropy_test_data(L = 3, seed = 29) + + suppressWarnings(suppressMessages({ + res_null <- colocboost( + X = td$X_list, Y = td$Y_list, + M = 20, output_level = 2 + ) + res_I <- colocboost( + X = td$X_list, Y = td$Y_list, + residual_correlation = diag(3), + M = 20, output_level = 2 + ) + })) + + if (!is.null(res_null$vcp) && !is.null(res_I$vcp)) { + expect_equal(as.numeric(res_null$vcp), as.numeric(res_I$vcp), + tolerance = 1e-10) + } + expect_equal( + names(res_null$cos_details$cos$cos_index), + names(res_I$cos_details$cos$cos_index) + ) +}) + +test_that("residual_correlation is stored on cb_model_para via diagnostic_details", { + td <- generate_entropy_test_data(L = 2, seed = 31) + RC <- matrix(c(1, 0.35, 0.35, 1), nrow = 2) + + suppressWarnings(suppressMessages({ + res <- colocboost( + X = td$X_list, Y = td$Y_list, + residual_correlation = RC, + M = 10, output_level = 3 + ) + })) + + expect_true("diagnostic_details" %in% names(res)) + stored <- res$diagnostic_details$cb_model_para$residual_correlation + expect_false(is.null(stored)) + expect_equal(stored, RC) +}) + +test_that("residual_correlation composes with use_entropy = TRUE", { + td <- generate_entropy_test_data(L = 3, seed = 19) + resid_cor <- diag(3) + resid_cor[1, 2] <- resid_cor[2, 1] <- 0.3 + + expect_error( + suppressWarnings(suppressMessages({ + res <- colocboost( + X = td$X_list, Y = td$Y_list, + residual_correlation = resid_cor, + use_entropy = TRUE, + M = 15, output_level = 2 + ) + })), + NA + ) + expect_s3_class(res, "colocboost") + expect_equal(res$data_info$n_outcomes, 3) +}) + +test_that("get_robust_colocalization accepts and propagates residual_correlation", { + td <- generate_entropy_test_data(L = 3, seed = 37) + RC <- matrix(0.2, 3, 3); diag(RC) <- 1 + + suppressWarnings(suppressMessages({ + res <- colocboost( + X = td$X_list, Y = td$Y_list, + residual_correlation = RC, + M = 20, output_level = 2 + ) + })) + + expect_error( + suppressMessages( + filtered <- get_robust_colocalization( + res, + cos_npc_cutoff = 0.2, + npc_outcome_cutoff = 0.1, + residual_correlation = RC + ) + ), + NA + ) + expect_s3_class(filtered, "colocboost") +}) + +test_that("residual_correlation end-to-end runs with non-identity RC and L=4", { + td <- generate_entropy_test_data(L = 4, seed = 41) + RC <- matrix(0.3, 4, 4); diag(RC) <- 1 + + suppressWarnings(suppressMessages({ + res <- colocboost( + X = td$X_list, Y = td$Y_list, + residual_correlation = RC, + use_entropy = TRUE, + M = 20, output_level = 3 + ) + })) + + expect_s3_class(res, "colocboost") + expect_equal(res$data_info$n_outcomes, 4) + expect_equal(res$diagnostic_details$cb_model_para$residual_correlation, RC) +}) \ No newline at end of file diff --git a/tests/testthat/test_Xref.R b/tests/testthat/test_Xref.R new file mode 100644 index 0000000..5d30abc --- /dev/null +++ b/tests/testthat/test_Xref.R @@ -0,0 +1,758 @@ +library(testthat) + +# ============================================================================ +# Shared test data generators for X_ref tests +# ============================================================================ + +generate_xref_test_data <- function(n = 200, n_ref = 50, p = 30, L = 2, seed = 42) { + set.seed(seed) + + # Generate X with LD structure (for individual-level truth) + sigma <- matrix(0, p, p) + for (i in 1:p) { + for (j in 1:p) { + sigma[i, j] <- 0.9^abs(i - j) + } + } + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + + # Generate X_ref (reference panel, separate samples) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + colnames(X_ref) <- paste0("SNP", 1:p) + + # Generate true effects + true_beta <- matrix(0, p, L) + true_beta[5, 1] <- 0.7 # SNP5 affects trait 1 + true_beta[5, 2] <- 0.6 # SNP5 also affects trait 2 (colocalized) + true_beta[20, 2] <- 0.5 # SNP20 only affects trait 2 + + # Generate Y with some noise + Y <- matrix(0, n, L) + for (l in 1:L) { + Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 1) + } + + # Calculate summary statistics + beta <- matrix(0, p, L) + se <- matrix(0, p, L) + for (i in 1:L) { + for (j in 1:p) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { + beta[j, i] <- fit[2, 1] + se[j, i] <- fit[2, 2] + } + } + } + + sumstat_list <- lapply(1:L, function(i) { + data.frame( + beta = beta[, i], + sebeta = se[, i], + n = n, + variant = colnames(X) + ) + }) + + LD <- cor(X) + + list( + X = X, + Y = Y, + X_ref = X_ref, + LD = LD, + sumstat = sumstat_list, + true_beta = true_beta + ) +} + + +# ============================================================================ +# Test 1: colocboost rejects LD + X_ref together +# ============================================================================ + +test_that("colocboost rejects when both LD and X_ref are provided", { + test_data <- generate_xref_test_data() + + expect_warning( + result <- colocboost( + sumstat = test_data$sumstat, + LD = test_data$LD, + X_ref = test_data$X_ref, + M = 1 + ), + "Provide either LD or X_ref, not both" + ) + expect_null(result) +}) + + +# ============================================================================ +# Test 2: X_ref with N_ref >= P precomputes LD +# ============================================================================ + +test_that("X_ref with N_ref >= P precomputes LD and produces valid results", { + # N_ref = 50, P = 30 => N_ref >= P, should precompute LD + test_data <- generate_xref_test_data(n_ref = 50, p = 30) + + expect_message( + suppressWarnings({ + result_xref <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + }), + "N_ref >= P: precomputing LD from X_ref" + ) + + expect_s3_class(result_xref, "colocboost") + expect_equal(result_xref$data_info$n_outcomes, 2) + expect_equal(length(result_xref$data_info$variables), 30) +}) + + +# ============================================================================ +# Test 3: X_ref with N_ref < P keeps X_ref for on-the-fly computation +# ============================================================================ + +test_that("X_ref with N_ref < P runs correctly without precomputing LD", { + # N_ref = 20, P = 30 => N_ref < P, should keep X_ref + test_data <- generate_xref_test_data(n_ref = 50, p = 30) + + suppressWarnings(suppressMessages({ + result_xref <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result_xref, "colocboost") + expect_equal(result_xref$data_info$n_outcomes, 2) + expect_equal(length(result_xref$data_info$variables), 30) +}) + + +# ============================================================================ +# Test 4: X_ref results are consistent with LD results (N_ref >= P case) +# ============================================================================ + +test_that("X_ref (N_ref >= P) produces results consistent with precomputed LD", { + test_data <- generate_xref_test_data(n = 200, n_ref = 200, p = 20, seed = 99) + + # Run with precomputed LD + suppressWarnings(suppressMessages({ + result_ld <- colocboost( + sumstat = test_data$sumstat, + LD = test_data$LD, + M = 50, + output_level = 2 + ) + })) + + # Run with X_ref (same samples as LD source, so N_ref >= P => precomputes LD) + suppressWarnings(suppressMessages({ + result_xref <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 50, + output_level = 2 + ) + })) + + # Both should produce valid colocboost objects + expect_s3_class(result_ld, "colocboost") + expect_s3_class(result_xref, "colocboost") + + # Same number of outcomes and variables + expect_equal(result_ld$data_info$n_outcomes, result_xref$data_info$n_outcomes) + expect_equal(length(result_ld$data_info$variables), length(result_xref$data_info$variables)) +}) + + +# ============================================================================ +# Test 5: X_ref with N_ref < P and one iteration (one_causal mode) +# ============================================================================ + +test_that("X_ref with N_ref < P works in one-causal mode (M=1 with LD)", { + test_data <- generate_xref_test_data(n_ref = 15, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 1, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$model_info$model_coveraged, "one_causal") + expect_equal(result$model_info$n_updates, 1) +}) + + +# ============================================================================ +# Test 6: X_ref with 3 outcomes +# ============================================================================ + +test_that("X_ref works with 3 outcomes", { + set.seed(123) + n <- 200 + n_ref <- 25 + p <- 25 + L <- 3 + + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + colnames(X_ref) <- paste0("SNP", 1:p) + + true_beta <- matrix(0, p, L) + true_beta[5, 1] <- 0.6 + true_beta[5, 2] <- 0.5 + true_beta[15, 3] <- 0.7 + + Y <- matrix(0, n, L) + for (l in 1:L) { + Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 1) + } + + beta <- se <- matrix(0, p, L) + for (i in 1:L) { + for (j in 1:p) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { beta[j, i] <- fit[2, 1]; se[j, i] <- fit[2, 2] } + } + } + sumstat <- lapply(1:L, function(i) data.frame(beta = beta[, i], sebeta = se[, i], n = n, variant = colnames(X))) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + X_ref = X_ref, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 3) +}) + + +# ============================================================================ +# Test 7: X_ref with focal outcome +# ============================================================================ + +test_that("X_ref works with focal outcome", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + focal_outcome_idx = 1, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$outcome_info$is_focal[1], TRUE) + expect_equal(result$data_info$outcome_info$is_focal[2], FALSE) +}) + + +# ============================================================================ +# Test 8: X_ref with dict_sumstatLD mapping +# ============================================================================ + +test_that("X_ref works with dict_sumstatLD mapping for shared reference panel", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # Both sumstats share the same X_ref + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = list(test_data$X_ref), + dict_sumstatLD = matrix(c(1, 1, 2, 1), ncol = 2), + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + + +# ============================================================================ +# Test 9: X_ref with multiple reference panels +# ============================================================================ + +test_that("X_ref works with multiple reference panels", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # Two separate X_ref panels (identical for testing) + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = list(test_data$X_ref, test_data$X_ref), + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + + +# ============================================================================ +# Test 10: X_ref with missing variants (partial overlap) +# ============================================================================ + +test_that("X_ref handles partial variant overlap with sumstat", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # X_ref has fewer variants than sumstat + X_ref_partial <- test_data$X_ref[, 1:25] # Only first 25 of 30 SNPs + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = X_ref_partial, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + # Should have 25 variables (intersection) + expect_equal(length(result$data_info$variables), 25) +}) + + +# ============================================================================ +# Test 11: compute_XtX_product correctness +# ============================================================================ + +test_that("compute_XtX_product produces correct results for all ref_label types", { + # Access internal function + compute_XtX_product <- get("compute_XtX_product", envir = asNamespace("colocboost")) + + set.seed(42) + n_ref <- 50 + p <- 20 + + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X_ref_raw <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + + # Standardize X_ref (as done in colocboost_validate_input_data) + X_ref <- Rfast::standardise(X_ref_raw, center = TRUE, scale = TRUE) + X_ref[is.na(X_ref)] <- 0 + + # Precompute LD from X_ref + LD <- get_cormat(X_ref_raw) + + beta <- rnorm(p) + + # Test LD case + result_ld <- compute_XtX_product(LD, beta, ref_label = "LD") + expected_ld <- as.vector(LD %*% beta) + expect_equal(result_ld, expected_ld) + + # Test X_ref case + result_xref <- compute_XtX_product(X_ref, beta, ref_label = "X_ref") + # Should approximate LD %*% beta + # X_ref' * X_ref / (N_ref - 1) ≈ LD + expected_xref <- as.vector(crossprod(X_ref, X_ref %*% beta) / (n_ref - 1)) + expect_equal(result_xref, expected_xref) + + # Test No_ref case + result_noref <- compute_XtX_product(1, beta, ref_label = "No_ref") + expect_equal(result_noref, beta) + + # X_ref and LD results should be similar (not exact due to finite N_ref) + expect_equal(result_xref, expected_ld, tolerance = 0.3) +}) + + +# ============================================================================ +# Test 12: X_ref validation in colocboost_validate_input_data +# ============================================================================ + +test_that("colocboost_validate_input_data correctly handles X_ref", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # Test N_ref < P case + validated <- colocboost_validate_input_data( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref + ) + + expect_equal(validated$ref_label, "X_ref") + expect_true(!is.null(validated$X_ref)) + expect_null(validated$LD) + + # Test N_ref >= P case + test_data_large <- generate_xref_test_data(n_ref = 50, p = 30) + + validated_large <- suppressMessages( + colocboost_validate_input_data( + sumstat = test_data_large$sumstat, + X_ref = test_data_large$X_ref + ) + ) + + expect_equal(validated_large$ref_label, "LD") + expect_null(validated_large$X_ref) + expect_true(!is.null(validated_large$LD)) +}) + + +# ============================================================================ +# Test 13: X_ref with output_level 3 (diagnostic details) +# ============================================================================ + +test_that("X_ref works with output_level 3 and has correct ref_label in cb_data", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 3 + ) + })) + + expect_s3_class(result, "colocboost") + expect_true("diagnostic_details" %in% names(result)) + + # Check that ref_label is correctly stored in the model's cb_data + cb_data <- result$diagnostic_details$cb_data + for (i in seq_along(cb_data$data)) { + ref_label_i <- cb_data$data[[i]]$ref_label + expect_true(!is.null(ref_label_i), + info = paste("ref_label missing for data element", i)) + expect_true(ref_label_i %in% c("X_ref", "LD", "No_ref", "individual"), + info = paste("Invalid ref_label for data element", i)) + } +}) + + +# ============================================================================ +# Test 14: X_ref with XtX_beta_cache in diagnostic output +# ============================================================================ + +test_that("X_ref model has XtX_beta_cache in diagnostic output", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 3 + ) + })) + + cb_model <- result$diagnostic_details$cb_model + for (i in seq_along(cb_model)) { + expect_true("XtX_beta_cache" %in% names(cb_model[[i]]), + info = paste("XtX_beta_cache missing for outcome", i)) + } +}) + + +# ============================================================================ +# Test 15: X_ref purity functions work correctly +# ============================================================================ + +test_that("purity functions dispatch correctly for X_ref", { + get_purity <- get("get_purity", envir = asNamespace("colocboost")) + get_between_purity <- get("get_between_purity", envir = asNamespace("colocboost")) + + set.seed(42) + n_ref <- 50 + p <- 20 + + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + X_ref <- Rfast::standardise(X_ref, center = TRUE, scale = TRUE) + X_ref[is.na(X_ref)] <- 0 + + LD <- cor(MASS::mvrnorm(200, rep(0, p), sigma)) + + # Test get_purity with X_ref + purity_xref <- get_purity(c(1, 2, 3), Xcorr = X_ref, ref_label = "X_ref") + expect_length(purity_xref, 3) + expect_true(all(purity_xref >= 0 & purity_xref <= 1)) + + # Test get_purity with LD + purity_ld <- get_purity(c(1, 2, 3), Xcorr = LD, ref_label = "LD") + expect_length(purity_ld, 3) + expect_true(all(purity_ld >= 0 & purity_ld <= 1)) + + # Test get_purity with No_ref + purity_noref <- get_purity(c(1, 2, 3), Xcorr = 1, ref_label = "No_ref") + expect_equal(purity_noref, c(0, 0, 0)) + + # Results should be similar between X_ref and LD (not exact due to different N) + expect_equal(purity_xref[1], purity_ld[1], tolerance = 0.3) + + # Test get_between_purity with X_ref + between_xref <- get_between_purity(c(1, 2), c(3, 4), Xcorr = X_ref, + miss_idx = NULL, P = p, ref_label = "X_ref") + expect_length(between_xref, 3) + expect_true(all(between_xref >= 0 & between_xref <= 1)) + + # Test get_between_purity with LD + between_ld <- get_between_purity(c(1, 2), c(3, 4), Xcorr = LD, + miss_idx = NULL, P = p, ref_label = "LD") + expect_length(between_ld, 3) + + # Test get_between_purity with No_ref + between_noref <- get_between_purity(c(1, 2), c(3, 4), Xcorr = 1, + miss_idx = NULL, P = p, ref_label = "No_ref") + expect_equal(between_noref, c(0, 0, 0)) +}) + + +# ============================================================================ +# Test 16: X_ref with get_robust_colocalization post-processing +# ============================================================================ + +test_that("get_robust_colocalization works with X_ref results", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + # Should not error + expect_error( + suppressMessages( + filtered <- get_robust_colocalization(result, cos_npc_cutoff = 0.2, npc_outcome_cutoff = 0.1) + ), + NA + ) + expect_s3_class(filtered, "colocboost") +}) + + +# ============================================================================ +# Test 17: X_ref with get_robust_ucos post-processing +# ============================================================================ + +test_that("get_robust_ucos works with X_ref results", { + test_data <- generate_xref_test_data(n_ref = 500, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + skip_if(is.null(result$ucos_details), "No ucos detected") + + expect_error( + suppressMessages( + filtered <- get_robust_ucos(result, npc_outcome_cutoff = 0.1) + ), + NA + ) + expect_s3_class(filtered, "colocboost") +}) + + +# ============================================================================ +# Test 18: X_ref plotting works +# ============================================================================ + +test_that("colocboost_plot works with X_ref results", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + expect_error(suppressWarnings(colocboost_plot(result)), NA) + expect_error(suppressWarnings(colocboost_plot(result, y = "vcp")), NA) +}) + + +# ============================================================================ +# Test 19: X_ref with get_colocboost_summary +# ============================================================================ + +test_that("get_colocboost_summary works with X_ref results", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + summary1 <- get_colocboost_summary(result, summary_level = 1) + expect_type(summary1, "list") + + summary2 <- get_colocboost_summary(result, summary_level = 2) + expect_type(summary2, "list") +}) + + +# ============================================================================ +# Test 20: X_ref with HyPrColoc format input +# ============================================================================ + +test_that("X_ref works with effect_est and effect_se input", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + beta <- se <- matrix(0, 30, 2) + for (i in 1:2) { + for (j in 1:30) { + fit <- summary(lm(test_data$Y[, i] ~ test_data$X[, j]))$coef + if (nrow(fit) == 2) { beta[j, i] <- fit[2, 1]; se[j, i] <- fit[2, 2] } + } + } + rownames(beta) <- rownames(se) <- colnames(test_data$X) + + suppressWarnings(suppressMessages({ + result <- colocboost( + effect_est = beta, + effect_se = se, + effect_n = rep(200, 2), + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + + +# ============================================================================ +# Test 21: LD_jk functions dispatch correctly for X_ref +# ============================================================================ + +test_that("get_LD_jk and get_LD_jk1_jk2 dispatch correctly for X_ref", { + get_LD_jk <- get("get_LD_jk", envir = asNamespace("colocboost")) + get_LD_jk1_jk2 <- get("get_LD_jk1_jk2", envir = asNamespace("colocboost")) + + set.seed(42) + n_ref <- 50 + p <- 20 + + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + X_ref <- Rfast::standardise(X_ref, center = TRUE, scale = TRUE) + X_ref[is.na(X_ref)] <- 0 + + LD <- cor(MASS::mvrnorm(200, rep(0, p), sigma)) + + remain_idx <- 1:p + + # get_LD_jk with X_ref + ld_jk_xref <- get_LD_jk(5, XtX = X_ref, remain_idx = remain_idx, P = p, ref_label = "X_ref") + expect_length(ld_jk_xref, p) + expect_equal(ld_jk_xref[5], 1.0, tolerance = 0.01) # Self-correlation ~ 1 + + # get_LD_jk with LD + ld_jk_ld <- get_LD_jk(5, XtX = LD, remain_idx = remain_idx, P = p, ref_label = "LD") + expect_length(ld_jk_ld, p) + expect_equal(ld_jk_ld[5], 1.0, tolerance = 0.01) + + # get_LD_jk with No_ref + ld_jk_noref <- get_LD_jk(5, XtX = 1, remain_idx = remain_idx, P = p, ref_label = "No_ref") + expect_true(all(ld_jk_noref[remain_idx] == 1)) + + # get_LD_jk1_jk2 with X_ref + ld_pair_xref <- get_LD_jk1_jk2(1, 2, XtX = X_ref, remain_jk = remain_idx, ref_label = "X_ref") + expect_true(abs(ld_pair_xref) <= 1) + expect_true(abs(ld_pair_xref) > 0.5) # Adjacent SNPs with 0.9 LD should be high + + # get_LD_jk1_jk2 with LD + ld_pair_ld <- get_LD_jk1_jk2(1, 2, XtX = LD, remain_jk = remain_idx, ref_label = "LD") + expect_true(abs(ld_pair_ld) <= 1) + + # get_LD_jk1_jk2 with No_ref + ld_pair_noref <- get_LD_jk1_jk2(1, 2, XtX = 1, remain_jk = remain_idx, ref_label = "No_ref") + expect_equal(ld_pair_noref, 0) +}) + + +# ============================================================================ +# Test 22: ref_label is never NULL in internal cb_data after processing +# ============================================================================ + +test_that("ref_label is always explicitly set in cb_data, never NULL", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # Build cb_data directly to inspect ref_label + sumstat <- test_data$sumstat + Z_list <- lapply(sumstat, function(s) s$beta / s$sebeta) + + # X_ref case + cb_data_xref <- colocboost_init_data( + X = NULL, Y = NULL, dict_YX = NULL, + Z = Z_list, LD = NULL, X_ref = list(test_data$X_ref), ref_label = "X_ref", + N_sumstat = lapply(sumstat, function(s) s$n[1]), + dict_sumstatLD = c(1, 1), Var_y = NULL, SeBhat = NULL, + keep_variables = lapply(sumstat, function(s) s$variant) + ) + for (i in seq_along(cb_data_xref$data)) { + expect_equal(cb_data_xref$data[[i]]$ref_label, "X_ref") + } + + # LD case + cb_data_ld <- colocboost_init_data( + X = NULL, Y = NULL, dict_YX = NULL, + Z = Z_list, LD = list(test_data$LD), X_ref = NULL, ref_label = "LD", + N_sumstat = lapply(sumstat, function(s) s$n[1]), + dict_sumstatLD = c(1, 1), Var_y = NULL, SeBhat = NULL, + keep_variables = lapply(sumstat, function(s) s$variant) + ) + for (i in seq_along(cb_data_ld$data)) { + expect_equal(cb_data_ld$data[[i]]$ref_label, "LD") + } + + # Individual data case + Y_list <- lapply(1:2, function(l) as.matrix(test_data$Y[, l])) + X_list <- list(test_data$X, test_data$X) + cb_data_ind <- colocboost_init_data( + X = X_list, Y = Y_list, dict_YX = c(1, 2), + Z = NULL, LD = NULL, X_ref = NULL, ref_label = NULL, + N_sumstat = NULL, dict_sumstatLD = NULL, Var_y = NULL, SeBhat = NULL, + keep_variables = lapply(X_list, colnames) + ) + for (i in seq_along(cb_data_ind$data)) { + expect_equal(cb_data_ind$data[[i]]$ref_label, "individual") + } +}) \ No newline at end of file diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R index 9c2c9d0..1b23c0f 100644 --- a/tests/testthat/test_inference.R +++ b/tests/testthat/test_inference.R @@ -790,6 +790,8 @@ generate_test_cb_obj_with_ucos <- function(n = 100, p = 20, L = 2, seed = 42) { dict_YX = c(1, 2), Z = NULL, LD = NULL, + X_ref = NULL, + ref_label = NULL, N_sumstat = NULL, dict_sumstatLD = NULL, Var_y = NULL, @@ -935,6 +937,8 @@ test_that("get_ucos_evidence handles individual-level data", { dict_YX = c(1, 2), Z = NULL, LD = NULL, + X_ref = NULL, + ref_label = NULL, N_sumstat = NULL, dict_sumstatLD = NULL, Var_y = NULL, @@ -1040,6 +1044,8 @@ test_that("get_ucos_evidence handles summary statistics data", { dict_YX = NULL, Z = Z_list, LD = list(LD_matrix), + X_ref = NULL, + ref_label = "LD", N_sumstat = lapply(sumstat_list, function(s) s$n[1]), dict_sumstatLD = c(1, 1), Var_y = NULL, diff --git a/tests/testthat/test_model.R b/tests/testthat/test_model.R index df998e8..fa28c1d 100644 --- a/tests/testthat/test_model.R +++ b/tests/testthat/test_model.R @@ -237,6 +237,7 @@ test_that("colocboost correctly maps focal outcome to keep_variables with dict_k dict_YX = dict_YX, Z = lapply(sumstat_list, function(s) s$beta / s$sebeta), # z-scores LD = list(LD_superset), + ref_label = "LD", N_sumstat = lapply(sumstat_list, function(s) s$n[1]), dict_sumstatLD = dict_sumstatLD, Var_y = NULL, diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index fb1b999..e62fcf9 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -571,6 +571,7 @@ test_that("colocboost_init_data handles complex dictionary mappings", { dict_YX = dict_YX, Z = Z_list, LD = LD_list, + ref_label = "LD", N_sumstat = N_sumstat, dict_sumstatLD = dict_sumstatLD, Var_y = NULL, diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd index d1deba3..9a7aca5 100644 --- a/vignettes/Input_Data_Format.Rmd +++ b/vignettes/Input_Data_Format.Rmd @@ -59,6 +59,8 @@ head(Sumstat_5traits$sumstat[[1]]) - `LD` is a matrix of LD. This matrix does not need to contain the exact same variants as in `sumstat`, but the `colnames` and `rownames` of `LD` should include the `variant` names for proper alignment. +- `X_ref` (alternative to `LD`) is a reference panel genotype matrix (N_ref x P). When the number of variants is large, passing `X_ref` directly avoids storing the full P x P LD matrix. See [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html) for details. + The input format for multiple traits is similar, but `sumstat` should be a list of data frames `sumstat = list(sumstat1, sumstat2, sumstat3)`. The flexibility of input format for multiple traits is as follows (see detailed usage with different input formats, refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html)): diff --git a/vignettes/Partial_Overlap_Variants.Rmd b/vignettes/Partial_Overlap_Variants.Rmd index e179fa4..b74d7f3 100644 --- a/vignettes/Partial_Overlap_Variants.Rmd +++ b/vignettes/Partial_Overlap_Variants.Rmd @@ -22,7 +22,7 @@ This vignette demonstrates how ColocBoost handles partial overlapping variants a library(colocboost) ``` -![](../man/figures/missing_representation.png) +![Illustration of partial overlapping variants across traits](../man/figures/missing_representation.png) diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 3177a68..2dc127a 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -177,7 +177,31 @@ res$cos_details$cos$cos_index ``` -## 3.4. HyPrColoc compatible format: effect size and standard error matrices +## 3.4. Using a reference panel genotype matrix (X_ref) instead of LD + +When the number of variants P is very large, storing the full P x P LD matrix may be infeasible. +If you have the reference panel genotype matrix from which LD would be computed, you can pass it directly via `X_ref`. +ColocBoost will compute LD products on the fly, avoiding the P x P memory cost. + +This is beneficial when the reference panel sample size (N_ref) is less than the number of variants (P). +When N_ref >= P, ColocBoost automatically precomputes the LD matrix internally for efficiency. + +Provide either `LD` or `X_ref`, not both. The `dict_sumstatLD` dictionary works with `X_ref` the same way as with `LD`. + +```{r x-ref-example} +# Use genotype matrix directly as reference panel +data("Ind_5traits") +X_ref <- Ind_5traits$X[[1]] + +# Run colocboost with X_ref instead of LD +res <- colocboost(sumstat = Sumstat_5traits$sumstat, X_ref = X_ref) + +# Identified CoS +res$cos_details$cos$cos_index +``` + + +## 3.5. HyPrColoc compatible format: effect size and standard error matrices ColocBoost also provides a flexibility to use HyPrColoc compatible format for summary statistics with and without LD matrix.