diff --git a/NAMESPACE b/NAMESPACE index 979ad852..fbe51414 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -54,9 +54,9 @@ export(harmonize_twas) export(invert_minmax_scaling) export(lasso_weights) export(lassosum_rss) -export(ld_loader) export(lassosum_rss_weights) export(lbf_to_alpha) +export(ld_loader) export(load_LD_matrix) export(load_genotype_region) export(load_multicontext_sumstats) @@ -88,6 +88,8 @@ export(multigene_udr) export(multivariate_analysis_pipeline) export(mvsusie_weights) export(normalize_variant_id) +export(otters_association) +export(otters_weights) export(parse_cs_corr) export(parse_dentist_output) export(parse_region) diff --git a/R/misc.R b/R/misc.R index bf4ec16f..a02f0b4f 100644 --- a/R/misc.R +++ b/R/misc.R @@ -14,13 +14,11 @@ pval_acat <- function(pvals) { if (length(pvals) == 1) { return(pvals[1]) } - stat <- 0.00 - pval_min <- 1.00 - - stat <- sum(qcauchy(pvals)) - pval_min <- min(pval_min, min(qcauchy(pvals))) - - return(pcauchy(stat / length(pvals), lower.tail = FALSE)) + # ACAT statistic: T = mean(tan(pi*(0.5 - p_i))) + # Liu & Xie (2020) "Cauchy combination test" + # For very small p, tan(pi*(0.5-p)) ~ 1/(pi*p), so T is large and positive. + stat <- mean(tan(pi * (0.5 - pvals))) + return(pcauchy(stat, lower.tail = FALSE)) } pval_hmp <- function(pvals) { diff --git a/R/otters.R b/R/otters.R new file mode 100644 index 00000000..404db65b --- /dev/null +++ b/R/otters.R @@ -0,0 +1,234 @@ +#' Train eQTL weights using multiple RSS methods (OTTERS Stage I) +#' +#' Implements the training stage of the OTTERS framework (Omnibus Transcriptome +#' Test using Expression Reference Summary data, Zhang et al. 2024). Trains +#' eQTL effect size weights for a gene region using multiple summary-statistics-based +#' methods in parallel, enabling downstream omnibus TWAS testing. +#' +#' Methods are dispatched dynamically via \code{do.call(paste0(method, "_weights"), ...)}, +#' so any function following the \code{*_weights(stat, LD, ...)} convention can be used +#' (e.g., \code{lassosum_rss_weights}, \code{prs_cs_weights}, \code{sdpr_weights}, +#' \code{mr_ash_rss_weights}). +#' +#' P+T (pruning and thresholding) is handled internally: for each threshold, SNPs with +#' eQTL p-value below the threshold are selected, and their marginal z-scores (scaled +#' to correlation units: \code{z / sqrt(n)}) are used as weights. +#' +#' @param sumstats A data.frame of eQTL summary statistics. Must contain column \code{z} +#' (z-scores). If \code{z} is absent but \code{beta} and \code{se} are present, +#' z-scores are computed as \code{beta / se}. +#' @param LD LD correlation matrix R for the gene region (single matrix, not a list). +#' Should have row/column names matching variant identifiers if variant alignment +#' is desired. +#' @param n eQTL study sample size (scalar). +#' @param methods Named list of RSS methods and their extra arguments. Each element +#' name must correspond to a \code{*_weights} function in pecotmr (without the +#' \code{_weights} suffix). Defaults match the original OTTERS pipeline +#' (Zhang et al. 2024): +#' \itemize{ +#' \item \code{lassosum_rss}: s grid = c(0.2, 0.5, 0.9, 1.0), lambda from +#' 0.0001 to 0.1 (20 values on log scale) +#' \item \code{prs_cs}: phi = 1e-4 (fixed, not learned), 1000 iterations, +#' 500 burn-in, thin = 5 +#' \item \code{sdpr}: 1000 iterations, 200 burn-in, thin = 1 (no thinning) +#' } +#' To add learners (e.g., \code{mr_ash_rss}), simply append to this list. +#' @param p_thresholds Numeric vector of p-value thresholds for P+T. Set to +#' \code{NULL} to skip P+T. Default: \code{c(0.001, 0.05)}. +#' +#' @return A named list of weight vectors (one per method). Each vector has length +#' equal to \code{nrow(sumstats)}. P+T results are named \code{PT_}. +#' +#' @examples +#' set.seed(42) +#' n <- 500; p <- 20 +#' z <- rnorm(p, sd = 2) +#' R <- diag(p) +#' sumstats <- data.frame(z = z) +#' weights <- otters_weights(sumstats, R, n, +#' methods = list(lassosum_rss = list()), +#' p_thresholds = c(0.05)) +#' +#' @export +otters_weights <- function(sumstats, LD, n, + methods = list( + lassosum_rss = list(), + prs_cs = list(phi = 1e-4, + n_iter = 1000, n_burnin = 500, thin = 5), + sdpr = list(iter = 1000, burn = 200, thin = 1, verbose = FALSE) + ), + p_thresholds = c(0.001, 0.05)) { + # Compute z-scores if not present + if (is.null(sumstats$z)) { + if (!is.null(sumstats$beta) && !is.null(sumstats$se)) { + sumstats$z <- sumstats$beta / sumstats$se + } else { + stop("sumstats must have 'z' or ('beta' and 'se') columns.") + } + } + + p <- nrow(sumstats) + z <- sumstats$z + + # Build stat object for _weights() convention + # Safeguard: clamp marginal correlations to (-1, 1) as required by lassosum + # (matches OTTERS shrink_factor logic in PRSmodels/lassosum.R lines 71-77) + b <- z / sqrt(n) + max_abs_b <- max(abs(b)) + if (max_abs_b >= 1) { + b <- b / (max_abs_b / 0.9999) + } + stat <- list(b = b, n = rep(n, p)) + + results <- list() + + # --- P+T (Pruning and Thresholding) --- + if (!is.null(p_thresholds)) { + pvals <- pchisq(z^2, df = 1, lower.tail = FALSE) + for (thr in p_thresholds) { + selected <- pvals < thr + # Weights = clamped marginal correlation (stat$b) for selected SNPs + w <- ifelse(selected, stat$b, 0) + results[[paste0("PT_", thr)]] <- w + } + } + + # --- RSS methods --- + for (method_name in names(methods)) { + fn_name <- paste0(method_name, "_weights") + if (!exists(fn_name, mode = "function")) { + warning(sprintf("Method '%s' not found (looking for function '%s'). Skipping.", + method_name, fn_name)) + next + } + tryCatch({ + w <- do.call(fn_name, c(list(stat = stat, LD = LD), methods[[method_name]])) + results[[method_name]] <- as.numeric(w) + }, error = function(e) { + warning(sprintf("Method '%s' failed: %s", method_name, e$message)) + results[[method_name]] <<- rep(0, p) + }) + } + + results +} + + +#' TWAS association testing with omnibus combination (OTTERS Stage II) +#' +#' Computes per-method TWAS z-scores using the FUSION formula and combines +#' p-values across methods via ACAT (Aggregated Cauchy Association Test) or +#' HMP (Harmonic Mean P-value). +#' +#' The FUSION TWAS statistic (Gusev et al. 2016) is: +#' \deqn{Z_{TWAS} = \frac{w^T z}{\sqrt{w^T R w}}} +#' where \eqn{w} are eQTL weights, \eqn{z} are GWAS z-scores, and \eqn{R} +#' is the LD correlation matrix. +#' +#' @param weights Named list of weight vectors (output from \code{\link{otters_weights}} +#' or any named list of numeric vectors). +#' @param gwas_z Numeric vector of GWAS z-scores, same length and order as the +#' weights vectors. Must be aligned to the same variants and allele orientation +#' as the weights and LD matrix. Use \code{\link{allele_qc}} or +#' \code{\link{rss_basic_qc}} for harmonization before calling this function. +#' @param LD LD correlation matrix R, aligned to the same variants as weights +#' and gwas_z. +#' @param combine_method Method to combine p-values across methods: \code{"acat"} +#' (default) or \code{"hmp"}. +#' +#' @return A data.frame with columns: +#' \describe{ +#' \item{method}{Method name (per-method rows plus a combined row).} +#' \item{twas_z}{TWAS z-score (\code{NA} for combined row).} +#' \item{twas_pval}{TWAS p-value.} +#' \item{n_snps}{Number of non-zero weight SNPs used.} +#' } +#' +#' @examples +#' set.seed(42) +#' p <- 20 +#' gwas_z <- rnorm(p) +#' R <- diag(p) +#' weights <- list(method1 = rnorm(p, sd = 0.01), method2 = rnorm(p, sd = 0.01)) +#' otters_association(weights, gwas_z, R) +#' +#' @export +otters_association <- function(weights, gwas_z, LD, + combine_method = c("acat", "hmp")) { + combine_method <- match.arg(combine_method) + + # Validate dimensions + p <- length(gwas_z) + if (nrow(LD) != p || ncol(LD) != p) { + stop(sprintf("LD dimensions (%d x %d) do not match gwas_z length (%d).", + nrow(LD), ncol(LD), p)) + } + for (nm in names(weights)) { + if (length(weights[[nm]]) != p) { + stop(sprintf("Weight vector '%s' has length %d but gwas_z has length %d.", + nm, length(weights[[nm]]), p)) + } + } + + results <- data.frame( + method = character(), + twas_z = numeric(), + twas_pval = numeric(), + n_snps = integer(), + stringsAsFactors = FALSE + ) + + valid_pvals <- c() + + for (method_name in names(weights)) { + w <- weights[[method_name]] + + # Skip all-zero weights + if (all(w == 0)) { + results <- rbind(results, data.frame( + method = method_name, twas_z = NA_real_, + twas_pval = NA_real_, n_snps = 0L, + stringsAsFactors = FALSE + )) + next + } + + # Use non-zero SNPs + nz <- which(w != 0) + n_snps <- length(nz) + + # Compute TWAS z-score via twas_z() + res <- twas_z(weights = w[nz], z = gwas_z[nz], R = LD[nz, nz, drop = FALSE]) + + z_val <- as.numeric(res$z) + p_val <- as.numeric(res$pval) + + results <- rbind(results, data.frame( + method = method_name, twas_z = z_val, + twas_pval = p_val, n_snps = n_snps, + stringsAsFactors = FALSE + )) + + if (!is.na(p_val) && is.finite(p_val) && p_val > 0 && p_val < 1) { + valid_pvals <- c(valid_pvals, p_val) + } + } + + # Combine p-values across methods + if (length(valid_pvals) >= 2) { + combined_pval <- if (combine_method == "acat") { + pval_acat(valid_pvals) + } else { + pval_hmp(valid_pvals) + } + results <- rbind(results, data.frame( + method = paste0(toupper(combine_method), "_combined"), + twas_z = NA_real_, + twas_pval = as.numeric(combined_pval), + n_snps = NA_integer_, + stringsAsFactors = FALSE + )) + } + + results +} diff --git a/R/regularized_regression.R b/R/regularized_regression.R index b3d8f271..e8ff298d 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -823,7 +823,7 @@ bayes_r_rss_weights <- function(sumstats, LD, ...) { #' out <- lassosum_rss(bhat, LD, n) #' @export lassosum_rss <- function(bhat, LD, n, - lambda = exp(seq(log(0.001), log(0.1), length.out = 20)), + lambda = exp(seq(log(0.0001), log(0.1), length.out = 20)), thr = 1e-4, maxiter = 10000) { if (!is.list(LD)) { stop("Please provide a valid list of LD blocks using 'LD'.") @@ -851,10 +851,38 @@ lassosum_rss <- function(bhat, LD, n, result } -#' Extract weights from lassosum_rss function -#' @return A numeric vector of the posterior SNP coefficients. +#' Extract weights from lassosum_rss with shrinkage grid search +#' +#' Searches over a grid of shrinkage parameters \code{s} (default: +#' \code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS). +#' For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then +#' \code{lassosum_rss()} is called across the lambda path. The best +#' \code{(s, lambda)} combination is selected by the lowest objective value. +#' +#' @param stat A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes). +#' @param LD LD correlation matrix R (single matrix, NOT pre-shrunk). +#' @param s Numeric vector of shrinkage parameters to search over. Default: +#' \code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS. +#' @param ... Additional arguments passed to \code{lassosum_rss()}. +#' +#' @return A numeric vector of the posterior SNP coefficients at the best (s, lambda). #' @export -lassosum_rss_weights <- function(stat, LD, ...) { - model <- lassosum_rss(bhat = stat$b, LD = list(blk1 = LD), n = median(stat$n), ...) - return(model$beta_est) +lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0), ...) { + n <- median(stat$n) + p <- nrow(LD) + best_fbeta <- Inf + best_beta <- rep(0, p) + + for (s_val in s) { + # Shrink LD: R_s = (1 - s) * R + s * I + LD_s <- (1 - s_val) * LD + s_val * diag(p) + model <- lassosum_rss(bhat = stat$b, LD = list(blk1 = LD_s), n = n, ...) + min_fbeta <- min(model$fbeta) + if (min_fbeta < best_fbeta) { + best_fbeta <- min_fbeta + best_beta <- model$beta_est + } + } + + return(best_beta) } diff --git a/man/lassosum_rss.Rd b/man/lassosum_rss.Rd index fcca1c52..bce3d7b0 100644 --- a/man/lassosum_rss.Rd +++ b/man/lassosum_rss.Rd @@ -8,7 +8,7 @@ lassosum_rss( bhat, LD, n, - lambda = exp(seq(log(0.001), log(0.1), length.out = 20)), + lambda = exp(seq(log(1e-04), log(0.1), length.out = 20)), thr = 1e-04, maxiter = 10000 ) diff --git a/man/lassosum_rss_weights.Rd b/man/lassosum_rss_weights.Rd index 25163c49..3546860b 100644 --- a/man/lassosum_rss_weights.Rd +++ b/man/lassosum_rss_weights.Rd @@ -2,13 +2,27 @@ % Please edit documentation in R/regularized_regression.R \name{lassosum_rss_weights} \alias{lassosum_rss_weights} -\title{Extract weights from lassosum_rss function} +\title{Extract weights from lassosum_rss with shrinkage grid search} \usage{ -lassosum_rss_weights(stat, LD, ...) +lassosum_rss_weights(stat, LD, s = c(0.2, 0.5, 0.9, 1), ...) +} +\arguments{ +\item{stat}{A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes).} + +\item{LD}{LD correlation matrix R (single matrix, NOT pre-shrunk).} + +\item{s}{Numeric vector of shrinkage parameters to search over. Default: +\code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS.} + +\item{...}{Additional arguments passed to \code{lassosum_rss()}.} } \value{ -A numeric vector of the posterior SNP coefficients. +A numeric vector of the posterior SNP coefficients at the best (s, lambda). } \description{ -Extract weights from lassosum_rss function +Searches over a grid of shrinkage parameters \code{s} (default: +\code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS). +For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then +\code{lassosum_rss()} is called across the lambda path. The best +\code{(s, lambda)} combination is selected by the lowest objective value. } diff --git a/man/ld_loader.Rd b/man/ld_loader.Rd new file mode 100644 index 00000000..fe8351a2 --- /dev/null +++ b/man/ld_loader.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ld_loader.R +\name{ld_loader} +\alias{ld_loader} +\title{Create an LD loader for on-demand block-wise LD retrieval} +\usage{ +ld_loader( + R_list = NULL, + X_list = NULL, + ld_meta_path = NULL, + regions = NULL, + return_genotype = FALSE, + max_variants = NULL +) +} +\arguments{ +\item{R_list}{List of G precomputed LD correlation matrices (p_g x p_g). +Mutually exclusive with \code{X_list} and \code{ld_meta_path}.} + +\item{X_list}{List of G genotype matrices (n x p_g). Mutually exclusive +with \code{R_list} and \code{ld_meta_path}.} + +\item{ld_meta_path}{Path to a pecotmr LD metadata TSV file (as used by +\code{\link{load_LD_matrix}}). Mutually exclusive with \code{R_list} +and \code{X_list}.} + +\item{regions}{Character vector of G region strings (e.g., +\code{"chr22:17238266-19744294"}). Required when \code{ld_meta_path} +is used.} + +\item{return_genotype}{Logical. When using file mode, return the +genotype matrix X (\code{TRUE}) or LD correlation R (\code{FALSE}, +default). Returning X enables low-rank RSS paths in downstream methods.} + +\item{max_variants}{Integer or \code{NULL}. If set, randomly subsample +blocks larger than this to control memory usage.} +} +\value{ +A function \code{loader(g)} that, given a block index \code{g}, + returns the corresponding LD matrix or genotype matrix. +} +\description{ +Constructs a loader function that retrieves per-block LD matrices on +demand. This avoids loading all blocks into memory simultaneously, +which is critical for genome-wide analyses with hundreds of blocks. +} +\details{ +Three modes are supported: + +\describe{ + \item{list mode (R)}{Pre-loaded list of LD correlation matrices. + Simple but uses more memory. Set \code{R_list}.} + \item{list mode (X)}{Pre-loaded list of genotype matrices (n x p_g). + Set \code{X_list}.} + \item{file mode}{Loads LD from a pecotmr metadata TSV file on the fly + via \code{\link{load_LD_matrix}}. Memory-efficient for large datasets. + Set \code{ld_meta_path} and \code{regions}.} +} +} +\examples{ +# List mode with pre-computed LD +R1 <- diag(10) +R2 <- diag(15) +loader <- ld_loader(R_list = list(R1, R2)) +loader(1) # returns R1 +loader(2) # returns R2 + +} diff --git a/man/otters_association.Rd b/man/otters_association.Rd new file mode 100644 index 00000000..edd373c0 --- /dev/null +++ b/man/otters_association.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/otters.R +\name{otters_association} +\alias{otters_association} +\title{TWAS association testing with omnibus combination (OTTERS Stage II)} +\usage{ +otters_association(weights, gwas_z, LD, combine_method = c("acat", "hmp")) +} +\arguments{ +\item{weights}{Named list of weight vectors (output from \code{\link{otters_weights}} +or any named list of numeric vectors).} + +\item{gwas_z}{Numeric vector of GWAS z-scores, same length and order as the +weights vectors. Must be aligned to the same variants and allele orientation +as the weights and LD matrix. Use \code{\link{allele_qc}} or +\code{\link{rss_basic_qc}} for harmonization before calling this function.} + +\item{LD}{LD correlation matrix R, aligned to the same variants as weights +and gwas_z.} + +\item{combine_method}{Method to combine p-values across methods: \code{"acat"} +(default) or \code{"hmp"}.} +} +\value{ +A data.frame with columns: +\describe{ + \item{method}{Method name (per-method rows plus a combined row).} + \item{twas_z}{TWAS z-score (\code{NA} for combined row).} + \item{twas_pval}{TWAS p-value.} + \item{n_snps}{Number of non-zero weight SNPs used.} +} +} +\description{ +Computes per-method TWAS z-scores using the FUSION formula and combines +p-values across methods via ACAT (Aggregated Cauchy Association Test) or +HMP (Harmonic Mean P-value). +} +\details{ +The FUSION TWAS statistic (Gusev et al. 2016) is: +\deqn{Z_{TWAS} = \frac{w^T z}{\sqrt{w^T R w}}} +where \eqn{w} are eQTL weights, \eqn{z} are GWAS z-scores, and \eqn{R} +is the LD correlation matrix. +} +\examples{ +set.seed(42) +p <- 20 +gwas_z <- rnorm(p) +R <- diag(p) +weights <- list(method1 = rnorm(p, sd = 0.01), method2 = rnorm(p, sd = 0.01)) +otters_association(weights, gwas_z, R) + +} diff --git a/man/otters_weights.Rd b/man/otters_weights.Rd new file mode 100644 index 00000000..6f7ed755 --- /dev/null +++ b/man/otters_weights.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/otters.R +\name{otters_weights} +\alias{otters_weights} +\title{Train eQTL weights using multiple RSS methods (OTTERS Stage I)} +\usage{ +otters_weights( + sumstats, + LD, + n, + methods = list(lassosum_rss = list(), prs_cs = list(phi = 1e-04, n_iter = 1000, + n_burnin = 500, thin = 5), sdpr = list(iter = 1000, burn = 200, thin = 1, verbose = + FALSE)), + p_thresholds = c(0.001, 0.05) +) +} +\arguments{ +\item{sumstats}{A data.frame of eQTL summary statistics. Must contain column \code{z} +(z-scores). If \code{z} is absent but \code{beta} and \code{se} are present, +z-scores are computed as \code{beta / se}.} + +\item{LD}{LD correlation matrix R for the gene region (single matrix, not a list). +Should have row/column names matching variant identifiers if variant alignment +is desired.} + +\item{n}{eQTL study sample size (scalar).} + +\item{methods}{Named list of RSS methods and their extra arguments. Each element +name must correspond to a \code{*_weights} function in pecotmr (without the +\code{_weights} suffix). Defaults match the original OTTERS pipeline +(Zhang et al. 2024): +\itemize{ + \item \code{lassosum_rss}: s grid = c(0.2, 0.5, 0.9, 1.0), lambda from + 0.0001 to 0.1 (20 values on log scale) + \item \code{prs_cs}: phi = 1e-4 (fixed, not learned), 1000 iterations, + 500 burn-in, thin = 5 + \item \code{sdpr}: 1000 iterations, 200 burn-in, thin = 1 (no thinning) +} +To add learners (e.g., \code{mr_ash_rss}), simply append to this list.} + +\item{p_thresholds}{Numeric vector of p-value thresholds for P+T. Set to +\code{NULL} to skip P+T. Default: \code{c(0.001, 0.05)}.} +} +\value{ +A named list of weight vectors (one per method). Each vector has length + equal to \code{nrow(sumstats)}. P+T results are named \code{PT_}. +} +\description{ +Implements the training stage of the OTTERS framework (Omnibus Transcriptome +Test using Expression Reference Summary data, Zhang et al. 2024). Trains +eQTL effect size weights for a gene region using multiple summary-statistics-based +methods in parallel, enabling downstream omnibus TWAS testing. +} +\details{ +Methods are dispatched dynamically via \code{do.call(paste0(method, "_weights"), ...)}, +so any function following the \code{*_weights(stat, LD, ...)} convention can be used +(e.g., \code{lassosum_rss_weights}, \code{prs_cs_weights}, \code{sdpr_weights}, +\code{mr_ash_rss_weights}). + +P+T (pruning and thresholding) is handled internally: for each threshold, SNPs with +eQTL p-value below the threshold are selected, and their marginal z-scores (scaled +to correlation units: \code{z / sqrt(n)}) are used as weights. +} +\examples{ +set.seed(42) +n <- 500; p <- 20 +z <- rnorm(p, sd = 2) +R <- diag(p) +sumstats <- data.frame(z = z) +weights <- otters_weights(sumstats, R, n, + methods = list(lassosum_rss = list()), + p_thresholds = c(0.05)) + +} diff --git a/tests/testthat/test_otters.R b/tests/testthat/test_otters.R new file mode 100644 index 00000000..fe3e4135 --- /dev/null +++ b/tests/testthat/test_otters.R @@ -0,0 +1,174 @@ +context("otters") + +# ---- otters_weights ---- +test_that("otters_weights returns named list of weight vectors", { + set.seed(42) + p <- 20 + n <- 500 + z <- rnorm(p, sd = 2) + R <- diag(p) + sumstats <- data.frame(z = z) + result <- otters_weights(sumstats, R, n, + methods = list(lassosum_rss = list()), + p_thresholds = c(0.05) + ) + expect_type(result, "list") + expect_true("PT_0.05" %in% names(result)) + expect_true("lassosum_rss" %in% names(result)) + expect_equal(length(result$PT_0.05), p) + expect_equal(length(result$lassosum_rss), p) +}) + +test_that("otters_weights computes z from beta/se if z missing", { + set.seed(42) + p <- 10 + n <- 100 + sumstats <- data.frame(beta = rnorm(p, sd = 0.1), se = rep(0.05, p)) + R <- diag(p) + result <- otters_weights(sumstats, R, n, + methods = list(lassosum_rss = list()), + p_thresholds = c(0.05) + ) + expect_true("lassosum_rss" %in% names(result)) + expect_equal(length(result$lassosum_rss), p) +}) + +test_that("otters_weights errors when no z or beta/se", { + sumstats <- data.frame(x = 1:5) + expect_error(otters_weights(sumstats, diag(5), 100), "z.*beta.*se") +}) + +test_that("otters_weights P+T selects correct SNPs", { + set.seed(42) + p <- 20 + n <- 500 + # Large z-scores for first 3 SNPs (should pass threshold) + z <- c(rep(5, 3), rep(0.1, 17)) + sumstats <- data.frame(z = z) + R <- diag(p) + result <- otters_weights(sumstats, R, n, + methods = list(), p_thresholds = c(0.001) + ) + w <- result$PT_0.001 + # First 3 should be non-zero, rest should be zero + expect_true(all(w[1:3] != 0)) + expect_true(all(w[4:20] == 0)) +}) + +test_that("otters_weights warns on unknown method", { + sumstats <- data.frame(z = rnorm(5)) + expect_warning( + otters_weights(sumstats, diag(5), 100, + methods = list(nonexistent_method = list()), + p_thresholds = NULL), + "not found" + ) +}) + +test_that("otters_weights with multiple methods returns all", { + set.seed(42) + p <- 15 + n <- 500 + z <- rnorm(p, sd = 2) + R <- diag(p) + for (i in 1:(p - 1)) { R[i, i + 1] <- 0.3; R[i + 1, i] <- 0.3 } + sumstats <- data.frame(z = z) + result <- otters_weights(sumstats, R, n, + methods = list( + lassosum_rss = list(), + prs_cs = list(n_iter = 50, n_burnin = 10, thin = 2, seed = 42) + ), + p_thresholds = c(0.001, 0.05) + ) + expect_true(all(c("PT_0.001", "PT_0.05", "lassosum_rss", "prs_cs") %in% names(result))) + for (nm in names(result)) { + expect_equal(length(result[[nm]]), p) + expect_true(all(is.finite(result[[nm]]))) + } +}) + +# ---- otters_association ---- +test_that("otters_association returns correct structure", { + set.seed(42) + p <- 20 + gwas_z <- rnorm(p) + R <- diag(p) + weights <- list( + method1 = rnorm(p, sd = 0.01), + method2 = rnorm(p, sd = 0.01) + ) + result <- otters_association(weights, gwas_z, R) + expect_true(is.data.frame(result)) + expect_true(all(c("method", "twas_z", "twas_pval", "n_snps") %in% colnames(result))) + # Two methods + ACAT combined + expect_equal(nrow(result), 3) + expect_true("ACAT_combined" %in% result$method) +}) + +test_that("otters_association handles all-zero weights gracefully", { + p <- 10 + gwas_z <- rnorm(p) + R <- diag(p) + weights <- list(zero_method = rep(0, p), nonzero = rnorm(p, sd = 0.01)) + result <- otters_association(weights, gwas_z, R) + zero_row <- result[result$method == "zero_method", ] + expect_true(is.na(zero_row$twas_z)) + expect_equal(zero_row$n_snps, 0) +}) + +test_that("otters_association with single method has no combined row", { + p <- 10 + gwas_z <- rnorm(p) + R <- diag(p) + weights <- list(only_method = rnorm(p, sd = 0.01)) + result <- otters_association(weights, gwas_z, R) + # Only one valid p-value, so no ACAT combination + expect_false("ACAT_combined" %in% result$method) +}) + +test_that("otters_association uses HMP when specified", { + skip_if_not_installed("harmonicmeanp") + set.seed(42) + p <- 20 + gwas_z <- rnorm(p) + R <- diag(p) + weights <- list(m1 = rnorm(p, sd = 0.01), m2 = rnorm(p, sd = 0.01)) + result <- otters_association(weights, gwas_z, R, combine_method = "hmp") + expect_true("HMP_combined" %in% result$method) +}) + +# ---- end-to-end integration ---- +test_that("otters_weights + otters_association end-to-end on simulated data", { + set.seed(2024) + n_eqtl <- 500 + n_gwas <- 10000 + p <- 20 + + # Simulate genotypes and eQTL + X <- matrix(rbinom(n_eqtl * p, 2, 0.3), nrow = n_eqtl) + beta_eqtl <- rep(0, p) + beta_eqtl[c(3, 10)] <- c(0.3, -0.2) + expr <- X %*% beta_eqtl + rnorm(n_eqtl) + eqtl_z <- as.vector(cor(expr, X)) * sqrt(n_eqtl) + R <- cor(X) + + # Simulate GWAS (gene affects trait) + beta_gwas_gene <- 0.1 + gwas_z <- R %*% (beta_eqtl * beta_gwas_gene * sqrt(n_gwas)) + rnorm(p) + + # Stage I: train weights + sumstats <- data.frame(z = eqtl_z) + weights <- otters_weights(sumstats, R, n_eqtl, + methods = list(lassosum_rss = list()), + p_thresholds = c(0.05) + ) + expect_true(length(weights) >= 2) + + # Stage II: test association + result <- otters_association(weights, as.numeric(gwas_z), R) + expect_true(is.data.frame(result)) + expect_true(nrow(result) >= 2) + # At least one method should have a small-ish p-value (gene is truly associated) + min_pval <- min(result$twas_pval, na.rm = TRUE) + expect_true(min_pval < 0.5) +}) diff --git a/vignettes/otters_pipeline.Rmd b/vignettes/otters_pipeline.Rmd new file mode 100644 index 00000000..fdad3923 --- /dev/null +++ b/vignettes/otters_pipeline.Rmd @@ -0,0 +1,229 @@ +--- +title: "OTTERS: Omnibus TWAS using multiple RSS methods" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{OTTERS: Omnibus TWAS using multiple RSS methods} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") +``` + +## Overview + +The OTTERS pipeline (Zhang et al. 2024) performs TWAS in two stages: + +1. **Stage I** (`otters_weights`): Train eQTL weights using multiple + summary-statistics-based (RSS) methods. +2. **Stage II** (`otters_association`): Compute per-method TWAS z-scores + and combine p-values across methods using ACAT. + +## Loading real LD data + +In real analyses, LD matrices are loaded from PLINK2 pgen files or +pre-computed LD blocks using pecotmr's data infrastructure. Here we show +the loading pattern using chr22 data from the ADSP reference panel (the +code is not evaluated here since the data paths are machine-specific): + +```{r load_real_data, eval=FALSE} +library(pecotmr) + +# LD metadata file points to PLINK2 pgen/pvar/psam files +ld_meta <- "~/Documents/ADSP_chr22_sketch/ld_meta_chr22.tsv" + +# Define gene regions of interest +gene_regions <- c( + "chr22:20000000-21000000", + "chr22:25000000-26000000", + "chr22:30000000-31000000" +) + +# Option 1: Load LD on the fly via ld_loader() (memory-efficient) +loader <- ld_loader(ld_meta_path = ld_meta, regions = gene_regions) +R_gene1 <- loader(1) # loads LD for first gene region on demand + +# Option 2: Pre-load all blocks into a list +R_list <- lapply(gene_regions, function(reg) { + ld <- load_LD_matrix(ld_meta, region = reg) + ld$LD_matrix +}) +loader_from_list <- ld_loader(R_list = R_list) + +# Option 3: Load genotype matrix and compute LD with shrinkage +ld_data <- load_LD_matrix(ld_meta, region = gene_regions[1], + return_genotype = TRUE) +X <- ld_data$LD_matrix +R_shrunk <- compute_LD(X, shrinkage = 0.5) # lassosum-style regularization + +# eQTL summary stats are loaded similarly: +# eqtl <- load_rss_data(sumstat_path, column_file_path, region, n) +``` + +## Simulating data for demonstration + +We simulate a realistic scenario: load genotype matrices from real data +(or simulate them), compute LD, then simulate eQTL effects and GWAS +z-scores on top. + +```{r simulate} +library(pecotmr) +set.seed(2024) + +# Simulate 3 gene regions with realistic LD +# (In practice, these would come from ld_loader() as shown above) +n_eqtl <- 500 +n_gwas <- 50000 +n_genes <- 3 +gene_p <- c(40, 50, 30) # variants per gene + +# Simulate genotype matrices and LD per gene +X_list <- lapply(gene_p, function(p) { + X <- matrix(rbinom(n_eqtl * p, 2, 0.3), nrow = n_eqtl) + X <- scale(X); X[is.na(X)] <- 0 + X +}) +R_list <- lapply(X_list, cor) + +# Create a loader from the pre-loaded list +loader <- ld_loader(R_list = R_list) + +# Simulate eQTL effects and GWAS signal per gene +eqtl_sumstats_list <- list() +gwas_z_list <- list() +beta_true_list <- list() + +for (g in 1:n_genes) { + p <- gene_p[g] + beta_eqtl <- rep(0, p) + # 2-3 causal eQTLs per gene + n_causal <- sample(2:3, 1) + causal <- sort(sample(p, n_causal)) + beta_eqtl[causal] <- rnorm(n_causal, sd = 0.25) + beta_true_list[[g]] <- beta_eqtl + + # eQTL z-scores + expr <- X_list[[g]] %*% beta_eqtl + rnorm(n_eqtl) + eqtl_z <- as.vector(cor(expr, X_list[[g]])) * sqrt(n_eqtl) + eqtl_sumstats_list[[g]] <- data.frame(z = eqtl_z) + + # GWAS z-scores (gene g affects trait with effect 0.05) + gwas_z_list[[g]] <- as.numeric( + R_list[[g]] %*% (beta_eqtl * 0.05 * sqrt(n_gwas)) + ) + rnorm(p) +} +``` + +## Running the OTTERS pipeline per gene + +### Default methods and parameters + +`otters_weights()` runs three methods by default, with parameters matching +the original OTTERS (Zhang et al. 2024): + +| Method | Key defaults | Description | +|--------|-------------|-------------| +| **P+T** | `p_thresholds = c(0.001, 0.05)` | Select SNPs by eQTL p-value; weights = marginal z/sqrt(n) | +| **lassosum** | `s = c(0.2, 0.5, 0.9, 1.0)`, `lambda` = 20 values from 1e-4 to 0.1 | L1-penalized regression; grid search over shrinkage | +| **PRS-CS** | `phi = 1e-4` (fixed), `n_iter = 1000`, `n_burnin = 500`, `thin = 5` | Bayesian continuous shrinkage; fixed global shrinkage | +| **SDPR** | `iter = 1000`, `burn = 200`, `thin = 1` | Dirichlet process mixture; no thinning | + +### Process all genes using the loader + +```{r run_pipeline, message=FALSE} +all_results <- list() + +for (g in 1:n_genes) { + # Get LD from loader (or could pass R_list[[g]] directly) + R_g <- loader(g) + + # Stage I: train weights + weights_g <- otters_weights( + sumstats = eqtl_sumstats_list[[g]], + LD = R_g, + n = n_eqtl + ) + + # Stage II: association test + twas_g <- otters_association( + weights = weights_g, + gwas_z = gwas_z_list[[g]], + LD = R_g + ) + twas_g$gene <- paste0("gene_", g) + all_results[[g]] <- twas_g +} + +results_df <- do.call(rbind, all_results) +print(results_df[, c("gene", "method", "twas_z", "twas_pval", "n_snps")]) +``` + +## Extending with additional learners + +Any function following the `*_weights(stat, LD, ...)` convention can be +added. No code changes needed — methods are dispatched dynamically. + +### Example: adding mr.ash.rss + +`mr_ash_rss_weights()` (from susieR) uses a flexible mixture-of-normals +prior that can capture both sparse and polygenic architectures: + +```{r extend, eval=FALSE} +weights <- otters_weights( + sumstats = eqtl_sumstats_list[[1]], + LD = R_list[[1]], + n = n_eqtl, + methods = list( + # Default OTTERS methods + lassosum_rss = list(), + prs_cs = list(phi = 1e-4, n_iter = 1000, n_burnin = 500, thin = 5), + sdpr = list(iter = 1000, burn = 200, thin = 1, verbose = FALSE), + # Additional learner + mr_ash_rss = list( + var_y = 1, + sigma2_e = 0.5, + s0 = c(0, 0.001, 0.01, 0.1, 0.5), + w0 = rep(0.2, 5) + ) + ) +) +``` + +### Example: quick two-method run + +```{r minimal, eval=FALSE} +weights <- otters_weights( + sumstats = eqtl_sumstats_list[[1]], + LD = R_list[[1]], + n = n_eqtl, + methods = list(lassosum_rss = list()), + p_thresholds = c(0.05) +) +``` + +## Note on data preparation + +`otters_association()` assumes that weights, GWAS z-scores, and LD are +aligned to the same variants and allele orientation. For real data, use +pecotmr's QC functions before calling the pipeline: + +- `allele_qc()` — match alleles between summary stats and LD reference, + handle strand flips (A/T, C/G), negate effects for allele-flipped SNPs +- `rss_basic_qc()` — full QC pipeline including allele matching, indel + filtering, and region-based variant selection + +## References + +- Zhang et al. (2024). OTTERS: a powerful TWAS framework leveraging + summary-level reference data. *Nature Communications*. +- Gusev et al. (2016). Integrative approaches for large-scale + transcriptome-wide association studies. *Nature Genetics*. +- Mak et al. (2017). Polygenic scores via penalized regression on + summary statistics. *Genetic Epidemiology*. +- Ge et al. (2019). Polygenic prediction via Bayesian regression and + continuous shrinkage priors. *Nature Communications*. +- Zhou et al. (2023). SDPR: a statistical method for leveraging + summary-level data for polygenic risk prediction. +- Liu & Xie (2020). Cauchy combination test: a powerful test with + analytic p-value calculation. *JASA*.