diff --git a/NAMESPACE b/NAMESPACE index 2458d828..2334a687 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,7 +51,10 @@ export(get_susie_result) export(glmnet_weights) export(harmonize_gwas) export(harmonize_twas) +export(invert_minmax_scaling) export(lasso_weights) +export(lassosum_rss) +export(lassosum_rss_weights) export(lbf_to_alpha) export(load_LD_matrix) export(load_genotype_region) diff --git a/R/RcppExports.R b/R/RcppExports.R index 67d76a8d..e434983b 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -23,6 +23,10 @@ dentist_iterative_impute <- function(LD_mat, nSample, zScore, pValueThreshold, p .Call('_pecotmr_dentist_iterative_impute', PACKAGE = 'pecotmr', LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose) } +lassosum_rss_rcpp <- function(z, LD, lambda, thr, maxiter) { + .Call('_pecotmr_lassosum_rss_rcpp', PACKAGE = 'pecotmr', z, LD, lambda, thr, maxiter) +} + prs_cs_rcpp <- function(a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) { .Call('_pecotmr_prs_cs_rcpp', PACKAGE = 'pecotmr', a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) } diff --git a/R/misc.R b/R/misc.R index 54830be2..bf4ec16f 100644 --- a/R/misc.R +++ b/R/misc.R @@ -153,6 +153,10 @@ is_zero_variance <- function(x) length(unique(x)) == 1 #' drops trailing samples so that \code{nrow(X)} is a multiple of 4, matching #' PLINK .bed file chunk processing. Ignored when \code{method = "sample"}. #' Default is \code{FALSE}. +#' @param shrinkage Numeric in (0, 1]. Shrink the LD matrix toward the identity: +#' \code{R_s = (1 - shrinkage) * R + shrinkage * I}. Useful for regularizing +#' LD for summary-statistics-based methods such as lassosum (Mak et al 2017). +#' Default is 0 (no shrinkage). #' #' @return A symmetric correlation matrix with row and column names taken from #' \code{colnames(X)}. @@ -188,7 +192,7 @@ is_zero_variance <- function(x) length(unique(x)) == 1 #' #' @export compute_LD <- function(X, method = c("sample", "population"), - trim_samples = FALSE) { + trim_samples = FALSE, shrinkage = 0) { if (is.null(X)) { stop("X must be provided.") } @@ -235,6 +239,13 @@ compute_LD <- function(X, method = c("sample", "population"), # Ensure clean output diag(R) <- 1.0 R[is.na(R) | is.nan(R)] <- 0 + + # Optional shrinkage toward identity: R_s = (1 - shrinkage) * R + shrinkage * I + # Used e.g. by lassosum (Mak et al 2017) to regularize LD for RSS methods. + if (shrinkage > 0 && shrinkage <= 1) { + R <- (1 - shrinkage) * R + shrinkage * diag(nrow(R)) + } + colnames(R) <- rownames(R) <- nms R } diff --git a/R/regularized_regression.R b/R/regularized_regression.R index 67106af6..b3d8f271 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -782,3 +782,79 @@ bayes_c_rss_weights <- function(sumstats, LD, ...) { bayes_r_rss_weights <- function(sumstats, LD, ...) { return(bayes_alphabet_rss_weights(sumstats, LD, method = "bayesR", ...)) } + +#' Lassosum RSS: LASSO on summary statistics with LD reference +#' +#' Coordinate descent to solve the penalized regression on summary statistics: +#' \deqn{f(\beta) = \beta' R \beta - 2\beta' r + 2\lambda ||\beta||_1} +#' where \eqn{R} is the LD matrix (pre-shrunk if desired) and \eqn{r = \hat\beta / \sqrt{n}}. +#' +#' Based on Mak et al (2017) "Polygenic scores via penalized regression on summary statistics", +#' Genetic Epidemiology 41(6):469-480. +#' +#' @param bhat A vector of marginal effect sizes. +#' @param LD A list of LD blocks, where each element is a matrix representing an LD block. +#' If shrinkage is desired, apply it before passing (e.g., \code{(1-s)*R + s*I}). +#' @param n Sample size of the GWAS. +#' @param lambda A vector of L1 penalty values. Default: 20 values from 0.001 to 0.1 on log scale. +#' @param thr Convergence threshold. Default: 1e-4. +#' @param maxiter Maximum number of iterations. Default: 10000. +#' +#' @return A list containing: +#' \item{beta_est}{Posterior estimates of SNP effect sizes at best lambda.} +#' \item{beta}{Matrix of estimates (p x nlambda).} +#' \item{lambda}{The lambda values used.} +#' \item{conv}{Convergence indicators (1 = converged).} +#' \item{loss}{Quadratic loss at each lambda.} +#' \item{fbeta}{Full objective value at each lambda.} +#' \item{nparams}{Number of non-zero coefficients at each lambda.} +#' +#' @examples +#' set.seed(42) +#' p <- 10 +#' n <- 100 +#' bhat <- rnorm(p, sd = 0.1) +#' R <- diag(p) +#' for (i in 1:(p - 1)) { +#' R[i, i + 1] <- 0.3 +#' R[i + 1, i] <- 0.3 +#' } +#' LD <- list(blk1 = R) +#' out <- lassosum_rss(bhat, LD, n) +#' @export +lassosum_rss <- function(bhat, LD, n, + lambda = exp(seq(log(0.001), log(0.1), length.out = 20)), + thr = 1e-4, maxiter = 10000) { + if (!is.list(LD)) { + stop("Please provide a valid list of LD blocks using 'LD'.") + } + if (missing(n) || n <= 0) { + stop("Please provide a valid sample size using 'n'.") + } + total_rows_in_LD <- sum(sapply(LD, nrow)) + if (length(bhat) != total_rows_in_LD) { + stop("The length of 'bhat' must be the same as the sum of the number of rows of all elements in the 'LD' list.") + } + + z <- bhat / sqrt(n) + order <- order(lambda, decreasing = TRUE) + result <- lassosum_rss_rcpp(z, LD, lambda[order], thr, maxiter) + + # Reorder back to original lambda order + result$beta[, order] <- result$beta + result$conv[order] <- result$conv + result$loss[order] <- result$loss + result$fbeta[order] <- result$fbeta + result$lambda <- lambda + result$nparams <- as.integer(colSums(result$beta != 0)) + result$beta_est <- as.numeric(result$beta[, which.min(result$fbeta)]) + result +} + +#' Extract weights from lassosum_rss function +#' @return A numeric vector of the posterior SNP coefficients. +#' @export +lassosum_rss_weights <- function(stat, LD, ...) { + model <- lassosum_rss(bhat = stat$b, LD = list(blk1 = LD), n = median(stat$n), ...) + return(model$beta_est) +} diff --git a/man/compute_LD.Rd b/man/compute_LD.Rd index d37eb521..3a2c9578 100644 --- a/man/compute_LD.Rd +++ b/man/compute_LD.Rd @@ -4,7 +4,12 @@ \alias{compute_LD} \title{Compute LD (Linkage Disequilibrium) Correlation Matrix from Genotypes} \usage{ -compute_LD(X, method = c("sample", "population"), trim_samples = FALSE) +compute_LD( + X, + method = c("sample", "population"), + trim_samples = FALSE, + shrinkage = 0 +) } \arguments{ \item{X}{Numeric genotype matrix (samples x SNPs). May contain \code{NA} @@ -17,6 +22,11 @@ for missing genotypes.} drops trailing samples so that \code{nrow(X)} is a multiple of 4, matching PLINK .bed file chunk processing. Ignored when \code{method = "sample"}. Default is \code{FALSE}.} + +\item{shrinkage}{Numeric in (0, 1]. Shrink the LD matrix toward the identity: +\code{R_s = (1 - shrinkage) * R + shrinkage * I}. Useful for regularizing +LD for summary-statistics-based methods such as lassosum (Mak et al 2017). +Default is 0 (no shrinkage).} } \value{ A symmetric correlation matrix with row and column names taken from diff --git a/man/invert_minmax_scaling.Rd b/man/invert_minmax_scaling.Rd new file mode 100644 index 00000000..616feed5 --- /dev/null +++ b/man/invert_minmax_scaling.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/file_utils.R +\name{invert_minmax_scaling} +\alias{invert_minmax_scaling} +\title{Invert min-max [0,2] scaling to recover the original U matrix.} +\usage{ +invert_minmax_scaling(X, u_min, u_max) +} +\arguments{ +\item{X}{Numeric matrix (B x p) of min-max scaled values in [0, 2].} + +\item{u_min}{Numeric vector of per-variant minimum values before scaling.} + +\item{u_max}{Numeric vector of per-variant maximum values before scaling.} +} +\value{ +Matrix of original U values with same dimensions. +} +\description{ +Stochastic genotype data (from rss_ld_sketch) is stored in PLINK2 pgen +format after min-max scaling: U_scaled = 2 * (U - u_min) / (u_max - u_min). +This function exactly inverts that transform using the stored per-variant +u_min and u_max values from the companion .afreq file. +} +\details{ +The recovered U satisfies U'U/B ~ Wishart(B, R)/B, the correct distributional +property for LD-based fine-mapping with dynamic variance tracking. +} diff --git a/man/lassosum_rss.Rd b/man/lassosum_rss.Rd new file mode 100644 index 00000000..fcca1c52 --- /dev/null +++ b/man/lassosum_rss.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/regularized_regression.R +\name{lassosum_rss} +\alias{lassosum_rss} +\title{Lassosum RSS: LASSO on summary statistics with LD reference} +\usage{ +lassosum_rss( + bhat, + LD, + n, + lambda = exp(seq(log(0.001), log(0.1), length.out = 20)), + thr = 1e-04, + maxiter = 10000 +) +} +\arguments{ +\item{bhat}{A vector of marginal effect sizes.} + +\item{LD}{A list of LD blocks, where each element is a matrix representing an LD block. +If shrinkage is desired, apply it before passing (e.g., \code{(1-s)*R + s*I}).} + +\item{n}{Sample size of the GWAS.} + +\item{lambda}{A vector of L1 penalty values. Default: 20 values from 0.001 to 0.1 on log scale.} + +\item{thr}{Convergence threshold. Default: 1e-4.} + +\item{maxiter}{Maximum number of iterations. Default: 10000.} +} +\value{ +A list containing: + \item{beta_est}{Posterior estimates of SNP effect sizes at best lambda.} + \item{beta}{Matrix of estimates (p x nlambda).} + \item{lambda}{The lambda values used.} + \item{conv}{Convergence indicators (1 = converged).} + \item{loss}{Quadratic loss at each lambda.} + \item{fbeta}{Full objective value at each lambda.} + \item{nparams}{Number of non-zero coefficients at each lambda.} +} +\description{ +Coordinate descent to solve the penalized regression on summary statistics: +\deqn{f(\beta) = \beta' R \beta - 2\beta' r + 2\lambda ||\beta||_1} +where \eqn{R} is the LD matrix (pre-shrunk if desired) and \eqn{r = \hat\beta / \sqrt{n}}. +} +\details{ +Based on Mak et al (2017) "Polygenic scores via penalized regression on summary statistics", +Genetic Epidemiology 41(6):469-480. +} +\examples{ +set.seed(42) +p <- 10 +n <- 100 +bhat <- rnorm(p, sd = 0.1) +R <- diag(p) +for (i in 1:(p - 1)) { + R[i, i + 1] <- 0.3 + R[i + 1, i] <- 0.3 +} +LD <- list(blk1 = R) +out <- lassosum_rss(bhat, LD, n) +} diff --git a/man/lassosum_rss_weights.Rd b/man/lassosum_rss_weights.Rd new file mode 100644 index 00000000..25163c49 --- /dev/null +++ b/man/lassosum_rss_weights.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/regularized_regression.R +\name{lassosum_rss_weights} +\alias{lassosum_rss_weights} +\title{Extract weights from lassosum_rss function} +\usage{ +lassosum_rss_weights(stat, LD, ...) +} +\value{ +A numeric vector of the posterior SNP coefficients. +} +\description{ +Extract weights from lassosum_rss function +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 99d81a8b..ce21e098 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -44,6 +44,21 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// lassosum_rss_rcpp +Rcpp::List lassosum_rss_rcpp(const arma::vec& z, const Rcpp::List& LD, const arma::vec& lambda, double thr, int maxiter); +RcppExport SEXP _pecotmr_lassosum_rss_rcpp(SEXP zSEXP, SEXP LDSEXP, SEXP lambdaSEXP, SEXP thrSEXP, SEXP maxiterSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::vec& >::type z(zSEXP); + Rcpp::traits::input_parameter< const Rcpp::List& >::type LD(LDSEXP); + Rcpp::traits::input_parameter< const arma::vec& >::type lambda(lambdaSEXP); + Rcpp::traits::input_parameter< double >::type thr(thrSEXP); + Rcpp::traits::input_parameter< int >::type maxiter(maxiterSEXP); + rcpp_result_gen = Rcpp::wrap(lassosum_rss_rcpp(z, LD, lambda, thr, maxiter)); + return rcpp_result_gen; +END_RCPP +} // prs_cs_rcpp Rcpp::List prs_cs_rcpp(double a, double b, Rcpp::Nullable phi, Rcpp::NumericVector bhat, Rcpp::Nullable maf, int n, Rcpp::List ld_blk, int n_iter, int n_burnin, int thin, bool verbose, Rcpp::Nullable seed); RcppExport SEXP _pecotmr_prs_cs_rcpp(SEXP aSEXP, SEXP bSEXP, SEXP phiSEXP, SEXP bhatSEXP, SEXP mafSEXP, SEXP nSEXP, SEXP ld_blkSEXP, SEXP n_iterSEXP, SEXP n_burninSEXP, SEXP thinSEXP, SEXP verboseSEXP, SEXP seedSEXP) { @@ -113,6 +128,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_pecotmr_compute_LD_gcta_cpp", (DL_FUNC) &_pecotmr_compute_LD_gcta_cpp, 2}, {"_pecotmr_dentist_iterative_impute", (DL_FUNC) &_pecotmr_dentist_iterative_impute, 11}, + {"_pecotmr_lassosum_rss_rcpp", (DL_FUNC) &_pecotmr_lassosum_rss_rcpp, 5}, {"_pecotmr_prs_cs_rcpp", (DL_FUNC) &_pecotmr_prs_cs_rcpp, 12}, {"_pecotmr_qtl_enrichment_rcpp", (DL_FUNC) &_pecotmr_qtl_enrichment_rcpp, 7}, {"_pecotmr_sdpr_rcpp", (DL_FUNC) &_pecotmr_sdpr_rcpp, 16}, diff --git a/src/lassosum_rss.cpp b/src/lassosum_rss.cpp new file mode 100644 index 00000000..0200d26a --- /dev/null +++ b/src/lassosum_rss.cpp @@ -0,0 +1,120 @@ +// lassosum_rss.cpp — Coordinate descent for LASSO on RSS objective +// Ported from lassosum (Mak et al 2017) functions.cpp elnet()/repelnet() +// +// Objective: min_beta beta'R beta - 2 beta'z + 2 lambda ||beta||_1 +// where R is a (possibly pre-shrunk) LD matrix and z = bhat / sqrt(n). + +#include +// [[Rcpp::depends(RcppArmadillo)]] + +using namespace Rcpp; + +// Single-block coordinate descent — mirrors lassosum elnet() +// Tracks Rbeta = R * beta as a running sum (analogous to yhat = X * beta +// in the genotype version). This avoids recomputing the full dot product +// each iteration and matches the original algorithm exactly. +static int elnet_rss(double lambda1, const arma::vec& diag_R, + const arma::mat& R, const arma::vec& z, + double thr, arma::vec& beta, arma::vec& Rbeta, + int maxiter) { + int p = z.n_elem; + double dlx, del, t, bj; + + int conv = 0; + for (int k = 0; k < maxiter; k++) { + dlx = 0.0; + for (int j = 0; j < p; j++) { + bj = beta(j); + // t = z(j) - sum_{k != j} R[j,k]*beta[k] + // = z(j) - (Rbeta(j) - R[j,j]*beta(j)) + t = z(j) - Rbeta(j) + diag_R(j) * bj; + // Soft-thresholding (same as lassosum line 438) + beta(j) = 0.0; + if (std::abs(t) - lambda1 > 0.0) + beta(j) = std::copysign(std::abs(t) - lambda1, t) / diag_R(j); + if (beta(j) == bj) continue; + del = beta(j) - bj; + dlx = std::max(dlx, std::abs(del)); + // Update running Rbeta (analogous to yhat += del * X.col(j)) + Rbeta += del * R.col(j); + } + Rcpp::checkUserInterrupt(); + if (dlx < thr) { + conv = 1; + break; + } + } + return conv; +} + +// [[Rcpp::export]] +Rcpp::List lassosum_rss_rcpp(const arma::vec& z, + const Rcpp::List& LD, + const arma::vec& lambda, + double thr, + int maxiter) { + // Compute total number of variants across all blocks + int n_blocks = LD.size(); + std::vector block_start(n_blocks), block_end(n_blocks); + int p = 0; + for (int b = 0; b < n_blocks; b++) { + arma::mat Rb = Rcpp::as(LD[b]); + block_start[b] = p; + p += Rb.n_rows; + block_end[b] = p - 1; + } + + if ((int)z.n_elem != p) + Rcpp::stop("Length of z must equal total rows across all LD blocks."); + + int nlambda = lambda.n_elem; + arma::mat beta_mat(p, nlambda, arma::fill::zeros); + arma::ivec conv_vec(nlambda, arma::fill::zeros); + arma::vec loss_vec(nlambda, arma::fill::zeros); + arma::vec fbeta_vec(nlambda, arma::fill::zeros); + + // Working beta vector — warm-started across lambda path + arma::vec beta(p, arma::fill::zeros); + + for (int i = 0; i < nlambda; i++) { + // Block-wise coordinate descent — mirrors lassosum repelnet() + int out = 1; + for (int b = 0; b < n_blocks; b++) { + arma::mat Rb = Rcpp::as(LD[b]); + int s = block_start[b]; + int e = block_end[b]; + arma::vec diag_R = Rb.diag(); + arma::vec z_blk = z.subvec(s, e); + arma::vec beta_blk = beta.subvec(s, e); + arma::vec Rbeta_blk = Rb * beta_blk; + + int conv_blk = elnet_rss(lambda(i), diag_R, Rb, z_blk, + thr, beta_blk, Rbeta_blk, maxiter); + beta.subvec(s, e) = beta_blk; + out = std::min(out, conv_blk); + } + + beta_mat.col(i) = beta; + conv_vec(i) = out; + + // Compute loss = beta'R beta - 2 z'beta (block-wise) + double loss = -2.0 * arma::dot(z, beta); + for (int b = 0; b < n_blocks; b++) { + arma::mat Rb = Rcpp::as(LD[b]); + int s = block_start[b]; + int e = block_end[b]; + arma::vec beta_blk = beta.subvec(s, e); + loss += arma::as_scalar(beta_blk.t() * Rb * beta_blk); + } + loss_vec(i) = loss; + fbeta_vec(i) = loss + 2.0 * lambda(i) * arma::sum(arma::abs(beta)); + } + + return List::create( + Named("beta") = beta_mat, + Named("lambda") = lambda, + Named("conv") = conv_vec, + Named("loss") = loss_vec, + Named("fbeta") = fbeta_vec + ); +} diff --git a/tests/testthat/test_regularized_regression.R b/tests/testthat/test_regularized_regression.R index 4238259e..3c2e2f14 100644 --- a/tests/testthat/test_regularized_regression.R +++ b/tests/testthat/test_regularized_regression.R @@ -786,3 +786,84 @@ test_that("mrash_weights with init_prior_sd = FALSE passes NULL sa2", { expect_true(is.numeric(result)) expect_equal(length(result), p) }) + +# ---- lassosum_rss ---- +test_that("lassosum_rss errors on invalid LD input", { + expect_error(lassosum_rss(bhat = rnorm(5), LD = "not_a_list", n = 100), + "valid list of LD blocks") +}) + +test_that("lassosum_rss errors on non-positive sample size", { + expect_error(lassosum_rss(bhat = rnorm(5), LD = list(blk1 = diag(5)), n = -1), + "valid sample size") +}) + +test_that("lassosum_rss errors on mismatched bhat and LD dimensions", { + expect_error( + lassosum_rss(bhat = rnorm(10), LD = list(blk1 = diag(5)), n = 100), + "same as the sum" + ) +}) + +test_that("lassosum_rss runs successfully with valid input", { + set.seed(42) + p <- 10 + n <- 100 + bhat <- rnorm(p, sd = 0.1) + R <- diag(p) + for (i in 1:(p - 1)) { + R[i, i + 1] <- 0.3 + R[i + 1, i] <- 0.3 + } + result <- lassosum_rss(bhat = bhat, LD = list(blk1 = R), n = n) + expect_type(result, "list") + expect_true("beta_est" %in% names(result)) + expect_equal(length(result$beta_est), p) + expect_true(all(is.finite(result$beta_est))) + expect_equal(nrow(result$beta), p) + expect_equal(ncol(result$beta), 20) +}) + +test_that("lassosum_rss accepts multiple LD blocks", { + set.seed(42) + p1 <- 5 + p2 <- 5 + p <- p1 + p2 + n <- 100 + bhat <- rnorm(p, sd = 0.1) + R1 <- diag(p1) + R2 <- diag(p2) + result <- lassosum_rss(bhat = bhat, LD = list(blk1 = R1, blk2 = R2), n = n) + expect_type(result, "list") + expect_true("beta_est" %in% names(result)) + expect_equal(length(result$beta_est), p) + expect_true(all(is.finite(result$beta_est))) +}) + +test_that("lassosum_rss with large lambda gives all-zero betas", { + set.seed(42) + p <- 10 + n <- 100 + bhat <- rnorm(p, sd = 0.1) + R <- diag(p) + result <- lassosum_rss(bhat = bhat, LD = list(blk1 = R), n = n, + lambda = c(100)) + expect_true(all(result$beta_est == 0)) +}) + +# ---- lassosum_rss_weights (wrapper) ---- +test_that("lassosum_rss_weights calls lassosum_rss and returns beta_est", { + set.seed(42) + p <- 10 + n <- 100 + bhat <- rnorm(p, sd = 0.1) + R <- diag(p) + for (i in 1:(p - 1)) { + R[i, i + 1] <- 0.3 + R[i + 1, i] <- 0.3 + } + stat <- list(b = bhat, n = rep(n, p)) + result <- lassosum_rss_weights(stat = stat, LD = R) + expect_equal(length(result), p) + expect_true(is.numeric(result)) +})