diff --git a/NAMESPACE b/NAMESPACE index 16414fcc..0ee1d142 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ export(bayes_n_weights) export(bayes_r_rss_weights) export(bayes_r_weights) export(calculate_xi_correlation) +export(check_ld) export(clean_context_names) export(coloc_post_processor) export(coloc_wrapper) @@ -27,7 +28,6 @@ export(colocboost_analysis_pipeline) export(compute_LD) export(compute_qtl_enrichment) export(ctwas_bimfile_loader) -export(ctwas_ld_loader) export(dentist) export(dentist_from_files) export(dentist_single_window) @@ -52,7 +52,6 @@ export(glmnet_weights) export(harmonize_gwas) export(harmonize_twas) export(invert_minmax_scaling) -export(check_ld) export(lasso_weights) export(lassosum_rss) export(lassosum_rss_weights) diff --git a/R/LD.R b/R/LD.R index 6d246da5..9d05dbee 100644 --- a/R/LD.R +++ b/R/LD.R @@ -127,27 +127,55 @@ get_regional_ld_meta <- function(ld_reference_meta_file, region, complete_covera #' @importFrom utils read.table #' @importFrom stats setNames #' @noRd -process_LD_matrix <- function(LD_file_path, bim_file_path) { +process_LD_matrix <- function(LD_file_path, snp_file_path = NULL) { # Read .cor.xz matrix LD_file_con <- xzfile(LD_file_path) LD_matrix <- scan(LD_file_con, quiet = TRUE) close(LD_file_con) LD_matrix <- matrix(LD_matrix, ncol = sqrt(length(LD_matrix)), byrow = TRUE) - # Read bim (variant metadata) - bim_file_name <- if (!is.null(bim_file_path)) bim_file_path else paste0(LD_file_path, ".bim") - LD_variants <- read.table(bim_file_name) - col_names <- if (ncol(LD_variants) == 9) { - c("chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss") - } else if (ncol(LD_variants) == 6) { - c("chrom", "variants", "GD", "pos", "A1", "A2") + # Auto-detect variant metadata file: .bim (PLINK1) or .pvar/.pvar.zst (PLINK2) + if (is.null(snp_file_path)) { + candidates <- paste0(LD_file_path, c(".bim", ".pvar", ".pvar.zst")) + found <- candidates[file.exists(candidates)] + if (length(found) == 0) stop("No variant file found for: ", LD_file_path, + " (tried .bim, .pvar, .pvar.zst)") + snp_file_path <- found[1] + } + + # Detect format by extension or header + is_pvar <- grepl("\\.(pvar|pvar\\.zst)$", snp_file_path) + if (!is_pvar) { + # Check if first line starts with #CHROM (pvar header) + first_line <- readLines(snp_file_path, n = 1) + is_pvar <- grepl("^#CHROM", first_line) + } + + if (is_pvar) { + # PLINK2 .pvar format: read via existing read_pvar_text() + LD_variants <- read_pvar_text(snp_file_path) + # read_pvar_text returns: chrom, id, pos, A2 (REF), A1 (ALT) + LD_variants <- LD_variants %>% + mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))), + variants = normalize_variant_id(id)) %>% + rename(GD = pos) # reuse pos column for ordering + LD_variants$GD <- LD_variants$pos <- as.integer( + sapply(LD_variants$variants, function(v) strsplit(v, ":")[[1]][2])) } else { - stop("Unexpected number of columns (", ncol(LD_variants), ") in bim file: ", bim_file_name) + # PLINK1 .bim format + LD_variants <- read.table(snp_file_path) + col_names <- if (ncol(LD_variants) == 9) { + c("chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss") + } else if (ncol(LD_variants) == 6) { + c("chrom", "variants", "GD", "pos", "A1", "A2") + } else { + stop("Unexpected number of columns (", ncol(LD_variants), ") in bim file: ", snp_file_path) + } + LD_variants <- LD_variants %>% + setNames(col_names) %>% + mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))), + variants = normalize_variant_id(variants)) } - LD_variants <- LD_variants %>% - setNames(col_names) %>% - mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))), - variants = normalize_variant_id(variants)) # Label and symmetrize the matrix colnames(LD_matrix) <- rownames(LD_matrix) <- LD_variants$variants diff --git a/R/ctwas_wrapper.R b/R/ctwas_wrapper.R index ec626729..90425819 100644 --- a/R/ctwas_wrapper.R +++ b/R/ctwas_wrapper.R @@ -1,11 +1,3 @@ -#' Utility function to load LD in ctwas analyses, to interface with cTWAS package -#' @param ld_matrix_file_path A string of file path to the LD matrix. -#' @export -ctwas_ld_loader <- function(ld_matrix_file_path) { - ld_loaded <- process_LD_matrix(ld_matrix_file_path, paste0(ld_matrix_file_path, ".bim")) - ld_loaded <- ld_loaded$LD_matrix - return(ld_loaded) -} #' @importFrom vroom vroom #' @export ctwas_bimfile_loader <- function(bim_file_path) { diff --git a/R/ld_loader.R b/R/ld_loader.R index d0c357d1..7b2c4640 100644 --- a/R/ld_loader.R +++ b/R/ld_loader.R @@ -4,31 +4,36 @@ #' demand. This avoids loading all blocks into memory simultaneously, #' which is critical for genome-wide analyses with hundreds of blocks. #' -#' Three modes are supported: +#' Four modes are supported: #' #' \describe{ #' \item{list mode (R)}{Pre-loaded list of LD correlation matrices. #' Simple but uses more memory. Set \code{R_list}.} #' \item{list mode (X)}{Pre-loaded list of genotype matrices (n x p_g). #' Set \code{X_list}.} -#' \item{file mode}{Loads LD from a pecotmr metadata TSV file on the fly +#' \item{region mode}{Loads LD from a pecotmr metadata TSV file on the fly #' via \code{\link{load_LD_matrix}}. Memory-efficient for large datasets. #' Set \code{ld_meta_path} and \code{regions}.} +#' \item{LD_info mode}{Loads pre-computed LD blocks from \code{.cor.xz} +#' files listed in an \code{LD_info} data.frame (as returned by +#' \code{\link{get_ctwas_meta_data}}). Set \code{LD_info}.} #' } #' #' @param R_list List of G precomputed LD correlation matrices (p_g x p_g). -#' Mutually exclusive with \code{X_list} and \code{ld_meta_path}. -#' @param X_list List of G genotype matrices (n x p_g). Mutually exclusive -#' with \code{R_list} and \code{ld_meta_path}. +#' @param X_list List of G genotype matrices (n x p_g). #' @param ld_meta_path Path to a pecotmr LD metadata TSV file (as used by -#' \code{\link{load_LD_matrix}}). Mutually exclusive with \code{R_list} -#' and \code{X_list}. +#' \code{\link{load_LD_matrix}}). #' @param regions Character vector of G region strings (e.g., #' \code{"chr22:17238266-19744294"}). Required when \code{ld_meta_path} #' is used. -#' @param return_genotype Logical. When using file mode, return the +#' @param LD_info A data.frame with column \code{LD_file} (paths to +#' \code{.cor.xz} LD matrix files) and optionally \code{SNP_file} +#' (paths to companion \code{.bim} files; defaults to +#' \code{paste0(LD_file, ".bim")} if absent). As returned by +#' \code{\link{get_ctwas_meta_data}}. +#' @param return_genotype Logical. When using region mode, return the #' genotype matrix X (\code{TRUE}) or LD correlation R (\code{FALSE}, -#' default). Returning X enables low-rank RSS paths in downstream methods. +#' default). #' @param max_variants Integer or \code{NULL}. If set, randomly subsample #' blocks larger than this to control memory usage. #' @@ -46,12 +51,14 @@ #' @export ld_loader <- function(R_list = NULL, X_list = NULL, ld_meta_path = NULL, regions = NULL, + LD_info = NULL, return_genotype = FALSE, max_variants = NULL) { # Validate: exactly one source - n_sources <- sum(!is.null(R_list), !is.null(X_list), !is.null(ld_meta_path)) + n_sources <- sum(!is.null(R_list), !is.null(X_list), + !is.null(ld_meta_path), !is.null(LD_info)) if (n_sources != 1) - stop("Provide exactly one of R_list, X_list, or ld_meta_path.") + stop("Provide exactly one of R_list, X_list, ld_meta_path, or LD_info.") if (!is.null(R_list)) { # List mode (R matrices) @@ -73,8 +80,8 @@ ld_loader <- function(R_list = NULL, X_list = NULL, } X } - } else { - # File mode: load on the fly via load_LD_matrix() + } else if (!is.null(ld_meta_path)) { + # Region mode: load on the fly via load_LD_matrix() if (is.null(regions)) stop("'regions' is required when using ld_meta_path.") @@ -97,6 +104,44 @@ ld_loader <- function(R_list = NULL, X_list = NULL, } mat } + } else { + # LD_info mode: load LD blocks by index from file paths + # Supports all three formats: + # 1. Pre-computed .cor.xz + .bim/.pvar (custom block format) + # 2. PLINK1 prefix (.bed/.bim/.fam) — LD computed on the fly + # 3. PLINK2 prefix (.pgen/.pvar/.psam) — LD computed on the fly + if (!is.data.frame(LD_info) || !"LD_file" %in% colnames(LD_info)) + stop("LD_info must be a data.frame with column 'LD_file'.") + + loader <- function(g) { + ld_path <- LD_info$LD_file[g] + + # Auto-detect format by checking what files exist + if (has_plink2_files(ld_path)) { + # PLINK2: load genotypes and compute LD + geno <- load_genotype_region(ld_path) + mat <- compute_LD(geno) + } else if (has_plink1_files(ld_path)) { + # PLINK1: load genotypes and compute LD + geno <- load_genotype_region(ld_path) + mat <- compute_LD(geno) + } else { + # Pre-computed .cor.xz block + snp_file <- if ("SNP_file" %in% colnames(LD_info)) { + LD_info$SNP_file[g] + } else { + NULL # let process_LD_matrix auto-detect .bim/.pvar/.pvar.zst + } + ld <- process_LD_matrix(ld_path, snp_file) + mat <- ld$LD_matrix + } + + if (!is.null(max_variants) && ncol(mat) > max_variants) { + keep <- sort(sample(ncol(mat), max_variants)) + mat <- mat[keep, keep] + } + mat + } } loader diff --git a/man/check_ld.Rd b/man/check_ld.Rd new file mode 100644 index 00000000..cb136c53 --- /dev/null +++ b/man/check_ld.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LD.R +\name{check_ld} +\alias{check_ld} +\title{Check and optionally repair LD matrix quality} +\usage{ +check_ld( + R, + method = c("check", "shrink", "eigenfix"), + r_tol = 1e-08, + shrinkage = 0.01 +) +} +\arguments{ +\item{R}{Symmetric correlation matrix.} + +\item{method}{One of \code{"check"}, \code{"shrink"}, or \code{"eigenfix"}.} + +\item{r_tol}{Eigenvalue tolerance. Eigenvalues with absolute value below +\code{r_tol} are treated as zero. Default: \code{1e-8}.} + +\item{shrinkage}{Shrinkage parameter for \code{method = "shrink"}. +Default: \code{0.01}.} +} +\value{ +A list with components: +\describe{ + \item{R}{The (possibly repaired) LD matrix.} + \item{is_pd}{Logical: is the matrix positive definite?} + \item{is_psd}{Logical: is the matrix positive semidefinite (within r_tol)?} + \item{min_eigenvalue}{Smallest eigenvalue of the original matrix.} + \item{n_negative}{Number of negative eigenvalues (below -r_tol).} + \item{condition_number}{Ratio of largest to smallest positive eigenvalue + (\code{Inf} if any eigenvalue is zero).} + \item{method_applied}{Character: \code{"none"}, \code{"shrink"}, or + \code{"eigenfix"}.} +} +} +\description{ +Diagnoses positive-definiteness of an LD correlation matrix and optionally +repairs it. Downstream methods like PRS-CS require positive-definite LD +(Cholesky decomposition), while others (lassosum, SDPR) handle non-PD +matrices internally via their own regularization. +} +\details{ +Three modes are available: +\describe{ + \item{\code{"check"}}{Diagnostic only — returns eigenvalue statistics + without modifying the matrix.} + \item{\code{"shrink"}}{Apply shrinkage toward identity: + \code{R_s = (1 - shrinkage) * R + shrinkage * I}. Simple and fast; + always produces a positive-definite matrix when \code{shrinkage > 0}.} + \item{\code{"eigenfix"}}{Set negative eigenvalues to zero and + reconstruct the matrix. Matches the approach used in susieR's + \code{rss_lambda_constructor} and is the closest positive + semidefinite matrix in the Frobenius norm. Does not inflate the + diagonal like shrinkage does.} +} +} +\examples{ +# A well-conditioned matrix +R_good <- diag(5) +check_ld(R_good)$is_pd # TRUE + +# A matrix with negative eigenvalues +R_bad <- matrix(0.9, 3, 3); diag(R_bad) <- 1; R_bad[1,3] <- R_bad[3,1] <- -0.5 +check_ld(R_bad)$is_psd # FALSE +R_fixed <- check_ld(R_bad, method = "eigenfix")$R +check_ld(R_fixed)$is_psd # TRUE + +} diff --git a/man/ctwas_ld_loader.Rd b/man/ctwas_ld_loader.Rd deleted file mode 100644 index 66c43f3e..00000000 --- a/man/ctwas_ld_loader.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ctwas_wrapper.R -\name{ctwas_ld_loader} -\alias{ctwas_ld_loader} -\title{Utility function to load LD in ctwas analyses, to interface with cTWAS package} -\usage{ -ctwas_ld_loader(ld_matrix_file_path) -} -\arguments{ -\item{ld_matrix_file_path}{A string of file path to the LD matrix.} -} -\description{ -Utility function to load LD in ctwas analyses, to interface with cTWAS package -} diff --git a/man/ld_loader.Rd b/man/ld_loader.Rd index fe8351a2..03955fb5 100644 --- a/man/ld_loader.Rd +++ b/man/ld_loader.Rd @@ -9,28 +9,32 @@ ld_loader( X_list = NULL, ld_meta_path = NULL, regions = NULL, + LD_info = NULL, return_genotype = FALSE, max_variants = NULL ) } \arguments{ -\item{R_list}{List of G precomputed LD correlation matrices (p_g x p_g). -Mutually exclusive with \code{X_list} and \code{ld_meta_path}.} +\item{R_list}{List of G precomputed LD correlation matrices (p_g x p_g).} -\item{X_list}{List of G genotype matrices (n x p_g). Mutually exclusive -with \code{R_list} and \code{ld_meta_path}.} +\item{X_list}{List of G genotype matrices (n x p_g).} \item{ld_meta_path}{Path to a pecotmr LD metadata TSV file (as used by -\code{\link{load_LD_matrix}}). Mutually exclusive with \code{R_list} -and \code{X_list}.} +\code{\link{load_LD_matrix}}).} \item{regions}{Character vector of G region strings (e.g., \code{"chr22:17238266-19744294"}). Required when \code{ld_meta_path} is used.} -\item{return_genotype}{Logical. When using file mode, return the +\item{LD_info}{A data.frame with column \code{LD_file} (paths to +\code{.cor.xz} LD matrix files) and optionally \code{SNP_file} +(paths to companion \code{.bim} files; defaults to +\code{paste0(LD_file, ".bim")} if absent). As returned by +\code{\link{get_ctwas_meta_data}}.} + +\item{return_genotype}{Logical. When using region mode, return the genotype matrix X (\code{TRUE}) or LD correlation R (\code{FALSE}, -default). Returning X enables low-rank RSS paths in downstream methods.} +default).} \item{max_variants}{Integer or \code{NULL}. If set, randomly subsample blocks larger than this to control memory usage.} @@ -45,16 +49,19 @@ demand. This avoids loading all blocks into memory simultaneously, which is critical for genome-wide analyses with hundreds of blocks. } \details{ -Three modes are supported: +Four modes are supported: \describe{ \item{list mode (R)}{Pre-loaded list of LD correlation matrices. Simple but uses more memory. Set \code{R_list}.} \item{list mode (X)}{Pre-loaded list of genotype matrices (n x p_g). Set \code{X_list}.} - \item{file mode}{Loads LD from a pecotmr metadata TSV file on the fly + \item{region mode}{Loads LD from a pecotmr metadata TSV file on the fly via \code{\link{load_LD_matrix}}. Memory-efficient for large datasets. Set \code{ld_meta_path} and \code{regions}.} + \item{LD_info mode}{Loads pre-computed LD blocks from \code{.cor.xz} + files listed in an \code{LD_info} data.frame (as returned by + \code{\link{get_ctwas_meta_data}}). Set \code{LD_info}.} } } \examples{ diff --git a/man/otters_weights.Rd b/man/otters_weights.Rd index 6f7ed755..d9d8bd25 100644 --- a/man/otters_weights.Rd +++ b/man/otters_weights.Rd @@ -11,7 +11,8 @@ otters_weights( methods = list(lassosum_rss = list(), prs_cs = list(phi = 1e-04, n_iter = 1000, n_burnin = 500, thin = 5), sdpr = list(iter = 1000, burn = 200, thin = 1, verbose = FALSE)), - p_thresholds = c(0.001, 0.05) + p_thresholds = c(0.001, 0.05), + check_ld_method = "eigenfix" ) } \arguments{ @@ -40,6 +41,11 @@ To add learners (e.g., \code{mr_ash_rss}), simply append to this list.} \item{p_thresholds}{Numeric vector of p-value thresholds for P+T. Set to \code{NULL} to skip P+T. Default: \code{c(0.001, 0.05)}.} + +\item{check_ld_method}{LD quality check method passed to \code{\link{check_ld}}. +Default \code{"eigenfix"} sets negative eigenvalues to zero (required for +PRS-CS Cholesky, matching OTTERS' SVD-based PD forcing). Set to \code{NULL} +to skip checking.} } \value{ A named list of weight vectors (one per method). Each vector has length