From 0b8e80735525619813fed8c19dc5451abd779cd1 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Tue, 7 Apr 2026 15:45:51 -0400 Subject: [PATCH] Fix 6 pre-existing bugs across pipeline 1. pval_hmp: remove unique() that silently discarded duplicate p-values, changing both the count L and the harmonic mean. 2. rss_basic_qc: convert skip_region start/end to integer after separate(). Was doing lexicographic comparison ("9" > "10" = TRUE). 3. allele_qc: use duplicated() instead of vec_duplicate_detect() to keep the first occurrence of duplicates. The old code removed ALL copies (including first), silently losing data. Now warns with count of removed duplicates. 4. compute_LD population method: add warning when missingness differs by >10% across columns, since the N-denominator approximation becomes biased with heterogeneous missingness. 5. gigrnd: remove the [0,1] clamp from inside the GIG sampler (which is a general-purpose distribution, not bounded at 1). Move psi clamping to after the GIG call in the MCMC loop via arma::clamp(psi, 0, 1), matching original Python PRS-CS exactly. 6. sdpr: add M >= 4 guard in R wrapper. With M < 4, sample_V() does out-of-bounds array access (a[M-2] with vector size M-1). Co-Authored-By: Claude Opus 4.6 (1M context) --- R/allele_qc.R | 5 +++-- R/misc.R | 22 ++++++++++++++++++---- R/regularized_regression.R | 5 +++++ R/sumstats_qc.R | 3 ++- src/prscs_mcmc.h | 8 ++++---- 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/R/allele_qc.R b/R/allele_qc.R index 93335ce9..0c4f787e 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -169,10 +169,11 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, result <- match_result[match_result$keep, , drop = FALSE] if (remove_dups) { - dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")]) + dups <- duplicated(result[, c("chrom", "pos", "variants_id_qced")]) if (any(dups)) { + n_removed <- sum(dups) + warning(sprintf("Removed %d duplicate variant(s), keeping first occurrence.", n_removed)) result <- result[!dups, , drop = FALSE] - warning("Duplicate variants were removed.") } } diff --git a/R/misc.R b/R/misc.R index 43ffa1e9..0adedaca 100644 --- a/R/misc.R +++ b/R/misc.R @@ -33,9 +33,8 @@ pval_hmp <- function(pvals) { stop("To use this function, please install harmonicmeanp: https://cran.r-project.org/web/packages/harmonicmeanp/index.html") } # https://search.r-project.org/CRAN/refmans/harmonicmeanp/html/pLandau.html - pvalues <- unique(pvals) - L <- length(pvalues) - HMP <- L / sum(pvalues^-1) + L <- length(pvals) + HMP <- L / sum(pvals^-1) LOC_L1 <- 0.874367040387922 SCALE <- 1.5707963267949 @@ -230,7 +229,22 @@ compute_LD <- function(X, method = c("sample", "population"), col_means <- colMeans(X, na.rm = TRUE) # Population variance: E[X^2] - E[X]^2 col_vars <- colMeans(X^2, na.rm = TRUE) - col_means^2 - # Center; set NA -> 0 so missing pairs don't contribute to cross-products + # Center; set NA -> 0 so missing pairs don't contribute to cross-products. + # NOTE: the covariance divides by total N (not pairwise non-missing counts), + # which is an approximation that assumes uniform missingness across SNPs. + # With heterogeneous missingness, correlations between high-missing and + # low-missing columns will be slightly deflated. This matches the GCTA + # convention and is standard for PLINK-style LD computation. + if (anyNA(X)) { + na_rates <- colMeans(is.na(X)) + if (max(na_rates) - min(na_rates) > 0.1) { + warning("Population LD method with heterogeneous missingness ", + "(max NA rate ", round(max(na_rates), 3), + ", min ", round(min(na_rates), 3), + "): correlations may be biased. Consider using method='sample' ", + "which handles missingness via mean imputation.") + } + } X_c <- sweep(X, 2, col_means) X_c[is.na(X_c)] <- 0 # Covariance with N denominator diff --git a/R/regularized_regression.R b/R/regularized_regression.R index 53e97c43..e63ff495 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -214,6 +214,11 @@ sdpr <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = stop("The total sample size 'n' must be a positive integer.") } + # M must be >= 4 (SDPR uses M-2 indexing in sample_V; M < 4 causes buffer overflow) + if (M < 4) { + stop("'M' must be at least 4.") + } + # Check if per_variant_sample_size vector contains only positive values (if provided) if (!is.null(per_variant_sample_size) && any(per_variant_sample_size <= 0)) { stop("The 'per_variant_sample_size' vector must contain only positive values.") diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R index 615e4057..2200634e 100644 --- a/R/sumstats_qc.R +++ b/R/sumstats_qc.R @@ -36,7 +36,8 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, keep_indel = TRU if (!is.null(skip_region)) { skip_table <- tibble(region = skip_region) %>% - separate(region, into = c("chrom", "start", "end"), sep = "[-:]") + separate(region, into = c("chrom", "start", "end"), sep = "[-:]") %>% + mutate(start = as.integer(start), end = as.integer(end)) skip_variant <- c() for (region_index in 1:nrow(skip_table)) { diff --git a/src/prscs_mcmc.h b/src/prscs_mcmc.h index 468ed44c..ac8b9f8f 100644 --- a/src/prscs_mcmc.h +++ b/src/prscs_mcmc.h @@ -164,13 +164,10 @@ double gigrnd(double p, double a, double b, std::mt19937& rng) { double result = rnd / std::sqrt(a / b); - // Check if the result is zero and replace it with a small value + // Guard against exact zero (can happen with extreme parameters) if (result == 0.0) { result = std::numeric_limits::min(); } - if (result > 1.0) { - result = 1.0; - } return result; } @@ -280,6 +277,9 @@ std::map prs_cs_mcmc(double a, double b, double* phi, for (int jj = 0; jj < p; ++jj) { psi(jj) = gigrnd(a - 0.5, 2.0 * delta(jj), n * std::pow(beta(jj), 2) / sigma, rng); } + // Clamp psi to [0, 1] — matches original PRS-CS (Ge et al.): + // psi[psi>1] = 1.0 (prevents explosive local shrinkage) + psi = arma::clamp(psi, 0.0, 1.0); if (phi_updt) { std::gamma_distribution gamma_dist_phi(1.0, 1.0 / (*phi + 1.0));