From 0b8e80735525619813fed8c19dc5451abd779cd1 Mon Sep 17 00:00:00 2001
From: Gao Wang <wangow@gmail.com>
Date: Tue, 7 Apr 2026 15:45:51 -0400
Subject: [PATCH] Fix 6 pre-existing bugs across pipeline

1. pval_hmp: remove unique() that silently discarded duplicate
   p-values, changing both the count L and the harmonic mean.

2. rss_basic_qc: convert skip_region start/end to integer after
   separate(). Was doing lexicographic comparison ("9" > "10" = TRUE).

3. allele_qc: use duplicated() instead of vec_duplicate_detect() to
   keep the first occurrence of duplicates. The old code removed ALL
   copies (including first), silently losing data. Now warns with
   count of removed duplicates.

4. compute_LD population method: add warning when missingness differs
   by >10% across columns, since the N-denominator approximation
   becomes biased with heterogeneous missingness.

5. gigrnd: remove the [0,1] clamp from inside the GIG sampler
   (which is a general-purpose distribution, not bounded at 1).
   Move psi clamping to after the GIG call in the MCMC loop via
   arma::clamp(psi, 0, 1), matching original Python PRS-CS exactly.

6. sdpr: add M >= 4 guard in R wrapper. With M < 4, sample_V()
   does out-of-bounds array access (a[M-2] with vector size M-1).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 R/allele_qc.R              |  5 +++--
 R/misc.R                   | 22 ++++++++++++++++++----
 R/regularized_regression.R |  5 +++++
 R/sumstats_qc.R            |  3 ++-
 src/prscs_mcmc.h           |  8 ++++----
 5 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/R/allele_qc.R b/R/allele_qc.R
index 93335ce9..0c4f787e 100644
--- a/R/allele_qc.R
+++ b/R/allele_qc.R
@@ -169,10 +169,11 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
   result <- match_result[match_result$keep, , drop = FALSE]
 	
   if (remove_dups) {
-	dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")])
+	dups <- duplicated(result[, c("chrom", "pos", "variants_id_qced")])
 	if (any(dups)) {
+	  n_removed <- sum(dups)
+	  warning(sprintf("Removed %d duplicate variant(s), keeping first occurrence.", n_removed))
 	  result <- result[!dups, , drop = FALSE]
-	  warning("Duplicate variants were removed.")
 	}
   }
 
diff --git a/R/misc.R b/R/misc.R
index 43ffa1e9..0adedaca 100644
--- a/R/misc.R
+++ b/R/misc.R
@@ -33,9 +33,8 @@ pval_hmp <- function(pvals) {
     stop("To use this function, please install harmonicmeanp: https://cran.r-project.org/web/packages/harmonicmeanp/index.html")
   }
   # https://search.r-project.org/CRAN/refmans/harmonicmeanp/html/pLandau.html
-  pvalues <- unique(pvals)
-  L <- length(pvalues)
-  HMP <- L / sum(pvalues^-1)
+  L <- length(pvals)
+  HMP <- L / sum(pvals^-1)
 
   LOC_L1 <- 0.874367040387922
   SCALE <- 1.5707963267949
@@ -230,7 +229,22 @@ compute_LD <- function(X, method = c("sample", "population"),
     col_means <- colMeans(X, na.rm = TRUE)
     # Population variance: E[X^2] - E[X]^2
     col_vars <- colMeans(X^2, na.rm = TRUE) - col_means^2
-    # Center; set NA -> 0 so missing pairs don't contribute to cross-products
+    # Center; set NA -> 0 so missing pairs don't contribute to cross-products.
+    # NOTE: the covariance divides by total N (not pairwise non-missing counts),
+    # which is an approximation that assumes uniform missingness across SNPs.
+    # With heterogeneous missingness, correlations between high-missing and
+    # low-missing columns will be slightly deflated. This matches the GCTA
+    # convention and is standard for PLINK-style LD computation.
+    if (anyNA(X)) {
+      na_rates <- colMeans(is.na(X))
+      if (max(na_rates) - min(na_rates) > 0.1) {
+        warning("Population LD method with heterogeneous missingness ",
+                "(max NA rate ", round(max(na_rates), 3),
+                ", min ", round(min(na_rates), 3),
+                "): correlations may be biased. Consider using method='sample' ",
+                "which handles missingness via mean imputation.")
+      }
+    }
     X_c <- sweep(X, 2, col_means)
     X_c[is.na(X_c)] <- 0
     # Covariance with N denominator
diff --git a/R/regularized_regression.R b/R/regularized_regression.R
index 53e97c43..e63ff495 100644
--- a/R/regularized_regression.R
+++ b/R/regularized_regression.R
@@ -214,6 +214,11 @@ sdpr <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a =
     stop("The total sample size 'n' must be a positive integer.")
   }
 
+  # M must be >= 4 (SDPR uses M-2 indexing in sample_V; M < 4 causes buffer overflow)
+  if (M < 4) {
+    stop("'M' must be at least 4.")
+  }
+
   # Check if per_variant_sample_size vector contains only positive values (if provided)
   if (!is.null(per_variant_sample_size) && any(per_variant_sample_size <= 0)) {
     stop("The 'per_variant_sample_size' vector must contain only positive values.")
diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R
index 615e4057..2200634e 100644
--- a/R/sumstats_qc.R
+++ b/R/sumstats_qc.R
@@ -36,7 +36,8 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, keep_indel = TRU
 
   if (!is.null(skip_region)) {
     skip_table <- tibble(region = skip_region) %>%
-      separate(region, into = c("chrom", "start", "end"), sep = "[-:]")
+      separate(region, into = c("chrom", "start", "end"), sep = "[-:]") %>%
+      mutate(start = as.integer(start), end = as.integer(end))
 
     skip_variant <- c()
     for (region_index in 1:nrow(skip_table)) {
diff --git a/src/prscs_mcmc.h b/src/prscs_mcmc.h
index 468ed44c..ac8b9f8f 100644
--- a/src/prscs_mcmc.h
+++ b/src/prscs_mcmc.h
@@ -164,13 +164,10 @@ double gigrnd(double p, double a, double b, std::mt19937& rng) {
 
 	double result = rnd / std::sqrt(a / b);
 
-	// Check if the result is zero and replace it with a small value
+	// Guard against exact zero (can happen with extreme parameters)
 	if (result == 0.0) {
 		result = std::numeric_limits<double>::min();
 	}
-	if (result > 1.0) {
-		result = 1.0;
-	}
 
 	return result;
 }
@@ -280,6 +277,9 @@ std::map<std::string, arma::vec> prs_cs_mcmc(double a, double b, double* phi,
 		for (int jj = 0; jj < p; ++jj) {
 			psi(jj) = gigrnd(a - 0.5, 2.0 * delta(jj), n * std::pow(beta(jj), 2) / sigma, rng);
 		}
+		// Clamp psi to [0, 1] — matches original PRS-CS (Ge et al.):
+		// psi[psi>1] = 1.0 (prevents explosive local shrinkage)
+		psi = arma::clamp(psi, 0.0, 1.0);
 
 		if (phi_updt) {
 			std::gamma_distribution<double> gamma_dist_phi(1.0, 1.0 / (*phi + 1.0));