diff --git a/R/allele_qc.R b/R/allele_qc.R index 93335ce9..0c4f787e 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -169,10 +169,11 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, result <- match_result[match_result$keep, , drop = FALSE] if (remove_dups) { - dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")]) + dups <- duplicated(result[, c("chrom", "pos", "variants_id_qced")]) if (any(dups)) { + n_removed <- sum(dups) + warning(sprintf("Removed %d duplicate variant(s), keeping first occurrence.", n_removed)) result <- result[!dups, , drop = FALSE] - warning("Duplicate variants were removed.") } } diff --git a/R/misc.R b/R/misc.R index 43ffa1e9..0adedaca 100644 --- a/R/misc.R +++ b/R/misc.R @@ -33,9 +33,8 @@ pval_hmp <- function(pvals) { stop("To use this function, please install harmonicmeanp: https://cran.r-project.org/web/packages/harmonicmeanp/index.html") } # https://search.r-project.org/CRAN/refmans/harmonicmeanp/html/pLandau.html - pvalues <- unique(pvals) - L <- length(pvalues) - HMP <- L / sum(pvalues^-1) + L <- length(pvals) + HMP <- L / sum(pvals^-1) LOC_L1 <- 0.874367040387922 SCALE <- 1.5707963267949 @@ -230,7 +229,22 @@ compute_LD <- function(X, method = c("sample", "population"), col_means <- colMeans(X, na.rm = TRUE) # Population variance: E[X^2] - E[X]^2 col_vars <- colMeans(X^2, na.rm = TRUE) - col_means^2 - # Center; set NA -> 0 so missing pairs don't contribute to cross-products + # Center; set NA -> 0 so missing pairs don't contribute to cross-products. + # NOTE: the covariance divides by total N (not pairwise non-missing counts), + # which is an approximation that assumes uniform missingness across SNPs. + # With heterogeneous missingness, correlations between high-missing and + # low-missing columns will be slightly deflated. This matches the GCTA + # convention and is standard for PLINK-style LD computation. + if (anyNA(X)) { + na_rates <- colMeans(is.na(X)) + if (max(na_rates) - min(na_rates) > 0.1) { + warning("Population LD method with heterogeneous missingness ", + "(max NA rate ", round(max(na_rates), 3), + ", min ", round(min(na_rates), 3), + "): correlations may be biased. Consider using method='sample' ", + "which handles missingness via mean imputation.") + } + } X_c <- sweep(X, 2, col_means) X_c[is.na(X_c)] <- 0 # Covariance with N denominator diff --git a/R/regularized_regression.R b/R/regularized_regression.R index 53e97c43..e63ff495 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -214,6 +214,11 @@ sdpr <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = stop("The total sample size 'n' must be a positive integer.") } + # M must be >= 4 (SDPR uses M-2 indexing in sample_V; M < 4 causes buffer overflow) + if (M < 4) { + stop("'M' must be at least 4.") + } + # Check if per_variant_sample_size vector contains only positive values (if provided) if (!is.null(per_variant_sample_size) && any(per_variant_sample_size <= 0)) { stop("The 'per_variant_sample_size' vector must contain only positive values.") diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R index 615e4057..2200634e 100644 --- a/R/sumstats_qc.R +++ b/R/sumstats_qc.R @@ -36,7 +36,8 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, keep_indel = TRU if (!is.null(skip_region)) { skip_table <- tibble(region = skip_region) %>% - separate(region, into = c("chrom", "start", "end"), sep = "[-:]") + separate(region, into = c("chrom", "start", "end"), sep = "[-:]") %>% + mutate(start = as.integer(start), end = as.integer(end)) skip_variant <- c() for (region_index in 1:nrow(skip_table)) { diff --git a/src/prscs_mcmc.h b/src/prscs_mcmc.h index 468ed44c..ac8b9f8f 100644 --- a/src/prscs_mcmc.h +++ b/src/prscs_mcmc.h @@ -164,13 +164,10 @@ double gigrnd(double p, double a, double b, std::mt19937& rng) { double result = rnd / std::sqrt(a / b); - // Check if the result is zero and replace it with a small value + // Guard against exact zero (can happen with extreme parameters) if (result == 0.0) { result = std::numeric_limits::min(); } - if (result > 1.0) { - result = 1.0; - } return result; } @@ -280,6 +277,9 @@ std::map prs_cs_mcmc(double a, double b, double* phi, for (int jj = 0; jj < p; ++jj) { psi(jj) = gigrnd(a - 0.5, 2.0 * delta(jj), n * std::pow(beta(jj), 2) / sigma, rng); } + // Clamp psi to [0, 1] — matches original PRS-CS (Ge et al.): + // psi[psi>1] = 1.0 (prevents explosive local shrinkage) + psi = arma::clamp(psi, 0.0, 1.0); if (phi_updt) { std::gamma_distribution gamma_dist_phi(1.0, 1.0 / (*phi + 1.0));