From c022e45527d3b97cef93e58c44d5e2df04bb3f97 Mon Sep 17 00:00:00 2001 From: iulia rautu Date: Tue, 17 Mar 2026 12:05:40 +0100 Subject: [PATCH 1/7] chore(DHIS2 format report): add copy for NER --- .../snt_dhis2_formatting_report_NER.ipynb | 1398 +++++++++++++++++ 1 file changed, 1398 insertions(+) create mode 100644 pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb new file mode 100644 index 0000000..dc316d8 --- /dev/null +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb @@ -0,0 +1,1398 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "47551f88-b40b-449f-9dc1-59db71183611", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 💡 Comments / Questions & To Do's:\n", + "# - filter by YEAR keep only 2022-2024): \n", + "# 1. Why these years? Arbitrary choice? Based on what? linked to what?\n", + "# 2. Is this a paramater is some other pipeline? if so, should be integrated here somehow \n", + "# - Missing data: why do we have NA values for population? Are these real NA (missing data) or 0?\n", + "# - OUTLIERS: there are clear outliers (i.e., DS AGADEZ): shall we do some simple data cleaning here?\n", + "# - Population catagories (breaks) do we have a specific scale in mind \n", + "# (i.e., use same as another country) or can I set it based on the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "342b6b54-4812-4b07-b408-68a034b4014e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# TO DO / FINISH:\n", + "# - add safety \"if\" logic so nb does not fail if data is missing or wrong path ...\n", + "# - (maybe) also add meaningful messages\n", + "# - Add code to export PNG files of relevant figures\n", + "# - Set dynamic boundaries for POPULATION categories? (so can use same code in different countries)\n", + "# - Clean code to avoid redundancies (especially ggplot stuff, a lot of copy pasted ...)" + ] + }, + { + "cell_type": "markdown", + "id": "5b72f828-4fc1-462d-babc-f8f6c9c96ff5", + "metadata": {}, + "source": [ + "## 0. Paths and Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d3285c7-1a60-46ad-9541-36a703d51924", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "\n", + "REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_formatting/reporting\")\n", + "\n", + "# Create output directories if they don't exist (before loading utils)\n", + "figures_dir <- file.path(REPORTING_NB_PATH, \"outputs\", \"figures\")\n", + "if (!dir.exists(figures_dir)) {\n", + " dir.create(figures_dir, recursive = TRUE)\n", + " print(paste0(\"Created figures directory: \", figures_dir))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "732733e7-8890-4c3e-be64-496fd4a2c800", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f26728d-10a0-42d6-a7ff-368cc38e60b9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "required_packages <- c(\n", + " \"tidyverse\", \n", + " \"arrow\", \n", + " \"sf\", \n", + " \"reticulate\",\n", + " \"patchwork\"\n", + ") \n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20475dd9-5091-4f87-9ae2-d0235921fe94", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load openhexa.sdk\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f70d726-1c34-47dc-b963-bb23e42994bb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90d58c60-fb4e-40e4-add8-5f258f541843", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration variables\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b96fa16-25cc-4420-9ad8-332af4a59fdf", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8eece9e0-2544-48c1-8579-a5a721af4ff8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# print function\n", + "printdim <- function(df, name = deparse(substitute(df))) {\n", + " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "643abe28-da3b-4bd2-9ecc-126b18b85c69", + "metadata": {}, + "source": [ + "## 1. Import data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43bbbcdf-c1d1-4631-980c-2c4465cf7a55", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import analytics DHIS2 data\n", + "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste0(\"[WARNING] Error while loading DHIS2 Routine data for: \" , COUNTRY_CODE, \n", + " \" the report cannot be executed. [ERROR DETAILS] \", conditionMessage(e))\n", + " stop(msg)\n", + " })\n", + "\n", + "printdim(routine_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d53274c5-965e-4a11-bb77-c9b899d5cb9c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste0(COUNTRY_NAME , \" Population data is not available in dataset : \" , dataset_name, \" last version.\")\n", + " log_msg(msg, \"warning\")\n", + " population_data <- NULL\n", + " })\n", + "\n", + "printdim(population_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1be5372-cbc1-4343-ab11-01eae0fa9d60", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) { \n", + " msg <- paste0(COUNTRY_NAME , \" Shapes data is not available in dataset : \" , dataset_name, \" last version.\")\n", + " log_msg(msg, \"warning\")\n", + " shapes_data <- NULL\n", + " })\n", + "\n", + "printdim(shapes_data)" + ] + }, + { + "cell_type": "markdown", + "id": "c881f748-e391-46c9-a36a-ed11c238a6ce", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65ea60f5-99e9-46d1-89f0-03245d9efd0b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", + "metadata": {}, + "source": [ + "# **Complétude des indicateurs composites**\n" + ] + }, + { + "cell_type": "markdown", + "id": "ca84fce1-0407-433a-a98a-e65ed15ab8de", + "metadata": {}, + "source": [ + "# 1. Complétude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7691e61-6542-4d40-af2a-c018d29b86a8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(routine_data)" + ] + }, + { + "cell_type": "markdown", + "id": "c109e82d-8c72-41f0-857a-322163cf213e", + "metadata": {}, + "source": [ + "## 1.1 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f54505a-2dcc-429e-a900-46d4fae6fd31", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 0: Rename your data for convenience\n", + "data <- routine_data\n", + "\n", + "# Step 1: Convert PERIOD to DATE\n", + "data <- data %>%\n", + " mutate(\n", + " DATE = ymd(paste0(PERIOD, \"01\"))\n", + " )\n", + "\n", + "# Step 2: Reshape wide to long: INDICATOR = column name (e.g., CONF), VALUE = value\n", + "indicator_vars <- setdiff(names(data), c(\n", + " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"DATE\"\n", + "))\n", + "\n", + "long_data <- data %>%\n", + " pivot_longer(cols = all_of(indicator_vars),\n", + " names_to = \"INDICATOR\",\n", + " values_to = \"VALUE\") %>%\n", + " rename(OU = OU_ID)\n", + "\n", + "# Step 3: Build expected full grid (OU × INDICATOR × DATE)\n", + "full_grid <- expand_grid(\n", + " OU = unique(long_data$OU),\n", + " INDICATOR = unique(long_data$INDICATOR),\n", + " DATE = unique(long_data$DATE)\n", + ")\n", + "\n", + "# Step 4: Join and assess reporting status\n", + "reporting_check <- full_grid %>%\n", + " left_join(\n", + " long_data %>% select(OU, INDICATOR, DATE, VALUE),\n", + " by = c(\"OU\", \"INDICATOR\", \"DATE\")\n", + " ) %>%\n", + " mutate(\n", + " is_missing = is.na(VALUE),\n", + " is_zero = VALUE == 0 & !is.na(VALUE),\n", + " is_positive = VALUE > 0 & !is.na(VALUE)\n", + " )\n", + "\n", + "# Step 5: Summarise reporting status\n", + "reporting_summary <- reporting_check %>%\n", + " group_by(INDICATOR, DATE) %>%\n", + " summarise(\n", + " n_total = n_distinct(OU),\n", + " n_missing = sum(is_missing),\n", + " n_zero = sum(is_zero),\n", + " n_positive = sum(is_positive),\n", + " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", + " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", + " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 6: Prepare plot-ready data\n", + "plot_data <- reporting_summary %>%\n", + " pivot_longer(\n", + " cols = starts_with(\"pct_\"),\n", + " names_to = \"Status\",\n", + " values_to = \"Percentage\"\n", + " ) %>%\n", + " mutate(\n", + " Status = recode(Status,\n", + " pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur nulle rapportée\",\n", + " pct_positive = \"Valeur positive rapportée\")\n", + " ) %>%\n", + " complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfd115e7-176d-4beb-9ab9-2e6990cb16af", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "options(repr.plot.width = 17, repr.plot.height = 10)\n", + "ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +\n", + " geom_col(position = \"stack\") +\n", + " facet_wrap(~ INDICATOR, scales = \"free_y\", ncol = 4) +\n", + " scale_y_continuous() +\n", + " scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\",\n", + " \"Valeur nulle rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " labs(\n", + " title = \"Taux de rapportage par indicateur (niveau formation sanitaire)\",\n", + " subtitle = \"Proportion des valeurs rapportées par mois et par indicateur\",\n", + " x = \"Mois\", y = \"% des formations sanitaires\",\n", + " fill = \"Statut du rapportage\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " strip.text = element_text(size = 16),\n", + " axis.title = element_text(size = 16),\n", + " axis.text = element_text(size = 16)\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "e6871759-714b-437a-8b9c-5a5a06656567", + "metadata": {}, + "source": [ + "## 1.2 Proportion des districts ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c89f6c77-dd42-4616-8eb5-1642d5b51157", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 0: Rename for convenience\n", + "data <- routine_data\n", + "\n", + "# Step 1: Convert PERIOD to proper Date\n", + "data <- data %>%\n", + " mutate(Date = ymd(paste0(PERIOD, \"01\")))\n", + "\n", + "# Step 2: Identify indicator columns\n", + "indicator_cols <- setdiff(names(data), c(\n", + " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\",\n", + " \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"Date\"\n", + "))\n", + "\n", + "# Step 3: Reshape to long format\n", + "data_long <- data %>%\n", + " select(ADM2_ID, OU_ID, Date, all_of(indicator_cols)) %>%\n", + " pivot_longer(cols = all_of(indicator_cols),\n", + " names_to = \"Indicator\", values_to = \"value\") %>%\n", + " mutate(value = as.numeric(value))\n", + "\n", + "# Step 4: Full expected grid at ADM2 level\n", + "full_grid <- expand_grid(\n", + " ADM2_ID = unique(data_long$ADM2_ID),\n", + " Indicator = unique(data_long$Indicator),\n", + " Date = unique(data_long$Date)\n", + ")\n", + "\n", + "# Step 5: Detect if *any* health facility reported per district × indicator × date\n", + "reporting_check <- data_long %>%\n", + " group_by(ADM2_ID, Indicator, Date) %>%\n", + " summarise(\n", + " is_missing = all(is.na(value)),\n", + " is_zero = all(value == 0, na.rm = TRUE),\n", + " is_positive = any(value > 0, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 6: Join with full grid to fill in missing ADM2s\n", + "reporting_full <- full_grid %>%\n", + " left_join(reporting_check, by = c(\"ADM2_ID\", \"Indicator\", \"Date\")) %>%\n", + " mutate(\n", + " is_missing = replace_na(is_missing, TRUE),\n", + " is_zero = replace_na(is_zero, FALSE),\n", + " is_positive = replace_na(is_positive, FALSE)\n", + " )\n", + "\n", + "# Step 7: Summarise by Indicator and Date\n", + "reporting_summary <- reporting_full %>%\n", + " group_by(Indicator, Date) %>%\n", + " summarise(\n", + " n_total = n_distinct(ADM2_ID),\n", + " n_missing = sum(is_missing),\n", + " n_zero = sum(is_zero & !is_missing),\n", + " n_positive = sum(is_positive),\n", + " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", + " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", + " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 8: Reshape for plotting\n", + "plot_data <- reporting_summary %>%\n", + " pivot_longer(cols = starts_with(\"pct_\"),\n", + " names_to = \"Status\", values_to = \"Percentage\") %>%\n", + " mutate(Status = recode(Status,\n", + " pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur nulle rapportée\",\n", + " pct_positive = \"Valeur positive rapportée\")) %>%\n", + " complete(Indicator, Date, Status, fill = list(Percentage = 0))\n", + "\n", + "# Step 9: Plot\n", + "ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +\n", + " geom_col(position = \"stack\") +\n", + " facet_wrap(~ Indicator, scales = \"free_y\") +\n", + " scale_y_continuous(limits = c(0, 100)) +\n", + " scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\",\n", + " \"Valeur nulle rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " labs(\n", + " title = \"Taux de rapportage par indicateur (niveau district)\",\n", + " subtitle = \"Proportion des districts (ADM2_ID) rapportant chaque mois\",\n", + " x = \"Mois\", y = \"% des districts\",\n", + " fill = \"Statut du rapportage\"\n", + " ) +\n", + " theme_minimal(base_size = 14) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 18),\n", + " strip.text = element_text(size = 14),\n", + " axis.title = element_text(size = 14),\n", + " axis.text = element_text(size = 12)\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "5cda3985", + "metadata": {}, + "source": [ + "# 2. Cohérence interne des indicateurs composites" + ] + }, + { + "cell_type": "markdown", + "id": "c131a633", + "metadata": {}, + "source": [ + "## 2.1 Filtrage préliminaire des valeurs aberrantes pour l’analyse de cohérence\n", + "\n", + "Avant d’évaluer la cohérence entre les indicateurs composites, nous éliminons d’abord les valeurs aberrantes les plus extrêmes. Cette étape ne modifie pas définitivement le jeu de données et ne vise pas à détecter toutes les valeurs aberrantes ; elle permet simplement d’exclure les cas extrêmes afin de faciliter une évaluation plus fiable de la cohérence entre les indicateurs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "936268f4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Function to detect outliers based on MAD method\n", + "detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = \"mad_flag\") {\n", + " data_long %>%\n", + " group_by(OU, indicator, YEAR) %>%\n", + " mutate(\n", + " median_val = median(value, na.rm = TRUE),\n", + " mad_val = mad(value, na.rm = TRUE),\n", + " \"{outlier_column}\" := value > (median_val + deviation * mad_val) | value < (median_val - deviation * mad_val)\n", + " ) %>%\n", + " ungroup()\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "881f9625", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 0: Select relevant core indicators\n", + "target_indicators <- c(\"SUSP\", \"TEST\", \"CONF\", \"MALTREAT\", \"PRES\")\n", + "\n", + "# Step 1: Convert wide to long format\n", + "routine_long <- routine_data %>%\n", + " pivot_longer(\n", + " cols = all_of(target_indicators),\n", + " names_to = \"indicator\",\n", + " values_to = \"value\"\n", + " ) %>%\n", + " mutate(\n", + " PERIOD = as.character(PERIOD), # Ensure PERIOD is character for join\n", + " OU = OU_ID # Alias for join clarity\n", + " )\n", + "\n", + "# Step 2: Filter to indicators of interest\n", + "routine_long_filtered <- routine_long %>%\n", + " filter(indicator %in% target_indicators)\n", + "\n", + "# Step 3: Calculate MAD15\n", + "mad15_data <- detect_mad_outliers(\n", + " routine_long_filtered,\n", + " deviation = 15,\n", + " outlier_column = \"mad15\"\n", + ")\n", + "\n", + "# Step 4: Calculate MAD10 (only where mad15 not flagged or missing)\n", + "mad10_flags <- mad15_data %>%\n", + " filter(is.na(mad15) | mad15 == FALSE, !is.na(value)) %>%\n", + " detect_mad_outliers(deviation = 10, outlier_column = \"mad10\")\n", + "\n", + "# Step 5: Combine MAD15 and MAD10 results\n", + "mad_combined <- mad15_data %>%\n", + " left_join(\n", + " mad10_flags %>% select(PERIOD, OU, indicator, mad10),\n", + " by = c(\"PERIOD\", \"OU\", \"indicator\")\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04d41ed1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)\n", + "outlier_flags <- mad_combined %>%\n", + " filter(mad15 == TRUE | mad10 == TRUE) %>%\n", + " mutate(PERIOD = as.numeric(PERIOD)) %>%\n", + " select(PERIOD, OU, indicator)\n", + "\n", + "# Step 7: Reshape routine_data to long format for filtering\n", + "routine_long_all <- routine_data %>%\n", + " pivot_longer(\n", + " cols = all_of(target_indicators),\n", + " names_to = \"indicator\",\n", + " values_to = \"value\"\n", + " ) %>%\n", + " mutate(OU = OU_ID)\n", + "\n", + "# Step 8: Remove outliers\n", + "routine_long_clean <- routine_long_all %>%\n", + " anti_join(outlier_flags, by = c(\"PERIOD\", \"OU\", \"indicator\"))\n", + "\n", + "# Step 9: Reshape back to wide format if needed\n", + "routine_data_clean <- routine_long_clean %>%\n", + " select(-OU) %>%\n", + " pivot_wider(names_from = indicator, values_from = value)\n" + ] + }, + { + "cell_type": "markdown", + "id": "c6a5a77b", + "metadata": {}, + "source": [ + "## 2.2 Cohérence des indicateurs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cfeb18e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: Extract year and month from PERIOD\n", + "routine_hd_month <- routine_data_clean %>%\n", + " mutate(\n", + " YEAR = substr(PERIOD, 1, 4),\n", + " MONTH = substr(PERIOD, 5, 6)\n", + " ) %>%\n", + " group_by(ADM2_ID, YEAR, MONTH) %>%\n", + " summarise(\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " MALTREAT = sum(MALTREAT, na.rm = TRUE),\n", + " PRES = sum(PRES, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 2: Create scatter plots\n", + "options(repr.plot.width = 14, repr.plot.height = 6)\n", + "\n", + "p1 <- ggplot(routine_hd_month, aes(x = SUSP, y = TEST)) +\n", + " geom_point(alpha = 0.5, color = \"blue\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Suspectés vs Testés\", x = \"Cas suspectés\", y = \"Cas testés\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "p2 <- ggplot(routine_hd_month, aes(x = TEST, y = CONF)) +\n", + " geom_point(alpha = 0.5, color = \"darkgreen\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Testés vs Confirmés\", x = \"Cas testés\", y = \"Cas confirmés\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "p3 <- ggplot(routine_hd_month, aes(x = CONF, y = MALTREAT)) +\n", + " geom_point(alpha = 0.5, color = \"purple\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Confirmés vs Traités\", x = \"Cas confirmés\", y = \"Cas traités\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "# Step 3: Combine plots\n", + "(p1 | p2 | p3) + plot_layout(guides = \"collect\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0df24272", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: Aggregate monthly values\n", + "rds_clean_month <- routine_data_clean %>%\n", + " mutate(\n", + " YEAR = substr(PERIOD, 1, 4),\n", + " MONTH = substr(PERIOD, 5, 6),\n", + " DATE = as.Date(paste(YEAR, MONTH, \"01\", sep = \"-\"))\n", + " ) %>%\n", + " group_by(YEAR, MONTH, DATE) %>%\n", + " summarise(\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " PRES = sum(PRES, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 2: Plot monthly national trends\n", + "options(repr.plot.width = 14, repr.plot.height = 6)\n", + "rds_clean_month %>%\n", + " pivot_longer(cols = c(SUSP, TEST, CONF, PRES), names_to = \"Indicator\") %>%\n", + " ggplot(aes(x = DATE, y = value, color = Indicator)) +\n", + " geom_line(linewidth = 1.2) +\n", + " labs(\n", + " title = \"Tendances mensuelles nationales des indicateurs composites (après suppression des outliers)\",\n", + " x = \"Mois\", y = \"Nombre de cas\", color = \"Indicateur\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " axis.title = element_text(size = 16),\n", + " axis.text = element_text(size = 16),\n", + " legend.title = element_text(size = 16),\n", + " legend.text = element_text(size = 16)\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "780fc9f8-6c67-4328-85f1-6bdefcd15b48", + "metadata": {}, + "source": [ + "# 3. Carte des populations par district sanitaire (DS)" + ] + }, + { + "cell_type": "markdown", + "id": "da58bbd3", + "metadata": {}, + "source": [ + "## 3.1. Carte de la Population pour ADM2 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6965155d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Code from previous version of the notebook\n", + "# Uses continuos scale for population\n", + "\n", + "# Run if population_data is available\n", + "if (!is.null(population_data) & !is.null(shapes_data)) {\n", + " # Join population to spatial shapes\n", + " map_data <- shapes_data %>%\n", + " left_join(population_data, by = \"ADM2_ID\")\n", + " \n", + " # Plot population per district (DS)\n", + " plot <- ggplot(map_data) +\n", + " geom_sf(aes(fill = POPULATION), color = \"white\", size = 0.2) +\n", + " scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", + " labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Données DHIS2\",\n", + " caption = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " theme_minimal(base_size = 14) \n", + "\n", + " print(plot)\n", + "\n", + "} else {\n", + " print(\"Population or shapes data not available.\")\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "eb276692", + "metadata": {}, + "source": [ + "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", + "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " + ] + }, + { + "cell_type": "markdown", + "id": "62ec70f5", + "metadata": {}, + "source": [ + "### 🇳🇪 NER specific code \n", + "Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", + "* only year 2022 to 2024\n", + "* specific palette (yellowish to brick red)\n", + "* specific intervals\n", + "* looks at **disaggregated** population <- this is sometimes contry-specific!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d33724e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "population_data_filtered <- population_data\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " print(\"🇳🇪 Executing NER specific code ... \")\n", + "\n", + " # --- Filter data to keep only 2022-2024 ... ---\n", + " years_to_keep <- 2022:2024\n", + " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", + "\n", + " # --- Read data from SNT_metadata.json ---\n", + " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + " # --- Assign population breaks from metadata ---\n", + " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", + " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", + " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", + "\n", + " # --- Define function to create dyanic labels based on breaks for pop category ---\n", + " create_dynamic_labels <- function(breaks) {\n", + " fmt <- function(x) {\n", + " format(x / 1000, big.mark = \"'\", scientific = FALSE, trim = TRUE)\n", + " }\n", + " \n", + " labels <- c(\n", + " paste0(\"< \", fmt(breaks[1]), \"k\"), # First label\n", + " paste0(fmt(breaks[-length(breaks)]), \" - \", fmt(breaks[-1]), \"k\"), # Middle\n", + " paste0(\"> \", fmt(breaks[length(breaks)]), \"k\") # Last label\n", + " ) \n", + " return(labels)\n", + " }\n", + "\n", + " # --- Create dynamic labels based on breaks ---\n", + " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", + " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", + " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "NER_palette_population <- c(\n", + " \"1\" = \"#fae6db\",\n", + " \"2\" = \"#f1b195\",\n", + " \"3\" = \"#ea7354\",\n", + " \"4\" = \"#cc3f32\",\n", + " \"5\" = \"#972620\"\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "95892df7-e5b8-4d7a-bf96-88673e633370", + "metadata": {}, + "source": [ + "### Population Totales" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0a196b8-2db5-478d-899a-48985d1735f0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", + "names(NER_palette_population) <- labels_tot\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION,\n", + " breaks = c(0, value_breaks_tot, Inf),\n", + " labels = labels_tot, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_tot, \n", + " drop = FALSE \n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export to see better in high resolution\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_totals.png\")),\n", + " width = 14,\n", + " height = 8,\n", + " dpi = 300\n", + ")\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "aca477aa-4d93-4a74-ad8c-32a30f85a552", + "metadata": {}, + "source": [ + "### Population Femmes Enceintes (FE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9324a56b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + "names(NER_palette_population) <- labels_fe\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION_FE,\n", + " breaks = c(0, value_breaks_fe, Inf),\n", + " labels = labels_fe, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_fe, \n", + " drop = FALSE # Prevents dropping empty levels from legend\n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export to see better in high resolution\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_fe.png\")),\n", + " width = 14, \n", + " height = 8,\n", + " dpi = 300\n", + ")\n", + "\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "bd5fe86d-591a-4f5a-bc42-58180a413d5d", + "metadata": {}, + "source": [ + "### Population Enfants moins de 5 ans (U5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4046761f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (COUNTRY_CODE == \"NER\") {\n", + "\n", + "names(NER_palette_population) <- labels_u5\n", + "\n", + "plot <- population_data_filtered %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION_U5,\n", + " breaks = c(0, value_breaks_u5, Inf),\n", + " labels = labels_u5, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_u5, \n", + " drop = FALSE \n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + "print(plot)\n", + "\n", + "# Export PNG\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_u5.png\")),\n", + " width = 14, \n", + " height = 8,\n", + " dpi = 300\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5", + "metadata": {}, + "source": [ + "## 3.2. Complétude et qualité des données de la Population" + ] + }, + { + "cell_type": "markdown", + "id": "0d86ed4a-e194-496b-9440-ad206157ee17", + "metadata": {}, + "source": [ + "#### Population Totale" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec2759d-9ac4-42e1-9f7e-7076780bd7d6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# hist(population_data$POPULATION)\n", + "hist(population_data_filtered$POPULATION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc00527f-d8f9-4c9e-bf4a-326c92cf8a68", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(population_data_filtered) +\n", + " geom_point(aes(x = POPULATION,\n", + " y = fct_reorder(ADM2_NAME, POPULATION),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+05, 4e+05, 6e+05, 8e+05, 1e+06, 1.5e+06),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "# Export PNG\n", + "ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", \"hist_population_totale.png\"),\n", + " units = \"cm\",\n", + " width = 15,\n", + " height = 23,\n", + " bg = \"white\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d6ab387a-cc9e-42b9-a634-12af21bef0f5", + "metadata": {}, + "source": [ + "#### Population Femmes Enceintes (FE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6bb79dd-2d8a-4cd1-bf91-3e0e48c14eda", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Wrap in if statement to avoid errors if POPULATION_FE is missing\n", + "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", + " hist(population_data_filtered$POPULATION_FE)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4200afa2-e2f0-4876-9842-141b96f32fe8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", + " \n", + "ggplot(population_data_filtered) +\n", + " geom_point(aes(x = POPULATION_FE,\n", + " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(\n", + " # title = \"\"\n", + " color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "} " + ] + }, + { + "cell_type": "markdown", + "id": "e39305c0-3700-48c3-967a-b9c6af3e737f", + "metadata": {}, + "source": [ + "#### Population Enfants moins de 5 ans (U5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbda9b88-9b91-4845-83a8-795a12124999", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", + " hist(population_data_filtered$POPULATION_U5)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "742116ab-fef7-46ea-8c4b-0aa2a166005d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", + "\n", + "ggplot(population_data_filtered) +\n", + " geom_point(aes(x = POPULATION_U5,\n", + " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(\n", + " # title = \"\"\n", + " color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a113378de7957557e74c955bf33f6b9b79b5d4c8 Mon Sep 17 00:00:00 2001 From: iulia rautu Date: Tue, 17 Mar 2026 16:38:32 +0100 Subject: [PATCH 2/7] perf(DHIS2 formatting): make general report notebook --- .../snt_dhis2_formatting_report.ipynb | 526 +++--------------- 1 file changed, 74 insertions(+), 452 deletions(-) diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb index 8fdf547..f18c35a 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb @@ -171,18 +171,6 @@ "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b96fa16-25cc-4420-9ad8-332af4a59fdf", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -272,24 +260,6 @@ "printdim(shapes_data)" ] }, - { - "cell_type": "markdown", - "id": "c881f748-e391-46c9-a36a-ed11c238a6ce", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65ea60f5-99e9-46d1-89f0-03245d9efd0b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", @@ -847,326 +817,6 @@ "}\n" ] }, - { - "cell_type": "markdown", - "id": "eb276692", - "metadata": {}, - "source": [ - "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", - "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d33724e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "population_data_filtered <- population_data\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " print(\"🇳🇪 Executing NER specific code ... \")\n", - "\n", - " IRdisplay::display_markdown(\"\n", - " ### 🇳🇪 NER specific code \n", - " Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", - " * only year 2022 to 2024\n", - " * specific palette (yellowish to brick red)\n", - " * specific intervals\n", - " * looks at **disaggregated** population <- this is sometimes contry-specific!\n", - "\")\n", - "\n", - " # --- Filter data to keep only 2022-2024 ... ---\n", - " years_to_keep <- 2022:2024\n", - " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", - "\n", - " # --- Read data from SNT_metadata.json ---\n", - " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - " # --- Assign population breaks from metadata ---\n", - " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", - " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", - " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", - "\n", - " # --- Define function to create dyanic labels based on breaks for pop category ---\n", - " create_dynamic_labels <- function(breaks) {\n", - " fmt <- function(x) {\n", - " format(x / 1000, big.mark = \"'\", scientific = FALSE, trim = TRUE)\n", - " }\n", - " \n", - " labels <- c(\n", - " paste0(\"< \", fmt(breaks[1]), \"k\"), # First label\n", - " paste0(fmt(breaks[-length(breaks)]), \" - \", fmt(breaks[-1]), \"k\"), # Middle\n", - " paste0(\"> \", fmt(breaks[length(breaks)]), \"k\") # Last label\n", - " ) \n", - " return(labels)\n", - " }\n", - "\n", - " # --- Create dynamic labels based on breaks ---\n", - " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", - " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", - " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "NER_palette_population <- c(\n", - " \"1\" = \"#fae6db\",\n", - " \"2\" = \"#f1b195\",\n", - " \"3\" = \"#ea7354\",\n", - " \"4\" = \"#cc3f32\",\n", - " \"5\" = \"#972620\"\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "95892df7-e5b8-4d7a-bf96-88673e633370", - "metadata": {}, - "source": [ - "### Population Totales" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0a196b8-2db5-478d-899a-48985d1735f0", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", - "names(NER_palette_population) <- labels_tot\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION,\n", - " breaks = c(0, value_breaks_tot, Inf),\n", - " labels = labels_tot, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_tot, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_totals.png\")),\n", - " width = 14,\n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "aca477aa-4d93-4a74-ad8c-32a30f85a552", - "metadata": {}, - "source": [ - "### Population Femmes Enceintes (FE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9324a56b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_fe\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_FE,\n", - " breaks = c(0, value_breaks_fe, Inf),\n", - " labels = labels_fe, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_fe, \n", - " drop = FALSE # Prevents dropping empty levels from legend\n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_fe.png\")),\n", - " width = 14, \n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "bd5fe86d-591a-4f5a-bc42-58180a413d5d", - "metadata": {}, - "source": [ - "### Population Enfants moins de 5 ans (U5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4046761f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_u5\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_U5,\n", - " breaks = c(0, value_breaks_u5, Inf),\n", - " labels = labels_u5, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_u5, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export PNG\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_u5.png\")),\n", - " width = 14, \n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "\n", - "}" - ] - }, { "cell_type": "markdown", "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5", @@ -1195,7 +845,7 @@ "outputs": [], "source": [ "# hist(population_data$POPULATION)\n", - "hist(population_data_filtered$POPULATION)" + "hist(population_data$POPULATION)" ] }, { @@ -1209,7 +859,7 @@ }, "outputs": [], "source": [ - "ggplot(population_data_filtered) +\n", + "ggplot(population_data) +\n", " geom_point(aes(x = POPULATION,\n", " y = fct_reorder(ADM2_NAME, POPULATION),\n", " color = factor(YEAR))\n", @@ -1242,14 +892,6 @@ ")" ] }, - { - "cell_type": "markdown", - "id": "d6ab387a-cc9e-42b9-a634-12af21bef0f5", - "metadata": {}, - "source": [ - "#### Population Femmes Enceintes (FE)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1261,74 +903,43 @@ }, "outputs": [], "source": [ - "# Wrap in if statement to avoid errors if POPULATION_FE is missing\n", - "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", - " hist(population_data_filtered$POPULATION_FE)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4200afa2-e2f0-4876-9842-141b96f32fe8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", - " \n", - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION_FE,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(\n", - " # title = \"\"\n", - " color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", + "# If the \"POPULATION_FE\" disaggregation is available\n", "\n", - "} " - ] - }, - { - "cell_type": "markdown", - "id": "e39305c0-3700-48c3-967a-b9c6af3e737f", - "metadata": {}, - "source": [ - "#### Population Enfants moins de 5 ans (U5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbda9b88-9b91-4845-83a8-795a12124999", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", - " hist(population_data_filtered$POPULATION_U5)\n", + "if (\"POPULATION_FE\" %in% names(population_data)) {\n", + "\n", + " # 1. Display formatted markdown (only if the condition is true)\n", + " IRdisplay::display_markdown(\"\n", + " #### Population Femmes Enceintes (FE)\n", + " \")\n", + "\n", + " # 2. Histogram of the pregnant women population\n", + " hist(population_data$POPULATION_FE)\n", + "\n", + " # 3. Plot of the pregnant women population\n", + " ggplot(population_data) +\n", + " geom_point(aes(x = POPULATION_FE,\n", + " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(\n", + " # title = \"\"\n", + " color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", "}" ] }, @@ -1343,32 +954,43 @@ }, "outputs": [], "source": [ - "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", + "# If the \"POPULATION_U5\" disaggregation is available\n", "\n", - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION_U5,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(\n", - " # title = \"\"\n", - " color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", + "if (\"POPULATION_U5\" %in% names(population_data)) {\n", + "\n", + " # 1. Display formatted markdown (only if the condition is true)\n", + " IRdisplay::display_markdown(\"\n", + " #### Population Enfants moins de 5 ans (U5)\n", + " \")\n", + "\n", + " # 2. Histogram of the u5 children population\n", + " hist(population_data$POPULATION_U5)\n", + "\n", + " # 3. Plot of the u5 children population\n", + " ggplot(population_data) +\n", + " geom_point(aes(x = POPULATION_U5,\n", + " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(\n", + " # title = \"\"\n", + " color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", "\n", "}" ] From 0f79ab255f97b30e065d42ed42efbbc1ae7dbc94 Mon Sep 17 00:00:00 2001 From: iulia rautu Date: Tue, 17 Mar 2026 19:28:36 +0100 Subject: [PATCH 3/7] fix(DHIS2 formatting): import irdisplay package --- .../reporting/snt_dhis2_formatting_report.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb index f18c35a..9fd00b1 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb @@ -103,7 +103,8 @@ "required_packages <- c(\n", " \"tidyverse\", \n", " \"arrow\", \n", - " \"sf\", \n", + " \"sf\",\n", + " \"IRdisplay\",\n", " \"reticulate\",\n", " \"patchwork\"\n", ") \n", From 47448cd221e3aea2f5c02e740864e5fee8ceaa4a Mon Sep 17 00:00:00 2001 From: iulia rautu Date: Tue, 17 Mar 2026 23:35:54 +0100 Subject: [PATCH 4/7] build(DHIS2 formatting): split report NER --- .../snt_dhis2_formatting_report.ipynb | 63 +- .../snt_dhis2_formatting_report_NER.ipynb | 2851 +++++++++-------- 2 files changed, 1513 insertions(+), 1401 deletions(-) diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb index 9fd00b1..f45fbc5 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb @@ -893,6 +893,23 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "82ff591f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Conditions of existence of disaggregated populations\n", + "\n", + "condition_pregnant_women <- \"POPULATION_FE\" %in% names(population_data)\n", + "condition_u5_children <- \"POPULATION_U5\" %in% names(population_data)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -906,17 +923,32 @@ "source": [ "# If the \"POPULATION_FE\" disaggregation is available\n", "\n", - "if (\"POPULATION_FE\" %in% names(population_data)) {\n", + "if (condition_pregnant_women) {\n", "\n", - " # 1. Display formatted markdown (only if the condition is true)\n", + " # Display formatted markdown (only if the condition is true)\n", " IRdisplay::display_markdown(\"\n", " #### Population Femmes Enceintes (FE)\n", " \")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f721048", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_pregnant_women) {\n", "\n", - " # 2. Histogram of the pregnant women population\n", + " # 1. Histogram of the pregnant women population\n", " hist(population_data$POPULATION_FE)\n", "\n", - " # 3. Plot of the pregnant women population\n", + " # 2. Plot of the pregnant women population\n", " ggplot(population_data) +\n", " geom_point(aes(x = POPULATION_FE,\n", " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", @@ -957,17 +989,32 @@ "source": [ "# If the \"POPULATION_U5\" disaggregation is available\n", "\n", - "if (\"POPULATION_U5\" %in% names(population_data)) {\n", + "if (condition_u5_children) {\n", "\n", - " # 1. Display formatted markdown (only if the condition is true)\n", + " # Display formatted markdown (only if the condition is true)\n", " IRdisplay::display_markdown(\"\n", " #### Population Enfants moins de 5 ans (U5)\n", " \")\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10552bb1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_u5_children) {\n", "\n", - " # 2. Histogram of the u5 children population\n", + " # 1. Histogram of the u5 children population\n", " hist(population_data$POPULATION_U5)\n", "\n", - " # 3. Plot of the u5 children population\n", + " # 2. Plot of the u5 children population\n", " ggplot(population_data) +\n", " geom_point(aes(x = POPULATION_U5,\n", " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb index dc316d8..6a6c3b5 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb @@ -1,1398 +1,1463 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "47551f88-b40b-449f-9dc1-59db71183611", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "47551f88-b40b-449f-9dc1-59db71183611", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 💡 Comments / Questions & To Do's:\n", + "# - filter by YEAR keep only 2022-2024): \n", + "# 1. Why these years? Arbitrary choice? Based on what? linked to what?\n", + "# 2. Is this a paramater is some other pipeline? if so, should be integrated here somehow \n", + "# - Missing data: why do we have NA values for population? Are these real NA (missing data) or 0?\n", + "# - OUTLIERS: there are clear outliers (i.e., DS AGADEZ): shall we do some simple data cleaning here?\n", + "# - Population catagories (breaks) do we have a specific scale in mind \n", + "# (i.e., use same as another country) or can I set it based on the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "342b6b54-4812-4b07-b408-68a034b4014e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# TO DO / FINISH:\n", + "# - add safety \"if\" logic so nb does not fail if data is missing or wrong path ...\n", + "# - (maybe) also add meaningful messages\n", + "# - Add code to export PNG files of relevant figures\n", + "# - Set dynamic boundaries for POPULATION categories? (so can use same code in different countries)\n", + "# - Clean code to avoid redundancies (especially ggplot stuff, a lot of copy pasted ...)" + ] + }, + { + "cell_type": "markdown", + "id": "5b72f828-4fc1-462d-babc-f8f6c9c96ff5", + "metadata": {}, + "source": [ + "## 0. Paths and Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d3285c7-1a60-46ad-9541-36a703d51924", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "\n", + "REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_formatting/reporting\")\n", + "\n", + "# Create output directories if they don't exist (before loading utils)\n", + "figures_dir <- file.path(REPORTING_NB_PATH, \"outputs\", \"figures\")\n", + "if (!dir.exists(figures_dir)) {\n", + " dir.create(figures_dir, recursive = TRUE)\n", + " print(paste0(\"Created figures directory: \", figures_dir))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "732733e7-8890-4c3e-be64-496fd4a2c800", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f26728d-10a0-42d6-a7ff-368cc38e60b9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "required_packages <- c(\n", + " \"tidyverse\", \n", + " \"arrow\", \n", + " \"sf\",\n", + " \"IRdisplay\",\n", + " \"reticulate\",\n", + " \"patchwork\"\n", + ") \n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20475dd9-5091-4f87-9ae2-d0235921fe94", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load openhexa.sdk\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f70d726-1c34-47dc-b963-bb23e42994bb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90d58c60-fb4e-40e4-add8-5f258f541843", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration variables\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8eece9e0-2544-48c1-8579-a5a721af4ff8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# print function\n", + "printdim <- function(df, name = deparse(substitute(df))) {\n", + " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "643abe28-da3b-4bd2-9ecc-126b18b85c69", + "metadata": {}, + "source": [ + "## 1. Import data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43bbbcdf-c1d1-4631-980c-2c4465cf7a55", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import analytics DHIS2 data\n", + "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste0(\"[WARNING] Error while loading DHIS2 Routine data for: \" , COUNTRY_CODE, \n", + " \" the report cannot be executed. [ERROR DETAILS] \", conditionMessage(e))\n", + " stop(msg)\n", + " })\n", + "\n", + "printdim(routine_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d53274c5-965e-4a11-bb77-c9b899d5cb9c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste0(COUNTRY_NAME , \" Population data is not available in dataset : \" , dataset_name, \" last version.\")\n", + " log_msg(msg, \"warning\")\n", + " population_data <- NULL\n", + " })\n", + "\n", + "printdim(population_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1be5372-cbc1-4343-ab11-01eae0fa9d60", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) { \n", + " msg <- paste0(COUNTRY_NAME , \" Shapes data is not available in dataset : \" , dataset_name, \" last version.\")\n", + " log_msg(msg, \"warning\")\n", + " shapes_data <- NULL\n", + " })\n", + "\n", + "printdim(shapes_data)" + ] + }, + { + "cell_type": "markdown", + "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", + "metadata": {}, + "source": [ + "# **Complétude des indicateurs composites**\n" + ] + }, + { + "cell_type": "markdown", + "id": "ca84fce1-0407-433a-a98a-e65ed15ab8de", + "metadata": {}, + "source": [ + "# 1. Complétude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators" + ] + }, + { + "cell_type": "markdown", + "id": "c109e82d-8c72-41f0-857a-322163cf213e", + "metadata": {}, + "source": [ + "## 1.1 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f54505a-2dcc-429e-a900-46d4fae6fd31", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 0: Rename your data for convenience\n", + "data <- routine_data\n", + "\n", + "# Step 1: Convert PERIOD to DATE\n", + "data <- data %>%\n", + " mutate(\n", + " DATE = ymd(paste0(PERIOD, \"01\"))\n", + " )\n", + "\n", + "# Step 2: Reshape wide to long: INDICATOR = column name (e.g., CONF), VALUE = value\n", + "indicator_vars <- setdiff(names(data), c(\n", + " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"DATE\"\n", + "))\n", + "\n", + "long_data <- data %>%\n", + " pivot_longer(cols = all_of(indicator_vars),\n", + " names_to = \"INDICATOR\",\n", + " values_to = \"VALUE\") %>%\n", + " rename(OU = OU_ID)\n", + "\n", + "# Step 3: Build expected full grid (OU × INDICATOR × DATE)\n", + "full_grid <- expand_grid(\n", + " OU = unique(long_data$OU),\n", + " INDICATOR = unique(long_data$INDICATOR),\n", + " DATE = unique(long_data$DATE)\n", + ")\n", + "\n", + "# Step 4: Join and assess reporting status\n", + "reporting_check <- full_grid %>%\n", + " left_join(\n", + " long_data %>% select(OU, INDICATOR, DATE, VALUE),\n", + " by = c(\"OU\", \"INDICATOR\", \"DATE\")\n", + " ) %>%\n", + " mutate(\n", + " is_missing = is.na(VALUE),\n", + " is_zero = VALUE == 0 & !is.na(VALUE),\n", + " is_positive = VALUE > 0 & !is.na(VALUE)\n", + " )\n", + "\n", + "# Step 5: Summarise reporting status\n", + "reporting_summary <- reporting_check %>%\n", + " group_by(INDICATOR, DATE) %>%\n", + " summarise(\n", + " n_total = n_distinct(OU),\n", + " n_missing = sum(is_missing),\n", + " n_zero = sum(is_zero),\n", + " n_positive = sum(is_positive),\n", + " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", + " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", + " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 6: Prepare plot-ready data\n", + "plot_data <- reporting_summary %>%\n", + " pivot_longer(\n", + " cols = starts_with(\"pct_\"),\n", + " names_to = \"Status\",\n", + " values_to = \"Percentage\"\n", + " ) %>%\n", + " mutate(\n", + " Status = recode(Status,\n", + " pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur nulle rapportée\",\n", + " pct_positive = \"Valeur positive rapportée\")\n", + " ) %>%\n", + " complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfd115e7-176d-4beb-9ab9-2e6990cb16af", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "options(repr.plot.width = 17, repr.plot.height = 10)\n", + "ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +\n", + " geom_col(position = \"stack\") +\n", + " facet_wrap(~ INDICATOR, scales = \"free_y\", ncol = 4) +\n", + " scale_y_continuous() +\n", + " scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\",\n", + " \"Valeur nulle rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " labs(\n", + " title = \"Taux de rapportage par indicateur (niveau formation sanitaire)\",\n", + " subtitle = \"Proportion des valeurs rapportées par mois et par indicateur\",\n", + " x = \"Mois\", y = \"% des formations sanitaires\",\n", + " fill = \"Statut du rapportage\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " strip.text = element_text(size = 16),\n", + " axis.title = element_text(size = 16),\n", + " axis.text = element_text(size = 16)\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "e6871759-714b-437a-8b9c-5a5a06656567", + "metadata": {}, + "source": [ + "## 1.2 Proportion des districts ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c89f6c77-dd42-4616-8eb5-1642d5b51157", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 0: Rename for convenience\n", + "data <- routine_data\n", + "\n", + "# Step 1: Convert PERIOD to proper Date\n", + "data <- data %>%\n", + " mutate(Date = ymd(paste0(PERIOD, \"01\")))\n", + "\n", + "# Step 2: Identify indicator columns\n", + "indicator_cols <- setdiff(names(data), c(\n", + " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\",\n", + " \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"Date\"\n", + "))\n", + "\n", + "# Step 3: Reshape to long format\n", + "data_long <- data %>%\n", + " select(ADM2_ID, OU_ID, Date, all_of(indicator_cols)) %>%\n", + " pivot_longer(cols = all_of(indicator_cols),\n", + " names_to = \"Indicator\", values_to = \"value\") %>%\n", + " mutate(value = as.numeric(value))\n", + "\n", + "# Step 4: Full expected grid at ADM2 level\n", + "full_grid <- expand_grid(\n", + " ADM2_ID = unique(data_long$ADM2_ID),\n", + " Indicator = unique(data_long$Indicator),\n", + " Date = unique(data_long$Date)\n", + ")\n", + "\n", + "# Step 5: Detect if *any* health facility reported per district × indicator × date\n", + "reporting_check <- data_long %>%\n", + " group_by(ADM2_ID, Indicator, Date) %>%\n", + " summarise(\n", + " is_missing = all(is.na(value)),\n", + " is_zero = all(value == 0, na.rm = TRUE),\n", + " is_positive = any(value > 0, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 6: Join with full grid to fill in missing ADM2s\n", + "reporting_full <- full_grid %>%\n", + " left_join(reporting_check, by = c(\"ADM2_ID\", \"Indicator\", \"Date\")) %>%\n", + " mutate(\n", + " is_missing = replace_na(is_missing, TRUE),\n", + " is_zero = replace_na(is_zero, FALSE),\n", + " is_positive = replace_na(is_positive, FALSE)\n", + " )\n", + "\n", + "# Step 7: Summarise by Indicator and Date\n", + "reporting_summary <- reporting_full %>%\n", + " group_by(Indicator, Date) %>%\n", + " summarise(\n", + " n_total = n_distinct(ADM2_ID),\n", + " n_missing = sum(is_missing),\n", + " n_zero = sum(is_zero & !is_missing),\n", + " n_positive = sum(is_positive),\n", + " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", + " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", + " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 8: Reshape for plotting\n", + "plot_data <- reporting_summary %>%\n", + " pivot_longer(cols = starts_with(\"pct_\"),\n", + " names_to = \"Status\", values_to = \"Percentage\") %>%\n", + " mutate(Status = recode(Status,\n", + " pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur nulle rapportée\",\n", + " pct_positive = \"Valeur positive rapportée\")) %>%\n", + " complete(Indicator, Date, Status, fill = list(Percentage = 0))\n", + "\n", + "# Step 9: Plot\n", + "ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +\n", + " geom_col(position = \"stack\") +\n", + " facet_wrap(~ Indicator, scales = \"free_y\") +\n", + " scale_y_continuous(limits = c(0, 100)) +\n", + " scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\",\n", + " \"Valeur nulle rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " labs(\n", + " title = \"Taux de rapportage par indicateur (niveau district)\",\n", + " subtitle = \"Proportion des districts (ADM2_ID) rapportant chaque mois\",\n", + " x = \"Mois\", y = \"% des districts\",\n", + " fill = \"Statut du rapportage\"\n", + " ) +\n", + " theme_minimal(base_size = 14) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 18),\n", + " strip.text = element_text(size = 14),\n", + " axis.title = element_text(size = 14),\n", + " axis.text = element_text(size = 12)\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "5cda3985", + "metadata": {}, + "source": [ + "# 2. Cohérence interne des indicateurs composites" + ] + }, + { + "cell_type": "markdown", + "id": "c131a633", + "metadata": {}, + "source": [ + "## 2.1 Filtrage préliminaire des valeurs aberrantes pour l’analyse de cohérence\n", + "\n", + "Avant d’évaluer la cohérence entre les indicateurs composites, nous éliminons d’abord les valeurs aberrantes les plus extrêmes. Cette étape ne modifie pas définitivement le jeu de données et ne vise pas à détecter toutes les valeurs aberrantes ; elle permet simplement d’exclure les cas extrêmes afin de faciliter une évaluation plus fiable de la cohérence entre les indicateurs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "936268f4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Function to detect outliers based on MAD method\n", + "detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = \"mad_flag\") {\n", + " data_long %>%\n", + " group_by(OU, indicator, YEAR) %>%\n", + " mutate(\n", + " median_val = median(value, na.rm = TRUE),\n", + " mad_val = mad(value, na.rm = TRUE),\n", + " \"{outlier_column}\" := value > (median_val + deviation * mad_val) | value < (median_val - deviation * mad_val)\n", + " ) %>%\n", + " ungroup()\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "881f9625", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 0: Select relevant core indicators\n", + "target_indicators <- c(\"SUSP\", \"TEST\", \"CONF\", \"MALTREAT\", \"PRES\")\n", + "\n", + "# Step 1: Convert wide to long format\n", + "routine_long <- routine_data %>%\n", + " pivot_longer(\n", + " cols = all_of(target_indicators),\n", + " names_to = \"indicator\",\n", + " values_to = \"value\"\n", + " ) %>%\n", + " mutate(\n", + " PERIOD = as.character(PERIOD), # Ensure PERIOD is character for join\n", + " OU = OU_ID # Alias for join clarity\n", + " )\n", + "\n", + "# Step 2: Filter to indicators of interest\n", + "routine_long_filtered <- routine_long %>%\n", + " filter(indicator %in% target_indicators)\n", + "\n", + "# Step 3: Calculate MAD15\n", + "mad15_data <- detect_mad_outliers(\n", + " routine_long_filtered,\n", + " deviation = 15,\n", + " outlier_column = \"mad15\"\n", + ")\n", + "\n", + "# Step 4: Calculate MAD10 (only where mad15 not flagged or missing)\n", + "mad10_flags <- mad15_data %>%\n", + " filter(is.na(mad15) | mad15 == FALSE, !is.na(value)) %>%\n", + " detect_mad_outliers(deviation = 10, outlier_column = \"mad10\")\n", + "\n", + "# Step 5: Combine MAD15 and MAD10 results\n", + "mad_combined <- mad15_data %>%\n", + " left_join(\n", + " mad10_flags %>% select(PERIOD, OU, indicator, mad10),\n", + " by = c(\"PERIOD\", \"OU\", \"indicator\")\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04d41ed1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)\n", + "outlier_flags <- mad_combined %>%\n", + " filter(mad15 == TRUE | mad10 == TRUE) %>%\n", + " mutate(PERIOD = as.numeric(PERIOD)) %>%\n", + " select(PERIOD, OU, indicator)\n", + "\n", + "# Step 7: Reshape routine_data to long format for filtering\n", + "routine_long_all <- routine_data %>%\n", + " pivot_longer(\n", + " cols = all_of(target_indicators),\n", + " names_to = \"indicator\",\n", + " values_to = \"value\"\n", + " ) %>%\n", + " mutate(OU = OU_ID)\n", + "\n", + "# Step 8: Remove outliers\n", + "routine_long_clean <- routine_long_all %>%\n", + " anti_join(outlier_flags, by = c(\"PERIOD\", \"OU\", \"indicator\"))\n", + "\n", + "# Step 9: Reshape back to wide format if needed\n", + "routine_data_clean <- routine_long_clean %>%\n", + " select(-OU) %>%\n", + " pivot_wider(names_from = indicator, values_from = value)\n" + ] + }, + { + "cell_type": "markdown", + "id": "c6a5a77b", + "metadata": {}, + "source": [ + "## 2.2 Cohérence des indicateurs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cfeb18e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: Extract year and month from PERIOD\n", + "routine_hd_month <- routine_data_clean %>%\n", + " mutate(\n", + " YEAR = substr(PERIOD, 1, 4),\n", + " MONTH = substr(PERIOD, 5, 6)\n", + " ) %>%\n", + " group_by(ADM2_ID, YEAR, MONTH) %>%\n", + " summarise(\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " MALTREAT = sum(MALTREAT, na.rm = TRUE),\n", + " PRES = sum(PRES, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 2: Create scatter plots\n", + "options(repr.plot.width = 14, repr.plot.height = 6)\n", + "\n", + "p1 <- ggplot(routine_hd_month, aes(x = SUSP, y = TEST)) +\n", + " geom_point(alpha = 0.5, color = \"blue\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Suspectés vs Testés\", x = \"Cas suspectés\", y = \"Cas testés\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "p2 <- ggplot(routine_hd_month, aes(x = TEST, y = CONF)) +\n", + " geom_point(alpha = 0.5, color = \"darkgreen\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Testés vs Confirmés\", x = \"Cas testés\", y = \"Cas confirmés\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "p3 <- ggplot(routine_hd_month, aes(x = CONF, y = MALTREAT)) +\n", + " geom_point(alpha = 0.5, color = \"purple\") +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " labs(title = \"Confirmés vs Traités\", x = \"Cas confirmés\", y = \"Cas traités\") +\n", + " theme_minimal(base_size = 16)\n", + "\n", + "# Step 3: Combine plots\n", + "(p1 | p2 | p3) + plot_layout(guides = \"collect\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0df24272", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: Aggregate monthly values\n", + "rds_clean_month <- routine_data_clean %>%\n", + " mutate(\n", + " YEAR = substr(PERIOD, 1, 4),\n", + " MONTH = substr(PERIOD, 5, 6),\n", + " DATE = as.Date(paste(YEAR, MONTH, \"01\", sep = \"-\"))\n", + " ) %>%\n", + " group_by(YEAR, MONTH, DATE) %>%\n", + " summarise(\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " PRES = sum(PRES, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "# Step 2: Plot monthly national trends\n", + "options(repr.plot.width = 14, repr.plot.height = 6)\n", + "rds_clean_month %>%\n", + " pivot_longer(cols = c(SUSP, TEST, CONF, PRES), names_to = \"Indicator\") %>%\n", + " ggplot(aes(x = DATE, y = value, color = Indicator)) +\n", + " geom_line(linewidth = 1.2) +\n", + " labs(\n", + " title = \"Tendances mensuelles nationales des indicateurs composites (après suppression des outliers)\",\n", + " x = \"Mois\", y = \"Nombre de cas\", color = \"Indicateur\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " axis.title = element_text(size = 16),\n", + " axis.text = element_text(size = 16),\n", + " legend.title = element_text(size = 16),\n", + " legend.text = element_text(size = 16)\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "780fc9f8-6c67-4328-85f1-6bdefcd15b48", + "metadata": {}, + "source": [ + "# 3. Carte des populations par district sanitaire (DS)" + ] + }, + { + "cell_type": "markdown", + "id": "da58bbd3", + "metadata": {}, + "source": [ + "## 3.1. Carte de la Population pour ADM2 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6965155d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Code from previous version of the notebook\n", + "# Uses continuos scale for population\n", + "\n", + "# Run if population_data is available\n", + "if (!is.null(population_data) & !is.null(shapes_data)) {\n", + " # Join population to spatial shapes\n", + " map_data <- shapes_data %>%\n", + " left_join(population_data, by = \"ADM2_ID\")\n", + " \n", + " # Plot population per district (DS)\n", + " plot <- ggplot(map_data) +\n", + " geom_sf(aes(fill = POPULATION), color = \"white\", size = 0.2) +\n", + " scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", + " labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Données DHIS2\",\n", + " caption = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " theme_minimal(base_size = 14) \n", + "\n", + " print(plot)\n", + "\n", + "} else {\n", + " print(\"Population or shapes data not available.\")\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "eb276692", + "metadata": {}, + "source": [ + "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", + "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " + ] + }, + { + "cell_type": "markdown", + "id": "62ec70f5", + "metadata": {}, + "source": [ + "### 🇳🇪 NER specific code \n", + "Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", + "* only year 2022 to 2024\n", + "* specific palette (yellowish to brick red)\n", + "* specific intervals\n", + "* looks at **disaggregated** population <- this is sometimes contry-specific!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d33724e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!is.null(population_data)) {\n", + " print(\"🇳🇪 Executing NER specific code ... \")\n", + "\n", + " # --- Filter data to keep only 2022-2024 ... ---\n", + " years_to_keep <- 2022:2024\n", + " population_data <- population_data |> filter(YEAR %in% years_to_keep)\n", + "\n", + " # --- Read data from SNT_metadata.json ---\n", + " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + " # --- Assign population breaks from metadata ---\n", + " parse_scale <- function(scale_obj) {\n", + " if (is.character(scale_obj)) {\n", + " return(jsonlite::fromJSON(scale_obj))\n", + " }\n", + " if (is.atomic(scale_obj) || is.list(scale_obj)) {\n", + " return(as.numeric(unlist(scale_obj)))\n", + " }\n", + " stop(\"Invalid SCALE format in SNT_metadata.json\")\n", + " }\n", + "\n", + " get_scale <- function(meta, candidates) {\n", + " for (name in candidates) {\n", + " if (!is.null(meta[[name]]) && !is.null(meta[[name]]$SCALE)) {\n", + " return(meta[[name]]$SCALE)\n", + " }\n", + " }\n", + " return(NULL)\n", + " }\n", + "\n", + " scale_tot <- get_scale(metadata_json, c(\"POPULATION_TOTAL\", \"POPULATION\"))\n", + " scale_u5 <- get_scale(metadata_json, c(\"POPULATION_U5\", \"POPULATION\"))\n", + " scale_fe <- get_scale(metadata_json, c(\"POPULATION_PREGNANT\", \"POPULATION\"))\n", + "\n", + " if (is.null(scale_tot) || is.null(scale_u5) || is.null(scale_fe)) {\n", + " stop(\"Missing population SCALE in SNT_metadata.json\")\n", + " }\n", + "\n", + " value_breaks_tot <- parse_scale(scale_tot)\n", + " value_breaks_u5 <- parse_scale(scale_u5)\n", + " value_breaks_fe <- parse_scale(scale_fe)\n", + "\n", + " if (length(value_breaks_tot) < 1 || length(value_breaks_u5) < 1 || length(value_breaks_fe) < 1) {\n", + " stop(\"Population SCALE in SNT_metadata.json must contain at least one break value.\")\n", + " }\n", + "\n", + " # --- Define function to create dyanic labels based on breaks for pop category ---\n", + " create_dynamic_labels <- function(breaks) {\n", + " fmt <- function(x) {\n", + " format(x / 1000, big.mark = \"'\", scientific = FALSE, trim = TRUE)\n", + " }\n", + " \n", + " labels <- c(\n", + " paste0(\"< \", fmt(breaks[1]), \"k\"), # First label\n", + " paste0(fmt(breaks[-length(breaks)]), \" - \", fmt(breaks[-1]), \"k\"), # Middle\n", + " paste0(\"> \", fmt(breaks[length(breaks)]), \"k\") # Last label\n", + " ) \n", + " return(labels)\n", + " }\n", + "\n", + " # --- Create dynamic labels based on breaks ---\n", + " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", + " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", + " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", + "\n", + " # --- NER palette: build with same length as labels (yellowish -> brick red) ---\n", + " NER_palette_base <- c(\"#fae6db\", \"#f1b195\", \"#ea7354\", \"#cc3f32\", \"#972620\")\n", + " make_ner_palette <- function(labels) {\n", + " setNames(colorRampPalette(NER_palette_base)(length(labels)), labels)\n", + " }\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# NER palette: base colors (yellowish -> brick red). Actual palette is built\n", + "# per plot with make_ner_palette(labels_*) so length always matches labels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fa4cded", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Conditions for display of total and disaggregated population plots\n", + "\n", + "pregnant_women_col <- \"POP_PREGNANT_WOMAN\"\n", + "u5_children_col <- \"POP_UNDER_5\"\n", + "\n", + "# existence of general population data and labels for plot\n", + "condition_population_data <- !is.null(population_data) && nrow(population_data) > 0\n", + "condition_plot_total_population <- condition_population_data && exists(\"labels_tot\") && !is.null(shapes_data)\n", + "\n", + "# existence of pregnant women data and labels for plot\n", + "condition_pregnant_women <- condition_population_data && pregnant_women_col %in% names(population_data)\n", + "condition_plot_pregnant_women <- condition_pregnant_women && exists(\"labels_fe\") && !is.null(shapes_data)\n", + "\n", + "# existence of under5 children disaggregation and labels for plot\n", + "condition_u5_children <- condition_population_data && u5_children_col %in% names(population_data)\n", + "condition_plot_u5_children <- condition_u5_children && exists(\"labels_u5\") && !is.null(shapes_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0a196b8-2db5-478d-899a-48985d1735f0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_plot_total_population) {\n", + "\n", + " # Display formatted markdown (only if the condition is true)\n", + " IRdisplay::display_markdown(\"\n", + " ### Population Totale\n", + " \")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4263ea8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_plot_total_population) {\n", + " \n", + " # Palette length must match number of labels (dynamic from metadata breaks)\n", + " NER_palette_population <- make_ner_palette(labels_tot)\n", + "\n", + " plot <- population_data %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION,\n", + " breaks = c(0, value_breaks_tot, Inf),\n", + " labels = labels_tot, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_tot, \n", + " drop = FALSE \n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + " print(plot)\n", + "\n", + " # Export to see better in high resolution\n", + " ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_totals.png\")),\n", + " width = 14,\n", + " height = 8,\n", + " dpi = 300\n", + " )\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9324a56b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_plot_pregnant_women) {\n", + "\n", + " # Display formatted markdown (only if the condition is true)\n", + " IRdisplay::display_markdown(\"\n", + " ### Population Femmes Enceintes (FE)\n", + " \")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b83cf3c0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_plot_pregnant_women) {\n", + "\n", + " NER_palette_population <- make_ner_palette(labels_fe)\n", + "\n", + " plot <- population_data %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION_FE,\n", + " breaks = c(0, value_breaks_fe, Inf),\n", + " labels = labels_fe, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_fe, \n", + " drop = FALSE # Prevents dropping empty levels from legend\n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + " print(plot)\n", + "\n", + " # Export to see better in high resolution\n", + " ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_fe.png\")),\n", + " width = 14, \n", + " height = 8,\n", + " dpi = 300\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4046761f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_plot_u5_children) {\n", + "\n", + " # Display formatted markdown (only if the condition is true)\n", + " IRdisplay::display_markdown(\"\n", + " #### Population Enfants moins de 5 ans (U5)\n", + " \")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc703bc5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_plot_u5_children) {\n", + " \n", + " NER_palette_population <- make_ner_palette(labels_u5)\n", + "\n", + " plot <- population_data %>%\n", + " mutate(\n", + " CATEGORY_POPULATION = cut(\n", + " POPULATION_U5,\n", + " breaks = c(0, value_breaks_u5, Inf),\n", + " labels = labels_u5, \n", + " right = TRUE,\n", + " include.lowest = TRUE\n", + " )\n", + " ) %>% \n", + " left_join(shapes_data, \n", + " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", + " ggplot() +\n", + " geom_sf(aes(geometry = geometry,\n", + " fill = CATEGORY_POPULATION),\n", + " color = \"black\",\n", + " linewidth = 0.25, \n", + " show.legend = TRUE\n", + " ) +\n", + " labs(\n", + " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", + " subtitle = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " scale_fill_manual(\n", + " values = NER_palette_population, \n", + " limits = labels_u5, \n", + " drop = FALSE \n", + " ) +\n", + " facet_wrap(~YEAR, ncol = 3) +\n", + " theme_void() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " strip.text = element_text(face = \"bold\"),\n", + " legend.key.height = unit(0.5, \"line\"),\n", + " legend.margin = margin(10, 0, 0, 0)\n", + " )\n", + "\n", + " print(plot)\n", + "\n", + " # Export PNG\n", + " ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_u5.png\")),\n", + " width = 14, \n", + " height = 8,\n", + " dpi = 300\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5", + "metadata": {}, + "source": [ + "## 3.2. Complétude et qualité des données de la Population" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec2759d-9ac4-42e1-9f7e-7076780bd7d6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_population_data) {\n", + "\n", + " # Display formatted markdown (only if the condition is true)\n", + " IRdisplay::display_markdown(\"\n", + " ### Population Totale\n", + " \")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f2e74b5", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_population_data) {\n", + "\n", + " # 1. Histogram of the total population\n", + " hist(population_data$POPULATION)\n", + "\n", + " # 2. Plot of the total population\n", + " ggplot(population_data) +\n", + " geom_point(aes(x = POPULATION,\n", + " y = fct_reorder(ADM2_NAME, POPULATION),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+05, 4e+05, 6e+05, 8e+05, 1e+06, 1.5e+06),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + " ggsave(\n", + " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", \"hist_population_totale.png\"),\n", + " units = \"cm\",\n", + " width = 15,\n", + " height = 23,\n", + " bg = \"white\"\n", + " )\n", + "\n", + "} else {\n", + " print(\"Population data not available.\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6bb79dd-2d8a-4cd1-bf91-3e0e48c14eda", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_pregnant_women) { \n", + " \n", + " # Display formatted markdown (only if the condition is true)\n", + " IRdisplay::display_markdown(\"\n", + " ### Population Femmes Enceintes (FE)\n", + " \")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d4ca310", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "\n", + "if (condition_pregnant_women) { \n", + "\n", + " # 1. Histogram of the pregnant women population\n", + " hist(population_data$POPULATION_FE)\n", + "\n", + " # 2. Plot of the pregnant women population\n", + " ggplot(population_data) +\n", + " geom_point(aes(x = POPULATION_FE,\n", + " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(\n", + " # title = \"\"\n", + " color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + " \n", + "} else {\n", + " print(\"Data for pregnant women not available.\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbda9b88-9b91-4845-83a8-795a12124999", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_u5_children) {\n", + " \n", + " # Display formatted markdown (only if the condition is true)\n", + " IRdisplay::display_markdown(\"\n", + " ### Population Enfants moins de 5 ans (U5)\n", + " \")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2512e79", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (condition_u5_children) {\n", + "\n", + " # 1. Make histogram\n", + " hist(population_data$POPULATION_U5)\n", + "\n", + " # 2. Make plot\n", + " ggplot(population_data) +\n", + " geom_point(aes(x = POPULATION_U5,\n", + " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", + " color = factor(YEAR))\n", + " ) +\n", + " facet_grid(rows = \"ADM1_NAME\", \n", + " scale = \"free_y\", \n", + " space = \"free_y\", \n", + " switch = \"y\") +\n", + " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),\n", + " labels = scales::comma) +\n", + " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", + " labs(\n", + " # title = \"\"\n", + " color = \"Année\") +\n", + " theme_minimal() +\n", + " theme(\n", + " axis.text = element_text(size = 7),\n", + " axis.title.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + " \n", + "} else {\n", + " print(\"Data for children under five not available.\")\n", + "}" + ] } - }, - "outputs": [], - "source": [ - "# 💡 Comments / Questions & To Do's:\n", - "# - filter by YEAR keep only 2022-2024): \n", - "# 1. Why these years? Arbitrary choice? Based on what? linked to what?\n", - "# 2. Is this a paramater is some other pipeline? if so, should be integrated here somehow \n", - "# - Missing data: why do we have NA values for population? Are these real NA (missing data) or 0?\n", - "# - OUTLIERS: there are clear outliers (i.e., DS AGADEZ): shall we do some simple data cleaning here?\n", - "# - Population catagories (breaks) do we have a specific scale in mind \n", - "# (i.e., use same as another country) or can I set it based on the data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "342b6b54-4812-4b07-b408-68a034b4014e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# TO DO / FINISH:\n", - "# - add safety \"if\" logic so nb does not fail if data is missing or wrong path ...\n", - "# - (maybe) also add meaningful messages\n", - "# - Add code to export PNG files of relevant figures\n", - "# - Set dynamic boundaries for POPULATION categories? (so can use same code in different countries)\n", - "# - Clean code to avoid redundancies (especially ggplot stuff, a lot of copy pasted ...)" - ] - }, - { - "cell_type": "markdown", - "id": "5b72f828-4fc1-462d-babc-f8f6c9c96ff5", - "metadata": {}, - "source": [ - "## 0. Paths and Config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d3285c7-1a60-46ad-9541-36a703d51924", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "\n", - "REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_formatting/reporting\")\n", - "\n", - "# Create output directories if they don't exist (before loading utils)\n", - "figures_dir <- file.path(REPORTING_NB_PATH, \"outputs\", \"figures\")\n", - "if (!dir.exists(figures_dir)) {\n", - " dir.create(figures_dir, recursive = TRUE)\n", - " print(paste0(\"Created figures directory: \", figures_dir))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "732733e7-8890-4c3e-be64-496fd4a2c800", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3f26728d-10a0-42d6-a7ff-368cc38e60b9", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "required_packages <- c(\n", - " \"tidyverse\", \n", - " \"arrow\", \n", - " \"sf\", \n", - " \"reticulate\",\n", - " \"patchwork\"\n", - ") \n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20475dd9-5091-4f87-9ae2-d0235921fe94", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load openhexa.sdk\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f70d726-1c34-47dc-b963-bb23e42994bb", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90d58c60-fb4e-40e4-add8-5f258f541843", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration variables\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b96fa16-25cc-4420-9ad8-332af4a59fdf", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8eece9e0-2544-48c1-8579-a5a721af4ff8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "643abe28-da3b-4bd2-9ecc-126b18b85c69", - "metadata": {}, - "source": [ - "## 1. Import data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "43bbbcdf-c1d1-4631-980c-2c4465cf7a55", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# import analytics DHIS2 data\n", - "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste0(\"[WARNING] Error while loading DHIS2 Routine data for: \" , COUNTRY_CODE, \n", - " \" the report cannot be executed. [ERROR DETAILS] \", conditionMessage(e))\n", - " stop(msg)\n", - " })\n", - "\n", - "printdim(routine_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d53274c5-965e-4a11-bb77-c9b899d5cb9c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste0(COUNTRY_NAME , \" Population data is not available in dataset : \" , dataset_name, \" last version.\")\n", - " log_msg(msg, \"warning\")\n", - " population_data <- NULL\n", - " })\n", - "\n", - "printdim(population_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1be5372-cbc1-4343-ab11-01eae0fa9d60", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_NAME , \" Shapes data is not available in dataset : \" , dataset_name, \" last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes_data <- NULL\n", - " })\n", - "\n", - "printdim(shapes_data)" - ] - }, - { - "cell_type": "markdown", - "id": "c881f748-e391-46c9-a36a-ed11c238a6ce", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65ea60f5-99e9-46d1-89f0-03245d9efd0b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233", - "metadata": {}, - "source": [ - "# **Complétude des indicateurs composites**\n" - ] - }, - { - "cell_type": "markdown", - "id": "ca84fce1-0407-433a-a98a-e65ed15ab8de", - "metadata": {}, - "source": [ - "# 1. Complétude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7691e61-6542-4d40-af2a-c018d29b86a8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(routine_data)" - ] - }, - { - "cell_type": "markdown", - "id": "c109e82d-8c72-41f0-857a-322163cf213e", - "metadata": {}, - "source": [ - "## 1.1 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f54505a-2dcc-429e-a900-46d4fae6fd31", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 0: Rename your data for convenience\n", - "data <- routine_data\n", - "\n", - "# Step 1: Convert PERIOD to DATE\n", - "data <- data %>%\n", - " mutate(\n", - " DATE = ymd(paste0(PERIOD, \"01\"))\n", - " )\n", - "\n", - "# Step 2: Reshape wide to long: INDICATOR = column name (e.g., CONF), VALUE = value\n", - "indicator_vars <- setdiff(names(data), c(\n", - " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"DATE\"\n", - "))\n", - "\n", - "long_data <- data %>%\n", - " pivot_longer(cols = all_of(indicator_vars),\n", - " names_to = \"INDICATOR\",\n", - " values_to = \"VALUE\") %>%\n", - " rename(OU = OU_ID)\n", - "\n", - "# Step 3: Build expected full grid (OU × INDICATOR × DATE)\n", - "full_grid <- expand_grid(\n", - " OU = unique(long_data$OU),\n", - " INDICATOR = unique(long_data$INDICATOR),\n", - " DATE = unique(long_data$DATE)\n", - ")\n", - "\n", - "# Step 4: Join and assess reporting status\n", - "reporting_check <- full_grid %>%\n", - " left_join(\n", - " long_data %>% select(OU, INDICATOR, DATE, VALUE),\n", - " by = c(\"OU\", \"INDICATOR\", \"DATE\")\n", - " ) %>%\n", - " mutate(\n", - " is_missing = is.na(VALUE),\n", - " is_zero = VALUE == 0 & !is.na(VALUE),\n", - " is_positive = VALUE > 0 & !is.na(VALUE)\n", - " )\n", - "\n", - "# Step 5: Summarise reporting status\n", - "reporting_summary <- reporting_check %>%\n", - " group_by(INDICATOR, DATE) %>%\n", - " summarise(\n", - " n_total = n_distinct(OU),\n", - " n_missing = sum(is_missing),\n", - " n_zero = sum(is_zero),\n", - " n_positive = sum(is_positive),\n", - " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", - " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", - " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 6: Prepare plot-ready data\n", - "plot_data <- reporting_summary %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"Status\",\n", - " values_to = \"Percentage\"\n", - " ) %>%\n", - " mutate(\n", - " Status = recode(Status,\n", - " pct_missing = \"Valeur manquante\",\n", - " pct_zero = \"Valeur nulle rapportée\",\n", - " pct_positive = \"Valeur positive rapportée\")\n", - " ) %>%\n", - " complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfd115e7-176d-4beb-9ab9-2e6990cb16af", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "options(repr.plot.width = 17, repr.plot.height = 10)\n", - "ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +\n", - " geom_col(position = \"stack\") +\n", - " facet_wrap(~ INDICATOR, scales = \"free_y\", ncol = 4) +\n", - " scale_y_continuous() +\n", - " scale_fill_manual(values = c(\n", - " \"Valeur manquante\" = \"tomato\",\n", - " \"Valeur nulle rapportée\" = \"skyblue\",\n", - " \"Valeur positive rapportée\" = \"green\"\n", - " )) +\n", - " labs(\n", - " title = \"Taux de rapportage par indicateur (niveau formation sanitaire)\",\n", - " subtitle = \"Proportion des valeurs rapportées par mois et par indicateur\",\n", - " x = \"Mois\", y = \"% des formations sanitaires\",\n", - " fill = \"Statut du rapportage\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " strip.text = element_text(size = 16),\n", - " axis.title = element_text(size = 16),\n", - " axis.text = element_text(size = 16)\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "e6871759-714b-437a-8b9c-5a5a06656567", - "metadata": {}, - "source": [ - "## 1.2 Proportion des districts ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c89f6c77-dd42-4616-8eb5-1642d5b51157", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 0: Rename for convenience\n", - "data <- routine_data\n", - "\n", - "# Step 1: Convert PERIOD to proper Date\n", - "data <- data %>%\n", - " mutate(Date = ymd(paste0(PERIOD, \"01\")))\n", - "\n", - "# Step 2: Identify indicator columns\n", - "indicator_cols <- setdiff(names(data), c(\n", - " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\",\n", - " \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"Date\"\n", - "))\n", - "\n", - "# Step 3: Reshape to long format\n", - "data_long <- data %>%\n", - " select(ADM2_ID, OU_ID, Date, all_of(indicator_cols)) %>%\n", - " pivot_longer(cols = all_of(indicator_cols),\n", - " names_to = \"Indicator\", values_to = \"value\") %>%\n", - " mutate(value = as.numeric(value))\n", - "\n", - "# Step 4: Full expected grid at ADM2 level\n", - "full_grid <- expand_grid(\n", - " ADM2_ID = unique(data_long$ADM2_ID),\n", - " Indicator = unique(data_long$Indicator),\n", - " Date = unique(data_long$Date)\n", - ")\n", - "\n", - "# Step 5: Detect if *any* health facility reported per district × indicator × date\n", - "reporting_check <- data_long %>%\n", - " group_by(ADM2_ID, Indicator, Date) %>%\n", - " summarise(\n", - " is_missing = all(is.na(value)),\n", - " is_zero = all(value == 0, na.rm = TRUE),\n", - " is_positive = any(value > 0, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 6: Join with full grid to fill in missing ADM2s\n", - "reporting_full <- full_grid %>%\n", - " left_join(reporting_check, by = c(\"ADM2_ID\", \"Indicator\", \"Date\")) %>%\n", - " mutate(\n", - " is_missing = replace_na(is_missing, TRUE),\n", - " is_zero = replace_na(is_zero, FALSE),\n", - " is_positive = replace_na(is_positive, FALSE)\n", - " )\n", - "\n", - "# Step 7: Summarise by Indicator and Date\n", - "reporting_summary <- reporting_full %>%\n", - " group_by(Indicator, Date) %>%\n", - " summarise(\n", - " n_total = n_distinct(ADM2_ID),\n", - " n_missing = sum(is_missing),\n", - " n_zero = sum(is_zero & !is_missing),\n", - " n_positive = sum(is_positive),\n", - " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", - " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", - " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 8: Reshape for plotting\n", - "plot_data <- reporting_summary %>%\n", - " pivot_longer(cols = starts_with(\"pct_\"),\n", - " names_to = \"Status\", values_to = \"Percentage\") %>%\n", - " mutate(Status = recode(Status,\n", - " pct_missing = \"Valeur manquante\",\n", - " pct_zero = \"Valeur nulle rapportée\",\n", - " pct_positive = \"Valeur positive rapportée\")) %>%\n", - " complete(Indicator, Date, Status, fill = list(Percentage = 0))\n", - "\n", - "# Step 9: Plot\n", - "ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +\n", - " geom_col(position = \"stack\") +\n", - " facet_wrap(~ Indicator, scales = \"free_y\") +\n", - " scale_y_continuous(limits = c(0, 100)) +\n", - " scale_fill_manual(values = c(\n", - " \"Valeur manquante\" = \"tomato\",\n", - " \"Valeur nulle rapportée\" = \"skyblue\",\n", - " \"Valeur positive rapportée\" = \"green\"\n", - " )) +\n", - " labs(\n", - " title = \"Taux de rapportage par indicateur (niveau district)\",\n", - " subtitle = \"Proportion des districts (ADM2_ID) rapportant chaque mois\",\n", - " x = \"Mois\", y = \"% des districts\",\n", - " fill = \"Statut du rapportage\"\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 18),\n", - " strip.text = element_text(size = 14),\n", - " axis.title = element_text(size = 14),\n", - " axis.text = element_text(size = 12)\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "5cda3985", - "metadata": {}, - "source": [ - "# 2. Cohérence interne des indicateurs composites" - ] - }, - { - "cell_type": "markdown", - "id": "c131a633", - "metadata": {}, - "source": [ - "## 2.1 Filtrage préliminaire des valeurs aberrantes pour l’analyse de cohérence\n", - "\n", - "Avant d’évaluer la cohérence entre les indicateurs composites, nous éliminons d’abord les valeurs aberrantes les plus extrêmes. Cette étape ne modifie pas définitivement le jeu de données et ne vise pas à détecter toutes les valeurs aberrantes ; elle permet simplement d’exclure les cas extrêmes afin de faciliter une évaluation plus fiable de la cohérence entre les indicateurs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "936268f4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Function to detect outliers based on MAD method\n", - "detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = \"mad_flag\") {\n", - " data_long %>%\n", - " group_by(OU, indicator, YEAR) %>%\n", - " mutate(\n", - " median_val = median(value, na.rm = TRUE),\n", - " mad_val = mad(value, na.rm = TRUE),\n", - " \"{outlier_column}\" := value > (median_val + deviation * mad_val) | value < (median_val - deviation * mad_val)\n", - " ) %>%\n", - " ungroup()\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "881f9625", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 0: Select relevant core indicators\n", - "target_indicators <- c(\"SUSP\", \"TEST\", \"CONF\", \"MALTREAT\", \"PRES\")\n", - "\n", - "# Step 1: Convert wide to long format\n", - "routine_long <- routine_data %>%\n", - " pivot_longer(\n", - " cols = all_of(target_indicators),\n", - " names_to = \"indicator\",\n", - " values_to = \"value\"\n", - " ) %>%\n", - " mutate(\n", - " PERIOD = as.character(PERIOD), # Ensure PERIOD is character for join\n", - " OU = OU_ID # Alias for join clarity\n", - " )\n", - "\n", - "# Step 2: Filter to indicators of interest\n", - "routine_long_filtered <- routine_long %>%\n", - " filter(indicator %in% target_indicators)\n", - "\n", - "# Step 3: Calculate MAD15\n", - "mad15_data <- detect_mad_outliers(\n", - " routine_long_filtered,\n", - " deviation = 15,\n", - " outlier_column = \"mad15\"\n", - ")\n", - "\n", - "# Step 4: Calculate MAD10 (only where mad15 not flagged or missing)\n", - "mad10_flags <- mad15_data %>%\n", - " filter(is.na(mad15) | mad15 == FALSE, !is.na(value)) %>%\n", - " detect_mad_outliers(deviation = 10, outlier_column = \"mad10\")\n", - "\n", - "# Step 5: Combine MAD15 and MAD10 results\n", - "mad_combined <- mad15_data %>%\n", - " left_join(\n", - " mad10_flags %>% select(PERIOD, OU, indicator, mad10),\n", - " by = c(\"PERIOD\", \"OU\", \"indicator\")\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04d41ed1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)\n", - "outlier_flags <- mad_combined %>%\n", - " filter(mad15 == TRUE | mad10 == TRUE) %>%\n", - " mutate(PERIOD = as.numeric(PERIOD)) %>%\n", - " select(PERIOD, OU, indicator)\n", - "\n", - "# Step 7: Reshape routine_data to long format for filtering\n", - "routine_long_all <- routine_data %>%\n", - " pivot_longer(\n", - " cols = all_of(target_indicators),\n", - " names_to = \"indicator\",\n", - " values_to = \"value\"\n", - " ) %>%\n", - " mutate(OU = OU_ID)\n", - "\n", - "# Step 8: Remove outliers\n", - "routine_long_clean <- routine_long_all %>%\n", - " anti_join(outlier_flags, by = c(\"PERIOD\", \"OU\", \"indicator\"))\n", - "\n", - "# Step 9: Reshape back to wide format if needed\n", - "routine_data_clean <- routine_long_clean %>%\n", - " select(-OU) %>%\n", - " pivot_wider(names_from = indicator, values_from = value)\n" - ] - }, - { - "cell_type": "markdown", - "id": "c6a5a77b", - "metadata": {}, - "source": [ - "## 2.2 Cohérence des indicateurs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6cfeb18e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: Extract year and month from PERIOD\n", - "routine_hd_month <- routine_data_clean %>%\n", - " mutate(\n", - " YEAR = substr(PERIOD, 1, 4),\n", - " MONTH = substr(PERIOD, 5, 6)\n", - " ) %>%\n", - " group_by(ADM2_ID, YEAR, MONTH) %>%\n", - " summarise(\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " MALTREAT = sum(MALTREAT, na.rm = TRUE),\n", - " PRES = sum(PRES, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 2: Create scatter plots\n", - "options(repr.plot.width = 14, repr.plot.height = 6)\n", - "\n", - "p1 <- ggplot(routine_hd_month, aes(x = SUSP, y = TEST)) +\n", - " geom_point(alpha = 0.5, color = \"blue\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Suspectés vs Testés\", x = \"Cas suspectés\", y = \"Cas testés\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "p2 <- ggplot(routine_hd_month, aes(x = TEST, y = CONF)) +\n", - " geom_point(alpha = 0.5, color = \"darkgreen\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Testés vs Confirmés\", x = \"Cas testés\", y = \"Cas confirmés\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "p3 <- ggplot(routine_hd_month, aes(x = CONF, y = MALTREAT)) +\n", - " geom_point(alpha = 0.5, color = \"purple\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Confirmés vs Traités\", x = \"Cas confirmés\", y = \"Cas traités\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "# Step 3: Combine plots\n", - "(p1 | p2 | p3) + plot_layout(guides = \"collect\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0df24272", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: Aggregate monthly values\n", - "rds_clean_month <- routine_data_clean %>%\n", - " mutate(\n", - " YEAR = substr(PERIOD, 1, 4),\n", - " MONTH = substr(PERIOD, 5, 6),\n", - " DATE = as.Date(paste(YEAR, MONTH, \"01\", sep = \"-\"))\n", - " ) %>%\n", - " group_by(YEAR, MONTH, DATE) %>%\n", - " summarise(\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " PRES = sum(PRES, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 2: Plot monthly national trends\n", - "options(repr.plot.width = 14, repr.plot.height = 6)\n", - "rds_clean_month %>%\n", - " pivot_longer(cols = c(SUSP, TEST, CONF, PRES), names_to = \"Indicator\") %>%\n", - " ggplot(aes(x = DATE, y = value, color = Indicator)) +\n", - " geom_line(linewidth = 1.2) +\n", - " labs(\n", - " title = \"Tendances mensuelles nationales des indicateurs composites (après suppression des outliers)\",\n", - " x = \"Mois\", y = \"Nombre de cas\", color = \"Indicateur\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " axis.title = element_text(size = 16),\n", - " axis.text = element_text(size = 16),\n", - " legend.title = element_text(size = 16),\n", - " legend.text = element_text(size = 16)\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "780fc9f8-6c67-4328-85f1-6bdefcd15b48", - "metadata": {}, - "source": [ - "# 3. Carte des populations par district sanitaire (DS)" - ] - }, - { - "cell_type": "markdown", - "id": "da58bbd3", - "metadata": {}, - "source": [ - "## 3.1. Carte de la Population pour ADM2 " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6965155d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Code from previous version of the notebook\n", - "# Uses continuos scale for population\n", - "\n", - "# Run if population_data is available\n", - "if (!is.null(population_data) & !is.null(shapes_data)) {\n", - " # Join population to spatial shapes\n", - " map_data <- shapes_data %>%\n", - " left_join(population_data, by = \"ADM2_ID\")\n", - " \n", - " # Plot population per district (DS)\n", - " plot <- ggplot(map_data) +\n", - " geom_sf(aes(fill = POPULATION), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Données DHIS2\",\n", - " caption = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " theme_minimal(base_size = 14) \n", - "\n", - " print(plot)\n", - "\n", - "} else {\n", - " print(\"Population or shapes data not available.\")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "eb276692", - "metadata": {}, - "source": [ - "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", - "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " - ] - }, - { - "cell_type": "markdown", - "id": "62ec70f5", - "metadata": {}, - "source": [ - "### 🇳🇪 NER specific code \n", - "Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", - "* only year 2022 to 2024\n", - "* specific palette (yellowish to brick red)\n", - "* specific intervals\n", - "* looks at **disaggregated** population <- this is sometimes contry-specific!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d33724e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "population_data_filtered <- population_data\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " print(\"🇳🇪 Executing NER specific code ... \")\n", - "\n", - " # --- Filter data to keep only 2022-2024 ... ---\n", - " years_to_keep <- 2022:2024\n", - " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", - "\n", - " # --- Read data from SNT_metadata.json ---\n", - " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - " # --- Assign population breaks from metadata ---\n", - " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", - " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", - " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", - "\n", - " # --- Define function to create dyanic labels based on breaks for pop category ---\n", - " create_dynamic_labels <- function(breaks) {\n", - " fmt <- function(x) {\n", - " format(x / 1000, big.mark = \"'\", scientific = FALSE, trim = TRUE)\n", - " }\n", - " \n", - " labels <- c(\n", - " paste0(\"< \", fmt(breaks[1]), \"k\"), # First label\n", - " paste0(fmt(breaks[-length(breaks)]), \" - \", fmt(breaks[-1]), \"k\"), # Middle\n", - " paste0(\"> \", fmt(breaks[length(breaks)]), \"k\") # Last label\n", - " ) \n", - " return(labels)\n", - " }\n", - "\n", - " # --- Create dynamic labels based on breaks ---\n", - " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", - " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", - " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "NER_palette_population <- c(\n", - " \"1\" = \"#fae6db\",\n", - " \"2\" = \"#f1b195\",\n", - " \"3\" = \"#ea7354\",\n", - " \"4\" = \"#cc3f32\",\n", - " \"5\" = \"#972620\"\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "95892df7-e5b8-4d7a-bf96-88673e633370", - "metadata": {}, - "source": [ - "### Population Totales" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0a196b8-2db5-478d-899a-48985d1735f0", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", - "names(NER_palette_population) <- labels_tot\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION,\n", - " breaks = c(0, value_breaks_tot, Inf),\n", - " labels = labels_tot, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_tot, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_totals.png\")),\n", - " width = 14,\n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "aca477aa-4d93-4a74-ad8c-32a30f85a552", - "metadata": {}, - "source": [ - "### Population Femmes Enceintes (FE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9324a56b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_fe\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_FE,\n", - " breaks = c(0, value_breaks_fe, Inf),\n", - " labels = labels_fe, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_fe, \n", - " drop = FALSE # Prevents dropping empty levels from legend\n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_fe.png\")),\n", - " width = 14, \n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "bd5fe86d-591a-4f5a-bc42-58180a413d5d", - "metadata": {}, - "source": [ - "### Population Enfants moins de 5 ans (U5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4046761f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_u5\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_U5,\n", - " breaks = c(0, value_breaks_u5, Inf),\n", - " labels = labels_u5, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_u5, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export PNG\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_u5.png\")),\n", - " width = 14, \n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5", - "metadata": {}, - "source": [ - "## 3.2. Complétude et qualité des données de la Population" - ] - }, - { - "cell_type": "markdown", - "id": "0d86ed4a-e194-496b-9440-ad206157ee17", - "metadata": {}, - "source": [ - "#### Population Totale" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bec2759d-9ac4-42e1-9f7e-7076780bd7d6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# hist(population_data$POPULATION)\n", - "hist(population_data_filtered$POPULATION)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc00527f-d8f9-4c9e-bf4a-326c92cf8a68", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION,\n", - " y = fct_reorder(ADM2_NAME, POPULATION),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+05, 4e+05, 6e+05, 8e+05, 1e+06, 1.5e+06),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "# Export PNG\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", \"hist_population_totale.png\"),\n", - " units = \"cm\",\n", - " width = 15,\n", - " height = 23,\n", - " bg = \"white\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "d6ab387a-cc9e-42b9-a634-12af21bef0f5", - "metadata": {}, - "source": [ - "#### Population Femmes Enceintes (FE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6bb79dd-2d8a-4cd1-bf91-3e0e48c14eda", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Wrap in if statement to avoid errors if POPULATION_FE is missing\n", - "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", - " hist(population_data_filtered$POPULATION_FE)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4200afa2-e2f0-4876-9842-141b96f32fe8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", - " \n", - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION_FE,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(\n", - " # title = \"\"\n", - " color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "} " - ] - }, - { - "cell_type": "markdown", - "id": "e39305c0-3700-48c3-967a-b9c6af3e737f", - "metadata": {}, - "source": [ - "#### Population Enfants moins de 5 ans (U5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbda9b88-9b91-4845-83a8-795a12124999", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", - " hist(population_data_filtered$POPULATION_U5)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "742116ab-fef7-46ea-8c4b-0aa2a166005d", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", - "\n", - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION_U5,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(\n", - " # title = \"\"\n", - " color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } From 5adb64785541c2629c3e4a0a9d43d413b3689c47 Mon Sep 17 00:00:00 2001 From: iulia rautu Date: Wed, 18 Mar 2026 00:41:47 +0100 Subject: [PATCH 5/7] perf(DHIS2 formatting): html markdown display --- .../snt_dhis2_formatting_report.ipynb | 8 +-- .../snt_dhis2_formatting_report_NER.ipynb | 62 +++++++++---------- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb index f45fbc5..eddb1c4 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb @@ -926,9 +926,7 @@ "if (condition_pregnant_women) {\n", "\n", " # Display formatted markdown (only if the condition is true)\n", - " IRdisplay::display_markdown(\"\n", - " #### Population Femmes Enceintes (FE)\n", - " \")\n", + " IRdisplay::display_html(\"

Population Femmes Enceintes (FE)

\")\n", "}" ] }, @@ -992,9 +990,7 @@ "if (condition_u5_children) {\n", "\n", " # Display formatted markdown (only if the condition is true)\n", - " IRdisplay::display_markdown(\"\n", - " #### Population Enfants moins de 5 ans (U5)\n", - " \")\n", + " IRdisplay::display_html(\"

Population Enfants moins de 5 ans (U5)

\")\n", "}\n" ] }, diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb index 6a6c3b5..9c9fa1b 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report_NER.ipynb @@ -974,9 +974,7 @@ "if (condition_plot_total_population) {\n", "\n", " # Display formatted markdown (only if the condition is true)\n", - " IRdisplay::display_markdown(\"\n", - " ### Population Totale\n", - " \")\n", + " IRdisplay::display_html(\"

Population Totale

\")\n", "}" ] }, @@ -1062,9 +1060,7 @@ "if (condition_plot_pregnant_women) {\n", "\n", " # Display formatted markdown (only if the condition is true)\n", - " IRdisplay::display_markdown(\"\n", - " ### Population Femmes Enceintes (FE)\n", - " \")\n", + " IRdisplay::display_html(\"

Population Femmes Enceintes (FE)

\")\n", "}" ] }, @@ -1086,7 +1082,8 @@ " plot <- population_data %>%\n", " mutate(\n", " CATEGORY_POPULATION = cut(\n", - " POPULATION_FE,\n", + " # POPULATION_FE,\n", + " !!sym(pregnant_women_col), # avoid hard coding column, because column names in data changed\n", " breaks = c(0, value_breaks_fe, Inf),\n", " labels = labels_fe, \n", " right = TRUE,\n", @@ -1150,9 +1147,7 @@ "if (condition_plot_u5_children) {\n", "\n", " # Display formatted markdown (only if the condition is true)\n", - " IRdisplay::display_markdown(\"\n", - " #### Population Enfants moins de 5 ans (U5)\n", - " \")\n", + " IRdisplay::display_html(\"

Population Enfants moins de 5 ans (U5)

\")\n", "}" ] }, @@ -1174,7 +1169,8 @@ " plot <- population_data %>%\n", " mutate(\n", " CATEGORY_POPULATION = cut(\n", - " POPULATION_U5,\n", + " # POPULATION_U5,\n", + " !!sym(u5_children_col), # avoid hard coding column, because column names in data changed\n", " breaks = c(0, value_breaks_u5, Inf),\n", " labels = labels_u5, \n", " right = TRUE,\n", @@ -1246,9 +1242,7 @@ "if (condition_population_data) {\n", "\n", " # Display formatted markdown (only if the condition is true)\n", - " IRdisplay::display_markdown(\"\n", - " ### Population Totale\n", - " \")\n", + " IRdisplay::display_html(\"

Population Totale

\")\n", "}" ] }, @@ -1319,9 +1313,7 @@ "if (condition_pregnant_women) { \n", " \n", " # Display formatted markdown (only if the condition is true)\n", - " IRdisplay::display_markdown(\"\n", - " ### Population Femmes Enceintes (FE)\n", - " \")\n", + " IRdisplay::display_html(\"

Population Femmes Enceintes (FE)

\")\n", "\n", "}" ] @@ -1341,14 +1333,18 @@ "if (condition_pregnant_women) { \n", "\n", " # 1. Histogram of the pregnant women population\n", - " hist(population_data$POPULATION_FE)\n", + " hist(population_data[[pregnant_women_col]]) # avoid hard coding \n", "\n", " # 2. Plot of the pregnant women population\n", - " ggplot(population_data) +\n", - " geom_point(aes(x = POPULATION_FE,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", - " color = factor(YEAR))\n", - " ) +\n", + " ggplot(\n", + " population_data,\n", + " aes(\n", + " x = .data[[pregnant_women_col]],\n", + " y = fct_reorder(.data[[\"ADM2_NAME\"]], .data[[pregnant_women_col]]), # these should all be dynamic; avoid hardcoding\n", + " color = factor(.data[[\"YEAR\"]])\n", + " )\n", + " ) +\n", + " geom_point() + \n", " facet_grid(rows = \"ADM1_NAME\", \n", " scale = \"free_y\", \n", " space = \"free_y\", \n", @@ -1388,9 +1384,7 @@ "if (condition_u5_children) {\n", " \n", " # Display formatted markdown (only if the condition is true)\n", - " IRdisplay::display_markdown(\"\n", - " ### Population Enfants moins de 5 ans (U5)\n", - " \")\n", + " IRdisplay::display_html(\"

Population Enfants moins de 5 ans (U5)

\")\n", "\n", "}" ] @@ -1409,14 +1403,18 @@ "if (condition_u5_children) {\n", "\n", " # 1. Make histogram\n", - " hist(population_data$POPULATION_U5)\n", + " hist(population_data[[u5_children_col]]) # avoid hard coding \n", "\n", " # 2. Make plot\n", - " ggplot(population_data) +\n", - " geom_point(aes(x = POPULATION_U5,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", - " color = factor(YEAR))\n", - " ) +\n", + " ggplot(\n", + " population_data,\n", + " aes(\n", + " x = .data[[u5_children_col]],\n", + " y = fct_reorder(.data[[\"ADM2_NAME\"]], .data[[u5_children_col]]), # these should all be dynamic; avoid hardcoding\n", + " color = factor(.data[[\"YEAR\"]])\n", + " )\n", + " ) +\n", + " geom_point() + \n", " facet_grid(rows = \"ADM1_NAME\", \n", " scale = \"free_y\", \n", " space = \"free_y\", \n", From c1b4c298f7b949fd42dd93e6ac7053bbdf37f165 Mon Sep 17 00:00:00 2001 From: iulia rautu Date: Thu, 19 Mar 2026 12:30:38 +0100 Subject: [PATCH 6/7] fix(seasonality): ggsave parameters --- .../snt_seasonality_cases_report.ipynb | 24 +++++++++++++++--- .../snt_seasonality_rainfall_report.ipynb | 25 ++++++++++++++++--- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/pipelines/snt_seasonality_cases/reporting/snt_seasonality_cases_report.ipynb b/pipelines/snt_seasonality_cases/reporting/snt_seasonality_cases_report.ipynb index 5af17d8..9597bcb 100644 --- a/pipelines/snt_seasonality_cases/reporting/snt_seasonality_cases_report.ipynb +++ b/pipelines/snt_seasonality_cases/reporting/snt_seasonality_cases_report.ipynb @@ -257,7 +257,11 @@ ")\n", "\n", "filename_seasonality_plot <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_plot.png', sep = '_')\n", - "ggsave(seasonality_plot, file = file.path(PLOTS_PATH, filename_seasonality_plot), dpi = 500)" + "ggsave(\n", + " filename=file.path(PLOTS_PATH, filename_seasonality_plot), \n", + " plot=seasonality_plot,\n", + " dpi = 500\n", + ")" ] }, { @@ -336,7 +340,11 @@ " \n", " # Save the plot\n", " filename_start_month_plot <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'start_month_plot.png', sep = '_')\n", - " ggsave(start_month_plot, file = file.path(PLOTS_PATH, filename_start_month_plot), dpi = 500, width = 12, height = 8)\n", + " ggsave(\n", + " file=file.path(PLOTS_PATH, filename_start_month_plot),\n", + " plot=start_month_plot,\n", + " dpi = 500, width = 12, height = 8\n", + " )\n", " \n", "} else {\n", " cat(paste('Column', season_start_month_col, 'not found in data. Run the updated code notebook first.\\n'))\n", @@ -380,7 +388,11 @@ "\n", "filename_duration_plot <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'duration_plot.png', sep = '_')\n", "\n", - "ggsave(duration_plot, file = file.path(PLOTS_PATH, filename_duration_plot), dpi = 500)" + "ggsave(\n", + " filename=file.path(PLOTS_PATH, filename_duration_plot),\n", + " plot=duration_plot,\n", + " dpi = 500\n", + ")" ] }, { @@ -444,7 +456,11 @@ " \n", " # Save the plot\n", " filename_proportion_plot <- paste(COUNTRY_CODE, data_source, admin_level, 'cases_proportion_plot.png', sep = '_')\n", - " ggsave(proportion_plot, file = file.path(PLOTS_PATH, filename_proportion_plot), dpi = 500)\n", + " ggsave(\n", + " filename=file.path(PLOTS_PATH, filename_proportion_plot),\n", + " plot=proportion_plot,\n", + " dpi = 500\n", + " )\n", " \n", "} else {\n", " cat('CASES_PROPORTION column not found in data. Run the updated code notebook first.\\n')\n", diff --git a/pipelines/snt_seasonality_rainfall/reporting/snt_seasonality_rainfall_report.ipynb b/pipelines/snt_seasonality_rainfall/reporting/snt_seasonality_rainfall_report.ipynb index 192a658..d2e0fe9 100644 --- a/pipelines/snt_seasonality_rainfall/reporting/snt_seasonality_rainfall_report.ipynb +++ b/pipelines/snt_seasonality_rainfall/reporting/snt_seasonality_rainfall_report.ipynb @@ -288,7 +288,12 @@ "\n", "# Save with suffix and using perc instead of %\n", "filename_seasonality_plot <- glue::glue(\"rainfall_seasonality_{seasonality_threshold_perc}{SUFFIX}.png\")\n", - "ggsave(seasonality_plot, file = file.path(PLOTS_PATH, filename_seasonality_plot), bg = \"white\", dpi = 300)" + "ggsave(\n", + " filename=file.path(PLOTS_PATH, filename_seasonality_plot)\n", + " plot=seasonality_plot,\n", + " bg=\"white\",\n", + " dpi = 300\n", + ")" ] }, { @@ -388,7 +393,11 @@ " \n", " # Save the plot with suffix\n", " filename_start_month_plot <- glue::glue(\"start_month_{seasonality_threshold_perc}{SUFFIX}.png\")\n", - " ggsave(start_month_plot, file = file.path(PLOTS_PATH, filename_start_month_plot), bg = \"white\", dpi = 300, width = 12, height = 8)\n", + " ggsave(\n", + " filename=file.path(PLOTS_PATH, filename_start_month_plot),\n", + " plot=start_month_plot,\n", + " bg = \"white\", dpi = 300, width = 12, height = 8\n", + " )\n", " \n", "} else {\n", " cat(paste('Column', season_start_month_col, 'not found in data. Run the updated code notebook first.\\n'))\n", @@ -423,7 +432,11 @@ "\n", "# Save with suffix\n", "filename_duration_plot <- glue::glue(\"seasonality_duration_{seasonality_threshold_perc}{SUFFIX}.png\")\n", - "ggsave(duration_plot, file = file.path(PLOTS_PATH, filename_duration_plot), bg = \"white\", dpi = 300, width = 12, height = 8)" + "ggsave(\n", + " filename=file.path(PLOTS_PATH, filename_duration_plot),\n", + " plot=duration_plot,\n", + " bg = \"white\", dpi = 300, width = 12, height = 8\n", + ")" ] }, { @@ -494,7 +507,11 @@ " \n", " # Save with suffix\n", " filename_proportion_plot <- glue::glue(\"rain_proportion_{seasonality_threshold_perc}{SUFFIX}.png\")\n", - " ggsave(proportion_plot, file = file.path(PLOTS_PATH, filename_proportion_plot), bg = \"white\", dpi = 300, width = 12, height = 8)\n", + " ggsave(\n", + " filename=file.path(PLOTS_PATH, filename_proportion_plot),\n", + " plot=proportion_plot,\n", + " bg=\"white\", dpi=300, width=12, height=8\n", + " )\n", " \n", "} else {\n", " cat('RAIN_PROPORTION column not found in data. Run the updated code notebook first.\\n')\n", From 536fa673e1bbe3f939b2dbe3ff43bad89afe4d37 Mon Sep 17 00:00:00 2001 From: iulia rautu Date: Thu, 19 Mar 2026 13:05:04 +0100 Subject: [PATCH 7/7] Revert "fix(seasonality): ggsave parameters" This reverts commit c1b4c298f7b949fd42dd93e6ac7053bbdf37f165. --- .../snt_seasonality_cases_report.ipynb | 24 +++--------------- .../snt_seasonality_rainfall_report.ipynb | 25 +++---------------- 2 files changed, 8 insertions(+), 41 deletions(-) diff --git a/pipelines/snt_seasonality_cases/reporting/snt_seasonality_cases_report.ipynb b/pipelines/snt_seasonality_cases/reporting/snt_seasonality_cases_report.ipynb index 9597bcb..5af17d8 100644 --- a/pipelines/snt_seasonality_cases/reporting/snt_seasonality_cases_report.ipynb +++ b/pipelines/snt_seasonality_cases/reporting/snt_seasonality_cases_report.ipynb @@ -257,11 +257,7 @@ ")\n", "\n", "filename_seasonality_plot <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_plot.png', sep = '_')\n", - "ggsave(\n", - " filename=file.path(PLOTS_PATH, filename_seasonality_plot), \n", - " plot=seasonality_plot,\n", - " dpi = 500\n", - ")" + "ggsave(seasonality_plot, file = file.path(PLOTS_PATH, filename_seasonality_plot), dpi = 500)" ] }, { @@ -340,11 +336,7 @@ " \n", " # Save the plot\n", " filename_start_month_plot <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'start_month_plot.png', sep = '_')\n", - " ggsave(\n", - " file=file.path(PLOTS_PATH, filename_start_month_plot),\n", - " plot=start_month_plot,\n", - " dpi = 500, width = 12, height = 8\n", - " )\n", + " ggsave(start_month_plot, file = file.path(PLOTS_PATH, filename_start_month_plot), dpi = 500, width = 12, height = 8)\n", " \n", "} else {\n", " cat(paste('Column', season_start_month_col, 'not found in data. Run the updated code notebook first.\\n'))\n", @@ -388,11 +380,7 @@ "\n", "filename_duration_plot <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'duration_plot.png', sep = '_')\n", "\n", - "ggsave(\n", - " filename=file.path(PLOTS_PATH, filename_duration_plot),\n", - " plot=duration_plot,\n", - " dpi = 500\n", - ")" + "ggsave(duration_plot, file = file.path(PLOTS_PATH, filename_duration_plot), dpi = 500)" ] }, { @@ -456,11 +444,7 @@ " \n", " # Save the plot\n", " filename_proportion_plot <- paste(COUNTRY_CODE, data_source, admin_level, 'cases_proportion_plot.png', sep = '_')\n", - " ggsave(\n", - " filename=file.path(PLOTS_PATH, filename_proportion_plot),\n", - " plot=proportion_plot,\n", - " dpi = 500\n", - " )\n", + " ggsave(proportion_plot, file = file.path(PLOTS_PATH, filename_proportion_plot), dpi = 500)\n", " \n", "} else {\n", " cat('CASES_PROPORTION column not found in data. Run the updated code notebook first.\\n')\n", diff --git a/pipelines/snt_seasonality_rainfall/reporting/snt_seasonality_rainfall_report.ipynb b/pipelines/snt_seasonality_rainfall/reporting/snt_seasonality_rainfall_report.ipynb index d2e0fe9..192a658 100644 --- a/pipelines/snt_seasonality_rainfall/reporting/snt_seasonality_rainfall_report.ipynb +++ b/pipelines/snt_seasonality_rainfall/reporting/snt_seasonality_rainfall_report.ipynb @@ -288,12 +288,7 @@ "\n", "# Save with suffix and using perc instead of %\n", "filename_seasonality_plot <- glue::glue(\"rainfall_seasonality_{seasonality_threshold_perc}{SUFFIX}.png\")\n", - "ggsave(\n", - " filename=file.path(PLOTS_PATH, filename_seasonality_plot)\n", - " plot=seasonality_plot,\n", - " bg=\"white\",\n", - " dpi = 300\n", - ")" + "ggsave(seasonality_plot, file = file.path(PLOTS_PATH, filename_seasonality_plot), bg = \"white\", dpi = 300)" ] }, { @@ -393,11 +388,7 @@ " \n", " # Save the plot with suffix\n", " filename_start_month_plot <- glue::glue(\"start_month_{seasonality_threshold_perc}{SUFFIX}.png\")\n", - " ggsave(\n", - " filename=file.path(PLOTS_PATH, filename_start_month_plot),\n", - " plot=start_month_plot,\n", - " bg = \"white\", dpi = 300, width = 12, height = 8\n", - " )\n", + " ggsave(start_month_plot, file = file.path(PLOTS_PATH, filename_start_month_plot), bg = \"white\", dpi = 300, width = 12, height = 8)\n", " \n", "} else {\n", " cat(paste('Column', season_start_month_col, 'not found in data. Run the updated code notebook first.\\n'))\n", @@ -432,11 +423,7 @@ "\n", "# Save with suffix\n", "filename_duration_plot <- glue::glue(\"seasonality_duration_{seasonality_threshold_perc}{SUFFIX}.png\")\n", - "ggsave(\n", - " filename=file.path(PLOTS_PATH, filename_duration_plot),\n", - " plot=duration_plot,\n", - " bg = \"white\", dpi = 300, width = 12, height = 8\n", - ")" + "ggsave(duration_plot, file = file.path(PLOTS_PATH, filename_duration_plot), bg = \"white\", dpi = 300, width = 12, height = 8)" ] }, { @@ -507,11 +494,7 @@ " \n", " # Save with suffix\n", " filename_proportion_plot <- glue::glue(\"rain_proportion_{seasonality_threshold_perc}{SUFFIX}.png\")\n", - " ggsave(\n", - " filename=file.path(PLOTS_PATH, filename_proportion_plot),\n", - " plot=proportion_plot,\n", - " bg=\"white\", dpi=300, width=12, height=8\n", - " )\n", + " ggsave(proportion_plot, file = file.path(PLOTS_PATH, filename_proportion_plot), bg = \"white\", dpi = 300, width = 12, height = 8)\n", " \n", "} else {\n", " cat('RAIN_PROPORTION column not found in data. Run the updated code notebook first.\\n')\n",