From 7e4fb177d037655bb70c29d6e55b1a4942123f9d Mon Sep 17 00:00:00 2001 From: YONG WOOK KIM Date: Wed, 25 Mar 2026 12:53:09 -0500 Subject: [PATCH] Add explorer S3 cleanup workflow and clean-explorer-bucket --non-interactive/--dry-run (#280) Made-with: Cursor --- .github/workflows/clean_explorer_s3.yml | 54 ++++++++++++++++++ CHANGELOG.md | 12 +++- backend/oeps/clients/s3.py | 23 +++++--- .../oeps/commands/clean_explorer_bucket.py | 57 ++++++++++++++----- 4 files changed, 123 insertions(+), 23 deletions(-) create mode 100644 .github/workflows/clean_explorer_s3.yml diff --git a/.github/workflows/clean_explorer_s3.yml b/.github/workflows/clean_explorer_s3.yml new file mode 100644 index 00000000..3674625a --- /dev/null +++ b/.github/workflows/clean_explorer_s3.yml @@ -0,0 +1,54 @@ +# Remove stale explorer map CSVs from S3 (see https://github.com/healthyregions/oeps/issues/280). +# Uses repo explorer/config/sources.json on the checked-out ref to decide which keys to keep. +name: Clean explorer S3 bucket + +on: + workflow_dispatch: + inputs: + dry_run: + description: "If true, only list keys that would be deleted" + required: false + type: boolean + default: false + +jobs: + clean_explorer_csv: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install backend dependencies + working-directory: backend + run: | + pip install --upgrade pip + pip install -e . + + - name: Create .env file + working-directory: backend + run: echo "FLASK_APP=oeps" > .env + + - name: Clean explorer/csv on S3 (dry run) + if: inputs.dry_run == true + working-directory: backend + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: flask clean-explorer-bucket --non-interactive --dry-run + + - name: Clean explorer/csv on S3 + if: inputs.dry_run != true + working-directory: backend + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: flask clean-explorer-bucket --non-interactive diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fab7928..ffdc2157 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,12 +16,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - "Adding images" section in `metadata/README.md`: HTML vs Markdown comparison, Option 1 (GitHub paste/drop), Option 2 (`metadata/images/`), and examples so images render correctly on the OEPS docs page ([#317](https://github.com/healthyregions/oeps/issues/317)). +- GitHub Action **Clean explorer S3 bucket** (`workflow_dispatch`) to remove stale map CSVs under `explorer/csv` using `explorer/config/sources.json`, with optional dry-run input ([#280](https://github.com/healthyregions/oeps/issues/280)). + +- `flask clean-explorer-bucket`: `--non-interactive` (fail if `sources.json` is missing, for CI) and `--dry-run` (list keys to delete without deleting) ([#280](https://github.com/healthyregions/oeps/issues/280)). + +- `*-geography-keys.csv` resources in Frictionless data packages so CSV Table Schema foreign keys reference tabular data that Frictionless can validate, alongside existing geography CSVs and shapefiles ([#311](https://github.com/healthyregions/oeps/issues/311)). + ### Changed -- Data package links on the download page use stable S3 URLs (`oeps-DSuite2018.zip`, `oeps-DSuite2023_no_foreign_keys.zip`) on `herop-geodata` ([#277](https://github.com/healthyregions/oeps/issues/277)). +- Create Data Packages GitHub Action: run Frictionless validation on each package (removed `--skip-validation`); DSuite2023 builds with foreign keys like other suites (removed `--skip-foreign-keys`) ([#311](https://github.com/healthyregions/oeps/issues/311)). + +- Data package links on the download page use stable S3 URLs (`oeps-DSuite2018.zip`, `oeps-DSuite2023.zip`) on `herop-geodata` ([#277](https://github.com/healthyregions/oeps/issues/277), [#311](https://github.com/healthyregions/oeps/issues/311)). - File size labels for data packages updated to "(100mb+)" on the download page ([#277](https://github.com/healthyregions/oeps/issues/277)). ### Fixed -- (none this release) +- Frictionless data package validation no longer fails when CSV foreign keys pointed at shapefile resources (`FileResource` / `row_stream`) ([#311](https://github.com/healthyregions/oeps/issues/311)). diff --git a/backend/oeps/clients/s3.py b/backend/oeps/clients/s3.py index f8634e01..1bdd7c95 100644 --- a/backend/oeps/clients/s3.py +++ b/backend/oeps/clients/s3.py @@ -53,20 +53,27 @@ def batch_upload_to_s3( for path in paths: upload_to_s3(path, prefix, progress_bar) -def clear_s3_bucket(prefix: str = None, objs_to_keep: list = []): - ''' - Empty all contents in an s3 bucket beginning with a given prefix, possibly preserving a few - ''' +def clear_s3_bucket(prefix: str = None, objs_to_keep: list = None, dry_run: bool = False): + """ + Empty all contents in an s3 bucket beginning with a given prefix, possibly preserving a few. + """ + if objs_to_keep is None: + objs_to_keep = [] s3 = boto3.resource("s3") for obj in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=prefix): - if obj.key.split('/')[-1] in objs_to_keep: continue + if obj.key.split("/")[-1] in objs_to_keep: + continue - print(f'Deleting {obj}..') - obj.delete() + if dry_run: + print(f"Would delete {obj.key}") + else: + print(f"Deleting {obj}..") + obj.delete() def sync_to_s3(local_dir: Path, prefix: str = None, progress_bar: bool = False, clear_bucket: bool = False): - if clear_bucket: clear_s3_bucket(prefix) + if clear_bucket: + clear_s3_bucket(prefix) paths = local_dir.glob("*") batch_upload_to_s3(paths, prefix, progress_bar) diff --git a/backend/oeps/commands/clean_explorer_bucket.py b/backend/oeps/commands/clean_explorer_bucket.py index 604891d2..49a3e7b4 100644 --- a/backend/oeps/commands/clean_explorer_bucket.py +++ b/backend/oeps/commands/clean_explorer_bucket.py @@ -11,38 +11,69 @@ @click.command() +@click.option( + "--non-interactive", + is_flag=True, + default=False, + help="Fail immediately if explorer/config/sources.json is missing (for CI). " + "Without this flag, a missing file triggers an interactive confirmation.", +) +@click.option( + "--dry-run", + is_flag=True, + default=False, + help="Print S3 keys that would be deleted without deleting them.", +) @add_common_opts(explorer_opt) def clean_explorer_bucket( - explorer_path: Path + explorer_path: Path, + non_interactive: bool, + dry_run: bool, ): """Deletes all files from the S3 bucket which are not mentioned in the local - explorer/configs/sources.json file. If no sources.json file exists, optionally - deletes all uploaded files. + explorer/config/sources.json file. If no sources.json file exists, optionally + deletes all uploaded files (interactive only). """ config_dir = Path(explorer_path, "config") + sources_path = Path(config_dir, "sources.json") objs_in_use = [] # command is highly destructive if sources.json is missing - # so prompt user in that case + # so prompt user in that case (unless --non-interactive) try: - sources = load_json(Path(config_dir, 'sources.json'))['sources'] - tables = [source['tables'] for source in sources] + sources = load_json(sources_path)["sources"] + tables = [source["tables"] for source in sources] # grab all filenames nested in the sources.json objs_in_use = [ - value['file'].split('/')[-1] + value["file"].split("/")[-1] for table in tables for value in table.values() ] - except FileNotFoundError: - print(f"{Path(config_dir, 'sources.json')} not found, so continuing will delete all files in the bucket which start with `explorer/csv`.") + except FileNotFoundError: + msg = ( + f"{sources_path} not found; continuing would delete all files under " + "`explorer/csv` except an explicit keep list." + ) + if non_interactive: + raise click.ClickException( + f"{msg} Refusing in --non-interactive mode (e.g. CI). " + "Commit sources.json or run locally without --non-interactive." + ) + print(msg) resp = input("Would you like to continue with deletion? (y/N)") - if resp.lower() != 'y': + if resp.lower() != "y": print("Exiting without deleting files.") return - - objs_in_use = objs_in_use + ['counties.csv', 'states.csv'] - clear_s3_bucket(prefix='explorer/csv', objs_to_keep=objs_in_use) + objs_in_use = objs_in_use + ["counties.csv", "states.csv"] + + if dry_run: + print("Dry run: no objects will be deleted.") + clear_s3_bucket( + prefix="explorer/csv", + objs_to_keep=objs_in_use, + dry_run=dry_run, + )