From 34b8034a4f22487c39afba42d70fd5398c5f6157 Mon Sep 17 00:00:00 2001 From: James Frost Date: Tue, 10 Feb 2026 15:02:49 +0000 Subject: [PATCH] Add pickle backed load cache to fetch_fcst This does the loading of the data into cubes (with all associated callbacks) once in the fetch_fcst step, vastly speeding up the loading during recipe baking. The read operator is updated to use this cache when it is provided a directory, and that directory contains a `loadcache.pickle` file. Otherwise loading will continue as normal. --- .../app/fetch_fcst/bin/fetch_data.py | 20 +++++++++++++++++++ src/CSET/operators/read.py | 20 +++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/CSET/cset_workflow/app/fetch_fcst/bin/fetch_data.py b/src/CSET/cset_workflow/app/fetch_fcst/bin/fetch_data.py index 8bf13cfff..dff160de4 100644 --- a/src/CSET/cset_workflow/app/fetch_fcst/bin/fetch_data.py +++ b/src/CSET/cset_workflow/app/fetch_fcst/bin/fetch_data.py @@ -20,6 +20,7 @@ import itertools import logging import os +import pickle import ssl import urllib.parse import urllib.request @@ -30,6 +31,8 @@ import isodate +from CSET.operators import read + logging.basicConfig( level=os.getenv("LOGLEVEL", "INFO"), format="%(asctime)s %(levelname)s %(message)s" ) @@ -294,6 +297,23 @@ def fetch_data(file_retriever: FileRetrieverABC): if not any_files_found: raise FileNotFoundError("No files found for model!") + # Create the load cache for this model. + prime_load_cache(cycle_data_dir) + + +def prime_load_cache(data_directory: str): + """Create the load cache for directory.""" + # Load in the cubes, applying all the callbacks and such. + logging.info("Reading in cubes for caching.") + cubes = read.read_cubes(data_directory) + # Remove the added cset_comparison_base attribute. + for cube in cubes: + del cube.attributes["cset_comparison_base"] + logging.info("Writing cache file.") + # Pickle to a cache file. + with open(Path(data_directory, "loadcache.pickle"), "wb") as fp: + pickle.dump(cubes, fp) + def fetch_obs(obs_retriever: FileRetrieverABC): """Fetch the observations corresponding to a model run. diff --git a/src/CSET/operators/read.py b/src/CSET/operators/read.py index 4d231c265..6133b30d1 100644 --- a/src/CSET/operators/read.py +++ b/src/CSET/operators/read.py @@ -20,6 +20,7 @@ import glob import itertools import logging +import pickle from pathlib import Path from typing import Literal @@ -218,12 +219,19 @@ def _load_model( constraint: iris.Constraint | None, ) -> iris.cube.CubeList: """Load a single model's data into a CubeList.""" - input_files = _check_input_files(paths) - # If unset, a constraint of None lets everything be loaded. - logging.debug("Constraint: %s", constraint) - cubes = iris.load(input_files, constraint, callback=_loading_callback) - # Make the UM's winds consistent with LFRic. - _fix_um_winds(cubes) + cache_file = Path(paths, "loadcache.pickle") if isinstance(paths, str) else None + if cache_file and cache_file.is_file(): + # Load from pickled cache. + with open(cache_file, "rb") as fp: + all_cubes = pickle.load(fp) + cubes = all_cubes.extract(constraint) + else: + input_files = _check_input_files(paths) + # If unset, a constraint of None lets everything be loaded. + logging.debug("Constraint: %s", constraint) + cubes = iris.load(input_files, constraint, callback=_loading_callback) + # Make the UM's winds consistent with LFRic. + _fix_um_winds(cubes) # Add model_name attribute to each cube to make it available at any further # step without needing to pass it as function parameter.