diff --git a/CHANGES/7482.bugfix b/CHANGES/7482.bugfix new file mode 100644 index 00000000000..8c9decd66fe --- /dev/null +++ b/CHANGES/7482.bugfix @@ -0,0 +1,2 @@ +Added periodic ``gc.collect()`` and ``malloc_trim(0)`` calls in API workers to prevent +unbounded RSS growth caused by glibc heap fragmentation. diff --git a/docs/admin/reference/settings.md b/docs/admin/reference/settings.md index f6b2b9afd4b..fa00c989e54 100644 --- a/docs/admin/reference/settings.md +++ b/docs/admin/reference/settings.md @@ -250,6 +250,14 @@ The number of seconds before a content app should be considered lost. Defaults to `30` seconds. +### MEMORY\_TRIM\_INTERVAL + +Number of API worker requests between periodic `gc.collect()` + `malloc_trim(0)` calls. +This compacts the glibc heap to prevent unbounded RSS growth from memory fragmentation. +Set to `0` to disable. + +Defaults to `1024`. + ### CONTENT\_ORIGIN A string containing the `protocol`, `fqdn`, and optionally `port` where the content app is reachable by users. diff --git a/pulpcore/app/entrypoint.py b/pulpcore/app/entrypoint.py index fed1d70e2a7..75a0d9c7915 100644 --- a/pulpcore/app/entrypoint.py +++ b/pulpcore/app/entrypoint.py @@ -1,6 +1,10 @@ from contextvars import ContextVar from logging import getLogger +import ctypes +import ctypes.util +import gc import os +import platform import sys import threading import time @@ -18,16 +22,42 @@ logger = getLogger(__name__) +_malloc_trim = None +if platform.system() == "Linux": + try: + _libc = ctypes.CDLL(ctypes.util.find_library("c")) + _malloc_trim = _libc.malloc_trim + _malloc_trim.argtypes = [ctypes.c_int] + _malloc_trim.restype = ctypes.c_int + except (OSError, AttributeError): + pass + name_template_var = ContextVar("name_template_var", default=None) using_pulp_api_worker = ContextVar("using_pulp_api_worker", default=False) class PulpApiWorker(SyncWorker): + _request_counter = 0 + _memory_trim_interval = 0 + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.heartbeat_thread = None + def handle_request(self, listener, req, client, addr): + super().handle_request(listener, req, client, addr) + self._trim_memory_if_needed() + + def _trim_memory_if_needed(self): + if not _malloc_trim or self._memory_trim_interval <= 0: + return + self._request_counter += 1 + if self._request_counter >= self._memory_trim_interval: + self._request_counter = 0 + gc.collect() + _malloc_trim(0) + def _heartbeat_loop(self): """Run heartbeat in a loop. Exit process if heartbeat fails.""" try: @@ -105,6 +135,13 @@ def init_process(self): ) self.heartbeat_thread.start() + self._memory_trim_interval = settings.MEMORY_TRIM_INTERVAL + if _malloc_trim and self._memory_trim_interval > 0: + logger.info( + "Memory trim enabled: gc.collect + malloc_trim(0) every %d requests", + self._memory_trim_interval, + ) + super().init_process() def run(self): diff --git a/pulpcore/app/settings.py b/pulpcore/app/settings.py index 71fccf16dcb..b880ffda09b 100644 --- a/pulpcore/app/settings.py +++ b/pulpcore/app/settings.py @@ -292,6 +292,7 @@ API_APP_TTL = 120 # The heartbeat is called from gunicorn notify (defaulting to 45 sec). CONTENT_APP_TTL = 30 WORKER_TTL = 30 +MEMORY_TRIM_INTERVAL = 1024 # Seconds for a task to finish on semi graceful worker shutdown (approx) # On SIGHUP, SIGTERM the currently running task will be awaited forever.