From befe4a935949f0acd376ee5b803796607ed329fd Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 30 Mar 2026 09:11:31 -0400 Subject: [PATCH 1/2] ENH: Improve zarr upload retry resilience and diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Batch-level retry backoff: replace linear sleep(1*N) with exponential backoff (5s, 20s, 40s, 80s, 120s) plus random jitter, giving S3 time to recover from throttling - Reduce parallelism on batch retry: halve ThreadPoolExecutor workers on each retry (5→3→2→1), reducing concurrent connections that may trigger S3 prefix-level throttling - Better failure diagnostics in _handle_failed_items_and_raise: log a summary line with failed/total counts, exception types grouped by count, and a "systematic" flag when all failures share the same exception type. This makes it immediately clear whether failures are random (flaky network) or deterministic (server-side issue) Motivated by investigation of #1821 where a previously-aborted OME-Zarr upload consistently fails with ConnectionAbortedError on every retry attempt for all 188 level-0 chunks (100% failure rate, 2259 connection attempts), while level-1 chunks and other fresh zarr uploads succeed fine from the same machine. Co-Authored-By: Claude Code 2.1.81 / Claude Opus 4.6 --- dandi/files/zarr.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index efb1efd2d..cafad7a1d 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -1,6 +1,7 @@ from __future__ import annotations from base64 import b64encode +from collections import Counter from collections.abc import Generator, Iterator from concurrent.futures import Future, ThreadPoolExecutor, as_completed from contextlib import closing @@ -8,12 +9,14 @@ from datetime import datetime from enum import Enum import json +import math import os import os.path -import urllib.parse from pathlib import Path +import random from time import sleep from typing import Any, Optional +import urllib.parse from dandischema.models import BareAsset, DigestType from pydantic import BaseModel, ConfigDict, ValidationError @@ -745,6 +748,7 @@ def mkzarr() -> str: items_to_upload = list(items) max_retries = 5 retry_count = 0 + current_jobs = jobs or 5 # Add all items to checksum tree (only done once) for it in items_to_upload: zcc.add_leaf(Path(it.entry_path), it.size, it.digest) @@ -774,7 +778,7 @@ def mkzarr() -> str: r = client.post(f"/zarr/{zarr_id}/files/", json=uploading) # Upload files in parallel - with ThreadPoolExecutor(max_workers=jobs or 5) as executor: + with ThreadPoolExecutor(max_workers=current_jobs) as executor: futures = [ executor.submit( _upload_zarr_file, @@ -817,14 +821,22 @@ def mkzarr() -> str: # Prepare for next iteration with retry items if items_to_upload := retry_items: retry_count += 1 + current_jobs = max(1, math.ceil(current_jobs / 2)) if retry_count <= max_retries: lgr.info( - "%s: %s got 403 errors, requesting new URLs", + "%s: %s got 403 errors, requesting new URLs" + " (attempt %d/%d, workers: %d)", asset_path, pluralize(len(items_to_upload), "file"), + retry_count, + max_retries, + current_jobs, + ) + # Exponential backoff with jitter before retry + sleep( + min(2**retry_count * 5, 120) + + random.uniform(0, 5) ) - # Small delay before retry - sleep(1 * retry_count) # Check if we exhausted retries if items_to_upload: @@ -899,6 +911,18 @@ def _handle_failed_items_and_raise( # Log all failures for item, error in failed_items: lgr.error("Failed to upload %s: %s", item.filepath, error) + + # Summary diagnostics + exc_counts = Counter(type(error).__name__ for _, error in failed_items) + exc_summary = ", ".join(f"{k}: {v}" for k, v in exc_counts.most_common()) + lgr.error( + "Upload failure summary: %d/%d files failed; exception types: {%s}%s", + len(failed_items), + len(futures), + exc_summary, + " (systematic — all same exception type)" if len(exc_counts) == 1 else "", + ) + # Raise the first error raise failed_items[0][1] From 4a2da705e12dd4d8af804763b083a9b8cda3359e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 30 Mar 2026 10:54:57 -0400 Subject: [PATCH 2/2] BF: Set explicit timeout for zarr chunk upload PUT requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, zarr chunk uploads used no explicit timeout, relying on OS TCP defaults. For large zarr chunks (e.g., 2.6 GB level-0 OME-Zarr chunks), the upload can take many minutes at typical home upload speeds (~7 min at 50 Mbps, ~70 min at 5 Mbps). Without a timeout, the client either hangs indefinitely on network failures, or the OS TCP stack's default keepalive/idle timeout (often ~120s on Windows) kills the connection before a large upload can complete. Set an explicit requests timeout of (60, 7200) — 60 seconds for TCP connect, 2 hours for response read. The read timeout covers the period after the full request body is sent while waiting for S3's response, which is where ConnectionAbortedError was observed in #1821. Note: this does NOT limit the upload body transfer time itself. The requests library's read timeout only applies to waiting for response data, not to sending request data. So even very slow multi-hour uploads will not be interrupted by this timeout. Ref: #1821 Co-Authored-By: Claude Code 2.1.81 / Claude Opus 4.6 --- dandi/files/zarr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index cafad7a1d..943e36288 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -969,6 +969,7 @@ def _upload_zarr_file( json_resp=False, retry_if=_retry_zarr_file, headers=headers, + timeout=(60, 7200), ) except requests.HTTPError as e: post_upload_size_check(item.filepath, item.size, True)