From befe4a935949f0acd376ee5b803796607ed329fd Mon Sep 17 00:00:00 2001
From: Yaroslav Halchenko <debian@onerussian.com>
Date: Mon, 30 Mar 2026 09:11:31 -0400
Subject: [PATCH 1/2] ENH: Improve zarr upload retry resilience and diagnostics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Batch-level retry backoff: replace linear sleep(1*N) with exponential
  backoff (5s, 20s, 40s, 80s, 120s) plus random jitter, giving S3 time
  to recover from throttling

- Reduce parallelism on batch retry: halve ThreadPoolExecutor workers on
  each retry (5→3→2→1), reducing concurrent connections that may trigger
  S3 prefix-level throttling

- Better failure diagnostics in _handle_failed_items_and_raise: log a
  summary line with failed/total counts, exception types grouped by count,
  and a "systematic" flag when all failures share the same exception type.
  This makes it immediately clear whether failures are random (flaky
  network) or deterministic (server-side issue)

Motivated by investigation of #1821 where a previously-aborted OME-Zarr
upload consistently fails with ConnectionAbortedError on every retry
attempt for all 188 level-0 chunks (100% failure rate, 2259 connection
attempts), while level-1 chunks and other fresh zarr uploads succeed fine
from the same machine.

Co-Authored-By: Claude Code 2.1.81 / Claude Opus 4.6 <noreply@anthropic.com>
---
 dandi/files/zarr.py | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py
index efb1efd2d..cafad7a1d 100644
--- a/dandi/files/zarr.py
+++ b/dandi/files/zarr.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from base64 import b64encode
+from collections import Counter
 from collections.abc import Generator, Iterator
 from concurrent.futures import Future, ThreadPoolExecutor, as_completed
 from contextlib import closing
@@ -8,12 +9,14 @@
 from datetime import datetime
 from enum import Enum
 import json
+import math
 import os
 import os.path
-import urllib.parse
 from pathlib import Path
+import random
 from time import sleep
 from typing import Any, Optional
+import urllib.parse
 
 from dandischema.models import BareAsset, DigestType
 from pydantic import BaseModel, ConfigDict, ValidationError
@@ -745,6 +748,7 @@ def mkzarr() -> str:
                     items_to_upload = list(items)
                     max_retries = 5
                     retry_count = 0
+                    current_jobs = jobs or 5
                     # Add all items to checksum tree (only done once)
                     for it in items_to_upload:
                         zcc.add_leaf(Path(it.entry_path), it.size, it.digest)
@@ -774,7 +778,7 @@ def mkzarr() -> str:
                         r = client.post(f"/zarr/{zarr_id}/files/", json=uploading)
 
                         # Upload files in parallel
-                        with ThreadPoolExecutor(max_workers=jobs or 5) as executor:
+                        with ThreadPoolExecutor(max_workers=current_jobs) as executor:
                             futures = [
                                 executor.submit(
                                     _upload_zarr_file,
@@ -817,14 +821,22 @@ def mkzarr() -> str:
                             # Prepare for next iteration with retry items
                             if items_to_upload := retry_items:
                                 retry_count += 1
+                                current_jobs = max(1, math.ceil(current_jobs / 2))
                                 if retry_count <= max_retries:
                                     lgr.info(
-                                        "%s: %s got 403 errors, requesting new URLs",
+                                        "%s: %s got 403 errors, requesting new URLs"
+                                        " (attempt %d/%d, workers: %d)",
                                         asset_path,
                                         pluralize(len(items_to_upload), "file"),
+                                        retry_count,
+                                        max_retries,
+                                        current_jobs,
+                                    )
+                                    # Exponential backoff with jitter before retry
+                                    sleep(
+                                        min(2**retry_count * 5, 120)
+                                        + random.uniform(0, 5)
                                     )
-                                    # Small delay before retry
-                                    sleep(1 * retry_count)
 
                     # Check if we exhausted retries
                     if items_to_upload:
@@ -899,6 +911,18 @@ def _handle_failed_items_and_raise(
     # Log all failures
     for item, error in failed_items:
         lgr.error("Failed to upload %s: %s", item.filepath, error)
+
+    # Summary diagnostics
+    exc_counts = Counter(type(error).__name__ for _, error in failed_items)
+    exc_summary = ", ".join(f"{k}: {v}" for k, v in exc_counts.most_common())
+    lgr.error(
+        "Upload failure summary: %d/%d files failed; exception types: {%s}%s",
+        len(failed_items),
+        len(futures),
+        exc_summary,
+        " (systematic — all same exception type)" if len(exc_counts) == 1 else "",
+    )
+
     # Raise the first error
     raise failed_items[0][1]
 

From 4a2da705e12dd4d8af804763b083a9b8cda3359e Mon Sep 17 00:00:00 2001
From: Yaroslav Halchenko <debian@onerussian.com>
Date: Mon, 30 Mar 2026 10:54:57 -0400
Subject: [PATCH 2/2] BF: Set explicit timeout for zarr chunk upload PUT
 requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, zarr chunk uploads used no explicit timeout, relying on OS
TCP defaults.  For large zarr chunks (e.g., 2.6 GB level-0 OME-Zarr
chunks), the upload can take many minutes at typical home upload speeds
(~7 min at 50 Mbps, ~70 min at 5 Mbps).  Without a timeout, the client
either hangs indefinitely on network failures, or the OS TCP stack's
default keepalive/idle timeout (often ~120s on Windows) kills the
connection before a large upload can complete.

Set an explicit requests timeout of (60, 7200) — 60 seconds for TCP
connect, 2 hours for response read.  The read timeout covers the period
after the full request body is sent while waiting for S3's response,
which is where ConnectionAbortedError was observed in #1821.

Note: this does NOT limit the upload body transfer time itself.  The
requests library's read timeout only applies to waiting for response
data, not to sending request data.  So even very slow multi-hour uploads
will not be interrupted by this timeout.

Ref: #1821

Co-Authored-By: Claude Code 2.1.81 / Claude Opus 4.6 <noreply@anthropic.com>
---
 dandi/files/zarr.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py
index cafad7a1d..943e36288 100644
--- a/dandi/files/zarr.py
+++ b/dandi/files/zarr.py
@@ -969,6 +969,7 @@ def _upload_zarr_file(
                 json_resp=False,
                 retry_if=_retry_zarr_file,
                 headers=headers,
+                timeout=(60, 7200),
             )
     except requests.HTTPError as e:
         post_upload_size_check(item.filepath, item.size, True)