From 1a350185121d700aa18ff9804f5cb5d322035dfd Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 25 Mar 2026 16:33:45 -0400 Subject: [PATCH] BF: Seek file data to 0 before every retry in request(), not just for retryable status codes Previously, data.seek(0) was only called inside the retry_statuses/retry_if branch, which handles specific HTTP response codes (429, 500-504, and zarr-specific conditions). When a ConnectionError occurred mid-upload (e.g., connection dropped during a multi-minute large zarr chunk transfer), tenacity caught the exception and retried, but the file pointer was left at whatever position the read was interrupted. On retry, requests computed Content-Length as (file_size - current_position) and sent only the tail of the file. S3 received partial data whose MD5 didn't match the Content-MD5 header (computed from the full file), resulting in a BadDigest 400 error. This error was also not retried (not in RETRY_STATUSES, not matched by retry_if), so the upload failed permanently. The fix uses tenacity's before_sleep callback to seek the file data back to position 0 before every retry, regardless of whether the retry was triggered by ConnectionError, HTTPError, or a retryable status code. Evidence from two independent logs (issue #1821): - Linux: 188 level-0 zarr chunks failed (large, ~6 min upload each), 67 level-2 chunks succeeded (small, fast upload) -- all with "succeeded after 1 retry" + BadDigest - Windows local disk: 187 level-0 failures after 2-8 retries, 68 level-1 successes -- ruling out NFS/filesystem as the cause - Both logs show "Resetting dropped connection: dandiarchive.s3.amazonaws.com" confirming ConnectionErrors preceded the BadDigest errors Closes #1821 Co-Authored-By: Claude Code 2.1.81 / Claude Opus 4.6 --- dandi/dandiapi.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 9a5e625c3..61566425b 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -212,6 +212,14 @@ def request( lgr.debug("%s %s", method.upper(), url) + def _rewind_data(retry_state: tenacity.RetryCallState) -> None: + # After a failed attempt (ConnectionError mid-upload, HTTPError, + # etc.), the file pointer may be at an arbitrary position. Seek + # back to 0 so the next attempt sends the complete body. + # See https://github.com/dandi/dandi-cli/issues/1821 + if data is not None and hasattr(data, "seek"): + data.seek(0) + try: for i, attempt in enumerate( tenacity.Retrying( @@ -225,6 +233,7 @@ def request( ), stop=tenacity.stop_after_attempt(REQUEST_RETRIES), reraise=True, + before_sleep=_rewind_data, ) ): with attempt: @@ -249,8 +258,6 @@ def request( url, result.text, ) - if data is not None and hasattr(data, "seek"): - data.seek(0) if retry_after := get_retry_after(result): lgr.debug( "Sleeping for %d seconds as instructed in response "