diff --git a/.github/dependabot.yml b/.github/dependabot.yml index cc3c1983fc3..a9bc4ac0020 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -8,7 +8,7 @@ updates: ignore: - dependency-name: "avocado-framework*" schedule: - interval: daily + interval: weekly groups: python-packages: patterns: @@ -22,7 +22,7 @@ updates: gha-versions: patterns: - "*" - assignees: + reviewers: - daos-stack/actions-watchers commit-message: prefix: "Doc-only: true \n" @@ -36,7 +36,7 @@ updates: gha-versions: patterns: - "*" - assignees: + reviewers: - daos-stack/actions-watchers commit-message: prefix: "Doc-only: true \n" diff --git a/.github/workflows/bash_unit_testing.yml b/.github/workflows/bash_unit_testing.yml index 872d11a1314..434f27fe1c3 100644 --- a/.github/workflows/bash_unit_testing.yml +++ b/.github/workflows/bash_unit_testing.yml @@ -20,11 +20,11 @@ jobs: runs-on: [self-hosted, light] steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Checkout bash_unit project - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: 'pgrange/bash_unit' path: bash_unit diff --git a/.github/workflows/bullseye-coverage.yml b/.github/workflows/bullseye-coverage.yml index f03227c5dbf..306a0d55a02 100644 --- a/.github/workflows/bullseye-coverage.yml +++ b/.github/workflows/bullseye-coverage.yml @@ -109,7 +109,7 @@ jobs: matrix: ${{ steps.matrix.outputs.text }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Import commit pragmas @@ -201,7 +201,7 @@ jobs: env: CONFIG_POWER_ONLY: false PRAGMA_SUFFIX: -vm - OPERATIONS_EMAIL: john.malmberg@hpe.com + OPERATIONS_EMAIL: core-daos-devops@groups.int.hpe.com TEST_RPMS: true COMMIT_MESSAGE: ${{ needs.Call-RPM-Build.outputs.commit-message }} JENKINS_URL: https://jenkins-3.daos.hpc.amslabs.hpecorp.net/ @@ -235,7 +235,7 @@ jobs: COMMIT_STATUS_DISTRO_VERSION: steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: 'recursive' fetch-depth: 500 @@ -366,7 +366,7 @@ jobs: if: (!cancelled()) && (success() || failure()) && steps.run-test.outcome != 'skipped' # yamllint disable-line rule:line-length - uses: EnricoMi/publish-unit-test-result-action@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action@c950f6fb443cb5af20a377fd0dfaa78838901040 # v2.23.0 with: check_name: ${{ env.STAGE_NAME }} Test Results github_token: ${{ secrets.GITHUB_TOKEN }} @@ -374,14 +374,14 @@ jobs: - name: Publish artifacts if: (!cancelled()) && (success() || failure()) && steps.run-test.outcome != 'skipped' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: ${{ env.STAGE_NAME }} artifacts path: ${{ env.STAGE_NAME }}/** - name: Upload test results if: (success() || failure()) && steps.run-test.outcome != 'skipped' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: ${{ env.STAGE_NAME }} test-results path: ${{ env.STAGE_NAME }}/**/results.xml @@ -409,7 +409,7 @@ jobs: matrix: ${{ steps.matrix.outputs.text }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Import commit pragmas @@ -491,7 +491,7 @@ jobs: env: CONFIG_POWER_ONLY: false PRAGMA_SUFFIX: -vm - OPERATIONS_EMAIL: john.malmberg@hpe.com + OPERATIONS_EMAIL: core-daos-devops@groups.int.hpe.com TEST_RPMS: true COMMIT_MESSAGE: ${{ needs.Call-RPM-Build.outputs.commit-message }} JENKINS_URL: https://jenkins-3.daos.hpc.amslabs.hpecorp.net/ @@ -519,7 +519,7 @@ jobs: SIZE: steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: 'recursive' fetch-depth: 500 @@ -563,11 +563,13 @@ jobs: STAGE_TAGS+=",provider" if [[ '${{ matrix.stage }}' = *\ Verbs\ * ]]; then FTEST_ARG+=' --provider ofi+verbs' + INST_RPMS+=' mercury-libfabric' elif [[ '${{ matrix.stage }}' = *\ UCX\ * ]]; then FTEST_ARG+=' --provider ucx+dc_x' INST_RPMS+=' mercury-ucx' elif [[ '${{ matrix.stage }}' = *\ TCP\ * ]]; then FTEST_ARG+=' --provider ofi+tcp' + INST_RPMS+=' mercury-libfabric' else echo 'Unknown provider in ${{ matrix.stage }}' exit 1 @@ -634,7 +636,7 @@ jobs: if: (!cancelled()) && (success() || failure()) && steps.run-test.outcome != 'skipped' # yamllint disable-line rule:line-length - uses: EnricoMi/publish-unit-test-result-action@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action@c950f6fb443cb5af20a377fd0dfaa78838901040 # v2.23.0 with: check_name: ${{ env.STAGE_NAME }} Test Results github_token: ${{ secrets.GITHUB_TOKEN }} @@ -642,14 +644,14 @@ jobs: - name: Publish artifacts if: (!cancelled()) && (success() || failure()) && steps.run-test.outcome != 'skipped' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: ${{ env.STAGE_NAME }} artifacts path: ${{ env.STAGE_NAME }}/** - name: Upload test results if: (success() || failure()) && steps.run-test.outcome != 'skipped' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: ${{ env.STAGE_NAME }} test-results path: ${{ env.STAGE_NAME }}/**/results.xml diff --git a/.github/workflows/ci2.yml b/.github/workflows/ci2.yml index c49e46f3785..3aad063bda3 100644 --- a/.github/workflows/ci2.yml +++ b/.github/workflows/ci2.yml @@ -34,7 +34,7 @@ jobs: DOCKER_BASE: ${{ matrix.base }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true fetch-depth: 500 @@ -68,7 +68,7 @@ jobs: - name: Publish NLT test results if: always() # yamllint disable-line rule:line-length - uses: EnricoMi/publish-unit-test-result-action@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action@c950f6fb443cb5af20a377fd0dfaa78838901040 # v2.23.0 with: github_token: ${{ secrets.GITHUB_TOKEN }} files: nlt-junit.xml @@ -100,7 +100,7 @@ jobs: COMPILER: ${{ matrix.compiler }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true fetch-depth: 500 diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index 54f0b0e95db..b8c8c74d4a9 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -18,7 +18,7 @@ jobs: permissions: contents: write steps: - - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 2 - uses: ./.github/actions/make_release diff --git a/.github/workflows/landing-builds.yml b/.github/workflows/landing-builds.yml index 6a557999a04..e527e021ff7 100644 --- a/.github/workflows/landing-builds.yml +++ b/.github/workflows/landing-builds.yml @@ -64,7 +64,7 @@ jobs: DOCKER_BASE: ${{ matrix.base }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: 'recursive' fetch-depth: 500 @@ -112,7 +112,7 @@ jobs: COMPILER: clang steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: 'recursive' fetch-depth: 500 @@ -144,7 +144,7 @@ jobs: - name: Publish NLT test results if: always() # yamllint disable-line rule:line-length - uses: EnricoMi/publish-unit-test-result-action@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action@c950f6fb443cb5af20a377fd0dfaa78838901040 # v2.23.0 with: github_token: ${{ secrets.GITHUB_TOKEN }} files: nlt-junit.xml @@ -181,7 +181,7 @@ jobs: COMPILER: ${{ matrix.compiler }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: 'recursive' fetch-depth: 500 @@ -255,7 +255,7 @@ jobs: BASE_DISTRO: ${{ matrix.with }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: 'recursive' fetch-depth: 500 @@ -344,7 +344,7 @@ jobs: COMPILER: ${{ matrix.compiler }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: 'recursive' fetch-depth: 500 diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 084673c65a2..777cf428652 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -25,11 +25,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Set up Python environment - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: '3' - name: Install extra python packages @@ -48,7 +48,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Run @@ -66,7 +66,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Check DAOS logging macro use. @@ -77,7 +77,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Check DAOS ftest tags. run: \[ ! -x src/tests/ftest/tags.py \] || ./src/tests/ftest/tags.py lint --verbose @@ -86,11 +86,11 @@ jobs: name: Flake8 check steps: - name: Check out source repository - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Set up Python environment - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: '3' - name: Add parser @@ -119,7 +119,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Install doxygen @@ -129,7 +129,7 @@ jobs: - name: Run check run: doxygen Doxyfile - name: 'Upload Artifact' - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: API Documentation path: docs/doxygen/html/ @@ -140,10 +140,10 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: '3.11' - name: Install python packages @@ -160,11 +160,11 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install extra python packages run: python3 -m pip install --requirement utils/cq/requirements.txt - name: Run check - uses: codespell-project/actions-codespell@406322ec52dd7b488e48c1c4b82e2a8b3a1bf630 # master + uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579 # master with: skip: ./src/control/vendor,./src/control/go.sum,./.git,./utils/*.patch ignore_words_file: ci/codespell.ignores @@ -175,7 +175,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Check out source repository - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} fetch-depth: 0 @@ -191,7 +191,7 @@ jobs: with: target: ${{ steps.get_merge_base.outputs.ref }} - name: Export changes - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 if: failure() with: name: format-patch-for-pr-${{ github.event.pull_request.number }} @@ -202,11 +202,11 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Check out source repository - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Set up Python environment - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: '3' - name: Install extra python packages @@ -219,7 +219,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Check out source repository - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} fetch-depth: 0 @@ -247,9 +247,14 @@ jobs: steps: - name: Check if any job failed run: | - if [[ -z "$(echo "${{ join(needs.*.result, '') }}" | sed -e 's/success//g')" ]]; then - echo "All jobs succeeded" - else - echo "One or more jobs did not succeed" - exit 1 - fi + ALL_DEPS_RESULT='${{ toJSON(needs) }}' + echo "$ALL_DEPS_RESULT" | jq -rc 'keys[] as $k | "\($k):\(.[$k].result)"' \ + | while read job_result; do + job_name=$(echo "$job_result" | cut -d: -f1) + job_result=$(echo "$job_result" | cut -d: -f2) + echo "$job_name = $job_result" + if [[ "$job_result" != "success" ]]; then + echo "Job $job_name failed" + exit 1 + fi + done diff --git a/.github/workflows/ossf-scorecard.yml b/.github/workflows/ossf-scorecard.yml index 3f57d3d8d94..21822ae1038 100644 --- a/.github/workflows/ossf-scorecard.yml +++ b/.github/workflows/ossf-scorecard.yml @@ -33,7 +33,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false @@ -62,7 +62,7 @@ jobs: # uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: SARIF file path: results.sarif @@ -71,6 +71,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard (optional). # Commenting out will disable upload of results to your repo's Code Scanning dashboard - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@f443b600d91635bebf5b0d9ebc620189c0d6fba5 # v4.30.8 + uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 with: sarif_file: results.sarif diff --git a/.github/workflows/pr-metadata.yml b/.github/workflows/pr-metadata.yml index 7a9a1838604..edca6d12300 100644 --- a/.github/workflows/pr-metadata.yml +++ b/.github/workflows/pr-metadata.yml @@ -19,7 +19,7 @@ jobs: name: Report Jira data to PR comment steps: - name: Checkout - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: install jira run: python3 -m pip install jira - name: Load jira metadata diff --git a/.github/workflows/rpm-build-and-test-report.yml b/.github/workflows/rpm-build-and-test-report.yml index 456fbc187eb..aae911b31c6 100644 --- a/.github/workflows/rpm-build-and-test-report.yml +++ b/.github/workflows/rpm-build-and-test-report.yml @@ -93,7 +93,7 @@ jobs: esac echo "STAGE_NAME=Build RPM on $DISTRO_NAME $DISTRO_VERSION" >> $GITHUB_ENV - name: Test Report - uses: dorny/test-reporter@dc3a92680fcc15842eef52e8c4606ea7ce6bd3f3 # v2.1.1 + uses: dorny/test-reporter@b082adf0eced0765477756c2a610396589b8c637 # v2.5.0 with: artifact: ${{ env.STAGE_NAME }} test-results name: ${{ env.STAGE_NAME }} Test Results (dorny) @@ -112,7 +112,7 @@ jobs: - name: Set variables run: echo "STAGE_NAME=Functional Hardware ${{ matrix.stage }}" >> $GITHUB_ENV - name: Test Report - uses: dorny/test-reporter@dc3a92680fcc15842eef52e8c4606ea7ce6bd3f3 # v2.1.1 + uses: dorny/test-reporter@b082adf0eced0765477756c2a610396589b8c637 # v2.5.0 with: artifact: ${{ env.STAGE_NAME }} test-results name: ${{ env.STAGE_NAME }} Test Results (dorny) diff --git a/.github/workflows/rpm-build-and-test.yml b/.github/workflows/rpm-build-and-test.yml index 8588f840b85..ff429771a44 100644 --- a/.github/workflows/rpm-build-and-test.yml +++ b/.github/workflows/rpm-build-and-test.yml @@ -118,7 +118,7 @@ jobs: matrix: ${{ steps.matrix.outputs.text }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Import commit pragmas @@ -210,7 +210,7 @@ jobs: env: CONFIG_POWER_ONLY: false PRAGMA_SUFFIX: -vm - OPERATIONS_EMAIL: john.malmberg@hpe.com + OPERATIONS_EMAIL: core-daos-devops@groups.int.hpe.com TEST_RPMS: true COMMIT_MESSAGE: ${{ needs.Call-RPM-Build.outputs.commit-message }} JENKINS_URL: https://jenkins-3.daos.hpc.amslabs.hpecorp.net/ @@ -244,7 +244,7 @@ jobs: COMMIT_STATUS_DISTRO_VERSION: steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: recursive fetch-depth: 500 @@ -375,7 +375,7 @@ jobs: if: (!cancelled()) && (success() || failure()) && steps.run-test.outcome != 'skipped' # yamllint disable-line rule:line-length - uses: EnricoMi/publish-unit-test-result-action@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action@c950f6fb443cb5af20a377fd0dfaa78838901040 # v2.23.0 with: check_name: ${{ env.STAGE_NAME }} Test Results github_token: ${{ secrets.GITHUB_TOKEN }} @@ -383,14 +383,14 @@ jobs: - name: Publish artifacts if: (!cancelled()) && (success() || failure()) && steps.run-test.outcome != 'skipped' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: ${{ env.STAGE_NAME }} artifacts path: ${{ env.STAGE_NAME }}/** - name: Upload test results if: (success() || failure()) && steps.run-test.outcome != 'skipped' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: ${{ env.STAGE_NAME }} test-results path: ${{ env.STAGE_NAME }}/**/results.xml @@ -418,7 +418,7 @@ jobs: matrix: ${{ steps.matrix.outputs.text }} steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Import commit pragmas @@ -500,7 +500,7 @@ jobs: env: CONFIG_POWER_ONLY: false PRAGMA_SUFFIX: -vm - OPERATIONS_EMAIL: john.malmberg@hpe.com + OPERATIONS_EMAIL: core-daos-devops@groups.int.hpe.com TEST_RPMS: true COMMIT_MESSAGE: ${{ needs.Call-RPM-Build.outputs.commit-message }} JENKINS_URL: https://jenkins-3.daos.hpc.amslabs.hpecorp.net/ @@ -528,7 +528,7 @@ jobs: SIZE: steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: recursive fetch-depth: 500 @@ -572,11 +572,13 @@ jobs: STAGE_TAGS+=",provider" if [[ '${{ matrix.stage }}' = *\ Verbs\ * ]]; then FTEST_ARG+=' --provider ofi+verbs' + INST_RPMS+=' mercury-libfabric' elif [[ '${{ matrix.stage }}' = *\ UCX\ * ]]; then FTEST_ARG+=' --provider ucx+dc_x' INST_RPMS+=' mercury-ucx' elif [[ '${{ matrix.stage }}' = *\ TCP\ * ]]; then FTEST_ARG+=' --provider ofi+tcp' + INST_RPMS+=' mercury-libfabric' else echo 'Unknown provider in ${{ matrix.stage }}' exit 1 @@ -643,7 +645,7 @@ jobs: if: (!cancelled()) && (success() || failure()) && steps.run-test.outcome != 'skipped' # yamllint disable-line rule:line-length - uses: EnricoMi/publish-unit-test-result-action@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action@c950f6fb443cb5af20a377fd0dfaa78838901040 # v2.23.0 with: check_name: ${{ env.STAGE_NAME }} Test Results github_token: ${{ secrets.GITHUB_TOKEN }} @@ -651,14 +653,14 @@ jobs: - name: Publish artifacts if: (!cancelled()) && (success() || failure()) && steps.run-test.outcome != 'skipped' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: ${{ env.STAGE_NAME }} artifacts path: ${{ env.STAGE_NAME }}/** - name: Upload test results if: (success() || failure()) && steps.run-test.outcome != 'skipped' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: ${{ env.STAGE_NAME }} test-results path: ${{ env.STAGE_NAME }}/**/results.xml diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index a04be926bb7..21aa575975f 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -33,10 +33,10 @@ jobs: security-events: write steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Run Trivy vulnerability scanner in filesystem mode (table format) - uses: aquasecurity/trivy-action@b6643a29fecd7f34b3597bc6acb0a98b03d33ff8 # 0.33.1 + uses: aquasecurity/trivy-action@e368e328979b113139d6f9068e03accaed98a518 # 0.34.1 with: scan-type: 'fs' scan-ref: '.' @@ -49,7 +49,7 @@ jobs: cp utils/trivy/.trivyignore report/trivyignore.txt - name: Upload the report to the GitHub artifact store - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: path: report/* name: trivy-report-daos @@ -61,14 +61,14 @@ jobs: sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml - name: Run Trivy vulnerability scanner in filesystem mode (sarif format) - uses: aquasecurity/trivy-action@b6643a29fecd7f34b3597bc6acb0a98b03d33ff8 # 0.33.1 + uses: aquasecurity/trivy-action@e368e328979b113139d6f9068e03accaed98a518 # 0.34.1 with: scan-type: 'fs' scan-ref: '.' trivy-config: 'utils/trivy/trivy.yaml' - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@f443b600d91635bebf5b0d9ebc620189c0d6fba5 # v4.30.8 + uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 with: sarif_file: 'trivy-results.sarif' @@ -79,7 +79,7 @@ jobs: sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml - name: Run Trivy vulnerability scanner in filesystem mode (human readable format) - uses: aquasecurity/trivy-action@b6643a29fecd7f34b3597bc6acb0a98b03d33ff8 # 0.33.1 + uses: aquasecurity/trivy-action@e368e328979b113139d6f9068e03accaed98a518 # 0.34.1 with: scan-type: 'fs' scan-ref: '.' diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index ee64db399b9..ca7ed7a2733 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -15,7 +15,7 @@ jobs: runs-on: [self-hosted, docker] steps: - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: 'recursive' - name: Build deps in Docker diff --git a/Jenkinsfile b/Jenkinsfile index b53f8f36315..da1b6f9672c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -5,7 +5,7 @@ /* groovylint-disable ParameterName, VariableName */ /* Copyright 2019-2024 Intel Corporation /* Copyright 2025 Google LLC - * Copyright 2025 Hewlett Packard Enterprise Development LP + * Copyright 2025-2026 Hewlett Packard Enterprise Development LP * All rights reserved. * * This file is part of the DAOS Project. It is subject to the license terms @@ -18,7 +18,7 @@ // To use a test branch (i.e. PR) until it lands to master // I.e. for testing library changes -//@Library(value='pipeline-lib@your_branch') _ +@Library(value='pipeline-lib@ryon-jensen/sles-15sp7') _ /* groovylint-disable-next-line CompileStatic */ job_status_internal = [:] @@ -152,9 +152,12 @@ String vm9_label(String distro) { } void rpm_test_post(String stageName, String node) { + // Extract first node from comma-delimited list + String firstNode = node.split(',')[0].trim() sh label: 'Fetch and stage artifacts', - script: 'hostname; ssh -i ci_key jenkins@' + node + ' ls -ltar /tmp; mkdir -p "' + env.STAGE_NAME + '/" && ' + - 'scp -i ci_key jenkins@' + node + + script: 'hostname; ssh -i ci_key jenkins@' + firstNode + + ' ls -ltar /tmp; mkdir -p "' + env.STAGE_NAME + '/" && ' + + 'scp -i ci_key jenkins@' + firstNode + ':/tmp/{{suite_dmg,daos_{server_helper,{control,agent}}}.log,daos_server.log.*} "' + stageName + '/"' archiveArtifacts artifacts: env.STAGE_NAME + '/**' @@ -197,14 +200,18 @@ Boolean skip_build_stage(String distro='', String compiler='gcc') { } } - // Skip the stage if any Skip-build-- pragmas are true - String pragma_names = ['build'] + // Skip the stage if any Skip-build[--] pragmas are true + List pragma_names = ['build'] if (distro && compiler) { pragma_names << "build-${distro}-${compiler}" } - Boolean any_pragma_skip = pragma_names.any { name -> skip_pragma_set(name) } + Boolean any_pragma_skip = pragma_names.any { name -> + if (skip_pragma_set(name)) { + println("[${env.STAGE_NAME}] Skipping build stage due to \"Skip-${name}: true\" pragma") + return true + } + } if (any_pragma_skip) { - println("[${env.STAGE_NAME}] Skipping build stage for due to Skip-[${pragma_names}] pragma") return true } @@ -448,69 +455,36 @@ pipeline { } } } - stage('Check PR') { - when { changeRequest() } - parallel { - stage('Branch name check') { - when { changeRequest() } - steps { - script { - if (env.CHANGE_ID.toInteger() > 9742 && !env.CHANGE_BRANCH.contains('/')) { - error('Your PR branch name does not follow the rules. Please rename it ' + - 'according to the rules described here: ' + - 'https://daosio.atlassian.net/l/cp/UP1sPTvc#branch_names. ' + - 'Once you have renamed your branch locally to match the ' + - 'format, close this PR and open a new one using the newly renamed ' + - 'local branch.') - } - } - } - } - } // parallel - } // stage('Check PR') - stage('Cancel Previous Builds') { - when { - beforeAgent true - expression { !paramsValue('CI_CANCEL_PREV_BUILD_SKIP', false) && !skipStage() } - } - steps { - cancelPreviousBuilds() - } - } - stage('Pre-build') { - when { - beforeAgent true - expression { !skipStage() } - } - parallel { - stage('Python Bandit check') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - dockerfile { - filename 'utils/docker/Dockerfile.code_scanning' - label 'docker_runner' - additionalBuildArgs dockerBuildArgs(add_repos: false) + - ' --build-arg FVERSION=37' - } - } - steps { - job_step_update(pythonBanditCheck()) - } - post { - always { - // Bandit will have empty results if it does not - // find any issues. - junit testResults: 'bandit.xml', - allowEmptyResults: true - job_status_update() - } - } - } // stage('Python Bandit check') - } - } +// stage('Check PR') { +// when { changeRequest() } +// parallel { +// stage('Branch name check') { +// when { changeRequest() } +// steps { +// script { +// if (env.CHANGE_ID.toInteger() > 9742 && !env.CHANGE_BRANCH.contains('/')) { +// error('Your PR branch name does not follow the rules. Please rename it ' + +// 'according to the rules described here: ' + +// 'https://daosio.atlassian.net/l/cp/UP1sPTvc#branch_names. ' + +// 'Once you have renamed your branch locally to match the ' + +// 'format, close this PR and open a new one using the newly renamed ' + +// 'local branch.') +// } +// } +// } +// } +// } // parallel +// } // stage('Check PR') +// stage('Cancel Previous Builds') { +// when { +// beforeAgent true +// expression { !paramsValue('CI_CANCEL_PREV_BUILD_SKIP', false) && !skipStage() } +// } +// steps { +// cancelPreviousBuilds() +// } +// } + stage('Build') { /* Don't use failFast here as whilst it avoids using extra resources * and gives faster results for PRs it's also on for master where we @@ -522,110 +496,6 @@ pipeline { expression { !skip_build_stage() } } parallel { - stage('Build on EL 8.8') { - when { - beforeAgent true - expression { !skip_build_stage('el8') } - } - agent { - dockerfile { - filename 'utils/docker/Dockerfile.el.8' - label 'docker_runner' - additionalBuildArgs dockerBuildArgs(repo_type: 'stable', - deps_build: false, - parallel_build: true) + - " -t ${sanitized_JOB_NAME()}-el8 " + - ' --build-arg DAOS_PACKAGES_BUILD=no ' + - ' --build-arg DAOS_KEEP_SRC=yes ' + - ' --build-arg REPOS="' + prRepos() + '"' - } - } - steps { - script { - sh label: 'Install RPMs', - script: './ci/rpm/install_deps.sh el8 "' + env.DAOS_RELVAL + '"' - sh label: 'Build deps', - script: './ci/rpm/build_deps.sh' - job_step_update( - sconsBuild(parallel_build: true, - stash_files: 'ci/test_files_to_stash.txt', - build_deps: 'no', - stash_opt: true, - scons_args: sconsArgs() + - ' PREFIX=/opt/daos TARGET_TYPE=release')) - sh label: 'Generate RPMs', - script: './ci/rpm/gen_rpms.sh el8 "' + env.DAOS_RELVAL + '"' - } - } - post { - success { - uploadNewRPMs('el8', 'success') - } - unsuccessful { - sh '''if [ -f config.log ]; then - mv config.log config.log-el8-gcc - fi''' - archiveArtifacts artifacts: 'config.log-el8-gcc', - allowEmptyArchive: true - } - cleanup { - uploadNewRPMs('el8', 'cleanup') - job_status_update() - } - } - } - stage('Build on EL 9') { - when { - beforeAgent true - expression { !skip_build_stage('el9') } - } - agent { - dockerfile { - filename 'utils/docker/Dockerfile.el.9' - label 'docker_runner' - additionalBuildArgs dockerBuildArgs(repo_type: 'stable', - deps_build: false, - parallel_build: true) + - " -t ${sanitized_JOB_NAME()}-el9 " + - ' --build-arg DAOS_PACKAGES_BUILD=no ' + - ' --build-arg DAOS_KEEP_SRC=yes ' + - ' --build-arg REPOS="' + prRepos() + '"' - } - } - steps { - script { - sh label: 'Install RPMs', - script: './ci/rpm/install_deps.sh el9 "' + env.DAOS_RELVAL + '"' - sh label: 'Build deps', - script: './ci/rpm/build_deps.sh' - job_step_update( - sconsBuild(parallel_build: true, - stash_files: 'ci/test_files_to_stash.txt', - build_deps: 'no', - stash_opt: true, - scons_args: sconsArgs() + - ' PREFIX=/opt/daos TARGET_TYPE=release')) - sh label: 'Generate RPMs', - script: './ci/rpm/gen_rpms.sh el9 "' + env.DAOS_RELVAL + '"' - } - } - post { - success { - uploadNewRPMs('el9', 'success') - } - unsuccessful { - sh '''if [ -f config.log ]; then - mv config.log config.log-el9-gcc - fi''' - archiveArtifacts artifacts: 'config.log-el9-gcc', - allowEmptyArchive: true - } - cleanup { - uploadNewRPMs('el9', 'cleanup') - job_status_update() - } - } - } stage('Build on Leap 15.5') { when { beforeAgent true @@ -640,7 +510,8 @@ pipeline { deps_build: false) + ' --build-arg DAOS_PACKAGES_BUILD=no ' + ' --build-arg DAOS_KEEP_SRC=yes ' + - " -t ${sanitized_JOB_NAME()}-leap15-gcc" + " -t ${sanitized_JOB_NAME()}-leap15" + + ' --build-arg POINT_RELEASE=.5 ' } } steps { @@ -677,148 +548,34 @@ pipeline { } } } - stage('Unit Tests') { - when { - beforeAgent true - expression { !skipStage() } - } - parallel { - stage('Unit Test on EL 8.8') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - label cachedCommitPragma(pragma: 'VM1-label', def_val: params.CI_UNIT_VM1_LABEL) - } - steps { - job_step_update( - unitTest(timeout_time: 60, - unstash_opt: true, - inst_repos: daosRepos(), - inst_rpms: unitPackages())) - } - post { - always { - unitTestPost artifacts: ['unit_test_logs/'] - job_status_update() - } - } - } - stage('Unit Test bdev on EL 8.8') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - label params.CI_UNIT_VM1_NVME_LABEL - } - steps { - job_step_update( - unitTest(timeout_time: 60, - unstash_opt: true, - inst_repos: daosRepos(), - inst_rpms: unitPackages())) - } - post { - always { - unitTestPost artifacts: ['unit_test_bdev_logs/'] - job_status_update() - } - } - } - stage('NLT on EL 8.8') { - when { - beforeAgent true - expression { params.CI_NLT_TEST && !skipStage() } - } - agent { - label params.CI_NLT_1_LABEL - } - steps { - job_step_update( - unitTest(timeout_time: 60, - inst_repos: daosRepos(), - test_script: 'ci/unit/test_nlt.sh', - unstash_opt: true, - unstash_tests: false, - inst_rpms: unitPackages())) - // recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltir.xml']], - // skipPublishingChecks: true, - // id: 'tlc', name: 'Fault Injection Interim Report') - stash(name:'nltr', includes:'nltr.json', allowEmpty: true) - } - post { - always { - unitTestPost artifacts: ['nlt_logs/'], - testResults: 'nlt-junit.xml', - always_script: 'ci/unit/test_nlt_post.sh', - valgrind_stash: 'el8-gcc-nlt-memcheck' - recordIssues enabledForFailure: true, - failOnError: false, - ignoreQualityGate: true, - name: 'NLT server leaks', - qualityGates: [[threshold: 1, type: 'TOTAL', unstable: true]], - tool: issues(pattern: 'nlt-server-leaks.json', - name: 'NLT server results', - id: 'NLT_server'), - scm: 'daos-stack/daos' - job_status_update() - } - } - } - stage('Unit Test with memcheck on EL 8.8') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - label cachedCommitPragma(pragma: 'VM1-label', def_val: params.CI_UNIT_VM1_LABEL) - } - steps { - job_step_update( - unitTest(timeout_time: 160, - unstash_opt: true, - ignore_failure: true, - inst_repos: daosRepos(), - inst_rpms: unitPackages())) - } - post { - always { - unitTestPost artifacts: ['unit_test_memcheck_logs.tar.gz', - 'unit_test_memcheck_logs/**/*.log'], - valgrind_stash: 'el8-gcc-unit-memcheck' - job_status_update() - } - } - } // stage('Unit Test with memcheck on EL 8.8') - stage('Unit Test bdev with memcheck on EL 8.8') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - label params.CI_UNIT_VM1_NVME_LABEL - } - steps { - job_step_update( - unitTest(timeout_time: 180, - unstash_opt: true, - ignore_failure: true, - inst_repos: daosRepos(), - inst_rpms: unitPackages())) - } - post { - always { - unitTestPost artifacts: ['unit_test_memcheck_bdev_logs.tar.gz', - 'unit_test_memcheck_bdev_logs/**/*.log'], - valgrind_stash: 'el8-gcc-unit-memcheck-bdev' - job_status_update() - } - } - } // stage('Unit Test bdev with memcheck on EL 8') - } - } +// stage('Fetch RPMs from previous build') { +// steps { +// copyArtifacts( +// projectName: env.JOB_NAME, +// selector: specific('2'), +// filter: 'artifacts/leap15/daos/**, artifacts/leap15/deps/**', +// fingerprintArtifacts: true +// ) +// copyArtifacts( +// projectName: env.JOB_NAME, +// selector: specific('2'), +// filter: 'artifacts/leap15/repodata/**', +// fingerprintArtifacts: true +// ) +// sh ''' +// set -euxo pipefail +// ls -lah artifacts/leap15/daos || true +// ls -lah artifacts/leap15/deps || true +// ls -lah artifacts/leap15/repodata || true +// ''' +// +// archiveArtifacts artifacts: 'artifacts/leap15/**', +// fingerprint: true, +// allowEmptyArchive: true +// +// } +// } + stage('Test') { when { beforeAgent true @@ -827,77 +584,11 @@ pipeline { expression { !paramsValue('CI_FUNCTIONAL_TEST_SKIP', false) } } parallel { - stage('Functional on EL 8.8 with Valgrind') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - label params.CI_FUNCTIONAL_VM9_LABEL - } - steps { - job_step_update( - functionalTest( - inst_repos: daosRepos(), - inst_rpms: functionalPackages(1, next_version(), 'tests-internal'), - test_function: 'runTestFunctionalV2')) - } - post { - always { - functionalTestPostV2() - job_status_update() - } - } - } // stage('Functional on EL 8.8 with Valgrind') - stage('Functional on EL 8.8') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - label vm9_label('EL8') - } - steps { - job_step_update( - functionalTest( - inst_repos: daosRepos(), - inst_rpms: functionalPackages(1, next_version(), 'tests-internal'), - test_function: 'runTestFunctionalV2')) - } - post { - always { - functionalTestPostV2() - job_status_update() - } - } - } // stage('Functional on EL 8.8') - stage('Functional on EL 9') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - label vm9_label('EL9') - } - steps { - job_step_update( - functionalTest( - inst_repos: daosRepos(), - inst_rpms: functionalPackages(1, next_version(), 'tests-internal'), - test_function: 'runTestFunctionalV2')) - } - post { - always { - functionalTestPostV2() - job_status_update() - } - } - } // stage('Functional on EL 9') - stage('Functional on Leap 15.6') { - when { - beforeAgent true - expression { !skipStage() } - } + stage('Functional on SLES 15.7') { +// when { +// beforeAgent true +// expression { !skipStage() } +// } agent { label vm9_label('Leap15') } @@ -905,31 +596,10 @@ pipeline { job_step_update( functionalTest( inst_repos: daosRepos(), - inst_rpms: functionalPackages(1, next_version(), 'tests-internal'), + inst_rpms: functionalPackages(1, next_version(), 'tests-internal') + + ' mercury-libfabric', test_function: 'runTestFunctionalV2', - image_version: 'leap15.6')) - } - post { - always { - functionalTestPostV2() - job_status_update() - } - } // post - } // stage('Functional on Leap 15.6') - stage('Functional on Ubuntu 20.04') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - label vm9_label('Ubuntu') - } - steps { - job_step_update( - functionalTest( - inst_repos: daosRepos(), - inst_rpms: functionalPackages(1, next_version(), 'tests-internal'), - test_function: 'runTestFunctionalV2')) + image_version: 'sles15.7')) } post { always { @@ -937,88 +607,12 @@ pipeline { job_status_update() } } // post - } // stage('Functional on Ubuntu 20.04') - stage('Fault injection testing on EL 8.8') { - when { - beforeAgent true - expression { !skipStage() } - } - agent { - dockerfile { - filename 'utils/docker/Dockerfile.el.8' - label 'docker_runner' - additionalBuildArgs dockerBuildArgs(repo_type: 'stable', - parallel_build: true, - deps_build: true) - args '--tmpfs /mnt/daos_0' - } - } - steps { - job_step_update( - sconsBuild(parallel_build: true, - scons_args: 'PREFIX=/opt/daos TARGET_TYPE=release BUILD_TYPE=debug', - build_deps: 'no')) - job_step_update(nlt_test()) - // recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltr.xml']], - // skipPublishingChecks: true, - // id: 'fir', name: 'Fault Injection Report') - } - post { - always { - discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', - scm: 'daos-stack/daos', - requiredResult: hudson.model.Result.UNSTABLE - recordIssues enabledForFailure: true, - /* ignore warning/errors from PMDK logging system */ - filters: [excludeFile('pmdk/.+')], - failOnError: false, - ignoreQualityGate: true, - qualityGates: [[threshold: 1, type: 'TOTAL_ERROR'], - [threshold: 1, type: 'TOTAL_HIGH'], - [threshold: 1, type: 'NEW_NORMAL', unstable: true], - [threshold: 1, type: 'NEW_LOW', unstable: true]], - tools: [issues(pattern: 'nlt-errors.json', - name: 'Fault injection issues', - id: 'Fault_Injection'), - issues(pattern: 'nlt-client-leaks.json', - name: 'Fault injection leaks', - id: 'NLT_client')], - scm: 'daos-stack/daos' - junit testResults: 'nlt-junit.xml' - stash name: 'fault-inject-valgrind', - includes: '*.memcheck.xml', - allowEmpty: true - archiveArtifacts artifacts: 'nlt_logs/el8.fault-injection/', - allowEmptyArchive: true - job_status_update() - } - } - } // stage('Fault injection testing on EL 8.8') - stage('Test RPMs on EL 8.6') { - when { - beforeAgent true - expression { params.CI_TEST_EL8_RPMs && !skipStage() } - } - agent { - label params.CI_UNIT_VM1_LABEL - } - steps { - job_step_update( - testRpm(inst_repos: daosRepos(), - daos_pkg_version: daosPackagesVersion(next_version())) - ) - } - post { - always { - rpm_test_post(env.STAGE_NAME, env.NODELIST) - } - } - } // stage('Test RPMs on EL 8.6') - stage('Test RPMs on Leap 15.5') { - when { - beforeAgent true - expression { params.CI_TEST_LEAP15_RPMs && !skipStage() } - } + } // stage('Functional on SLES 15.7') + stage('Test RPMs on Leap 15.6') { +// when { +// beforeAgent true +// expression { params.CI_TEST_LEAP15_RPMs && !skipStage() } +// } agent { label params.CI_UNIT_VM1_LABEL } @@ -1028,8 +622,8 @@ pipeline { * additionally for this use-case, can't override ftest_arg with this :-( script { - 'Test RPMs on Leap 15.5': getFunctionalTestStage( - name: 'Test RPMs on Leap 15.5', + 'Test RPMs on Leap 15.6': getFunctionalTestStage( + name: 'Test RPMs on Leap 15.6', pragma_suffix: '', label: params.CI_UNIT_VM1_LABEL, next_version: next_version(), @@ -1057,7 +651,8 @@ pipeline { } */ job_step_update( testRpm(inst_repos: daosRepos(), - daos_pkg_version: daosPackagesVersion(next_version())) + daos_pkg_version: daosPackagesVersion(next_version()), + inst_rpms: 'mercury-libfabric') ) } post { @@ -1065,141 +660,9 @@ pipeline { rpm_test_post(env.STAGE_NAME, env.NODELIST) } } - } // stage('Test RPMs on Leap 15.5') + } // stage('Test RPMs on Leap 15.6') } // parallel } // stage('Test') - stage('Test Storage Prep on EL 8.8') { - when { - beforeAgent true - expression { params.CI_STORAGE_PREP_LABEL != '' } - } - agent { - label params.CI_STORAGE_PREP_LABEL - } - steps { - job_step_update( - storagePrepTest( - inst_repos: daosRepos(), - inst_rpms: functionalPackages(1, next_version(), 'tests-internal'))) - } - post { - cleanup { - job_status_update() - } - } - } // stage('Test Storage Prep') - stage('Test Hardware') { - when { - beforeAgent true - expression { !paramsValue('CI_FUNCTIONAL_HARDWARE_TEST_SKIP', false) && !skipStage() } - } - steps { - script { - parallel( - 'Functional Hardware Medium': getFunctionalTestStage( - name: 'Functional Hardware Medium', - pragma_suffix: '-hw-medium', - label: params.FUNCTIONAL_HARDWARE_MEDIUM_LABEL, - next_version: next_version(), - stage_tags: 'hw,medium,-provider', - default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', - nvme: 'auto', - run_if_pr: false, - run_if_landing: false, - job_status: job_status_internal - ), - 'Functional Hardware Medium MD on SSD': getFunctionalTestStage( - name: 'Functional Hardware Medium MD on SSD', - pragma_suffix: '-hw-medium-md-on-ssd', - label: params.FUNCTIONAL_HARDWARE_MEDIUM_LABEL, - next_version: next_version(), - stage_tags: 'hw,medium,-provider', - default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', - nvme: 'auto_md_on_ssd', - run_if_pr: true, - run_if_landing: false, - job_status: job_status_internal - ), - 'Functional Hardware Medium VMD': getFunctionalTestStage( - name: 'Functional Hardware Medium VMD', - pragma_suffix: '-hw-medium-vmd', - label: params.FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL, - next_version: next_version(), - stage_tags: 'hw_vmd,medium', - /* groovylint-disable-next-line UnnecessaryGetter */ - default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', - nvme: 'auto', - run_if_pr: false, - run_if_landing: false, - job_status: job_status_internal - ), - 'Functional Hardware Medium Verbs Provider': getFunctionalTestStage( - name: 'Functional Hardware Medium Verbs Provider', - pragma_suffix: '-hw-medium-verbs-provider', - label: params.FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL, - next_version: next_version(), - stage_tags: 'hw,medium,provider', - default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', - default_nvme: 'auto', - provider: 'ofi+verbs;ofi_rxm', - run_if_pr: false, - run_if_landing: false, - job_status: job_status_internal - ), - 'Functional Hardware Medium Verbs Provider MD on SSD': getFunctionalTestStage( - name: 'Functional Hardware Medium Verbs Provider MD on SSD', - pragma_suffix: '-hw-medium-verbs-provider-md-on-ssd', - label: params.FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL, - next_version: next_version(), - stage_tags: 'hw,medium,provider', - default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', - default_nvme: 'auto_md_on_ssd', - provider: 'ofi+verbs;ofi_rxm', - run_if_pr: true, - run_if_landing: false, - job_status: job_status_internal - ), - 'Functional Hardware Medium UCX Provider': getFunctionalTestStage( - name: 'Functional Hardware Medium UCX Provider', - pragma_suffix: '-hw-medium-ucx-provider', - label: params.FUNCTIONAL_HARDWARE_MEDIUM_UCX_PROVIDER_LABEL, - next_version: next_version(), - stage_tags: 'hw,medium,provider', - default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', - default_nvme: 'auto', - provider: cachedCommitPragma('Test-provider-ucx', 'ucx+ud_x'), - run_if_pr: false, - run_if_landing: false, - job_status: job_status_internal - ), - 'Functional Hardware Large': getFunctionalTestStage( - name: 'Functional Hardware Large', - pragma_suffix: '-hw-large', - label: params.FUNCTIONAL_HARDWARE_LARGE_LABEL, - next_version: next_version(), - stage_tags: 'hw,large', - default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', - default_nvme: 'auto', - run_if_pr: false, - run_if_landing: false, - job_status: job_status_internal - ), - 'Functional Hardware Large MD on SSD': getFunctionalTestStage( - name: 'Functional Hardware Large MD on SSD', - pragma_suffix: '-hw-large-md-on-ssd', - label: params.FUNCTIONAL_HARDWARE_LARGE_LABEL, - next_version: next_version(), - stage_tags: 'hw,large', - default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', - default_nvme: 'auto_md_on_ssd', - run_if_pr: true, - run_if_landing: false, - job_status: job_status_internal - ), - ) - } - } - } // stage('Test Hardware') } // stages post { always { @@ -1213,4 +676,4 @@ pipeline { notifyBrokenBranch branches: target_branch } } // post -} +} \ No newline at end of file diff --git a/TAG b/TAG index 9071fdd003b..069a65ddf86 100644 --- a/TAG +++ b/TAG @@ -1 +1 @@ -2.7.101-tb +2.7.104-tb diff --git a/VERSION b/VERSION index 9ab9777fabf..1b2efb48149 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.7.101 +2.7.104 diff --git a/ci/functional/launchable_analysis b/ci/functional/launchable_analysis index faf2ed43c43..9b40db9c396 100755 --- a/ci/functional/launchable_analysis +++ b/ci/functional/launchable_analysis @@ -57,5 +57,5 @@ if $notify; then env | sort | grep ^CHANGE_ echo echo "See details in $dir." - } | mail -r "$HOSTNAME"@hpe.com -s "Launchable prediction failure: ${not_predicted_percent}%" john.malmberg@hpe.com + } | mail -r "$HOSTNAME"@hpe.com -s "Launchable prediction failure: ${not_predicted_percent}%" core-daos-devops@groups.int.hpe.com fi diff --git a/ci/functional/test_main.sh b/ci/functional/test_main.sh index adcd0f78be8..aa056248bcc 100755 --- a/ci/functional/test_main.sh +++ b/ci/functional/test_main.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2020-2024 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -14,6 +14,13 @@ fi test_tag="$TEST_TAG" +: "${NODELIST:=localhost}" +: "${TEST_RPMS:=false}" +: "${STAGE_NAME:=unknown}" + +def_node_count="$(nodeset -c "$NODELIST")" +: "${NODE_COUNT:=$def_node_count}" + tnodes=$(echo "$NODELIST" | cut -d ',' -f 1-"$NODE_COUNT") first_node=${NODELIST%%,*} @@ -42,14 +49,17 @@ cluster_reboot () { test_cluster() { # Test that all nodes in the cluster are healthy clush -B -S -o '-i ci_key' -l root -w "${tnodes}" \ - "OPERATIONS_EMAIL=${OPERATIONS_EMAIL} \ + "OPERATIONS_EMAIL=${OPERATIONS_EMAIL:-} \ FIRST_NODE=${first_node} \ TEST_RPMS=${TEST_RPMS} \ NODELIST=${tnodes} \ BUILD_URL=\"${BUILD_URL:-Unknown in GHA}\" \ - STAGE_NAME=\"$STAGE_NAME\" \ + STAGE_NAME=\"${STAGE_NAME}\" \ JENKINS_URL=\"${JENKINS_URL:-}\" \ DAOS_DEVOPS_EMAIL=\"${DAOS_DEVOPS_EMAIL:-}\" \ + DAOS_INFINIBAND=${DAOS_INFINIBAND:-} \ + DAOS_NVME=${DAOS_NVME:-} \ + DAOS_PMEM=${DAOS_PMEM:-} \ $(cat ci/functional/test_main_prep_node.sh)" } @@ -65,7 +75,7 @@ if ! test_cluster; then echo "Hardware test failed again after reboot" fi else - echo "Cluster reboot failed" + echo "Cluster reboot failed" fi else hardware_ok=true @@ -88,6 +98,7 @@ trap 'clush -B -S -o "-i ci_key" -l root -w "${tnodes}" '\ # Setup the Jenkins build artifacts directory before running the tests to ensure # there is enough disk space to report the results. +# Even though STAGE_NAME forced to be set, shellcheck wants this syntax. rm -rf "${STAGE_NAME:?ERROR: STAGE_NAME is not defined}/" mkdir "${STAGE_NAME:?ERROR: STAGE_NAME is not defined}/" @@ -98,24 +109,23 @@ rm -rf install/lib/daos/TESTING/ftest/avocado ./*_results.xml mkdir -p install/lib/daos/TESTING/ftest/avocado/job-results if "$hardware_ok"; then - if $TEST_RPMS; then + if "$TEST_RPMS"; then # shellcheck disable=SC2029 - ssh -i ci_key -l jenkins "${first_node}" \ - "TEST_TAG=\"$test_tag\" \ - TNODES=\"$tnodes\" \ - FTEST_ARG=\"${FTEST_ARG:-}\" \ - WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \ - STAGE_NAME=\"$STAGE_NAME\" \ - DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ + ssh -i ci_key -l jenkins "${first_node}" \ + "TEST_TAG=\"$test_tag\" \ + TNODES=\"$tnodes\" \ + FTEST_ARG=\"${FTEST_ARG:-}\" \ + WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \ + STAGE_NAME=\"${STAGE_NAME}\" \ + DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ $(cat ci/functional/test_main_node.sh)" else - ./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG" + ./ftest.sh "$test_tag" "$tnodes" "${FTEST_ARG:-}" fi fi # Now rename the previously collected hardware test data for Jenkins # to use them for Junit processing. -: "${STAGE_NAME:=}" mkdir -p "${STAGE_NAME}/hardware_prep/" for node in ${tnodes//,/ }; do old_name="./hardware_prep_node_results.xml.$node" diff --git a/ci/functional/test_main_prep_node.sh b/ci/functional/test_main_prep_node.sh index fec054efb54..ab75841a54b 100755 --- a/ci/functional/test_main_prep_node.sh +++ b/ci/functional/test_main_prep_node.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2020-2023 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -16,9 +16,16 @@ domain1="${JENKINS_URL#https://}" mail_domain="${domain1%%/*}" : "${EMAIL_DOMAIN:=$mail_domain}" : "${DAOS_DEVOPS_EMAIL:="$HOSTNAME"@"$EMAIL_DOMAIN"}" +: "${DAOS_INFINIBAND:=}" +: "${DAOS_PMEM:=0}" +: "${DAOS_NVME:=0}" + +#cn is for a cleaned up stage name. +cn=$(echo "$STAGE_NAME" | sed 's/[^a-zA-Z0-9_]/_/g' | sed 's/__*/_/g') result=0 mail_message='' +mail_type='warning' nl=" " @@ -48,7 +55,7 @@ function do_mail { fi # shellcheck disable=SC2059 build_info="BUILD_URL = $BUILD_URL$nl STAGE = $STAGE_NAME$nl$nl" - mail -s "Hardware check failed after reboot!" \ + mail -s "Hardware check $mail_type after reboot!" \ -r "$DAOS_DEVOPS_EMAIL" "$OPERATIONS_EMAIL" \ <<< "$build_info$mail_message" set -x @@ -58,7 +65,7 @@ if ! command -v lspci; then if command -v dnf; then dnf -y install pciutils else - echo "pciutils not installed, can not test for Infiniband devices" + echo "pciutils not installed, can not test for hardware devices" fi fi @@ -106,17 +113,27 @@ The Omni-Path adapters will not be used." mail_message+="${nl}${ib_message}${nl}" echo "$ib_message" fi +if [ -z "$DAOS_INFINIBAND" ]; then + DAOS_INFINIBAND=$ib_count +fi set -x # Wait for at least the expected IB devices to show up. -# in the case of dual port HBAs, not all IB devices will -# show up. +# in the case of dual port HBAs, only the ports that are connected may show up. # For some unknown reason, sometimes IB devices will not show up # except in the lspci output unless an ip link set up command for # at least one device that should be present shows up. good_ibs=() function do_wait_for_ib { - local ib_devs=("$@") + # The problem is that we do not know the actual device names + # ahead of time. So we try to bring up all possible devices + # and see if at least the expected number show up with IP + # addresses. + local ib_devs=("ib0" "ib1" "ib2" "ib3" "ib4") + # Udev rule convention, first digit is the numa node + # second digit should be an index of the HBA on that numa node. + ib_devs+=("ib_00" "ib_01" "ib_02" "ib_03") + ib_devs+=("ib_10" "ib_11" "ib_12" "ib_13") local working_ib ib_timeout=300 # 5 minutes retry_wait=10 # seconds @@ -147,15 +164,14 @@ function do_wait_for_ib { return 1 } -# Migrating to using udev rules for network devices -if [ -e /etc/udev/rules.d/70-persistent-ipoib.rules ]; then - ib_list=('ib_cpu0_0' 'ib_cpu1_0') -else - ib_list=('ib0') - if [ "$ib_count" -gt 1 ]; then - ib_list+=('ib1') +# Get list of actual InfiniBand devices from /sys/class/net/ +ib_list=() +for iface in /sys/class/net/ib*; do + if [ -e "$iface" ]; then + iface_name=$(basename "$iface") + ib_list+=("$iface_name") fi -fi +done function check_ib_devices { local ib_devs=("$@") @@ -165,11 +181,10 @@ function check_ib_devices { set +x if ! ip addr show "$iface" | grep "inet "; then ib_message="$({ - echo "Found interface $iface down after reboot on $HOSTNAME." + echo "Found interface $iface with no ip address after reboot on $HOSTNAME." ip addr show "$iface" || true cat /sys/class/net/"$iface"/mode || true ip link set up "$iface" || true - cat /etc/sysconfig/network-scripts/ifcfg-"$iface" || true } 2>&1)" mail_message+="${nl}${ib_message}${nl}" echo "$ib_message" @@ -190,11 +205,10 @@ function check_ib_devices { done } - # First check for InfiniBand devices if [ "$ib_count" -gt 0 ]; then - if do_wait_for_ib "${ib_list[@]}"; then - echo "Found at least $ib_count working devices in" "${ib_list[@]}" + if do_wait_for_ib; then + echo "Found at least $ib_count working devices on $HOSTNAME" # All good, generate Junit report check_ib_devices "${good_ibs[@]}" else @@ -205,106 +219,111 @@ fi # having -x just makes the console log harder to read. # set +x -if [ "$ib_count" -ge 2 ]; then - # now check for pmem & NVMe drives when multiple ib are present. - # ipmctl show -dimm should show an even number of drives, all healthy - dimm_count=$(ipmctl show -dimm | grep Healthy -c) - if [ "$dimm_count" -eq 0 ] || [ $((dimm_count%2)) -ne 0 ]; then - # May not be fatal, the PMEM DIMM should be replaced when downtime can be - # scheduled for this system. - dimm_message="FAIL: Wrong number $dimm_count healthy PMEM DIMMs seen" - dimm_message+=" on $HOSTNAME." +if [ "$ib_count" -ge 2 ] ; then + if [ "$DAOS_PMEM" -gt 0 ]; then + # now check for pmem & NVMe drives when multiple ib are present. + # ipmctl show -dimm should show an even number of drives, all healthy + dimm_count=$(ipmctl show -dimm | grep Healthy -c) + if [ "$dimm_count" -eq 0 ] || [ $((dimm_count%2)) -ne 0 ]; then + # May not be fatal, the PMEM DIMM should be replaced when downtime + # can be # scheduled for this system. + dimm_message="FAIL: Wrong number $dimm_count healthy PMEM DIMMs seen" + dimm_message+=" on $HOSTNAME." - mail_message+="$nl$dimm_message$nl$(ipmctl show -dimm)$nl" - else - echo "OK: Found $dimm_count PMEM DIMMs." - fi - # Should have 2 regions 0x0000 and 0x0001, type AppDirect - dimm_rcount=0 - while IFS= read -r line; do - if [[ "$line" != *"| AppDirect"*"| Healthy"* ]]; then continue; fi - ((dimm_rcount++)) || true - done < <(ipmctl show -region) + mail_message+="$nl$dimm_message$nl$(ipmctl show -dimm)$nl" + else + echo "OK: Found $dimm_count PMEM DIMMs." + fi + # Should have 2 regions 0x0000 and 0x0001, type AppDirect + dimm_rcount=0 + while IFS= read -r line; do + if [[ "$line" != *"| AppDirect"*"| Healthy"* ]]; then continue; fi + ((dimm_rcount++)) || true + done < <(ipmctl show -region) - ((testruns++)) || true - testcases+=" ${nl}" - if [ "$dimm_rcount" -ne 2 ]; then - pmem_message="FAIL: Found $dimm_rcount of DIMM PMEM regions, need 2" - pmem_message+=" on $HOSTNAME." - pmem_message+="$nl$(ipmctl show -region)" - mail_message+="$nl$pmem_message$nl" - ((testfails++)) || true - testcases+=" - + ((testruns++)) || true + testcases+=" ${nl}" + if [ "$dimm_rcount" -ne 2 ]; then + pmem_message="FAIL: Found $dimm_rcount of DIMM PMEM regions, need 2" + pmem_message+=" on $HOSTNAME." + pmem_message+="$nl$(ipmctl show -region)" + mail_message+="$nl$pmem_message$nl" + ((testfails++)) || true + testcases+=" + $nl" result=3 - else - echo "OK: Found $dimm_rcount DIMM PMEM regions." - fi - testcases+=" $nl" - - # While this gets more data than needed, it is the same search that - # DAOS tests do and records it in the console log. - nvme_devices="$(lspci -vmm -D | grep -E '^(Slot|Class|Device|NUMANode):' | - grep -E 'Class:\s+Non-Volatile memory controller' -B 1 -A 2)" - nvme_count=0 - while IFS= read -r line; do - if [[ "$line" != *"Class:"*"Non-Volatile memory controller"* ]];then - continue + else + echo "OK: Found $dimm_rcount DIMM PMEM regions." fi - ((nvme_count++)) || true - done < <(printf %s "$nvme_devices") + testcases+=" $nl" + fi + if [ "$DAOS_NVME" -gt 0 ]; then + # While this gets more data than needed, it is the same search that + # DAOS tests do and records it in the console log. + nvme_devices="$(lspci -vmm -D | grep -E '^(Slot|Class|Device|NUMANode):' | + grep -E 'Class:\s+Non-Volatile memory controller' -B 1 -A 2)" + nvme_count=0 + while IFS= read -r line; do + if [[ "$line" != *"Class:"*"Non-Volatile memory controller"* ]];then + continue + fi + ((nvme_count++)) || true + done < <(printf %s "$nvme_devices") - ((testruns++)) || true - testcases+=" ${nl}" - if [ $((nvme_count%2)) -ne 0 ]; then - nvme_message="Fail: Odd number ($nvme_count) of NVMe devices seen." - mail_message+="$nl$nvme_message$nl$nvme_devices$nl" - ((testfails++)) || true - testcases+=" + ((testruns++)) || true + testcases+=" ${nl}" + if [ $((nvme_count%2)) -ne 0 ]; then + nvme_message="Fail: Odd number ($nvme_count) of NVMe devices seen." + mail_message+="$nl$nvme_message$nl$nvme_devices$nl" + ((testfails++)) || true + testcases+=" $nl" - result=4 - else - echo "OK: Even number ($nvme_count) of NVMe devices seen." + result=4 + else + echo "OK: Even number ($nvme_count) of NVMe devices seen." + fi + testcases+=" $nl" fi - testcases+=" $nl" - # All storage found by lspci should also be in lsblk report lsblk_nvme=$(lsblk | grep nvme -c) lsblk_pmem=$(lsblk | grep pmem -c) - ((testruns++)) || true - testcases+=" ${nl}" - if [ "$lsblk_nvme" -ne "$nvme_count" ]; then - lsblk_nvme_msg="Fail: Only $lsblk_nvme of $nvme_count NVMe devices seen" - lsblk_nvme_msg+=" on $HOSTNAME." - mail_message+="$nl$lsblk_nvme_msg$nl$(lsblk)$nl" - ((testfails++)) || true - testcases+=" + if [ "$DAOS_NVME" -gt 0 ]; then + ((testruns++)) || true + testcases+=" ${nl}" + if [ "$lsblk_nvme" -ne "$nvme_count" ]; then + lsblk_nvme_msg="Fail: Only $lsblk_nvme of $nvme_count NVMe devices seen" + lsblk_nvme_msg+=" on $HOSTNAME." + mail_message+="$nl$lsblk_nvme_msg$nl$(lsblk)$nl" + ((testfails++)) || true + testcases+=" $nl" - result=5 - else - echo "OK: All $nvme_count NVMe devices are in lsblk report." + result=5 + else + echo "OK: All $nvme_count NVMe devices are in lsblk report." + fi + testcases+=" $nl" fi - testcases+=" $nl" - - ((testruns++)) || true - testcases+=" ${nl}" - if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then - lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen" - lsblk_pmem_msg+=" on $HOSTNAME." - mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl" - ((testfails++)) || true - testcases+=" + if [ "$DAOS_PMEM" -gt 0 ]; then + ((testruns++)) || true + testcases+=" ${nl}" + if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then + lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen" + lsblk_pmem_msg+=" on $HOSTNAME." + mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl" + ((testfails++)) || true + testcases+=" $nl" - result=6 - else - echo "OK: All $dimm_rcount PMEM devices are in lsblk report." + result=6 + else + echo "OK: All $dimm_rcount PMEM devices are in lsblk report." + fi + testcases+=" $nl" fi - testcases+=" $nl" fi # Additional information if any check failed @@ -325,21 +344,23 @@ if [ -n "$FIRST_NODE" ] && ! grep /mnt/share /proc/mounts; then mount "$FIRST_NODE":/export/share /mnt/share fi -# Defaulting the package to "(root)" for now as then Jenkins -# will default to setting putting the outer stage name and -# inner stage name in the full test name. -ts="Hardware" +# The package name defaults to "(root)" unless there is a dot in the +# testsuite name, in which case the package name is the part before +# the last dot in the testsuite name. +pn="Hardware" tf="failures=\"$testfails\"" te="errors=\"0\"" tc="tests=\"$testruns\"" -# shellcheck disable=SC2089 -junit_xml="$nl +junit_xml="$nl $testcases$nl" # Each junit file needs the same name for when they are collected. echo "$junit_xml" > "./hardware_prep_node_results.xml" +if [ "$testfails" -gt 0 ]; then + mail_type='failed' +fi do_mail if [ "$result" -ne 0 ]; then diff --git a/ci/junit.sh b/ci/junit.sh index e0051b819cd..c7e3a2aee3b 100644 --- a/ci/junit.sh +++ b/ci/junit.sh @@ -24,7 +24,7 @@ report_junit() { clush -o '-i ci_key' -l root -w "$nodes" --rcopy "$results" local results_files - results_files=$(find . -maxdepth 1 -name "$results.*") + readarray -t results_files < <(find . -maxdepth 1 -name "$results.*") if [ ${#results_files[@]} -eq 0 ]; then echo "No results found to report as JUnit results" diff --git a/ci/parse_ci_envs.sh b/ci/parse_ci_envs.sh index 84cb0183f91..2d15d81b470 100755 --- a/ci/parse_ci_envs.sh +++ b/ci/parse_ci_envs.sh @@ -22,8 +22,12 @@ if [ -n "${STAGE_NAME:?}" ]; then : "${TARGET:=centos9}" : "${REPO_SPEC:=el-9}" ;; + *SLES\ 15.7*|*sles15.7*) + : "${CHROOT_NAME:=sles-15-sp7-x86_64}" + : "${TARGET:=sles15.7}" + ;; *Leap\ 15.6*|*leap15.6*|*opensuse15.6*|*sles15.6*) - : "${CHROOT_NAME:=opensuse-leap-15.5-x86_64}" + : "${CHROOT_NAME:=opensuse-leap-15.6-x86_64}" : "${TARGET:=leap15.6}" ;; *Leap\ 15.5*|*leap15.5*|*opensuse15.5*|*sles15.5*) diff --git a/ci/provisioning/post_provision_config.sh b/ci/provisioning/post_provision_config.sh index 0e98332a4d0..345d38b336f 100755 --- a/ci/provisioning/post_provision_config.sh +++ b/ci/provisioning/post_provision_config.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2020-2023 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -29,7 +29,7 @@ source ci/junit.sh # Before running the script, environment variables may be needed for # the specific site. -: "${MLNX_VER_NUM:=3.0.0}" +: "${MLNX_VER_NUM:=3.2.1}" # This is tangled and needs a better fix as it has DISTRO being passed # as EL_8 for EL_9, yet other places expect DISTRO to really be EL_8 and diff --git a/ci/provisioning/post_provision_config_common.sh b/ci/provisioning/post_provision_config_common.sh index 257e6dfe2d9..3cac657efaf 100755 --- a/ci/provisioning/post_provision_config_common.sh +++ b/ci/provisioning/post_provision_config_common.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2021-2023 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -32,7 +32,7 @@ fi # shellcheck disable=SC1091 . /etc/os-release # shellcheck disable=SC2034 -EXCLUDE_UPGRADE=mercury,daos,daos-\* +EXCLUDE_UPGRADE=mercury,mercury-\*,daos,daos-\* if rpm -qa | grep mlnx; then # packages not to allow upgrading if MLNX OFED is installed EXCLUDE_UPGRADE+=,openmpi,\*mlnx\*,\*ucx\* diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index c3b11439d47..5c5e2a50fbd 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -2,7 +2,7 @@ # # Copyright 2022-2023 Intel Corporation. # Copyright 2025 Google LLC -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -323,7 +323,7 @@ post_provision_config_nodes() { rm -f "$REPOS_DIR"/*_job_daos-stack_job_*_job_*.repo time dnf -y erase fio fuse ior-hpc mpich-autoload \ argobots cart daos daos-client daos-spdk dpdk \ - libisa-l libpmemobj mercury mpich \ + libisa-l libpmemobj libpmemobj1 mercury mpich \ pmix protobuf-c spdk libfabric libpmem \ munge-libs munge slurm \ slurm-example-configs slurmctld slurm-slurmmd @@ -438,3 +438,32 @@ post_provision_config_nodes() { return 0 } + +install_mofed() { + if [ -z "$MLNX_VER_NUM" ]; then + echo "MLNX_VER_NUM is not set" + env + exit 1 + fi + + : "${ARTIFACTORY_URL:=}" + if [ -z "$ARTIFACTORY_URL" ]; then + return + fi + + # Install Mellanox OFED or DOCA RPMS + install_mellanox="install_mellanox.sh" + script_url="${ARTIFACTORY_URL}/raw-internal/sre_tools/$install_mellanox" + install_target="/usr/local/sbin/$install_mellanox" + + if [ ! -e "$install_target" ]; then + if ! curl --silent --show-error --fail \ + -o "/usr/local/sbin/$install_mellanox" "$script_url"; then + echo "Failed to fetch $script_url" + return 1 + fi + chmod 0755 "$install_target" + fi + + MELLANOX_VERSION="$MLNX_VER_NUM" "$install_mellanox" +} diff --git a/ci/provisioning/post_provision_config_nodes.sh b/ci/provisioning/post_provision_config_nodes.sh index 37ac6f23aaa..c62c7064cad 100644 --- a/ci/provisioning/post_provision_config_nodes.sh +++ b/ci/provisioning/post_provision_config_nodes.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2020-2023 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -78,79 +78,6 @@ if lspci | grep -i nvme; then daos_server nvme reset && rmmod vfio_pci && modprobe vfio_pci fi -# FOR now limit to 2 devices per CPU NUMA node -: "${DAOS_CI_NVME_NUMA_LIMIT:=2}" - -function mount_nvme_drive { - local drive="$1" - file_system=$(file -sL "/dev/$drive") - if [[ "$file_system" != *"ext4 filesystem"* ]]; then - yes | mkfs -t ext4 "/dev/$drive" - fi - mkdir -p "/mnt/$drive" - mount "/dev/$drive" "/mnt/$drive" -} - - -nvme_class="/sys/class/nvme/" -function nvme_limit { - set +x - if [ ! -d "${nvme_class}" ] || [ -z "$(ls -A "${nvme_class}")" ]; then - echo "No NVMe devices found" - return - fi - local numa0_devices=() - local numa1_devices=() - for nvme_path in "$nvme_class"*; do - nvme="$(basename "$nvme_path")n1" - numa_node="$(cat "${nvme_path}/numa_node")" - if mount | grep "$nvme"; then - continue - fi - if [ "$numa_node" -eq 0 ]; then - numa0_devices+=("$nvme") - else - numa1_devices+=("$nvme") - fi - done - echo numa0 "${numa0_devices[@]}" - echo numa1 "${numa1_devices[@]}" - if [ "${#numa0_devices[@]}" -gt 0 ] && [ "${#numa1_devices[@]}" -gt 0 ]; then - echo "balanced NVMe configuration possible" - nvme_count=0 - for nvme in "${numa0_devices[@]}"; do - if [ "$nvme_count" -ge "${DAOS_CI_NVME_NUMA_LIMIT}" ]; then - mount_nvme_drive "$nvme" - else - ((nvme_count++)) || true - fi - done - nvme_count=0 - for nvme in "${numa1_devices[@]}"; do - if [ "$nvme_count" -ge "${DAOS_CI_NVME_NUMA_LIMIT}" ]; then - mount_nvme_drive "$nvme" - else - ((nvme_count++)) || true - fi - done - else - echo "balanced NVMe configuration not possible" - for nvme in "${numa0_devices[@]}" "${numa1_devices[@]}"; do - ((needed = "$DAOS_CI_NVME_NUMA_LIMIT" + 1)) || true - nvme_count=0 - if [ "$nvme_count" -ge "$needed" ]; then - mount_nvme_drive "$nvme" - else - ((nvme_count++)) || true - fi - done - fi - set -x -} - -# Force only the desired number of NVMe devices to be seen by DAOS tests -# by mounting the extra ones. -nvme_limit systemctl enable nfs-server.service systemctl start nfs-server.service diff --git a/ci/provisioning/post_provision_config_nodes_EL.sh b/ci/provisioning/post_provision_config_nodes_EL.sh index 75e1d7934e3..c9257d87c22 100644 --- a/ci/provisioning/post_provision_config_nodes_EL.sh +++ b/ci/provisioning/post_provision_config_nodes_EL.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2021-2024 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent @@ -33,66 +33,3 @@ distro_custom() { dnf -y install python39 python39-devel dnf -y install python3.11 python3.11-devel } - -install_mofed() { - if [ -z "$MLNX_VER_NUM" ]; then - echo "MLNX_VER_NUM is not set" - env - exit 1 - fi - - # Remove Omni-Path software - # shellcheck disable=SC2046 - time dnf -y remove $(rpm -q opa-address-resolution \ - opa-basic-tools \ - opa-fastfabric \ - opa-libopamgt \ - compat-openmpi16 \ - compat-openmpi16-devel \ - openmpi \ - openmpi-devel \ - ompi \ - ompi-debuginfo \ - ompi-devel | grep -v 'is not installed') - - - stream=false - gversion="$VERSION_ID" - if [ "$gversion" == "8" ]; then - # Mellanox does not have a release for 8.9 yet. - gversion="8.8" - stream=true - elif [[ $gversion = *.*.* ]]; then - gversion="${gversion%.*}" - fi - - : "${ARTIFACTORY_URL:=}" - if [ -z "$ARTIFACTORY_URL" ]; then - return - fi - - # Install Mellanox OFED or DOCA RPMS - install_mellanox="install_mellanox.sh" - script_url="${ARTIFACTORY_URL}/raw-internal/sre_tools/$install_mellanox" - install_target="/usr/local/sbin/$install_mellanox" - - if [ ! -e "$install_target" ]; then - if ! curl --silent --show-error --fail \ - -o "/usr/local/sbin/$install_mellanox" "$script_url"; then - echo "Failed to fetch $script_url" - return 1 - fi - chmod 0755 "$install_target" - fi - - MELLANOX_VERSION="$MLNX_VER_NUM" "$install_mellanox" - - dnf list --showduplicates perftest - if [ "$gversion" == "8.5" ]; then - dnf remove -y perftest || true - fi - if $stream; then - dnf list --showduplicates ucx-knem - dnf remove -y ucx-knem || true - fi -} diff --git a/ci/provisioning/post_provision_config_nodes_LEAP.sh b/ci/provisioning/post_provision_config_nodes_LEAP.sh index 5a2a553e56f..f5603d9d2fa 100644 --- a/ci/provisioning/post_provision_config_nodes_LEAP.sh +++ b/ci/provisioning/post_provision_config_nodes_LEAP.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2021-2024 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent @@ -23,4 +23,23 @@ distro_custom() { sed -e '/MODULEPATH=/s/$/:\/usr\/share\/modules/' \ /etc/profile.d/lmod.sh; \ fi + + # Fix for no_pmix_multi_ctx tests on SLES/Leap 15.x + if [[ "${VERSION_ID:-}" == 15.* ]]; then + zypper rm -y -u mercury mercury-debuginfo || true + zypper rm -y -u libfabric libfabric1 libfabric-debuginfo || true + zypper clean --all + ldconfig +# zypper mr -e daos-stack-daos-sl-15-stable-local-artifactory || true +# zypper mr -p 90 daos-stack-daos-sl-15-stable-local-artifactory || true + zypper mr -p 90 daos-stack-deps-sl-15-stable-local-artifactory || true + + zypper in -y -f libfabric1 mercury-libfabric mercury + +# if [[ "${ID:-}" == "sles" ]]; then +# zypper in -y -f libfabric1 mercury-libfabric mercury daos-server daos-client daos-client-tests openmpi3 openmpi3-devel +# else +# zypper in -y -f libfabric1 mercury-libfabric mercury +# fi + fi } diff --git a/ci/rpm/test_daos_node.sh b/ci/rpm/test_daos_node.sh index 9968d1ec49d..36bd44fd4a3 100755 --- a/ci/rpm/test_daos_node.sh +++ b/ci/rpm/test_daos_node.sh @@ -105,6 +105,14 @@ FTEST=/usr/lib/daos/TESTING/ftest python3 -m venv venv # shellcheck disable=SC1091 source venv/bin/activate + +cat < venv/pip.conf +[global] + progress_bar = off + no_color = true + quiet = 1 +EOF + pip install --upgrade pip pip install -r $FTEST/requirements-ftest.txt diff --git a/ci/storage/test_main_storage_prepare_node.sh b/ci/storage/test_main_storage_prepare_node.sh index f87333327b8..489baa21006 100755 --- a/ci/storage/test_main_storage_prepare_node.sh +++ b/ci/storage/test_main_storage_prepare_node.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2021-2023 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -44,6 +44,22 @@ if command -v ibv_devinfo; then ibv_devinfo || true; fi lspci | grep -i "Non-Volatile memory controller" || true +ib_count=0 +for ib_path in /sys/class/net/ib*; do + if [ ! -e "$ib_path" ]; then + continue + fi + ((ib_count++)) || true + ip addr show "$(basename "$ib_path")" +done + +# Skip test controller +if [ "$ib_count" -le 1 ]; then + echo "Less than 2 Infiniband devices found ($ib_count)." + echo "Assuming this is a test controller node. Skipping PMEM setup." + exit +fi + if ipmctl show -dimm; then ipmctl show -goal ipmctl show -region @@ -60,12 +76,7 @@ if ipmctl show -dimm; then fi fi else - counter=0 - for ib in /sys/class/net/ib*; do - ((counter++)) || true - ip addr show "$ib" - done - if "$counter" -ge 2; then + if [ "$ib_count" -ge 2 ]; then # All of our CI nodes with two ib adapters should have PMEM DIMMs echo 'No PMEM DIMM devices found on CI node!' exit 1 diff --git a/ci/test_files_to_stash.txt b/ci/test_files_to_stash.txt index feeb0e64992..7214ed71499 100755 --- a/ci/test_files_to_stash.txt +++ b/ci/test_files_to_stash.txt @@ -2,15 +2,16 @@ build/*/*/src/tests/ftest/cart/utest/test_linkage, build/*/*/src/tests/ftest/cart/utest/utest_hlc, build/*/*/src/tests/ftest/cart/utest/utest_protocol, build/*/*/src/tests/ftest/cart/utest/utest_swim, +build/*/*/src/gurt/tests/d_log_memory_ut, build/*/*/src/gurt/tests/test_gurt, build/*/*/src/gurt/tests/test_gurt_telem_producer, -build/*/*/src/gurt/tests/test_gurt_telem_consumer, build/*/*/src/rdb/raft/src/tests_main, build/*/*/src/common/tests/btree_direct, build/*/*/src/common/tests/btree, build/*/*/src/common/tests/common_test, build/*/*/src/common/tests/sched, build/*/*/src/common/tests/drpc_tests, +build/*/*/src/common/tests/control_tests, build/*/*/src/common/tests/acl_api_tests, build/*/*/src/common/tests/acl_valid_tests, build/*/*/src/common/tests/acl_util_tests, @@ -43,6 +44,7 @@ build/*/*/src/bio/smd/tests/smd_ut, build/*/*/src/tests/rpc/rpc_tests, build/*/*/src/engine/tests/abt_perf, build/*/*/src/engine/tests/abt_stack, +build/*/*/src/utils/dlck/tests/dlck_args_ut, src/common/tests/btree.sh, src/control/run_go_tests.sh, src/rdb/raft_tests/raft_tests.py, diff --git a/ci/unit/required_packages.sh b/ci/unit/required_packages.sh index bbd3313155a..5a57b0cb054 100755 --- a/ci/unit/required_packages.sh +++ b/ci/unit/required_packages.sh @@ -1,5 +1,10 @@ #!/bin/bash - +# +# (C) Copyright 2025 Google LLC +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP +# +# SPDX-License-Identifier: BSD-2-Clause-Patent +# set -eu # No longer used but provided by pipeline-lib @@ -24,6 +29,7 @@ pkgs="$(utils/rpms/package_version.sh argobots lib) \ $(utils/rpms/package_version.sh libfabric debug) \ $(utils/rpms/package_version.sh mercury dev) \ $(utils/rpms/package_version.sh mercury debug) \ + $(utils/rpms/package_version.sh mercury lib mercury_libfabric) \ $(utils/rpms/package_version.sh pmdk lib pmemobj) \ $(utils/rpms/package_version.sh pmdk debug pmemobj) \ $(utils/rpms/package_version.sh pmdk debug pmem) \ diff --git a/ci/unit/test_main_node.sh b/ci/unit/test_main_node.sh index 0afbf26fea6..ad03978ee84 100755 --- a/ci/unit/test_main_node.sh +++ b/ci/unit/test_main_node.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2020-2023 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -84,13 +84,17 @@ rm -rf "$test_log_dir" python3 -m venv venv # shellcheck disable=SC1091 source venv/bin/activate -# touch venv/pip.conf -# pip config set global.progress_bar off -# pip config set global.no_color true + +cat < venv/pip.conf +[global] + progress_bar = off + no_color = true + quiet = 1 +EOF pip install --upgrade pip -pip install --requirement requirements-utest.txt +pip install --requirement requirements-utest.txt pip install /opt/daos/lib/daos/python/ HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" utils/run_utest.py $RUN_TEST_VALGRIND \ diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index fa422586ad9..730dea30423 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -28,13 +28,17 @@ sudo bash -c ". ./utils/sl/setup_local.sh; ./utils/setup_daos_server_helper.sh" python3.11 -m venv venv # shellcheck disable=SC1091 source venv/bin/activate -touch venv/pip.conf -pip config set global.progress_bar off -pip config set global.no_color true + +cat < venv/pip.conf +[global] + progress_bar = off + no_color = true + quiet = 1 +EOF pip install --upgrade pip -pip install --requirement requirements-utest.txt +pip install --requirement requirements-utest.txt pip install /opt/daos/lib/daos/python/ # set high open file limit in the shell to avoid extra warning diff --git a/deps/patches/mercury/0001_dep_versions.patch b/deps/patches/mercury/0001_dep_versions.patch new file mode 100644 index 00000000000..6b1d6a71a80 --- /dev/null +++ b/deps/patches/mercury/0001_dep_versions.patch @@ -0,0 +1,74 @@ +From 0a7756d4ef2f329fa7caa8e4052a099a91816f2f Mon Sep 17 00:00:00 2001 +From: Jerome Soumagne +Date: Mon, 24 Nov 2025 16:44:14 -0600 +Subject: [PATCH 1/3] NA OFI/UCX: print version infos + +NA OFI: fix log warning +--- + src/na/na_ofi.c | 12 +++++++----- + src/na/na_ucx.c | 10 ++++++++++ + 2 files changed, 17 insertions(+), 5 deletions(-) + +diff --git a/src/na/na_ofi.c b/src/na/na_ofi.c +index c7c3e0b3..682efe65 100644 +--- a/src/na/na_ofi.c ++++ b/src/na/na_ofi.c +@@ -8048,8 +8048,10 @@ na_ofi_check_protocol(const char *protocol_name) + uint32_t runtime_version = fi_version(); + na_return_t na_ret; + +- NA_LOG_SUBSYS_DEBUG(cls, "Querying info on libfabric v%d.%d", +- FI_MAJOR(runtime_version), FI_MINOR(runtime_version)); ++ NA_LOG_SUBSYS_INFO(cls, ++ "Querying info on libfabric (runtime v%d.%d, API v%d.%d)", ++ FI_MAJOR(runtime_version), FI_MINOR(runtime_version), ++ FI_MAJOR(FI_COMPILE_VERSION), FI_MINOR(FI_COMPILE_VERSION)); + NA_CHECK_SUBSYS_ERROR(cls, FI_VERSION_LT(runtime_version, NA_OFI_VERSION), + out, accept, false, + "runtime libfabric version (v%d.%d) is lower than required version " +@@ -9105,7 +9107,7 @@ na_ofi_mem_handle_create(na_class_t NA_UNUSED *na_class, void *buf, + + NA_LOG_SUBSYS_DEBUG(mem, + "Created mem handle %p (iov_base=%p, iov_len=%zu, iovcnt=1, " +- "flags=0x%lx, len=%zu)", ++ "flags=0x%" PRIx8 ", len=%" PRIu64 ")", + (void *) na_ofi_mem_handle, na_ofi_mem_handle->desc.iov.s[0].iov_base, + na_ofi_mem_handle->desc.iov.s[0].iov_len, + na_ofi_mem_handle->desc.info.flags, na_ofi_mem_handle->desc.info.len); +@@ -9444,8 +9446,8 @@ na_ofi_mem_handle_deserialize(na_class_t NA_UNUSED *na_class, + na_ofi_mem_handle->desc.info.iovcnt); + + NA_LOG_SUBSYS_DEBUG(mem, +- "Deserialized mem handle %p (iov_base=%p, iov_len=%zu, iovcnt=%zu, " +- "flags=0x%lx, len=%zu)", ++ "Deserialized mem handle %p (iov_base=%p, iov_len=%zu, iovcnt=%" PRIu64 ++ ", flags=0x%" PRIx8 ", len=%" PRIu64 ")", + (void *) na_ofi_mem_handle, na_ofi_mem_handle->desc.iov.s[0].iov_base, + na_ofi_mem_handle->desc.iov.s[0].iov_len, + na_ofi_mem_handle->desc.info.iovcnt, na_ofi_mem_handle->desc.info.flags, +diff --git a/src/na/na_ucx.c b/src/na/na_ucx.c +index 96501b27..7c0ac8d4 100644 +--- a/src/na/na_ucx.c ++++ b/src/na/na_ucx.c +@@ -3433,8 +3433,18 @@ na_ucx_check_protocol(const char *protocol_name) + .field_mask = UCP_PARAM_FIELD_FEATURES, .features = NA_UCX_FEATURES}; + ucp_context_h context = NULL; + ucs_status_t status; ++ unsigned int runtime_major_version, runtime_minor_version, ++ runtime_patch_version; + bool accept = false; + ++ ucp_get_version( ++ &runtime_major_version, &runtime_minor_version, &runtime_patch_version); ++ ++ NA_LOG_SUBSYS_INFO(cls, ++ "Querying info on UCX (runtime v%u.%u.%u, API v%u.%u)", ++ runtime_major_version, runtime_minor_version, runtime_patch_version, ++ UCP_API_MAJOR, UCP_API_MINOR); ++ + status = ucp_config_read(NULL, NULL, &config); + NA_CHECK_SUBSYS_ERROR_NORET(cls, status != UCS_OK, done, + "ucp_config_read() failed (%s)", ucs_status_string(status)); +-- +2.52.0 + diff --git a/deps/patches/mercury/0001_na_ucx.patch b/deps/patches/mercury/0001_na_ucx.patch deleted file mode 100644 index 57b39feef9d..00000000000 --- a/deps/patches/mercury/0001_na_ucx.patch +++ /dev/null @@ -1,110 +0,0 @@ -diff --git a/src/na/na_ucx.c b/src/na/na_ucx.c -index 84eb8b0..e4b6676 100644 ---- a/src/na/na_ucx.c -+++ b/src/na/na_ucx.c -@@ -614,7 +614,7 @@ na_ucx_addr_map_update(struct na_ucx_class *na_ucx_class, - */ - static na_return_t - na_ucx_addr_map_remove( -- struct na_ucx_map *na_ucx_map, ucs_sock_addr_t *addr_key); -+ struct na_ucx_map *na_ucx_map, struct na_ucx_addr *remove_addr); - - /** - * Hash connection ID. -@@ -1688,8 +1688,12 @@ na_ucp_listener_conn_cb(ucp_conn_request_h conn_request, void *arg) - .addr = (const struct sockaddr *) &conn_request_attrs.client_address, - .addrlen = sizeof(conn_request_attrs.client_address)}; - na_ucx_addr = na_ucx_addr_map_lookup(&na_ucx_class->addr_map, &addr_key); -- NA_CHECK_SUBSYS_ERROR_NORET(addr, na_ucx_addr != NULL, error, -- "An entry is already present for this address"); -+ -+ if (na_ucx_addr != NULL) { -+ NA_LOG_SUBSYS_WARNING(addr, -+ "An entry is already present for this address"); -+ na_ucx_addr_map_remove(&na_ucx_class->addr_map, na_ucx_addr); -+ } - - /* Insert new entry and create new address */ - na_ret = na_ucx_addr_map_insert(na_ucx_class, &na_ucx_class->addr_map, -@@ -1937,10 +1941,14 @@ na_ucp_ep_error_cb( - static void - na_ucp_ep_close(ucp_ep_h ep) - { -- ucs_status_ptr_t status_ptr = ucp_ep_close_nb(ep, UCP_EP_CLOSE_MODE_FORCE); -+ const ucp_request_param_t close_params = { -+ .op_attr_mask = UCP_OP_ATTR_FIELD_FLAGS, -+ .flags = UCP_EP_CLOSE_FLAG_FORCE}; -+ ucs_status_ptr_t status_ptr = ucp_ep_close_nbx(ep, &close_params); -+ - NA_CHECK_SUBSYS_ERROR_DONE(addr, - status_ptr != NULL && UCS_PTR_IS_ERR(status_ptr), -- "ucp_ep_close_nb() failed (%s)", -+ "ucp_ep_close_nbx() failed (%s)", - ucs_status_string(UCS_PTR_STATUS(status_ptr))); - } - -@@ -2722,7 +2730,7 @@ unlock: - - /*---------------------------------------------------------------------------*/ - static na_return_t --na_ucx_addr_map_remove(struct na_ucx_map *na_ucx_map, ucs_sock_addr_t *addr_key) -+na_ucx_addr_map_remove(struct na_ucx_map *na_ucx_map, struct na_ucx_addr *remove_addr) - { - struct na_ucx_addr *na_ucx_addr = NULL; - na_return_t ret = NA_SUCCESS; -@@ -2731,13 +2739,14 @@ na_ucx_addr_map_remove(struct na_ucx_map *na_ucx_map, ucs_sock_addr_t *addr_key) - hg_thread_rwlock_wrlock(&na_ucx_map->lock); - - na_ucx_addr = hg_hash_table_lookup( -- na_ucx_map->key_map, (hg_hash_table_key_t) addr_key); -- if (na_ucx_addr == HG_HASH_TABLE_NULL) -+ na_ucx_map->key_map, (hg_hash_table_key_t) &remove_addr->addr_key); -+ -+ if (na_ucx_addr == HG_HASH_TABLE_NULL || na_ucx_addr->ucp_ep != remove_addr->ucp_ep) - goto unlock; - - /* Remove addr key from primary map */ - rc = hg_hash_table_remove( -- na_ucx_map->key_map, (hg_hash_table_key_t) addr_key); -+ na_ucx_map->key_map, (hg_hash_table_key_t) &na_ucx_addr->addr_key); - NA_CHECK_SUBSYS_ERROR(addr, rc != 1, unlock, ret, NA_NOENTRY, - "hg_hash_table_remove() failed"); - -@@ -2841,7 +2850,7 @@ na_ucx_addr_release(struct na_ucx_addr *na_ucx_addr) - NA_UCX_PRINT_ADDR_KEY_INFO("Removing address", &na_ucx_addr->addr_key); - - na_ucx_addr_map_remove( -- &na_ucx_addr->na_ucx_class->addr_map, &na_ucx_addr->addr_key); -+ &na_ucx_addr->na_ucx_class->addr_map, na_ucx_addr); - } - - if (na_ucx_addr->ucp_ep != NULL) { -@@ -3023,6 +3032,18 @@ na_ucx_rma(struct na_ucx_class NA_UNUSED *na_ucx_class, na_context_t *context, - - /* There is no need to have a fully resolved address to start an RMA. - * This is only necessary for two-sided communication. */ -+ /* The above assumption is now in question, so the following will resolve -+ * the address if required. */ -+ -+ /* Check addr to ensure the EP for that addr is still valid */ -+ if (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) { -+ ret = na_ucx_addr_map_update( -+ na_ucx_class, &na_ucx_class->addr_map, na_ucx_addr); -+ NA_CHECK_SUBSYS_NA_ERROR( -+ addr, error, ret, "Could not update NA UCX address"); -+ } -+ NA_CHECK_SUBSYS_ERROR(msg, na_ucx_addr->ucp_ep == NULL, error, ret, -+ NA_ADDRNOTAVAIL, "UCP endpoint is NULL for that address"); - - /* TODO UCX requires the remote key to be bound to the origin, do we need a - * new API? */ -@@ -3061,6 +3082,9 @@ na_ucx_rma_key_resolve(ucp_ep_h ep, struct na_ucx_mem_handle *na_ucx_mem_handle, - - hg_thread_mutex_lock(&na_ucx_mem_handle->rkey_unpack_lock); - -+ NA_CHECK_SUBSYS_ERROR( -+ mem, ep == NULL, error, ret, NA_INVALID_ARG, "Invalid endpoint (%p)", ep); -+ - switch (hg_atomic_get32(&na_ucx_mem_handle->type)) { - case NA_UCX_MEM_HANDLE_REMOTE_PACKED: { - ucs_status_t status = ucp_ep_rkey_unpack(ep, diff --git a/deps/patches/mercury/0002_na_ucx_ep_flush.patch b/deps/patches/mercury/0002_na_ucx_ep_flush.patch deleted file mode 100644 index f7b38d304aa..00000000000 --- a/deps/patches/mercury/0002_na_ucx_ep_flush.patch +++ /dev/null @@ -1,64 +0,0 @@ -diff --git a/src/na/na_ucx.c b/src/na/na_ucx.c -index 6e9c3b0..2f157da 100644 ---- a/src/na/na_ucx.c -+++ b/src/na/na_ucx.c -@@ -441,6 +441,12 @@ na_ucp_ep_create(ucp_worker_h worker, ucp_ep_params_t *ep_params, - static void - na_ucp_ep_error_cb(void *arg, ucp_ep_h ep, ucs_status_t status); - -+/** -+ * Flush endpoint. -+ */ -+static ucs_status_ptr_t -+na_ucp_ep_flush(ucp_ep_h ep); -+ - /** - * Close endpoint. - */ -@@ -1940,6 +1946,21 @@ na_ucp_ep_error_cb( - na_ucx_addr_ref_decr(na_ucx_addr); - } - -+/*---------------------------------------------------------------------------*/ -+static ucs_status_ptr_t -+na_ucp_ep_flush(ucp_ep_h ep) -+{ -+ const ucp_request_param_t flush_params = { -+ .op_attr_mask = 0}; -+ ucs_status_ptr_t status_ptr = ucp_ep_flush_nbx(ep, &flush_params); -+ -+ NA_CHECK_SUBSYS_ERROR_DONE(addr, -+ status_ptr != NULL && UCS_PTR_IS_ERR(status_ptr), -+ "ucp_ep_flush_nb() failed (%s)", -+ ucs_status_string(UCS_PTR_STATUS(status_ptr))); -+ return status_ptr; -+} -+ - /*---------------------------------------------------------------------------*/ - static void - na_ucp_ep_close(ucp_ep_h ep) -@@ -2859,8 +2880,23 @@ na_ucx_addr_release(struct na_ucx_addr *na_ucx_addr) - if (na_ucx_addr->ucp_ep != NULL) { - /* NB. for deserialized addresses that are not "connected" addresses, do - * not close the EP */ -- if (na_ucx_addr->worker_addr == NULL) -+ if (na_ucx_addr->worker_addr == NULL) { -+ if (!na_ucx_addr->na_ucx_class->ucp_listener) { -+ ucs_status_ptr_t status_ptr = na_ucp_ep_flush(na_ucx_addr->ucp_ep); -+ -+ if (UCS_PTR_IS_PTR(status_ptr)) { -+ ucs_status_t status; -+ -+ do { -+ ucp_worker_progress(na_ucx_addr->na_ucx_class->ucp_worker); -+ status = ucp_request_check_status(status_ptr); -+ } while (status == UCS_INPROGRESS); -+ ucp_request_free(status_ptr); -+ } -+ } -+ - na_ucp_ep_close(na_ucx_addr->ucp_ep); -+ } - na_ucx_addr->ucp_ep = NULL; - } - diff --git a/deps/patches/mercury/0002_ofi_counters.patch b/deps/patches/mercury/0002_ofi_counters.patch new file mode 100644 index 00000000000..7cbdc543bd8 --- /dev/null +++ b/deps/patches/mercury/0002_ofi_counters.patch @@ -0,0 +1,1196 @@ +From 187759c6cd99205c52fc5b77fdb6c2d3536f9021 Mon Sep 17 00:00:00 2001 +From: Jerome Soumagne +Date: Thu, 18 Dec 2025 17:47:03 -0600 +Subject: [PATCH 2/3] NA OFI: add counters for monitoring tx/rx/rma/cq counts + +Monitor mr and addr counts + +NA: add NA_Diag_dump_counters() routine to dump counters +if HG_LOG_LEVEL>=min_debug is set + +HG Core: clean up counters + +HG util: add ability to remove counters + +NA OFI: finalize counters after fabric is closed + +HG util: bump minor version +--- + CMakeLists.txt | 2 +- + src/mercury.c | 1 + + src/mercury_core.c | 40 ++--- + src/na/CMakeLists.txt | 5 + + src/na/na.c | 9 + + src/na/na.h | 6 + + src/na/na_config.h.in | 1 + + src/na/na_ofi.c | 360 +++++++++++++++++++++++++++++++++------- + src/util/mercury_dlog.c | 37 ++++- + src/util/mercury_dlog.h | 18 ++ + src/util/mercury_log.c | 14 +- + src/util/mercury_log.h | 8 + + src/util/version.txt | 2 +- + 13 files changed, 413 insertions(+), 90 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index e4e79711..e71944f1 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -192,7 +192,7 @@ endif() + # Enable diagnostics counters separately from debug. + #------------------------------------------------------------------------------ + option(MERCURY_ENABLE_COUNTERS "Enable diagnostics counters." OFF) +-if(MERCURY_ENABLE_COUNTERS) ++if(MERCURY_ENABLE_COUNTERS AND NOT WIN32) + set(HG_HAS_DIAG 1) + else() + set(HG_HAS_DIAG 0) +diff --git a/src/mercury.c b/src/mercury.c +index 2c062384..6f0376ee 100644 +--- a/src/mercury.c ++++ b/src/mercury.c +@@ -1178,6 +1178,7 @@ HG_Diag_dump_counters(void) + #ifndef _WIN32 + hg_log_dump_counters(&HG_LOG_OUTLET(hg_diag)); + #endif ++ NA_Diag_dump_counters(); + } + + /*---------------------------------------------------------------------------*/ +diff --git a/src/mercury_core.c b/src/mercury_core.c +index 97cc4fb2..7abd8a47 100644 +--- a/src/mercury_core.c ++++ b/src/mercury_core.c +@@ -185,7 +185,7 @@ struct hg_core_private_class { + struct hg_core_map rpc_map; /* RPC Map */ + struct hg_core_more_data_cb more_data_cb; /* More data callbacks */ + na_tag_t request_max_tag; /* Max value for tag */ +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + struct hg_core_counters counters; /* Diag counters */ + #endif + hg_atomic_int32_t n_contexts; /* Total number of contexts */ +@@ -369,7 +369,7 @@ struct hg_core_private_handle { + uint8_t cookie; /* Cookie */ + bool multi_recv_copy; /* Copy on multi-recv */ + bool reuse; /* Re-use handle once ref_count is 0 */ +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + bool active; + #endif + }; +@@ -405,7 +405,7 @@ hg_core_op_type_to_string(enum hg_core_op_type op_type); + /** + * Init counters. + */ +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + static void + hg_core_counters_init(struct hg_core_counters *hg_core_counters); + #endif +@@ -447,7 +447,7 @@ hg_core_finalize(struct hg_core_private_class *hg_core_class); + /** + * Get counters. + */ +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + static void + hg_core_class_get_counters(const struct hg_core_counters *counters, + struct hg_diag_counters *diag_counters); +@@ -1091,7 +1091,7 @@ hg_core_op_type_to_string(enum hg_core_op_type op_type) + #endif + + /*---------------------------------------------------------------------------*/ +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + static void + hg_core_counters_init(struct hg_core_counters *hg_core_counters) + { +@@ -1325,7 +1325,7 @@ hg_core_init(const char *na_info_string, bool na_listen, unsigned int version, + hg_core_class->init_info.listen = na_listen; + + /* Stats / counters */ +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + hg_core_counters_init(&hg_core_class->counters); + #endif + +@@ -1521,7 +1521,7 @@ error: + } + + /*---------------------------------------------------------------------------*/ +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + static void + hg_core_class_get_counters(const struct hg_core_counters *counters, + struct hg_diag_counters *diag_counters) +@@ -3445,7 +3445,7 @@ hg_core_destroy(struct hg_core_private_handle *hg_core_handle) + return HG_SUCCESS; /* Cannot free yet */ + } + +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + if (hg_core_handle->active) { + hg_atomic_decr64(HG_CORE_HANDLE_CLASS(hg_core_handle) + ->counters.rpc_req_recv_active_count); +@@ -4048,7 +4048,7 @@ hg_core_forward(struct hg_core_private_handle *hg_core_handle, + hg_core_handle->request_callback = callback; + hg_core_handle->request_arg = arg; + +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64( + HG_CORE_HANDLE_CLASS(hg_core_handle)->counters.rpc_req_sent_count); +@@ -4263,7 +4263,7 @@ hg_core_respond(struct hg_core_private_handle *hg_core_handle, + hg_core_handle->response_callback = callback; + hg_core_handle->response_arg = arg; + +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64( + HG_CORE_HANDLE_CLASS(hg_core_handle)->counters.rpc_resp_sent_count); +@@ -4499,7 +4499,7 @@ hg_core_recv_input_cb(const struct na_cb_info *callback_info) + hg_thread_spin_lock(&hg_core_handle_pool->pending_list.lock); + LIST_REMOVE(hg_core_handle, pending); + hg_thread_spin_unlock(&hg_core_handle_pool->pending_list.lock); +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64(HG_CORE_HANDLE_CLASS(hg_core_handle) + ->counters.rpc_req_recv_active_count); +@@ -4608,7 +4608,7 @@ hg_core_multi_recv_input_cb(const struct na_cb_info *callback_info) + ret = hg_core_handle_pool_get(context->handle_pool, &hg_core_handle); + HG_CHECK_SUBSYS_HG_ERROR( + rpc, error, ret, "Could not get handle from pool"); +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64(HG_CORE_HANDLE_CLASS(hg_core_handle) + ->counters.rpc_req_recv_active_count); +@@ -4665,7 +4665,7 @@ hg_core_multi_recv_input_cb(const struct na_cb_info *callback_info) + "Copying multi-recv payload of size %zu for handle (%p)", + hg_core_handle->core_handle.in_buf_used, + (void *) hg_core_handle); +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64(HG_CORE_CONTEXT_CLASS(context) + ->counters.rpc_multi_recv_copy_count); +@@ -4763,7 +4763,7 @@ hg_core_process_input(struct hg_core_private_handle *hg_core_handle) + uint32_t flags = (uint32_t) hg_atomic_get32(&hg_core_handle->flags); + hg_return_t ret; + +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64(hg_core_class->counters.rpc_req_recv_count); + #endif +@@ -4812,7 +4812,7 @@ hg_core_process_input(struct hg_core_private_handle *hg_core_handle) + "Handle (%p) expected_count incr to %" PRId32, + (void *) hg_core_handle, expected_count); + +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64(hg_core_class->counters.rpc_req_extra_count); + #endif +@@ -4936,7 +4936,7 @@ hg_core_process_output(struct hg_core_private_handle *hg_core_handle) + uint32_t flags = (uint32_t) hg_atomic_get32(&hg_core_handle->flags); + hg_return_t ret; + +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64(hg_core_class->counters.rpc_resp_recv_count); + #endif +@@ -4980,7 +4980,7 @@ hg_core_process_output(struct hg_core_private_handle *hg_core_handle) + "Handle (%p) expected_count incr to %" PRId32, + (void *) hg_core_handle, expected_count); + +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + hg_atomic_incr64(hg_core_class->counters.rpc_resp_extra_count); + #endif +@@ -5319,7 +5319,7 @@ hg_core_completion_add(struct hg_core_context *core_context, + struct hg_core_completion_queue *backfill_queue = &context->backfill_queue; + int rc; + +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + /* Increment counter */ + if (hg_completion_entry->op_type == HG_BULK) + hg_atomic_incr64(HG_CORE_CONTEXT_CLASS(context)->counters.bulk_count); +@@ -6212,7 +6212,7 @@ hg_return_t + HG_Core_class_get_counters(const hg_core_class_t *hg_core_class, + struct hg_diag_counters *diag_counters) + { +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + const struct hg_core_private_class *private_class = + (const struct hg_core_private_class *) hg_core_class; + #endif +@@ -6222,7 +6222,7 @@ HG_Core_class_get_counters(const hg_core_class_t *hg_core_class, + HG_INVALID_ARG, "NULL HG core class"); + HG_CHECK_SUBSYS_ERROR(cls, diag_counters == NULL, error, ret, + HG_INVALID_ARG, "NULL pointer to diag_counters"); +-#if defined(HG_HAS_DIAG) && !defined(_WIN32) ++#ifdef HG_HAS_DIAG + hg_core_class_get_counters(&private_class->counters, diag_counters); + #else + HG_LOG_SUBSYS_ERROR(cls, "Counters not supported in current build, please " +diff --git a/src/na/CMakeLists.txt b/src/na/CMakeLists.txt +index 115e70bb..d99ed05d 100644 +--- a/src/na/CMakeLists.txt ++++ b/src/na/CMakeLists.txt +@@ -59,6 +59,11 @@ set(NA_BUILD_INCLUDE_DEPENDENCIES + #------------------------------------------------------------------------------ + # Internal dependencies + #------------------------------------------------------------------------------ ++# Diagnostics counters ++if(HG_HAS_DIAG) ++ set(NA_HAS_DIAG 1) ++endif() ++ + # Multi progress + if(NOT HG_ALLOW_MULTI_PROGRESS) + option(NA_ALLOW_MULTI_PROGRESS "Allow concurrent progress on single context." ON) +diff --git a/src/na/na.c b/src/na/na.c +index d54db626..61abf0a3 100644 +--- a/src/na/na.c ++++ b/src/na/na.c +@@ -1010,6 +1010,15 @@ NA_Set_log_level(const char *level) + hg_log_set_subsys_level(NA_SUBSYS_NAME_STRING, hg_log_name_to_level(level)); + } + ++/*---------------------------------------------------------------------------*/ ++void ++NA_Diag_dump_counters(void) ++{ ++#ifndef _WIN32 ++ hg_log_dump_counters(&HG_LOG_OUTLET(NA_SUBSYS_NAME)); ++#endif ++} ++ + /*---------------------------------------------------------------------------*/ + na_context_t * + NA_Context_create(na_class_t *na_class) +diff --git a/src/na/na.h b/src/na/na.h +index ff65ceb2..3d335fe7 100644 +--- a/src/na/na.h ++++ b/src/na/na.h +@@ -153,6 +153,12 @@ NA_Has_opt_feature( + NA_PUBLIC void + NA_Set_log_level(const char *level); + ++/** ++ * Dump diagnostic counters into the existing log stream. ++ */ ++NA_PUBLIC void ++NA_Diag_dump_counters(void); ++ + /** + * Return the name of the NA class. + * +diff --git a/src/na/na_config.h.in b/src/na/na_config.h.in +index 8468de8c..1419bee6 100644 +--- a/src/na/na_config.h.in ++++ b/src/na/na_config.h.in +@@ -85,6 +85,7 @@ + + /* Build Options */ + #cmakedefine NA_HAS_DEBUG ++#cmakedefine NA_HAS_DIAG + #cmakedefine NA_HAS_MULTI_PROGRESS + + /* HWLOC */ +diff --git a/src/na/na_ofi.c b/src/na/na_ofi.c +index 682efe65..d385d279 100644 +--- a/src/na/na_ofi.c ++++ b/src/na/na_ofi.c +@@ -262,6 +262,9 @@ static unsigned long const na_ofi_prov_flags[] = {NA_OFI_PROV_TYPES}; + /* Prov info array init count */ + #define NA_OFI_PROV_INFO_COUNT (32) + ++/* Max counter name length */ ++#define NA_OFI_MAX_COUNTER_NAME (64) ++ + /* Address / URI max len */ + #define NA_OFI_MAX_URI_LEN (128) + +@@ -787,7 +790,6 @@ struct na_ofi_domain { + hg_atomic_int64_t requested_key; /* Requested key if not FI_MR_PROV_KEY */ + int64_t max_key; /* Max key if not FI_MR_PROV_KEY */ + uint64_t max_tag; /* Max tag from CQ data size */ +- hg_atomic_int32_t mr_reg_count; /* Number of MR registered */ + bool no_wait; /* Wait disabled on domain */ + bool av_auth_key; /* Use FI_AV_AUTH_KEY */ + bool av_user_id; /* Use FI_AV_USER_ID */ +@@ -830,15 +832,26 @@ struct na_ofi_verify_info { + enum na_ofi_prov_type prov_type; /* Provider type */ + }; + +-/* OFI class */ +-struct na_ofi_class { +- struct na_ofi_addr_pool addr_pool; /* Addr pool */ +- struct fi_info *fi_info; /* OFI info */ +- struct na_ofi_fabric *fabric; /* Fabric pointer */ +- struct na_ofi_domain *domain; /* Domain pointer */ +- struct na_ofi_endpoint *endpoint; /* Endpoint pointer */ +- struct hg_mem_pool *send_pool; /* Msg send buf pool */ +- struct hg_mem_pool *recv_pool; /* Msg recv buf pool */ ++#ifdef NA_HAS_DIAG ++/* OFI counters */ ++struct na_ofi_counters { ++ char tx_count_string[NA_OFI_MAX_COUNTER_NAME]; /* TX count string */ ++ char rx_count_string[NA_OFI_MAX_COUNTER_NAME]; /* RX count string */ ++ char rma_count_string[NA_OFI_MAX_COUNTER_NAME]; /* RMA count string */ ++ char mr_count_string[NA_OFI_MAX_COUNTER_NAME]; /* MR count string */ ++ char addr_count_string[NA_OFI_MAX_COUNTER_NAME]; /* Addr count string */ ++ char cq_count_string[NA_OFI_MAX_COUNTER_NAME]; /* CQ count string */ ++ hg_atomic_int32_t *tx_count; /* Number of active sends */ ++ hg_atomic_int32_t *rx_count; /* Number of active receives */ ++ hg_atomic_int32_t *rma_count; /* Number of active RMAs */ ++ hg_atomic_int32_t *mr_count; /* Number of active MRs */ ++ hg_atomic_int32_t *addr_count; /* Number of addresses inserted */ ++ hg_atomic_int32_t *cq_count; /* Number of CQ events */ ++}; ++#endif ++ ++/* OFI ops */ ++struct na_ofi_ops { + na_return_t (*msg_send_unexpected)( + struct fid_ep *, const struct na_ofi_msg_info *, void *); + na_return_t (*msg_recv_unexpected)( +@@ -853,14 +866,29 @@ struct na_ofi_class { + const char *msg_recv_unexpected_string; /* Error log string */ + const char *msg_send_expected_string; /* Error log string */ + const char *msg_recv_expected_string; /* Error log string */ +- unsigned long opt_features; /* Optional feature flags */ +- hg_atomic_int32_t n_contexts; /* Number of context */ +- unsigned int op_retry_timeout; /* Retry timeout */ +- unsigned int op_retry_period; /* Time elapsed until next retry */ +- uint8_t context_max; /* Max number of contexts */ +- bool no_wait; /* Ignore wait object */ +- bool use_sep; /* Use scalable endpoints */ +- bool finalizing; /* Class being destroyed */ ++}; ++ ++/* OFI class */ ++struct na_ofi_class { ++ struct na_ofi_addr_pool addr_pool; /* Addr pool */ ++ struct fi_info *fi_info; /* OFI info */ ++ struct na_ofi_fabric *fabric; /* Fabric pointer */ ++ struct na_ofi_domain *domain; /* Domain pointer */ ++ struct na_ofi_endpoint *endpoint; /* Endpoint pointer */ ++ struct hg_mem_pool *send_pool; /* Msg send buf pool */ ++ struct hg_mem_pool *recv_pool; /* Msg recv buf pool */ ++ struct na_ofi_ops ops; /* OFI operations */ ++#ifdef NA_HAS_DIAG ++ struct na_ofi_counters counters; /* OFI counters */ ++#endif ++ unsigned long opt_features; /* Optional feature flags */ ++ hg_atomic_int32_t n_contexts; /* Number of context */ ++ unsigned int op_retry_timeout; /* Retry timeout */ ++ unsigned int op_retry_period; /* Time elapsed until next retry */ ++ uint8_t context_max; /* Max number of contexts */ ++ bool no_wait; /* Ignore wait object */ ++ bool use_sep; /* Use scalable endpoints */ ++ bool finalizing; /* Class being destroyed */ + }; + + /********************/ +@@ -1145,6 +1173,20 @@ na_ofi_class_alloc(void); + static na_return_t + na_ofi_class_free(struct na_ofi_class *na_ofi_class); + ++#ifdef NA_HAS_DIAG ++/** ++ * Init counters. ++ */ ++static void ++na_ofi_counters_init(struct na_ofi_counters *counters, int class_id); ++ ++/** ++ * Finalize counters. ++ */ ++static void ++na_ofi_counters_finalize(struct na_ofi_counters *counters); ++#endif ++ + /** + * Configure class parameters from environment variables. + */ +@@ -1762,6 +1804,14 @@ static void + na_ofi_op_retry_abort_addr( + struct na_ofi_context *na_ofi_context, fi_addr_t fi_addr, na_return_t ret); + ++/** ++ * Process counters. ++ */ ++#ifdef NA_HAS_DIAG ++static void ++na_ofi_cq_process_counters(struct na_ofi_op_id *na_ofi_op_id); ++#endif ++ + /** + * Complete operation ID. + */ +@@ -3307,6 +3357,12 @@ na_ofi_addr_map_insert(struct na_ofi_class *na_ofi_class, + na_ofi_errno_to_na(-rc), + "fi_av_remove(%" PRIu64 ") failed, rc: %d (%s)", + na_ofi_addr->fi_addr, rc, fi_strerror(-rc)); ++ ++#ifdef NA_HAS_DIAG ++ /* Counters */ ++ hg_atomic_decr32(na_ofi_class->counters.addr_count); ++#endif ++ + addr_map_exist = true; + } + } else { +@@ -3353,6 +3409,11 @@ na_ofi_addr_map_insert(struct na_ofi_class *na_ofi_class, + addr_str, &addr_str_len), + rc); + ++#ifdef NA_HAS_DIAG ++ /* Counters */ ++ hg_atomic_incr32(na_ofi_class->counters.addr_count); ++#endif ++ + #if FI_VERSION_GE(FI_COMPILE_VERSION, FI_VERSION(1, 20)) + if (na_ofi_class->domain->av_auth_key) { + size_t addrlen = sizeof(na_ofi_addr->addr_key.addr); +@@ -3458,6 +3519,11 @@ na_ofi_addr_map_remove( + "fi_av_remove(%" PRIu64 ") failed, rc: %d (%s)", na_ofi_addr->fi_addr, + rc, fi_strerror(-rc)); + ++#ifdef NA_HAS_DIAG ++ /* Counters */ ++ hg_atomic_decr32(na_ofi_addr->class->counters.addr_count); ++#endif ++ + NA_LOG_SUBSYS_DEBUG( + addr, "Removed addr for FI addr %" PRIu64, na_ofi_addr->fi_addr); + +@@ -4229,6 +4295,9 @@ static struct na_ofi_class * + na_ofi_class_alloc(void) + { + struct na_ofi_class *na_ofi_class = NULL; ++#ifdef NA_HAS_DIAG ++ static int class_id = 0; ++#endif + int rc; + + /* Create private data */ +@@ -4237,6 +4306,10 @@ na_ofi_class_alloc(void) + "Could not allocate NA private data class"); + hg_atomic_init32(&na_ofi_class->n_contexts, 0); + ++#ifdef NA_HAS_DIAG ++ na_ofi_counters_init(&na_ofi_class->counters, class_id++); ++#endif ++ + /* Initialize addr pool */ + rc = hg_thread_spin_init(&na_ofi_class->addr_pool.lock); + NA_CHECK_SUBSYS_ERROR_NORET( +@@ -4301,6 +4374,11 @@ na_ofi_class_free(struct na_ofi_class *na_ofi_class) + na_ofi_class->fabric = NULL; + } + ++#ifdef NA_HAS_DIAG ++ /* Remove counters */ ++ na_ofi_counters_finalize(&na_ofi_class->counters); ++#endif ++ + /* Free info */ + if (na_ofi_class->fi_info) + na_ofi_freeinfo(na_ofi_class->fi_info); +@@ -4313,6 +4391,50 @@ out: + return ret; + } + ++/*---------------------------------------------------------------------------*/ ++#ifdef NA_HAS_DIAG ++static void ++na_ofi_counters_init(struct na_ofi_counters *counters, int class_id) ++{ ++ snprintf(counters->tx_count_string, sizeof(counters->tx_count_string), ++ "[%d] na_ofi_tx_count ", class_id); ++ snprintf(counters->rx_count_string, sizeof(counters->rx_count_string), ++ "[%d] na_ofi_rx_count ", class_id); ++ snprintf(counters->rma_count_string, sizeof(counters->rma_count_string), ++ "[%d] na_ofi_rma_count ", class_id); ++ snprintf(counters->mr_count_string, sizeof(counters->mr_count_string), ++ "[%d] na_ofi_mr_count ", class_id); ++ snprintf(counters->addr_count_string, sizeof(counters->addr_count_string), ++ "[%d] na_ofi_addr_count", class_id); ++ snprintf(counters->cq_count_string, sizeof(counters->cq_count_string), ++ "[%d] na_ofi_cq_count ", class_id); ++ HG_LOG_ADD_COUNTER32(na, &counters->tx_count, counters->tx_count_string, ++ "Number of active sends"); ++ HG_LOG_ADD_COUNTER32(na, &counters->rx_count, counters->rx_count_string, ++ "Number of active recvs"); ++ HG_LOG_ADD_COUNTER32(na, &counters->rma_count, counters->rma_count_string, ++ "Number of active RMAs"); ++ HG_LOG_ADD_COUNTER32(na, &counters->mr_count, counters->mr_count_string, ++ "Number of active MRs"); ++ HG_LOG_ADD_COUNTER32(na, &counters->addr_count, counters->addr_count_string, ++ "Number of addresses inserted"); ++ HG_LOG_ADD_COUNTER32(na, &counters->cq_count, counters->cq_count_string, ++ "Number of events still in CQ"); ++} ++ ++/*---------------------------------------------------------------------------*/ ++static void ++na_ofi_counters_finalize(struct na_ofi_counters *counters) ++{ ++ HG_LOG_DEL_COUNTER32(na, counters->tx_count); ++ HG_LOG_DEL_COUNTER32(na, counters->rx_count); ++ HG_LOG_DEL_COUNTER32(na, counters->rma_count); ++ HG_LOG_DEL_COUNTER32(na, counters->mr_count); ++ HG_LOG_DEL_COUNTER32(na, counters->addr_count); ++ HG_LOG_DEL_COUNTER32(na, counters->cq_count); ++} ++#endif ++ + /*---------------------------------------------------------------------------*/ + static na_return_t + na_ofi_class_env_config(struct na_ofi_class *na_ofi_class) +@@ -4323,26 +4445,26 @@ na_ofi_class_env_config(struct na_ofi_class *na_ofi_class) + /* Set unexpected msg callbacks */ + env = getenv("NA_OFI_UNEXPECTED_TAG_MSG"); + if (env == NULL || env[0] == '0' || tolower(env[0]) == 'n') { +- na_ofi_class->msg_send_unexpected = na_ofi_msg_send; +- na_ofi_class->msg_send_unexpected_string = "fi_senddata"; +- na_ofi_class->msg_recv_unexpected = na_ofi_msg_recv; +- na_ofi_class->msg_recv_unexpected_string = "fi_recv"; ++ na_ofi_class->ops.msg_send_unexpected = na_ofi_msg_send; ++ na_ofi_class->ops.msg_send_unexpected_string = "fi_senddata"; ++ na_ofi_class->ops.msg_recv_unexpected = na_ofi_msg_recv; ++ na_ofi_class->ops.msg_recv_unexpected_string = "fi_recv"; + } else { + NA_LOG_SUBSYS_DEBUG(cls, + "NA_OFI_UNEXPECTED_TAG_MSG set to %s, forcing unexpected messages " + "to use tagged recvs", + env); +- na_ofi_class->msg_send_unexpected = na_ofi_tag_send; +- na_ofi_class->msg_send_unexpected_string = "fi_tsend"; +- na_ofi_class->msg_recv_unexpected = na_ofi_tag_recv; +- na_ofi_class->msg_recv_unexpected_string = "fi_trecv"; ++ na_ofi_class->ops.msg_send_unexpected = na_ofi_tag_send; ++ na_ofi_class->ops.msg_send_unexpected_string = "fi_tsend"; ++ na_ofi_class->ops.msg_recv_unexpected = na_ofi_tag_recv; ++ na_ofi_class->ops.msg_recv_unexpected_string = "fi_trecv"; + } + + /* Set expected msg callbacks */ +- na_ofi_class->msg_send_expected = na_ofi_tag_send; +- na_ofi_class->msg_send_expected_string = "fi_tsend"; +- na_ofi_class->msg_recv_expected = na_ofi_tag_recv; +- na_ofi_class->msg_recv_expected_string = "fi_trecv"; ++ na_ofi_class->ops.msg_send_expected = na_ofi_tag_send; ++ na_ofi_class->ops.msg_send_expected_string = "fi_tsend"; ++ na_ofi_class->ops.msg_recv_expected = na_ofi_tag_recv; ++ na_ofi_class->ops.msg_recv_expected_string = "fi_trecv"; + + /* Default retry timeouts in ms */ + if ((env = getenv("NA_OFI_OP_RETRY_TIMEOUT")) != NULL) { +@@ -5073,7 +5195,6 @@ na_ofi_domain_open(const struct na_ofi_fabric *na_ofi_fabric, + hg_atomic_init64(&na_ofi_domain->requested_key, 0); + /* No need to take a refcount on fabric */ + na_ofi_domain->fabric = na_ofi_fabric; +- hg_atomic_init32(&na_ofi_domain->mr_reg_count, 0); + + /* Dup name */ + na_ofi_domain->name = strdup(domain_attr->name); +@@ -6056,6 +6177,11 @@ na_ofi_mem_buf_register(const void *buf, size_t len, unsigned long flags, + + /* Register memory if FI_MR_LOCAL is set and provider uses it */ + if (na_ofi_class->fi_info->domain_attr->mr_mode & FI_MR_LOCAL) { ++#ifdef NA_HAS_DIAG ++ int32_t mr_cnt = hg_atomic_get32(na_ofi_class->counters.mr_count); ++#else ++ int32_t mr_cnt = -1; ++#endif + struct fid_mr *mr_hdl = NULL; + uint64_t access = 0; + int rc; +@@ -6072,10 +6198,11 @@ na_ofi_mem_buf_register(const void *buf, size_t len, unsigned long flags, + NA_CHECK_SUBSYS_ERROR(mem, rc != 0, out, ret, HG_UTIL_FAIL, + "fi_mr_reg(buf=%p, len=%zu, flags=%lu) failed, rc: %d (%s), " + "mr_reg_count: %d", +- buf, len, flags, rc, fi_strerror(-rc), +- hg_atomic_get32(&na_ofi_class->domain->mr_reg_count)); ++ buf, len, flags, rc, fi_strerror(-rc), mr_cnt); + +- hg_atomic_incr32(&na_ofi_class->domain->mr_reg_count); ++#ifdef NA_HAS_DIAG ++ hg_atomic_incr32(na_ofi_class->counters.mr_count); ++#endif + *handle_p = (void *) mr_hdl; + } else + *handle_p = NULL; +@@ -6093,11 +6220,17 @@ na_ofi_mem_buf_deregister(void *handle, void *arg) + /* Release MR handle is there was any */ + if (handle) { + struct fid_mr *mr_hdl = (struct fid_mr *) handle; ++#ifdef NA_HAS_DIAG + struct na_ofi_class *na_ofi_class = (struct na_ofi_class *) arg; ++#else ++ (void) arg; ++#endif + int rc = fi_close(&mr_hdl->fid); + NA_CHECK_SUBSYS_ERROR(mem, rc != 0, out, ret, HG_UTIL_FAIL, + "fi_close() mr_hdl failed, rc: %d (%s)", rc, fi_strerror(-rc)); +- hg_atomic_decr32(&na_ofi_class->domain->mr_reg_count); ++#ifdef NA_HAS_DIAG ++ hg_atomic_decr32(na_ofi_class->counters.mr_count); ++#endif + } + + out: +@@ -6159,6 +6292,11 @@ na_ofi_msg_send_common(struct na_ofi_class *na_ofi_class, + if ((int) na_ofi_class->fi_info->addr_format == FI_ADDR_OPX) + na_ofi_op_id->fi_ctx[0].internal[0] = &na_ofi_addr->addr_key.addr.opx; + ++#ifdef NA_HAS_DIAG ++ /* Counters */ ++ hg_atomic_incr32(na_ofi_class->counters.tx_count); ++#endif ++ + ret = msg_op( + na_ofi_context->fi_tx, &na_ofi_op_id->info.msg, &na_ofi_op_id->fi_ctx); + if (ret != NA_SUCCESS) { +@@ -6166,8 +6304,12 @@ na_ofi_msg_send_common(struct na_ofi_class *na_ofi_class, + na_ofi_op_id->retry_op.msg = msg_op; + na_ofi_op_retry( + na_ofi_context, na_ofi_class->op_retry_timeout, na_ofi_op_id); +- } else ++ } else { ++#ifdef NA_HAS_DIAG ++ hg_atomic_decr32(na_ofi_class->counters.tx_count); ++#endif + NA_GOTO_SUBSYS_ERROR_NORET(msg, release, "Could not post msg send"); ++ } + } + + return NA_SUCCESS; +@@ -6224,6 +6366,11 @@ na_ofi_msg_recv_common(struct na_ofi_class *na_ofi_class, + .tag = tag, + .tag_mask = tag_mask}; + ++#ifdef NA_HAS_DIAG ++ /* Counters */ ++ hg_atomic_incr32(na_ofi_class->counters.rx_count); ++#endif ++ + ret = msg_op( + na_ofi_context->fi_rx, &na_ofi_op_id->info.msg, &na_ofi_op_id->fi_ctx); + if (ret != NA_SUCCESS) { +@@ -6231,8 +6378,12 @@ na_ofi_msg_recv_common(struct na_ofi_class *na_ofi_class, + na_ofi_op_id->retry_op.msg = msg_op; + na_ofi_op_retry( + na_ofi_context, na_ofi_class->op_retry_timeout, na_ofi_op_id); +- } else ++ } else { ++#ifdef NA_HAS_DIAG ++ hg_atomic_decr32(na_ofi_class->counters.rx_count); ++#endif + NA_GOTO_SUBSYS_ERROR_NORET(msg, release, "Could not post msg recv"); ++ } + } + + return NA_SUCCESS; +@@ -6620,6 +6771,11 @@ na_ofi_rma_common(struct na_ofi_class *na_ofi_class, na_context_t *context, + NA_OFI_SEP_RX_CTX_BITS) + : na_ofi_addr->fi_addr; + ++#ifdef NA_HAS_DIAG ++ /* Counters */ ++ hg_atomic_incr32(na_ofi_class->counters.rma_count); ++#endif ++ + /* Post the OFI RMA operation */ + ret = + na_ofi_rma_post(na_ofi_context->fi_tx, rma_info, &na_ofi_op_id->fi_ctx); +@@ -6628,8 +6784,12 @@ na_ofi_rma_common(struct na_ofi_class *na_ofi_class, na_context_t *context, + na_ofi_op_id->retry_op.rma = na_ofi_rma_post; + na_ofi_op_retry( + na_ofi_context, na_ofi_class->op_retry_timeout, na_ofi_op_id); +- } else ++ } else { ++#ifdef NA_HAS_DIAG ++ hg_atomic_decr32(na_ofi_class->counters.rma_count); ++#endif + NA_GOTO_SUBSYS_ERROR_NORET(rma, release, "Could not post RMA op"); ++ } + } + + return NA_SUCCESS; +@@ -6991,6 +7151,10 @@ na_ofi_cq_process_canceled(const struct na_ofi_class *na_ofi_class, + cq_err->err, fi_strerror(cq_err->err), (void *) na_ofi_op_id, + na_cb_type_to_string(na_ofi_op_id->type)); + ++#ifdef NA_HAS_DIAG ++ na_ofi_cq_process_counters(na_ofi_op_id); ++#endif ++ + /* When tearing down connections, it is possible that operations will be + canceled by libfabric itself. + NA_CHECK_SUBSYS_WARNING(op, +@@ -7014,6 +7178,10 @@ na_ofi_cq_process_canceled(const struct na_ofi_class *na_ofi_class, + &na_ofi_op_id->completion_data->callback_info.info + .multi_recv_unexpected, + complete); ++#ifdef NA_HAS_DIAG ++ if (complete) ++ hg_atomic_decr32(na_ofi_class->counters.rx_count); ++#endif + } else + complete = true; + +@@ -7151,6 +7319,10 @@ na_ofi_cq_process_error( + NA_OFI_OP_CANCELED) + return NA_SUCCESS; /* already handled */ + ++#ifdef NA_HAS_DIAG ++ na_ofi_cq_process_counters(na_ofi_op_id); ++#endif ++ + /* Abort other retries if peer is unreachable */ + if (na_ret == NA_HOSTUNREACH && na_ofi_op_id->addr) + na_ofi_op_retry_abort_addr(na_ofi_op_id->na_ofi_context, +@@ -7172,6 +7344,10 @@ na_ofi_cq_process_error( + &na_ofi_op_id->completion_data->callback_info.info + .multi_recv_unexpected, + complete); ++#ifdef NA_HAS_DIAG ++ if (complete) ++ hg_atomic_decr32(na_ofi_class->counters.rx_count); ++#endif + } else + complete = true; + +@@ -7346,6 +7522,10 @@ na_ofi_cq_process_event(struct na_ofi_class *na_ofi_class, + cq_event->op_context, cq_event->flags, cq_event->len, cq_event->buf, + cq_event->data, cq_event->tag); + ++#ifdef NA_HAS_DIAG ++ na_ofi_cq_process_counters(na_ofi_op_id); ++#endif ++ + switch (na_ofi_op_id->type) { + case NA_CB_RECV_UNEXPECTED: + /* Default to cq_event->tag for backward compatibility */ +@@ -7361,6 +7541,10 @@ na_ofi_cq_process_event(struct na_ofi_class *na_ofi_class, + break; + case NA_CB_MULTI_RECV_UNEXPECTED: + complete = cq_event->flags & FI_MULTI_RECV; ++#ifdef HG_HAS_DIAG ++ if (complete) ++ hg_atomic_decr32(na_ofi_class->counters.rx_count); ++#endif + + ret = na_ofi_cq_process_multi_recv_unexpected(na_ofi_class, + &na_ofi_op_id->info.msg, +@@ -7628,6 +7812,10 @@ na_ofi_cq_process_retries( + NA_LOG_SUBSYS_ERROR(op, "retry operation of %p (%s) failed", + (void *) na_ofi_op_id, na_cb_type_to_string(cb_type)); + ++#ifdef NA_HAS_DIAG ++ na_ofi_cq_process_counters(na_ofi_op_id); ++#endif ++ + /* Force internal completion in error mode */ + hg_atomic_or32(&na_ofi_op_id->status, NA_OFI_OP_ERRORED); + na_ofi_op_id->complete(na_ofi_op_id, true, ret); +@@ -7690,6 +7878,32 @@ na_ofi_op_retry_abort_addr( + hg_thread_spin_unlock(&op_queue->lock); + } + ++/*---------------------------------------------------------------------------*/ ++#ifdef NA_HAS_DIAG ++static void ++na_ofi_cq_process_counters(struct na_ofi_op_id *na_ofi_op_id) ++{ ++ switch (na_ofi_op_id->type) { ++ case NA_CB_RECV_UNEXPECTED: ++ case NA_CB_RECV_EXPECTED: ++ hg_atomic_decr32(na_ofi_op_id->na_ofi_class->counters.rx_count); ++ break; ++ case NA_CB_SEND_UNEXPECTED: ++ case NA_CB_SEND_EXPECTED: ++ hg_atomic_decr32(na_ofi_op_id->na_ofi_class->counters.tx_count); ++ break; ++ case NA_CB_PUT: ++ case NA_CB_GET: ++ hg_atomic_decr32(na_ofi_op_id->na_ofi_class->counters.rma_count); ++ break; ++ case NA_CB_MULTI_RECV_UNEXPECTED: ++ /* TODO currently treated outside of switch */ ++ default: ++ break; ++ } ++} ++#endif ++ + /*---------------------------------------------------------------------------*/ + static NA_INLINE void + na_ofi_op_complete_single(struct na_ofi_op_id *na_ofi_op_id, +@@ -7711,6 +7925,9 @@ na_ofi_op_complete_single(struct na_ofi_op_id *na_ofi_op_id, + completion_data->plugin_callback = na_ofi_op_release_single; + + NA_LOG_SUBSYS_DEBUG(op, "Adding completion data to queue"); ++#ifdef NA_HAS_DIAG ++ hg_atomic_incr32(na_ofi_op_id->na_ofi_class->counters.cq_count); ++#endif + + /* Add OP to NA completion queue */ + na_cb_completion_add( +@@ -7728,6 +7945,9 @@ na_ofi_op_release_single(void *arg) + (!(hg_atomic_get32(&na_ofi_op_id->status) & NA_OFI_OP_COMPLETED)), + "Releasing resources from an uncompleted operation"); + ++#ifdef NA_HAS_DIAG ++ hg_atomic_decr32(na_ofi_op_id->na_ofi_class->counters.cq_count); ++#endif + if (na_ofi_op_id->addr) { + na_ofi_addr_ref_decr(na_ofi_op_id->addr); + na_ofi_op_id->addr = NULL; +@@ -7775,6 +7995,10 @@ na_ofi_op_complete_multi( + op, na_ofi_op_id->completion_data == NULL, error, "Queue is full"); + + NA_LOG_SUBSYS_DEBUG(op, "Adding completion data to queue"); ++#ifdef NA_HAS_DIAG ++ hg_atomic_incr32(na_ofi_op_id->na_ofi_class->counters.cq_count); ++#endif ++ + /* Add OP to NA completion queue */ + na_cb_completion_add( + na_ofi_op_id->na_ofi_context->context, completion_data); +@@ -7789,6 +8013,9 @@ na_ofi_op_release_multi(void *arg) + { + struct na_ofi_op_id *na_ofi_op_id = (struct na_ofi_op_id *) arg; + ++#ifdef NA_HAS_DIAG ++ hg_atomic_decr32(na_ofi_op_id->na_ofi_class->counters.cq_count); ++#endif + na_ofi_completion_multi_pop(&na_ofi_op_id->completion_data_storage.multi); + } + +@@ -8230,15 +8457,15 @@ na_ofi_initialize( + + /* Set/check optional features */ + if ((na_ofi_prov_extra_caps[prov_type] & FI_MULTI_RECV) && +- (na_ofi_class->msg_recv_unexpected == na_ofi_msg_recv)) { ++ (na_ofi_class->ops.msg_recv_unexpected == na_ofi_msg_recv)) { + NA_CHECK_SUBSYS_ERROR(cls, + !(na_ofi_class->fi_info->caps & FI_MULTI_RECV), error, ret, + NA_PROTONOSUPPORT, "FI_MULTI_RECV is not supported by provider"); + na_ofi_class->opt_features |= NA_OPT_MULTI_RECV; + } +- na_ofi_class->cq_poll = (na_ofi_class->fi_info->caps & FI_SOURCE_ERR) +- ? na_ofi_cq_poll_fi_source +- : na_ofi_cq_poll_no_source; ++ na_ofi_class->ops.cq_poll = (na_ofi_class->fi_info->caps & FI_SOURCE_ERR) ++ ? na_ofi_cq_poll_fi_source ++ : na_ofi_cq_poll_no_source; + + /* Open fabric */ + ret = na_ofi_fabric_open( +@@ -8959,8 +9186,8 @@ na_ofi_msg_send_unexpected(na_class_t *na_class, na_context_t *context, + { + return na_ofi_msg_send_common(NA_OFI_CLASS(na_class), + NA_OFI_CONTEXT(context), NA_CB_SEND_UNEXPECTED, callback, arg, +- NA_OFI_CLASS(na_class)->msg_send_unexpected, +- NA_OFI_CLASS(na_class)->msg_send_unexpected_string, buf, buf_size, ++ NA_OFI_CLASS(na_class)->ops.msg_send_unexpected, ++ NA_OFI_CLASS(na_class)->ops.msg_send_unexpected_string, buf, buf_size, + NA_OFI_CLASS(na_class)->endpoint->unexpected_msg_size_max, + (struct na_ofi_msg_buf_handle *) plugin_data, + (struct na_ofi_addr *) dest_addr, dest_id, +@@ -8975,8 +9202,8 @@ na_ofi_msg_recv_unexpected(na_class_t *na_class, na_context_t *context, + { + return na_ofi_msg_recv_common(NA_OFI_CLASS(na_class), + NA_OFI_CONTEXT(context), NA_CB_RECV_UNEXPECTED, callback, arg, +- NA_OFI_CLASS(na_class)->msg_recv_unexpected, +- NA_OFI_CLASS(na_class)->msg_recv_unexpected_string, buf, buf_size, ++ NA_OFI_CLASS(na_class)->ops.msg_recv_unexpected, ++ NA_OFI_CLASS(na_class)->ops.msg_recv_unexpected_string, buf, buf_size, + NA_OFI_CLASS(na_class)->endpoint->unexpected_msg_size_max, + (struct na_ofi_msg_buf_handle *) plugin_data, NULL, 0, + NA_OFI_UNEXPECTED_TAG, NA_OFI_TAG_MASK, (struct na_ofi_op_id *) op_id); +@@ -9027,9 +9254,17 @@ na_ofi_msg_multi_recv_unexpected(na_class_t *na_class, na_context_t *context, + .tag = 0 /* unused */, + .tag_mask = 0 /* unused */}; + ++#ifdef NA_HAS_DIAG ++ /* Counters */ ++ hg_atomic_incr32(na_ofi_class->counters.rx_count); ++#endif ++ + ret = na_ofi_msg_multi_recv( + na_ofi_context->fi_rx, &na_ofi_op_id->info.msg, &na_ofi_op_id->fi_ctx); + if (ret != NA_SUCCESS) { ++#ifdef NA_HAS_DIAG ++ hg_atomic_decr32(na_ofi_class->counters.rx_count); ++#endif + if (ret == NA_AGAIN) { + na_ofi_op_id->retry_op.msg = na_ofi_msg_multi_recv; + na_ofi_op_retry( +@@ -9061,8 +9296,8 @@ na_ofi_msg_send_expected(na_class_t *na_class, na_context_t *context, + { + return na_ofi_msg_send_common(NA_OFI_CLASS(na_class), + NA_OFI_CONTEXT(context), NA_CB_SEND_EXPECTED, callback, arg, +- NA_OFI_CLASS(na_class)->msg_send_expected, +- NA_OFI_CLASS(na_class)->msg_send_expected_string, buf, buf_size, ++ NA_OFI_CLASS(na_class)->ops.msg_send_expected, ++ NA_OFI_CLASS(na_class)->ops.msg_send_expected_string, buf, buf_size, + NA_OFI_CLASS(na_class)->endpoint->expected_msg_size_max, + (struct na_ofi_msg_buf_handle *) plugin_data, + (struct na_ofi_addr *) dest_addr, dest_id, (uint64_t) tag, +@@ -9077,8 +9312,8 @@ na_ofi_msg_recv_expected(na_class_t *na_class, na_context_t *context, + { + return na_ofi_msg_recv_common(NA_OFI_CLASS(na_class), + NA_OFI_CONTEXT(context), NA_CB_RECV_EXPECTED, callback, arg, +- NA_OFI_CLASS(na_class)->msg_recv_expected, +- NA_OFI_CLASS(na_class)->msg_recv_expected_string, buf, buf_size, ++ NA_OFI_CLASS(na_class)->ops.msg_recv_expected, ++ NA_OFI_CLASS(na_class)->ops.msg_recv_expected_string, buf, buf_size, + NA_OFI_CLASS(na_class)->endpoint->expected_msg_size_max, + (struct na_ofi_msg_buf_handle *) plugin_data, + (struct na_ofi_addr *) source_addr, source_id, (uint64_t) tag, 0, +@@ -9214,7 +9449,11 @@ na_ofi_mem_register(na_class_t *na_class, na_mem_handle_t *mem_handle, + (struct na_ofi_mem_handle *) mem_handle; + struct na_ofi_domain *domain = NA_OFI_CLASS(na_class)->domain; + const struct fi_info *fi_info = NA_OFI_CLASS(na_class)->fi_info; +- int32_t mr_cnt = hg_atomic_get32(&domain->mr_reg_count); ++#ifdef NA_HAS_DIAG ++ int32_t mr_cnt = hg_atomic_get32(NA_OFI_CLASS(na_class)->counters.mr_count); ++#else ++ int32_t mr_cnt = -1; ++#endif + struct fi_mr_attr fi_mr_attr = { + .mr_iov = NA_OFI_IOV( + na_ofi_mem_handle->desc.iov, na_ofi_mem_handle->desc.info.iovcnt), +@@ -9283,7 +9522,9 @@ na_ofi_mem_register(na_class_t *na_class, na_mem_handle_t *mem_handle, + fi_mr_attr.mr_iov[0].iov_base, fi_mr_attr.mr_iov[0].iov_len, + fi_mr_attr.iov_count, fi_mr_attr.access, fi_mr_attr.iface, + fi_mr_attr.requested_key, rc, fi_strerror(-rc), mr_cnt); +- mr_cnt = hg_atomic_incr32(&domain->mr_reg_count); ++#ifdef NA_HAS_DIAG ++ mr_cnt = hg_atomic_incr32(NA_OFI_CLASS(na_class)->counters.mr_count); ++#endif + + /* Attach MR to endpoint when provider requests it */ + if (fi_info->domain_attr->mr_mode & FI_MR_ENDPOINT) { +@@ -9327,7 +9568,9 @@ na_ofi_mem_register(na_class_t *na_class, na_mem_handle_t *mem_handle, + error: + if (na_ofi_mem_handle->fi_mr) { + (void) fi_close(&na_ofi_mem_handle->fi_mr->fid); +- hg_atomic_decr32(&domain->mr_reg_count); ++#ifdef NA_HAS_DIAG ++ hg_atomic_decr32(NA_OFI_CLASS(na_class)->counters.mr_count); ++#endif + } + return ret; + } +@@ -9336,7 +9579,6 @@ error: + static na_return_t + na_ofi_mem_deregister(na_class_t *na_class, na_mem_handle_t *mem_handle) + { +- struct na_ofi_domain *domain = NA_OFI_CLASS(na_class)->domain; + struct na_ofi_mem_handle *na_ofi_mem_handle = + (struct na_ofi_mem_handle *) mem_handle; + na_return_t ret; +@@ -9344,14 +9586,18 @@ na_ofi_mem_deregister(na_class_t *na_class, na_mem_handle_t *mem_handle) + + /* close MR handle */ + if (na_ofi_mem_handle->fi_mr != NULL) { +- int32_t NA_DEBUG_LOG_USED mr_cnt; ++ int32_t NA_DEBUG_LOG_USED mr_cnt = -1; + const struct iovec NA_DEBUG_LOG_USED *mr_iov = NA_OFI_IOV( + na_ofi_mem_handle->desc.iov, na_ofi_mem_handle->desc.info.iovcnt); + + rc = fi_close(&na_ofi_mem_handle->fi_mr->fid); + NA_CHECK_SUBSYS_ERROR(mem, rc != 0, error, ret, na_ofi_errno_to_na(-rc), + "fi_close() mr_hdl failed, rc: %d (%s)", rc, fi_strerror(-rc)); +- mr_cnt = hg_atomic_decr32(&domain->mr_reg_count); ++#ifdef NA_HAS_DIAG ++ mr_cnt = hg_atomic_decr32(NA_OFI_CLASS(na_class)->counters.mr_count); ++#else ++ (void) na_class; ++#endif + + NA_LOG_SUBSYS_DEBUG(mem, + "Deregistered memory region: mr_iov[0].iov_base=%p, " +@@ -9575,7 +9821,7 @@ na_ofi_poll(na_class_t *na_class, na_context_t *context, unsigned int *count_p) + return NA_SUCCESS; + + /* Read from CQ and process events */ +- ret = na_ofi_class->cq_poll(na_ofi_class, na_ofi_context, &count); ++ ret = na_ofi_class->ops.cq_poll(na_ofi_class, na_ofi_context, &count); + NA_CHECK_SUBSYS_NA_ERROR(poll, error, ret, "Could not poll context CQ"); + + /* Attempt to process retries */ +diff --git a/src/util/mercury_dlog.c b/src/util/mercury_dlog.c +index 042a0157..fbdc114f 100644 +--- a/src/util/mercury_dlog.c ++++ b/src/util/mercury_dlog.c +@@ -138,6 +138,40 @@ hg_dlog_mkcount64(struct hg_dlog *d, hg_atomic_int64_t **cptr, const char *name, + hg_thread_mutex_unlock(&d->dlock); + } + ++/*---------------------------------------------------------------------------*/ ++void ++hg_dlog_rmcount32(struct hg_dlog *d, hg_atomic_int32_t *cptr) ++{ ++ struct hg_dlog_dcount32 *dcnt; ++ ++ hg_thread_mutex_lock(&d->dlock); ++ TAILQ_FOREACH (dcnt, &d->cnts32, l) { ++ if (&dcnt->c == cptr) { ++ TAILQ_REMOVE(&d->cnts32, dcnt, l); ++ free(dcnt); ++ break; ++ } ++ } ++ hg_thread_mutex_unlock(&d->dlock); ++} ++ ++/*---------------------------------------------------------------------------*/ ++void ++hg_dlog_rmcount64(struct hg_dlog *d, hg_atomic_int64_t *cptr) ++{ ++ struct hg_dlog_dcount64 *dcnt; ++ ++ hg_thread_mutex_lock(&d->dlock); ++ TAILQ_FOREACH (dcnt, &d->cnts64, l) { ++ if (&dcnt->c == cptr) { ++ TAILQ_REMOVE(&d->cnts64, dcnt, l); ++ free(dcnt); ++ break; ++ } ++ } ++ hg_thread_mutex_unlock(&d->dlock); ++} ++ + /*---------------------------------------------------------------------------*/ + unsigned int + hg_dlog_addlog(struct hg_dlog *d, const char *file, unsigned int line, +@@ -210,8 +244,9 @@ hg_dlog_dump(struct hg_dlog *d, int (*log_func)(FILE *, const char *, ...), + "### (%s) debug log summary\n" + "### ----------------------\n", + (d->dlog_magic + strlen(HG_DLOG_STDMAGIC))); +- if (!TAILQ_EMPTY(&d->cnts32) && !TAILQ_EMPTY(&d->cnts64)) { ++ if (!TAILQ_EMPTY(&d->cnts32) || !TAILQ_EMPTY(&d->cnts64)) { + log_func(stream, "# Counters\n"); ++ + TAILQ_FOREACH (dc32, &d->cnts32, l) { + log_func(stream, "# %s: %" PRId32 " [%s]\n", dc32->name, + hg_atomic_get32(&dc32->c), dc32->descr); +diff --git a/src/util/mercury_dlog.h b/src/util/mercury_dlog.h +index 88944b05..43c30a4f 100644 +--- a/src/util/mercury_dlog.h ++++ b/src/util/mercury_dlog.h +@@ -155,6 +155,15 @@ HG_UTIL_PUBLIC void + hg_dlog_mkcount32(struct hg_dlog *d, hg_atomic_int32_t **cptr, const char *name, + const char *descr); + ++/** ++ * remove a 32-bit counter from a dlog. ++ * ++ * \param d [IN] dlog to remove the counter from ++ * \param cptr [IN] pointer to counter to remove ++ */ ++HG_UTIL_PUBLIC void ++hg_dlog_rmcount32(struct hg_dlog *d, hg_atomic_int32_t *cptr); ++ + /** + * make a named atomic64 counter in a dlog and return a pointer to + * it. we use the dlock to ensure a counter under a given name only +@@ -178,6 +187,15 @@ HG_UTIL_PUBLIC void + hg_dlog_mkcount64(struct hg_dlog *d, hg_atomic_int64_t **cptr, const char *name, + const char *descr); + ++/** ++ * remove a 64-bit counter from a dlog. ++ * ++ * \param d [IN] dlog to remove the counter from ++ * \param cptr [IN] pointer to counter to remove ++ */ ++HG_UTIL_PUBLIC void ++hg_dlog_rmcount64(struct hg_dlog *d, hg_atomic_int64_t *cptr); ++ + /** + * attempt to add a log record to a dlog. the id and msg should point + * to static strings that are valid throughout the life of the program +diff --git a/src/util/mercury_log.c b/src/util/mercury_log.c +index 483922bf..2fc13548 100644 +--- a/src/util/mercury_log.c ++++ b/src/util/mercury_log.c +@@ -500,16 +500,9 @@ hg_log_outlet_deregister(struct hg_log_outlet *hg_log_outlet) + + if (hg_log_outlet->debug_log && + !(hg_log_outlet->parent && +- hg_log_outlet->parent->debug_log == hg_log_outlet->debug_log)) { +- if (hg_log_outlet->level >= HG_LOG_LEVEL_MIN_DEBUG) { +- FILE *stream = hg_log_streams_g[hg_log_outlet->level] +- ? hg_log_streams_g[hg_log_outlet->level] +- : *hg_log_std_streams_g[hg_log_outlet->level]; +- hg_dlog_dump_counters( +- hg_log_outlet->debug_log, hg_log_func_g, stream, 0); +- } ++ hg_log_outlet->parent->debug_log == hg_log_outlet->debug_log)) + hg_dlog_free(hg_log_outlet->debug_log); +- } ++ + STAILQ_REMOVE(&hg_log_outlets_g, hg_log_outlet, hg_log_outlet, entry); + hg_log_outlet->registered = false; + } +@@ -593,7 +586,8 @@ hg_log_vwrite(struct hg_log_outlet *hg_log_outlet, enum hg_log_level log_level, + no_return ? "" : "\n"); + #endif + +- if (log_level == HG_LOG_LEVEL_ERROR && hg_log_outlet->debug_log && ++ if ((log_level == HG_LOG_LEVEL_ERROR || log_level == HG_LOG_LEVEL_FATAL) && ++ hg_log_outlet->debug_log && + hg_log_outlet->level >= HG_LOG_LEVEL_MIN_DEBUG) { + hg_dlog_dump(hg_log_outlet->debug_log, hg_log_func_g, stream, 0); + hg_dlog_resetlog(hg_log_outlet->debug_log); +diff --git a/src/util/mercury_log.h b/src/util/mercury_log.h +index ceba0c7a..1ed01429 100644 +--- a/src/util/mercury_log.h ++++ b/src/util/mercury_log.h +@@ -308,11 +308,19 @@ + hg_dlog_mkcount32(HG_LOG_OUTLET(name).debug_log, counter_ptr, \ + counter_name, counter_desc) + ++/* HG_LOG_DEL_COUNTER32: delete 32-bit debug log counter */ ++#define HG_LOG_DEL_COUNTER32(name, counter_ptr) \ ++ hg_dlog_rmcount32(HG_LOG_OUTLET(name).debug_log, counter_ptr) ++ + /* HG_LOG_ADD_COUNTER64: add 64-bit debug log counter */ + #define HG_LOG_ADD_COUNTER64(name, counter_ptr, counter_name, counter_desc) \ + hg_dlog_mkcount64(HG_LOG_OUTLET(name).debug_log, counter_ptr, \ + counter_name, counter_desc) + ++/* HG_LOG_DEL_COUNTER64: delete 64-bit debug log counter */ ++#define HG_LOG_DEL_COUNTER64(name, counter_ptr) \ ++ hg_dlog_rmcount64(HG_LOG_OUTLET(name).debug_log, counter_ptr) ++ + /*************************************/ + /* Public Type and Struct Definition */ + /*************************************/ +diff --git a/src/util/version.txt b/src/util/version.txt +index fcdb2e10..ee74734a 100644 +--- a/src/util/version.txt ++++ b/src/util/version.txt +@@ -1 +1 @@ +-4.0.0 ++4.1.0 +-- +2.52.0 + diff --git a/deps/patches/mercury/0003_combined_plugin_path.patch b/deps/patches/mercury/0003_combined_plugin_path.patch deleted file mode 100644 index 476598dbc84..00000000000 --- a/deps/patches/mercury/0003_combined_plugin_path.patch +++ /dev/null @@ -1,260 +0,0 @@ -diff --git a/src/na/CMakeLists.txt b/src/na/CMakeLists.txt -index 5c4ec2f..625c07c 100644 ---- a/src/na/CMakeLists.txt -+++ b/src/na/CMakeLists.txt -@@ -82,9 +82,10 @@ if(NA_USE_DYNAMIC_PLUGINS) - if(NOT BUILD_SHARED_LIBS) - message(FATAL_ERROR "Using dynamic plugins requires BUILD_SHARED_LIBS to be ON.") - endif() -+ cmake_path(SET NA_PLUGIN_RELATIVE_PATH ${NA_INSTALL_PLUGIN_DIR}) -+ cmake_path(RELATIVE_PATH NA_PLUGIN_RELATIVE_PATH BASE_DIRECTORY ${NA_INSTALL_LIB_DIR}) -+ message(STATUS "NA plugin install directory: ${NA_INSTALL_PLUGIN_DIR} (relative path to libraries: ${NA_PLUGIN_RELATIVE_PATH})") - set(NA_HAS_DYNAMIC_PLUGINS 1) -- set(NA_DEFAULT_PLUGIN_PATH ${NA_INSTALL_PLUGIN_DIR} CACHE PATH "Default path used to load plugins.") -- mark_as_advanced(NA_DEFAULT_PLUGIN_PATH) - endif() - - # BMI -diff --git a/src/na/na.c b/src/na/na.c -index 781f4c8..b60d305 100644 ---- a/src/na/na.c -+++ b/src/na/na.c -@@ -20,6 +20,9 @@ - # include - # else - # include -+# include -+# include -+# include - # endif - #endif - -@@ -138,6 +141,10 @@ na_plugin_check_protocol(const struct na_class_ops *const class_ops[], - const struct na_class_ops **ops_p); - - #ifdef NA_HAS_DYNAMIC_PLUGINS -+/* Resolve plugin search path */ -+static na_return_t -+na_plugin_resolve_path(const char *offset, char *path, size_t path_size); -+ - /* Scan a given path and return a list of plugins */ - static na_return_t - na_plugin_scan_path(const char *path, struct na_plugin_entry **entries_p); -@@ -265,15 +272,26 @@ static void - na_initialize(void) - { - const char *plugin_path = getenv("NA_PLUGIN_PATH"); -+ char resolved_path[NA_PLUGIN_PATH_MAX]; - na_return_t ret; - -- if (plugin_path == NULL) -- plugin_path = NA_DEFAULT_PLUGIN_PATH; -+ if (plugin_path == NULL) { -+ ret = na_plugin_resolve_path( -+ NA_PLUGIN_RELATIVE_PATH, resolved_path, sizeof(resolved_path)); -+ NA_CHECK_SUBSYS_NA_ERROR(cls, done, ret, -+ "Could not resolve plugin path using offset (%s)", -+ NA_PLUGIN_RELATIVE_PATH); -+ plugin_path = resolved_path; -+ } - - ret = na_plugin_scan_path(plugin_path, &na_plugin_dynamic_g); - NA_CHECK_SUBSYS_WARNING(fatal, ret != NA_SUCCESS, -- "No plugin found in path (%s), consider setting NA_PLUGIN_PATH.", -+ "No usable plugin found in path (%s), consider setting NA_PLUGIN_PATH " -+ "if path indicated is not valid.", - plugin_path); -+ -+done: -+ return; - } - - /*---------------------------------------------------------------------------*/ -@@ -472,6 +490,44 @@ error: - /*---------------------------------------------------------------------------*/ - #ifdef NA_HAS_DYNAMIC_PLUGINS - # ifdef _WIN32 -+# define PATH_MAX MAX_PATH -+# define realpath(N, R) _fullpath((R), (N), PATH_MAX) -+# endif -+static na_return_t -+na_plugin_resolve_path(const char *offset, char *path, size_t path_size) -+{ -+ static int placeholder; -+ char libpath[PATH_MAX]; -+ char *slash; -+ na_return_t ret; -+ int rc; -+ -+ rc = hg_dl_get_path(&placeholder, path, path_size); -+ NA_CHECK_SUBSYS_ERROR( -+ cls, rc != 0, error, ret, NA_NOENTRY, "hg_dl_get_path() failed"); -+ -+ NA_CHECK_SUBSYS_ERROR(cls, realpath(path, libpath) == NULL, error, ret, -+ NA_NOENTRY, "realpath() failed, %s", strerror(errno)); -+ -+ slash = strrchr(libpath, '/'); -+ NA_CHECK_SUBSYS_ERROR(cls, slash == NULL, error, ret, NA_INVALID_ARG, -+ "Could not find last '/' in %s", libpath); -+ *slash = '\0'; -+ -+ rc = snprintf(path, path_size, "%s/%s", libpath, offset); -+ NA_CHECK_SUBSYS_ERROR(cls, rc < 0 || rc > (int) path_size, error, ret, -+ NA_OVERFLOW, -+ "snprintf() failed or name truncated, rc: %d (expected %zu)", rc, -+ path_size); -+ -+ return NA_SUCCESS; -+ -+error: -+ return ret; -+} -+ -+/*---------------------------------------------------------------------------*/ -+# ifdef _WIN32 - static na_return_t - na_plugin_scan_path(const char *path, struct na_plugin_entry **entries_p) - { -@@ -494,7 +550,7 @@ na_plugin_scan_path(const char *path, struct na_plugin_entry **entries_p) - struct dirent **plugin_list; - struct na_plugin_entry *entries = NULL; - na_return_t ret; -- int n, n_entries = 0; -+ int n, opened_plugins = 0; - - n = scandir(path, &plugin_list, na_plugin_filter, alphasort); - NA_CHECK_SUBSYS_ERROR( -@@ -504,16 +560,20 @@ na_plugin_scan_path(const char *path, struct na_plugin_entry **entries_p) - (struct na_plugin_entry *) calloc((size_t) n + 1, sizeof(*entries)); - NA_CHECK_SUBSYS_ERROR(cls, entries == NULL, error, ret, NA_NOMEM, - "Could not allocate %d plugin entries", n); -- n_entries = n; - - while (n--) { - ret = na_plugin_open(path, plugin_list[n]->d_name, &entries[n]); - free(plugin_list[n]); -- NA_CHECK_SUBSYS_NA_ERROR(cls, error, ret, "Could not open plugin (%s)", -- plugin_list[n]->d_name); -+ if (ret == NA_SUCCESS) -+ opened_plugins++; -+ else -+ NA_CHECK_SUBSYS_NA_ERROR( -+ cls, error, ret, "Could not open plugin (%s)", plugin_list[n]->d_name); - } - - free(plugin_list); -+ NA_CHECK_SUBSYS_ERROR(cls, opened_plugins == 0, error, ret, NA_NOENTRY, -+ "No usable plugin found in path (%s)", path); - - *entries_p = entries; - -@@ -521,19 +581,11 @@ na_plugin_scan_path(const char *path, struct na_plugin_entry **entries_p) - - error: - if (n > 0) { -- if (entries != NULL) { -- int i; -- -- /* close entry */ -- for (i = n + 1; i < n_entries; i++) -- na_plugin_close(&entries[i]); -- free(entries); -- } -- - while (n--) - free(plugin_list[n]); - free(plugin_list); - } -+ free(entries); - - return ret; - } -diff --git a/src/na/na_config.h.in b/src/na/na_config.h.in -index 30d0e08..50dd443 100644 ---- a/src/na/na_config.h.in -+++ b/src/na/na_config.h.in -@@ -80,7 +80,7 @@ - # define NA_PLUGIN - # define NA_PLUGIN_VISIBILITY NA_PRIVATE - #endif --#cmakedefine NA_DEFAULT_PLUGIN_PATH "@NA_DEFAULT_PLUGIN_PATH@" -+#cmakedefine NA_PLUGIN_RELATIVE_PATH "@NA_PLUGIN_RELATIVE_PATH@" - - /* Build Options */ - #cmakedefine NA_HAS_DEBUG -diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt -index 44fe6b9..32f51d1 100644 ---- a/src/util/CMakeLists.txt -+++ b/src/util/CMakeLists.txt -@@ -193,6 +193,7 @@ configure_file( - #------------------------------------------------------------------------------ - set(MERCURY_UTIL_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/mercury_atomic_queue.c -+ ${CMAKE_CURRENT_SOURCE_DIR}/mercury_dl.c - ${CMAKE_CURRENT_SOURCE_DIR}/mercury_dlog.c - ${CMAKE_CURRENT_SOURCE_DIR}/mercury_event.c - ${CMAKE_CURRENT_SOURCE_DIR}/mercury_hash_table.c -diff --git a/src/util/mercury_dl.h b/src/util/mercury_dl.h -index b86932a..0cda094 100644 ---- a/src/util/mercury_dl.h -+++ b/src/util/mercury_dl.h -@@ -62,6 +62,18 @@ hg_dl_close(HG_DL_HANDLE handle); - static HG_UTIL_INLINE void * - hg_dl_sym(HG_DL_HANDLE handle, const char *name); - -+/** -+ * Retrieve library path. -+ * -+ * \param addr [IN] address of the symbol -+ * \param path [OUT] buffer to store the path -+ * \param path_size [IN] size of the buffer -+ * -+ * \return Non-negative on success or negative on failure -+ */ -+HG_UTIL_PUBLIC int -+hg_dl_get_path(const void *addr, char *path, size_t path_size); -+ - /*---------------------------------------------------------------------------*/ - static HG_UTIL_INLINE const char * - hg_dl_error(void) -diff --git a/src/util/mercury_dl.c b/src/util/mercury_dl.c -new file mode 100644 -index 0000000..6ed4666 ---- /dev/null -+++ b/src/util/mercury_dl.c -@@ -0,0 +1,35 @@ -+/** -+ * Copyright (c) 2013-2022 UChicago Argonne, LLC and The HDF Group. -+ * Copyright (c) 2022-2023 Intel Corporation. -+ * -+ * SPDX-License-Identifier: BSD-3-Clause -+ */ -+ -+#if !defined(_WIN32) && !defined(_GNU_SOURCE) -+# define _GNU_SOURCE -+#endif -+#include "mercury_dl.h" -+ -+#include -+ -+/*---------------------------------------------------------------------------*/ -+int -+hg_dl_get_path(const void *addr, char *path, size_t path_size) -+{ -+#ifdef _WIN32 -+ HMODULE module; -+ if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | -+ GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, -+ (LPCSTR) addr, &module)) -+ return GetModuleFileNameA(module, path, (DWORD) path_size); -+#else -+ Dl_info info; -+ if (dladdr(addr, &info) && info.dli_fname) { -+ strncpy(path, info.dli_fname, path_size); -+ path[path_size - 1] = '\0'; -+ return HG_UTIL_SUCCESS; -+ } -+#endif -+ -+ return HG_UTIL_FAIL; -+} diff --git a/deps/patches/mercury/0003_ofi_auth_key.patch b/deps/patches/mercury/0003_ofi_auth_key.patch new file mode 100644 index 00000000000..480f6518dd9 --- /dev/null +++ b/deps/patches/mercury/0003_ofi_auth_key.patch @@ -0,0 +1,25 @@ +From cd678a20fb21b6e5a5b8f05619427dc79aa1246b Mon Sep 17 00:00:00 2001 +From: Jerome Soumagne +Date: Mon, 26 Jan 2026 12:27:27 -0600 +Subject: [PATCH 3/3] NA OFI: ensure domain auth key remains valid + +--- + src/na/na_ofi.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/na/na_ofi.c b/src/na/na_ofi.c +index d385d279..68a443e8 100644 +--- a/src/na/na_ofi.c ++++ b/src/na/na_ofi.c +@@ -5220,7 +5220,7 @@ na_ofi_domain_open(const struct na_ofi_fabric *na_ofi_fabric, + NA_NOMEM, "Could not allocate auth_key"); + memcpy(na_ofi_domain->auth_key, &base_auth_key, sizeof(base_auth_key)); + +- domain_attr->auth_key = (void *) &base_auth_key; ++ domain_attr->auth_key = (void *) na_ofi_domain->auth_key; + domain_attr->auth_key_size = auth_key_size; + } + +-- +2.52.0 + diff --git a/deps/patches/spdk/0001_3428322b812fe31cc3e1d0308a7f5bd4b06b9886.diff b/deps/patches/spdk/0001_3428322b812fe31cc3e1d0308a7f5bd4b06b9886.diff new file mode 100644 index 00000000000..f427d33d2d8 --- /dev/null +++ b/deps/patches/spdk/0001_3428322b812fe31cc3e1d0308a7f5bd4b06b9886.diff @@ -0,0 +1,51 @@ +diff --git a/module/bdev/aio/bdev_aio.c b/module/bdev/aio/bdev_aio.c +index 075459b1564..b51d6c83a3f 100644 +--- a/module/bdev/aio/bdev_aio.c ++++ b/module/bdev/aio/bdev_aio.c +@@ -64,7 +64,9 @@ struct file_disk { + struct spdk_bdev disk; + char *filename; + int fd; ++#ifdef RWF_NOWAIT + bool use_nowait; ++#endif + TAILQ_ENTRY(file_disk) link; + bool block_size_override; + bool readonly; +@@ -114,7 +116,9 @@ bdev_aio_open(struct file_disk *disk) + { + int fd; + int io_flag = disk->readonly ? O_RDONLY : O_RDWR; ++#ifdef RWF_NOWAIT + struct stat st; ++#endif + + fd = open(disk->filename, io_flag | O_DIRECT); + if (fd < 0) { +@@ -129,11 +133,14 @@ bdev_aio_open(struct file_disk *disk) + } + + disk->fd = fd; ++ ++#ifdef RWF_NOWAIT + /* Some aio operations can block, for example if number outstanding + * I/O exceeds number of block layer tags. But not all files can + * support RWF_NOWAIT flag. So use RWF_NOWAIT on block devices only. + */ + disk->use_nowait = fstat(fd, &st) == 0 && S_ISBLK(st.st_mode); ++#endif + + return 0; + } +@@ -205,9 +212,11 @@ bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk, + io_set_eventfd(iocb, aio_ch->group_ch->efd); + } + iocb->data = aio_task; ++#ifdef RWF_NOWAIT + if (fdisk->use_nowait) { + iocb->aio_rw_flags = RWF_NOWAIT; + } ++#endif + aio_task->len = nbytes; + aio_task->ch = aio_ch; + diff --git a/deps/patches/spdk/0001_b0aba3fcd5aceceea530a702922153bc75664978.diff b/deps/patches/spdk/0001_b0aba3fcd5aceceea530a702922153bc75664978.diff deleted file mode 100644 index 9186e715e2b..00000000000 --- a/deps/patches/spdk/0001_b0aba3fcd5aceceea530a702922153bc75664978.diff +++ /dev/null @@ -1,61 +0,0 @@ -diff --git a/scripts/setup.sh b/scripts/setup.sh -index d0c09430a6f..a56c74dd686 100755 ---- a/scripts/setup.sh -+++ b/scripts/setup.sh -@@ -141,6 +141,10 @@ function linux_bind_driver() { - - pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" - -+ if [[ $driver_name == "none" ]]; then -+ return 0 -+ fi -+ - echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true - echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true - -@@ -248,6 +252,17 @@ function collect_devices() { - if [[ $PCI_ALLOWED != *"$bdf"* ]]; then - pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" - in_use=1 -+ elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then -+ if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then -+ if [ "$mode" == "config" ]; then -+ cat <<- MESSAGE -+ Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint -+ which are attached to the kernel NVMe driver,the binding process may go faster -+ if you first run this script with DRIVER_OVERRIDE="none" to unbind only the -+ NVMe SSDs, and then run again to unbind the VMD devices." -+ MESSAGE -+ fi -+ fi - fi - fi - fi -@@ -305,7 +320,9 @@ function configure_linux_pci() { - fi - fi - -- if [[ -n "${DRIVER_OVERRIDE}" ]]; then -+ if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then -+ driver_name=none -+ elif [[ -n "${DRIVER_OVERRIDE}" ]]; then - driver_path="$DRIVER_OVERRIDE" - driver_name="${DRIVER_OVERRIDE##*/}" - # modprobe and the sysfs don't use the .ko suffix. -@@ -337,10 +354,12 @@ function configure_linux_pci() { - fi - - # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod -- if [[ -n "$driver_path" ]]; then -- insmod $driver_path || true -- else -- modprobe $driver_name -+ if [[ $driver_name != "none" ]]; then -+ if [[ -n "$driver_path" ]]; then -+ insmod $driver_path || true -+ else -+ modprobe $driver_name -+ fi - fi - - for bdf in "${!all_devices_d[@]}"; do diff --git a/deps/patches/spdk/0002_445a4c808badbad3942696ecf16fa60e8129a747.diff b/deps/patches/spdk/0002_445a4c808badbad3942696ecf16fa60e8129a747.diff deleted file mode 100644 index 11bd483eb89..00000000000 --- a/deps/patches/spdk/0002_445a4c808badbad3942696ecf16fa60e8129a747.diff +++ /dev/null @@ -1,70 +0,0 @@ -diff --git a/CONFIG b/CONFIG -index 5f552fe81df..481643dcc3b 100644 ---- a/CONFIG -+++ b/CONFIG -@@ -195,3 +195,6 @@ CONFIG_USDT=n - # Build with IDXD kernel support. - # In this mode, SPDK shares the DSA device with the kernel. - CONFIG_IDXD_KERNEL=n -+ -+# arc4random is available in stdlib.h -+CONFIG_HAVE_ARC4RANDOM=n -diff --git a/configure b/configure -index a18f34a004d..688d72bfbf6 100755 ---- a/configure -+++ b/configure -@@ -850,6 +850,11 @@ if [[ "${CONFIG[TSAN]}" = "y" ]]; then - fi - fi - -+if echo -e '#include \nint main(void) { arc4random(); return 0; }\n' \ -+ | "${BUILD_CMD[@]}" - 2> /dev/null; then -+ CONFIG[HAVE_ARC4RANDOM]="y" -+fi -+ - if [[ "${CONFIG[OCF]}" = "y" ]]; then - # If OCF_PATH is a file, assume it is a library and use it to compile with - if [ -f ${CONFIG[OCF_PATH]} ]; then -diff --git a/lib/iscsi/iscsi.c b/lib/iscsi/iscsi.c -index 00b1d62e26b..3c403b972f3 100644 ---- a/lib/iscsi/iscsi.c -+++ b/lib/iscsi/iscsi.c -@@ -62,7 +62,6 @@ - - #ifdef __FreeBSD__ - #define HAVE_SRANDOMDEV 1 --#define HAVE_ARC4RANDOM 1 - #endif - - struct spdk_iscsi_globals g_iscsi = { -@@ -97,7 +96,7 @@ srandomdev(void) - } - #endif /* HAVE_SRANDOMDEV */ - --#ifndef HAVE_ARC4RANDOM -+#ifndef SPDK_CONFIG_HAVE_ARC4RANDOM - static int g_arc4random_initialized = 0; - - static uint32_t -@@ -115,7 +114,7 @@ arc4random(void) - r = (r1 << 16) | r2; - return r; - } --#endif /* HAVE_ARC4RANDOM */ -+#endif /* SPDK_CONFIG_HAVE_ARC4RANDOM */ - - static void - gen_random(uint8_t *buf, size_t len) -diff --git a/scripts/check_format.sh b/scripts/check_format.sh -index 1dbc25d205e..e2e47131537 100755 ---- a/scripts/check_format.sh -+++ b/scripts/check_format.sh -@@ -270,7 +270,7 @@ function check_posix_includes() { - local rc=0 - - echo -n "Checking for POSIX includes..." -- git grep -I -i -f scripts/posix.txt -- './*' ':!include/spdk/stdinc.h' ':!include/linux/**' ':!lib/rte_vhost*/**' ':!scripts/posix.txt' ':!*.patch' > scripts/posix.log || true -+ git grep -I -i -f scripts/posix.txt -- './*' ':!include/spdk/stdinc.h' ':!include/linux/**' ':!lib/rte_vhost*/**' ':!scripts/posix.txt' ':!*.patch' ':!configure' > scripts/posix.log || true - if [ -s scripts/posix.log ]; then - echo "POSIX includes detected. Please include spdk/stdinc.h instead." - cat scripts/posix.log diff --git a/deps/patches/spdk/0002_spdk_rwf_nowait.patch b/deps/patches/spdk/0002_spdk_rwf_nowait.patch new file mode 100644 index 00000000000..e65bb55e32b --- /dev/null +++ b/deps/patches/spdk/0002_spdk_rwf_nowait.patch @@ -0,0 +1,78 @@ +diff --git a/CONFIG b/CONFIG +index 89c34e90b..02ce04692 100644 +--- a/CONFIG ++++ b/CONFIG +@@ -256,3 +256,6 @@ CONFIG_COPY_FILE_RANGE=n + + # liblz4 is available + CONFIG_HAVE_LZ4=n ++ ++# aio_rw_flags are enabled ++CONFIG_HAVE_AIO_RW_FLAGS=n +diff --git a/configure b/configure +index 26c9b0f4d..d8daedc37 100755 +--- a/configure ++++ b/configure +@@ -860,6 +860,22 @@ if [[ $sys_name != "Linux" ]]; then + fi + fi + ++if echo -e '#include \n' \ ++ '#include \n' \ ++ '#include \n' \ ++ '#ifndef RWF_NOWAIT\n' \ ++ '#error "No RWF_NOWAIT is defined"\n' \ ++ '#endif\n' \ ++ 'int main(int argc, char **argv) {\n' \ ++ 'return offsetof(struct iocb, aio_rw_flags);\n}\n' \ ++ | "${BUILD_CMD[@]}" -c - ; then ++ echo HAVE_AIO_RW_FLAGS=YES ++ CONFIG[HAVE_AIO_RW_FLAGS]="y" ++else ++ echo HAVE_AIO_RW_FLAGS=NO ++ CONFIG[HAVE_AIO_RW_FLAGS]="n" ++fi ++ + if [ "${CONFIG[RDMA]}" = "y" ]; then + if [[ ! "${CONFIG[RDMA_PROV]}" == "verbs" ]] && [[ ! "${CONFIG[RDMA_PROV]}" == "mlx5_dv" ]]; then + echo "Invalid RDMA provider specified, must be \"verbs\" or \"mlx5_dv\"" +diff --git a/module/bdev/aio/bdev_aio.c b/module/bdev/aio/bdev_aio.c +index b51d6c83a..01914fb9d 100644 +--- a/module/bdev/aio/bdev_aio.c ++++ b/module/bdev/aio/bdev_aio.c +@@ -64,7 +64,7 @@ struct file_disk { + struct spdk_bdev disk; + char *filename; + int fd; +-#ifdef RWF_NOWAIT ++#ifdef SPDK_CONFIG_HAVE_AIO_RW_FLAGS + bool use_nowait; + #endif + TAILQ_ENTRY(file_disk) link; +@@ -116,7 +116,7 @@ bdev_aio_open(struct file_disk *disk) + { + int fd; + int io_flag = disk->readonly ? O_RDONLY : O_RDWR; +-#ifdef RWF_NOWAIT ++#ifdef SPDK_CONFIG_HAVE_AIO_RW_FLAGS + struct stat st; + #endif + +@@ -134,7 +134,7 @@ bdev_aio_open(struct file_disk *disk) + + disk->fd = fd; + +-#ifdef RWF_NOWAIT ++#ifdef SPDK_CONFIG_HAVE_AIO_RW_FLAGS + /* Some aio operations can block, for example if number outstanding + * I/O exceeds number of block layer tags. But not all files can + * support RWF_NOWAIT flag. So use RWF_NOWAIT on block devices only. +@@ -212,7 +212,7 @@ bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk, + io_set_eventfd(iocb, aio_ch->group_ch->efd); + } + iocb->data = aio_task; +-#ifdef RWF_NOWAIT ++#ifdef SPDK_CONFIG_HAVE_AIO_RW_FLAGS + if (fdisk->use_nowait) { + iocb->aio_rw_flags = RWF_NOWAIT; + } diff --git a/deps/patches/spdk/0003_external_isal.patch b/deps/patches/spdk/0003_external_isal.patch new file mode 100644 index 00000000000..3a4a9be7824 --- /dev/null +++ b/deps/patches/spdk/0003_external_isal.patch @@ -0,0 +1,362 @@ +diff --git a/CONFIG b/CONFIG +index 89c34e90b..086db27a4 100644 +--- a/CONFIG ++++ b/CONFIG +@@ -170,9 +170,11 @@ CONFIG_CUSTOMOCF=n + + # Build ISA-L library + CONFIG_ISAL=y ++CONFIG_ISAL_PATH= + + # Build ISA-L-crypto library + CONFIG_ISAL_CRYPTO=y ++CONFIG_ISAL_CRYPTO_PATH= + + # Build with IO_URING support + CONFIG_URING=n +diff --git a/Makefile b/Makefile +index 3aeae41ad..5e249aaa3 100644 +--- a/Makefile ++++ b/Makefile +@@ -18,8 +18,16 @@ DIRS-$(CONFIG_EXAMPLES) += examples + DIRS-$(CONFIG_APPS) += app + DIRS-y += test + DIRS-$(CONFIG_IPSEC_MB) += ipsecbuild ++ifeq ($(CONFIG_ISAL),y) ++ifeq ($(CONFIG_ISAL_PATH),) + DIRS-$(CONFIG_ISAL) += isalbuild ++endif ++endif ++ifeq ($(CONFIG_ISAL_CRYPTO),y) ++ifeq ($(CONFIG_ISAL_CRYPTO_PATH),) + DIRS-$(CONFIG_ISAL_CRYPTO) += isalcryptobuild ++endif ++endif + DIRS-$(CONFIG_VFIO_USER) += vfiouserbuild + DIRS-$(CONFIG_SMA) += proto + DIRS-$(CONFIG_XNVME) += xnvmebuild +@@ -63,14 +71,18 @@ DPDK_DEPS += ipsecbuild + endif + + ifeq ($(CONFIG_ISAL),y) ++ifeq ($(CONFIG_ISAL_PATH),) + ISALBUILD = isalbuild + LIB += isalbuild + DPDK_DEPS += isalbuild + ifeq ($(CONFIG_ISAL_CRYPTO),y) ++ifeq ($(CONFIG_ISAL_CRYPTO_PATH),) + ISALCRYPTOBUILD = isalcryptobuild + LIB += isalcryptobuild + endif + endif ++endif ++endif + + ifeq ($(CONFIG_VFIO_USER),y) + VFIOUSERBUILD = vfiouserbuild +diff --git a/configure b/configure +index 26c9b0f4d..8ef548fa8 100755 +--- a/configure ++++ b/configure +@@ -62,6 +62,8 @@ function usage() { + echo " --without-idxd Disabled while experimental. Only built for x86 when enabled." + echo " --with-crypto Build isa-l-crypto and vbdev crypto module. No path required." + echo " --without-crypto Disable isa-l-crypto and vbdev crypto module." ++ echo " --with-isal[=DIR] Don't build isal, use external library" ++ echo " --with-isal-crypto[=DIR] Don't build isal-crypto, use external library" + echo " --with-fio[=DIR] Build fio_plugin." + echo " --without-fio default: /usr/src/fio" + echo " --with-xnvme Build xNVMe bdev module." +@@ -581,6 +583,26 @@ for i in "$@"; do + --without-fio) + CONFIG[FIO_PLUGIN]=n + ;; ++ --with-isal) ;& ++ --with-isal=*) ++ # if specified, set the default so we don't build it ++ CONFIG[ISAL_PATH]="/usr" ++ if [[ -n ${i#*=} ]] && [[ ${i#*=} != "$i" ]]; then ++ CONFIG[ISAL_PATH]=${i#*=} ++ fi ++ check_dir "--with-isal=${CONFIG[ISAL_PATH]}" ++ CONFIG[ISAL]=y ++ ;; ++ --with-isal-crypto) ;& ++ --with-isal-crypto=*) ++ # if specified, set the default so we don't build it ++ CONFIG[ISAL_CRYPTO_PATH]="/usr" ++ if [[ -n ${i#*=} ]] && [[ ${i#*=} != "$i" ]]; then ++ CONFIG[ISAL_CRYPTO_PATH]=${i#*=} ++ fi ++ check_dir "--with-isal-crypto=${CONFIG[ISAL_CRYPTO_PATH]}" ++ CONFIG[ISAL_CRYPTO]=y ++ ;; + --with-vtune=*) + check_dir "$i" + CONFIG[VTUNE_DIR]="${i#*=}" +@@ -1228,7 +1250,10 @@ if [[ "${CONFIG[FUZZER]}" = "y" && "$CC_TYPE" != "clang" ]]; then + exit 1 + fi + +-if [[ $arch == x86_64* ]] || [[ $arch == aarch64* ]]; then ++if [[ -d "${CONFIG[ISAL_PATH]}" ]]; then ++ echo "Using ISA-L from ${CONFIG[ISAL_PATH]}" ++ CONFIG[ISAL]=y ++elif [[ $arch == x86_64* ]] || [[ $arch == aarch64* ]]; then + CONFIG[ISAL]=y + # make sure the submodule is initialized + if [ ! -f "$rootdir"/isa-l/autogen.sh ]; then +@@ -1266,35 +1291,40 @@ else + fi + + # now either configure ISA-L or disable unavailable features +-if [[ "${CONFIG[ISAL]}" = "y" ]]; then +- cd $rootdir/isa-l +- ISAL_LOG=$rootdir/.spdk-isal.log +- if [[ -n "${CONFIG[CROSS_PREFIX]}" ]]; then +- ISAL_OPTS=("--host=${CONFIG[CROSS_PREFIX]}") +- else +- ISAL_OPTS=() +- fi +- if [[ "${CONFIG[SHARED]}" = "y" ]]; then +- ISAL_OPTS+=("--enable-shared=yes") ++if [[ ! -d "${CONFIG[ISAL_PATH]}" ]]; then ++ if [[ "${CONFIG[ISAL]}" = "y" ]]; then ++ cd $rootdir/isa-l ++ ISAL_LOG=$rootdir/.spdk-isal.log ++ if [[ -n "${CONFIG[CROSS_PREFIX]}" ]]; then ++ ISAL_OPTS=("--host=${CONFIG[CROSS_PREFIX]}") ++ else ++ ISAL_OPTS=() ++ fi ++ if [[ "${CONFIG[SHARED]}" = "y" ]]; then ++ ISAL_OPTS+=("--enable-shared=yes") ++ else ++ ISAL_OPTS+=("--enable-shared=no") ++ fi ++ ISAL_OPTS+=("--prefix=${CONFIG[PREFIX]}") ++ echo -n "Configuring ISA-L (logfile: $ISAL_LOG)..." ++ ./autogen.sh &> $ISAL_LOG ++ ./configure CFLAGS="-fPIC -g -O2 -fuse-ld=$LD_TYPE -Wno-unused-command-line-argument" "${ISAL_OPTS[@]}" --enable-shared=no >> $ISAL_LOG 2>&1 ++ echo "done." ++ cd $rootdir + else +- ISAL_OPTS+=("--enable-shared=no") ++ echo "Without ISA-L, there is no software support for crypto or compression," ++ echo "so these features will be disabled." ++ CONFIG[CRYPTO]=n ++ CONFIG[VBDEV_COMPRESS]=n ++ CONFIG[DPDK_COMPRESSDEV]=n + fi +- ISAL_OPTS+=("--prefix=${CONFIG[PREFIX]}") +- echo -n "Configuring ISA-L (logfile: $ISAL_LOG)..." +- ./autogen.sh &> $ISAL_LOG +- ./configure CFLAGS="-fPIC -g -O2 -fuse-ld=$LD_TYPE -Wno-unused-command-line-argument" "${ISAL_OPTS[@]}" --enable-shared=no >> $ISAL_LOG 2>&1 +- echo "done." +- cd $rootdir +-else +- echo "Without ISA-L, there is no software support for crypto or compression," +- echo "so these features will be disabled." +- CONFIG[CRYPTO]=n +- CONFIG[VBDEV_COMPRESS]=n +- CONFIG[DPDK_COMPRESSDEV]=n + fi + + # ISA-L-crypto complements ISA-L functionality, it is only enabled together with ISA-L +-if [[ "${CONFIG[ISAL]}" = "y" ]]; then ++if [[ -d "${CONFIG[ISAL_CRYPTO_PATH]}" ]]; then ++ echo "Using isa-l_crypto from ${CONFIG[ISAL_CRYPTO_PATH]}" ++ CONFIG[ISAL_CRYPTO]=y ++elif [[ "${CONFIG[ISAL]}" = "y" ]]; then + if [ ! -f "$rootdir"/isa-l-crypto/autogen.sh ]; then + echo "ISA-L-crypto is required but was not found, please init the submodule with:" + echo " git submodule update --init" +diff --git a/dpdkbuild/Makefile b/dpdkbuild/Makefile +index 64da6cc32..a88c8a6ec 100644 +--- a/dpdkbuild/Makefile ++++ b/dpdkbuild/Makefile +@@ -108,8 +108,8 @@ DPDK_DRIVERS += compress compress/isal + ifeq ($(CONFIG_VBDEV_COMPRESS_MLX5),y) + DPDK_DRIVERS += compress/mlx5 + endif +-DPDK_CFLAGS += -I$(ISAL_DIR) -I$(ISAL_BUILD_DIR) +-DPDK_LDFLAGS += -L$(ISAL_DIR)/.libs -lisal ++DPDK_CFLAGS += -I$(ISAL_DIR) -I$(ISAL_DIR)/include -I$(ISAL_BUILD_DIR) ++DPDK_LDFLAGS += -L$(ISAL_DIR)/.libs -L$(ISAL_DIR)/lib64 -lisal + endif + + DPDK_ENABLED_DRIVERS = $(shell echo $(DPDK_DRIVERS) | sed -E "s/ +/,/g") +diff --git a/lib/accel/Makefile b/lib/accel/Makefile +index 0d4cb1239..840a031a1 100644 +--- a/lib/accel/Makefile ++++ b/lib/accel/Makefile +@@ -18,6 +18,8 @@ ifeq ($(CONFIG_HAVE_LZ4),y) + LOCAL_SYS_LIBS += -llz4 + endif + ++LOCAL_SYS_LIBS += -L$(ISAL_CRYPTO_DIR)/lib64 -lisal_crypto ++ + SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_accel.map) + + include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/lib/accel/accel_sw.c b/lib/accel/accel_sw.c +index d7e2dfff9..03b1dcb4c 100644 +--- a/lib/accel/accel_sw.c ++++ b/lib/accel/accel_sw.c +@@ -24,12 +24,21 @@ + #endif + + #ifdef SPDK_CONFIG_ISAL ++#ifdef SPDK_CONFIG_ISAL_PATH ++#include ++#else + #include "../isa-l/include/igzip_lib.h" ++#endif + #ifdef SPDK_CONFIG_ISAL_CRYPTO ++#ifdef SPDK_CONFIG_ISAL_CRYPTO_PATH ++#include "isa-l-crypto/aes_xts.h" ++#include "isa-l-crypto/isal_crypto_api.h" ++#else + #include "../isa-l-crypto/include/aes_xts.h" + #include "../isa-l-crypto/include/isal_crypto_api.h" + #endif + #endif ++#endif + + /* Per the AES-XTS spec, the size of data unit cannot be bigger than 2^20 blocks, 128b each block */ + #define ACCEL_AES_XTS_MAX_BLOCK_SIZE (1 << 24) +diff --git a/lib/env_dpdk/env.mk b/lib/env_dpdk/env.mk +index f71de7f48..a45a019df 100644 +--- a/lib/env_dpdk/env.mk ++++ b/lib/env_dpdk/env.mk +@@ -171,7 +171,7 @@ endif + endif + + ifeq ($(CONFIG_VBDEV_COMPRESS),y) +-DPDK_PRIVATE_LINKER_ARGS += -lisal -L$(ISAL_DIR)/.libs ++DPDK_PRIVATE_LINKER_ARGS += -lisal -L$(ISAL_DIR)/.libs -L$(ISAL_DIR)/lib64 + ifeq ($(CONFIG_VBDEV_COMPRESS_MLX5),y) + DPDK_PRIVATE_LINKER_ARGS += -lmlx5 -libverbs + endif +diff --git a/lib/util/Makefile b/lib/util/Makefile +index e9daa2623..c2fa28734 100644 +--- a/lib/util/Makefile ++++ b/lib/util/Makefile +@@ -22,6 +22,8 @@ ifeq ($(CONFIG_HAVE_UUID_GENERATE_SHA1), n) + LOCAL_SYS_LIBS += -lssl + endif + ++LOCAL_SYS_LIBS += -L$(ISAL_DIR)/lib64 -lisal ++ + CFLAGS += -Wpointer-arith + + SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_util.map) +diff --git a/lib/util/crc16.c b/lib/util/crc16.c +index f085a2851..a5e6937ca 100644 +--- a/lib/util/crc16.c ++++ b/lib/util/crc16.c +@@ -11,7 +11,12 @@ + */ + + #ifdef SPDK_CONFIG_ISAL ++#ifdef SPDK_CONFIG_ISAL_PATH ++#include ++#else + #include "isa-l/include/crc.h" ++#endif ++ + + uint16_t + spdk_crc16_t10dif(uint16_t init_crc, const void *buf, size_t len) +diff --git a/lib/util/crc64.c b/lib/util/crc64.c +index b1a37af35..31bd7bd3c 100644 +--- a/lib/util/crc64.c ++++ b/lib/util/crc64.c +@@ -7,7 +7,11 @@ + #include "spdk/crc64.h" + + #ifdef SPDK_CONFIG_ISAL ++#ifdef SPDK_CONFIG_ISAL_PATH ++#include ++#else + #include "isa-l/include/crc64.h" ++#endif + + uint64_t + spdk_crc64_nvme(const void *buf, size_t len, uint64_t crc) +diff --git a/lib/util/crc_internal.h b/lib/util/crc_internal.h +index b432d0d7b..f9979249f 100644 +--- a/lib/util/crc_internal.h ++++ b/lib/util/crc_internal.h +@@ -10,7 +10,11 @@ + + #ifdef SPDK_CONFIG_ISAL + #define SPDK_HAVE_ISAL +-#include ++#ifdef SPDK_CONFIG_ISAL_PATH ++#include ++#else ++#include "isa-l/include/crc.h" ++#endif + #elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + #define SPDK_HAVE_ARM_CRC + #include +diff --git a/lib/util/xor.c b/lib/util/xor.c +index 07eca5f50..2b15aea3b 100644 +--- a/lib/util/xor.c ++++ b/lib/util/xor.c +@@ -85,7 +85,11 @@ xor_gen_basic(void *dest, void **sources, uint32_t n, uint32_t len) + } + + #ifdef SPDK_CONFIG_ISAL ++#ifdef SPDK_CONFIG_ISAL_PATH ++#include ++#else + #include "isa-l/include/raid.h" ++#endif + + #define SPDK_XOR_BUF_ALIGN 32 + +diff --git a/mk/spdk.common.mk b/mk/spdk.common.mk +index 19f0192c2..65ef68f50 100644 +--- a/mk/spdk.common.mk ++++ b/mk/spdk.common.mk +@@ -179,23 +179,31 @@ endif + + IPSEC_MB_DIR=$(CONFIG_IPSEC_MB_DIR) + ++ifeq ($(CONFIG_ISAL_PATH),) + ISAL_DIR=$(SPDK_ROOT_DIR)/isa-l ++else ++ISAL_DIR=$(CONFIG_ISAL_PATH) ++endif ++ifeq ($(CONFIG_ISAL_CRYPTO_PATH),) + ISAL_CRYPTO_DIR=$(SPDK_ROOT_DIR)/isa-l-crypto ++else ++ISAL_CRYPTO_DIR=$(CONFIG_ISAL_CRYPTO_PATH) ++endif + ISAL_BUILD_DIR=$(SPDK_ROOT_DIR)/isalbuild + ISAL_CRYPTO_BUILD_DIR=$(SPDK_ROOT_DIR)/isalcryptobuild +-ifeq ($(CONFIG_ISAL), y) +-COMMON_CFLAGS += -I$(ISAL_DIR)/.. -I$(ISAL_BUILD_DIR) ++ifeq ($(CONFIG_ISAL),y) ++COMMON_CFLAGS += -I$(ISAL_DIR)/.. -I$(ISAL_DIR)/include -I$(ISAL_BUILD_DIR) + ifeq ($(CONFIG_SHARED),y) +-SYS_LIBS += -L$(ISAL_DIR)/.libs -lisal +-LDFLAGS += -Wl,-rpath=$(ISAL_DIR)/.libs ++SYS_LIBS += -L$(ISAL_DIR)/.libs -L$(ISAL_DIR)/lib64 -lisal ++LDFLAGS += -Wl,-rpath=$(ISAL_DIR)/.lib -Wl,-rpath=$(ISAL_DIR)/lib64 + else + SYS_LIBS += $(ISAL_DIR)/.libs/libisal.a + endif +-ifeq ($(CONFIG_ISAL_CRYPTO), y) ++ifeq ($(CONFIG_ISAL_CRYPTO),y) + COMMON_CFLAGS += -I$(ISAL_CRYPTO_DIR)/.. -I$(ISAL_CRYPTO_BUILD_DIR) + ifeq ($(CONFIG_SHARED),y) +-SYS_LIBS += -L$(ISAL_CRYPTO_DIR)/.libs -lisal_crypto +-LDFLAGS += -Wl,-rpath=$(ISAL_CRYPTO_DIR)/.libs ++SYS_LIBS += -L$(ISAL_CRYPTO_DIR)/.libs -L$(ISAL_CRYPTO_DIR)/lib64 -lisal_crypto ++LDFLAGS += -Wl,-rpath=$(ISAL_CRYPTO_DIR)/.libs -Wl,-rpath=$(ISAL_CRYPTO_DIR)/lib64 + else + SYS_LIBS += $(ISAL_CRYPTO_DIR)/.libs/libisal_crypto.a + endif diff --git a/docs/QSG/setup_rhel.md b/docs/QSG/setup_rhel.md index 8ce61c12b6f..053cb1c0cd3 100644 --- a/docs/QSG/setup_rhel.md +++ b/docs/QSG/setup_rhel.md @@ -377,8 +377,8 @@ Examples are available on [github](https://github.com/daos-stack/daos/tree/maste pdsh -S -w $SERVER_NODES "sudo systemctl status daos_server" # if you see following format messages (depending on number of servers), proceed to storage format - server-1: server-1.test.hpdd.intel.com INFO 2023/04/11 23:14:06 SCM format required on instance 1 - server-1: server-1.test.hpdd.intel.com INFO 2023/04/11 23:14:06 SCM format required on instance 0 + server-1: server-1.test.example.com INFO 2023/04/11 23:14:06 SCM format required on instance 1 + server-1: server-1.test.example.com INFO 2023/04/11 23:14:06 SCM format required on instance 0 # format storage dmg storage format -l $SERVER_NODES # can use --force if needed @@ -391,10 +391,10 @@ Examples are available on [github](https://github.com/daos-stack/daos/tree/maste # all the server ranks should show 'Joined' STATE Rank UUID Control Address Fault Domain State Reason ---- ---- --------------- ------------ ----- ------ - 0 604c4ffa-563a-49dc-b702-3c87293dbcf3 10.8.1.179:10001 /server-1.test.hpdd.intel.com Joined - 1 f0791f98-4379-4ace-a083-6ca3ffa65756 10.8.1.179:10001 /server-1.test.hpdd.intel.com Joined - 2 745d2a5b-46dd-42c5-b90a-d2e46e178b3e 10.8.1.189:10001 /server-2.test.hpdd.intel.com Joined - 3 ba6a7800-3952-46ce-af92-bba9daa35048 10.8.1.189:10001 /server-2.test.hpdd.intel.com Joined + 0 604c4ffa-563a-49dc-b702-3c87293dbcf3 10.8.1.179:10001 /server-1.test.example.com Joined + 1 f0791f98-4379-4ace-a083-6ca3ffa65756 10.8.1.179:10001 /server-1.test.example.com Joined + 2 745d2a5b-46dd-42c5-b90a-d2e46e178b3e 10.8.1.189:10001 /server-2.test.example.com Joined + 3 ba6a7800-3952-46ce-af92-bba9daa35048 10.8.1.189:10001 /server-2.test.example.com Joined ## Start the DAOS Agents diff --git a/docs/QSG/setup_suse.md b/docs/QSG/setup_suse.md index ddd964efe43..b1a29dd263d 100644 --- a/docs/QSG/setup_suse.md +++ b/docs/QSG/setup_suse.md @@ -394,7 +394,7 @@ Examples are available on [github](https://github.com/daos-stack/daos/tree/maste pdsh -S -w $SERVER_NODES "sudo systemctl status daos_server" # if you see following format messages (depending on number of servers), proceed to storage format - node-4: node-1.test.hpdd.intel.com INFO 2023/04/11 23:14:06 SCM format required on instance 0 + node-4: node-1.test.example.com INFO 2023/04/11 23:14:06 SCM format required on instance 0 # format storage dmg storage format -l $SERVER_NODES # can use --force if needed @@ -407,10 +407,10 @@ Examples are available on [github](https://github.com/daos-stack/daos/tree/maste # all the server ranks should show 'Joined' STATE Rank UUID Control Address Fault Domain State Reason ---- ---- --------------- ------------ ----- ------ - 0 604c4ffa-563a-49dc-b702-3c87293dbcf3 10.8.1.179:10001 /node-4.test.hpdd.intel.com Joined - 1 f0791f98-4379-4ace-a083-6ca3ffa65756 10.8.1.179:10001 /node-4.test.hpdd.intel.com Joined - 2 745d2a5b-46dd-42c5-b90a-d2e46e178b3e 10.8.1.189:10001 /node-5.test.hpdd.intel.com Joined - 3 ba6a7800-3952-46ce-af92-bba9daa35048 10.8.1.189:10001 /node-5.test.hpdd.intel.com Joined + 0 604c4ffa-563a-49dc-b702-3c87293dbcf3 10.8.1.179:10001 /node-4.test.example.com Joined + 1 f0791f98-4379-4ace-a083-6ca3ffa65756 10.8.1.179:10001 /node-4.test.example.com Joined + 2 745d2a5b-46dd-42c5-b90a-d2e46e178b3e 10.8.1.189:10001 /node-5.test.example.com Joined + 3 ba6a7800-3952-46ce-af92-bba9daa35048 10.8.1.189:10001 /node-5.test.example.com Joined ## Start the DAOS Agents diff --git a/docs/QSG/tour.md b/docs/QSG/tour.md index 78dd290cc03..130b89b5650 100644 --- a/docs/QSG/tour.md +++ b/docs/QSG/tour.md @@ -153,7 +153,7 @@ bring-up DAOS servers and clients. IOR-3.4.0+dev: MPI Coordinated Test of Parallel I/O Began : Fri Apr 16 18:07:56 2021 Command line : ior -a POSIX -b 26214400 -v -w -k -i 1 -o /tmp/daos_test1/testfile -t 25M - Machine : Linux boro-8.boro.hpdd.intel.com + Machine : Linux boro-8.boro.example.com Start time skew across all tasks: 0.00 sec TestID : 0 StartTime : Fri Apr 16 18:07:56 2021 @@ -358,19 +358,19 @@ bring-up DAOS servers and clients. $ dmg system query -v Rank UUID Control Address Fault Domain State Reason ---- --------------- ------------ ----- ------ - 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 boro-8.boro.hpdd.intel.com Joined - 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 boro-35.boro.hpdd.intel.com Joined - 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 boro-53.boro.hpdd.intel.com Joined - 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 boro-52.boro.hpdd.intel.com Evicted system stop + 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 boro-8.boro.example.com Joined + 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 boro-35.boro.example.com Joined + 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 boro-53.boro.example.com Joined + 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 boro-52.boro.example.com Evicted system stop # Restart, after evicted server restarted, verify the server joined $ /usr/bin/dmg system query -v Rank UUID Control Address Fault Domain State Reason ---- --------------- ------------ ----- ------ - 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 /boro-8.boro.hpdd.intel.com Joined - 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 /boro-35.boro.hpdd.intel.com Joined - 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 /boro-53.boro.hpdd.intel.com Joined - 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 /boro-52.boro.hpdd.intel.com Joined + 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 /boro-8.boro.example.com Joined + 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 /boro-35.boro.example.com Joined + 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 /boro-53.boro.example.com Joined + 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 /boro-52.boro.example.com Joined # Unmount after test completed $ fusermount -u /tmp/daos_test1/ @@ -415,19 +415,19 @@ bring-up DAOS servers and clients. $ dmg system query -v Rank UUID Control Address Fault Domain State Reason ---- --------------- ------------ ----- ------ - 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 boro-8.boro.hpdd.intel.com Joined - 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 boro-35.boro.hpdd.intel.com Joined - 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 boro-53.boro.hpdd.intel.com Evicted system stop - 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 boro-52.boro.hpdd.intel.com Joined + 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 boro-8.boro.example.com Joined + 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 boro-35.boro.example.com Joined + 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 boro-53.boro.example.com Evicted system stop + 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 boro-52.boro.example.com Joined # Restart, after evicted server restarted, verify the server joined $ /usr/bin/dmg system query -v Rank UUID Control Address Fault Domain State Reason ---- --------------- ------------ ----- ------ - 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 /boro-8.boro.hpdd.intel.com Joined - 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 /boro-35.boro.hpdd.intel.com Joined - 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 /boro-53.boro.hpdd.intel.com Joined - 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 /boro-52.boro.hpdd.intel.com Joined + 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 /boro-8.boro.example.com Joined + 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 /boro-35.boro.example.com Joined + 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 /boro-53.boro.example.com Joined + 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 /boro-52.boro.example.com Joined ## Clean-Up diff --git a/docs/admin/administration.md b/docs/admin/administration.md index 61fa6ba4179..ca61456d7be 100644 --- a/docs/admin/administration.md +++ b/docs/admin/administration.md @@ -34,7 +34,7 @@ Below is an example of a RAS event signaling an exclusion of an unresponsive engine: ``` -&&& RAS EVENT id: [swim_rank_dead] ts: [2021-11-21T13:32:31.747408+0000] host: [wolf-112.wolf.hpdd.intel.com] type: [STATE_CHANGE] sev: [NOTICE] msg: [SWIM marked rank as dead.] pid: [253454] tid: [1] rank: [6] inc: [63a058833280000] +&&& RAS EVENT id: [swim_rank_dead] ts: [2021-11-21T13:32:31.747408+0000] host: [wolf-112.wolf.example.com] type: [STATE_CHANGE] sev: [NOTICE] msg: [SWIM marked rank as dead.] pid: [253454] tid: [1] rank: [6] inc: [63a058833280000] ``` ### Event List @@ -580,7 +580,7 @@ following `daos_server` log entries to indicate the parameters are written to the engine's NVMe config: ```bash -DEBUG 13:59:29.229795 provider.go:592: BdevWriteConfigRequest: &{ForwardableRequest:{Forwarded:false} ConfigOutputPath:/mnt/daos0/daos_nvme.conf OwnerUID:10695475 OwnerGID:10695475 TierProps:[{Class:nvme DeviceList:0000:5e:00.0 DeviceFileSize:0 Tier:1 DeviceRoles:{OptionBits:0}}] HotplugEnabled:false HotplugBusidBegin:0 HotplugBusidEnd:0 Hostname:wolf-310.wolf.hpdd.intel.com AccelProps:{Engine: Options:0} SpdkRpcSrvProps:{Enable:false SockAddr:} AutoFaultyProps:{Enable:true MaxIoErrs:1 MaxCsumErrs:2} VMDEnabled:false ScannedBdevs:} +DEBUG 13:59:29.229795 provider.go:592: BdevWriteConfigRequest: &{ForwardableRequest:{Forwarded:false} ConfigOutputPath:/mnt/daos0/daos_nvme.conf OwnerUID:10695475 OwnerGID:10695475 TierProps:[{Class:nvme DeviceList:0000:5e:00.0 DeviceFileSize:0 Tier:1 DeviceRoles:{OptionBits:0}}] HotplugEnabled:false HotplugBusidBegin:0 HotplugBusidEnd:0 Hostname:wolf-310.wolf.example.com AccelProps:{Engine: Options:0} SpdkRpcSrvProps:{Enable:false SockAddr:} AutoFaultyProps:{Enable:true MaxIoErrs:1 MaxCsumErrs:2} VMDEnabled:false ScannedBdevs:} Writing NVMe config file for engine instance 0 to "/mnt/daos0/daos_nvme.conf" ``` @@ -642,41 +642,18 @@ This LED activity visually indicates a fault and that the device needs to be rep longer in use by DAOS. The LED of the VMD device will remain in this state until replaced by a new device. -!!! note - Full NVMe hot plug capability will be available and supported in DAOS 2.6 release. - Use is currently intended for testing only and is not supported for production. - -- To use a newly added (hot-inserted) SSD it needs to be unbound from the kernel driver -and bound instead to a user-space driver so that the device can be used with DAOS. - -To rebind a SSD on a single host, run the following command (replace SSD PCI address and -hostname with appropriate values): +- If VMD is not enabled, then in order to use a newly added (hot-inserted) SSD it needs to be +unbound from the kernel driver and bound instead to a user-space driver so that the device can be +used with DAOS. To rebind an SSD on a single host, run the following command (replace SSD PCI +address and hostname with appropriate values): ```bash $ dmg storage nvme-rebind -a 0000:84:00.0 -l wolf-167 Command completed successfully ``` The device will now be bound to a user-space driver (e.g. VFIO) and can be accessed by -DAOS I/O engine processes (and used in the following `dmg storage replace nvme` command -as a new device). - -- Once an engine is using a newly added (hot-inserted) SSD it can be added to the persistent -NVMe config (stored on SCM) so that on engine restart the new device will be used. - -To update the engine's persistent NVMe config with the new SSD transport address, run the -following command (replace SSD PCI address, engine index and hostname with appropriate values): -```bash -$ dmg storage nvme-add-device -a 0000:84:00.0 -e 0 -l wolf-167 -Command completed successfully -``` - -The optional [--tier-index|-t] command parameter can be used to specify which storage tier to -insert the SSD into, if specified then the server will attempt to insert the device into the tier -specified by the index, if not specified then the server will attempt to insert the device into -the bdev tier with the lowest index value (the first bdev tier). - -The device will now be registered in the engine's persistent NVMe config so that when restarted, -the newly added SSD will be used. +DAOS I/O engine processes. Now the new device can be used in the following +`dmg storage replace nvme` command. - Replace an excluded SSD with a New Device: ```bash @@ -1047,6 +1024,12 @@ An examples workflow would be: rank will be created). - Formatted engine will join using the existing (old) rank which is mapped to the engine's hardware. +!!! note + `dmg storage format --replace` can be used to replace a rank in `AdminExcluded` state. The + subsequent state of the rank will then no longer be `AdminExcluded`. This special case reduces + a chance that a duplicate rank entry is introduced inadvertently because the rank to be replaced + is in the `AdminExcluded` state and so is recreated rather than replaced. + ### System Erase To erase the DAOS sorage configuration, the `dmg system erase` diff --git a/docs/admin/deployment.md b/docs/admin/deployment.md index 879c415092d..88e0c4bd16c 100644 --- a/docs/admin/deployment.md +++ b/docs/admin/deployment.md @@ -438,7 +438,7 @@ per engine. The command redirects stderr to /dev/null and stdout to a temporary installation is from a source build. ```bash -[user@wolf-226 daos]$ install/bin/daos_server config generate -p ofi+tcp --use-tmpfs-scm 2>/dev/null | tee ~/configs/tmp.yml +$ daos_server config generate -p ofi+tcp --use-tmpfs-scm 2>/dev/null | tee ~/configs/tmp.yml port: 10001 transport_config: allow_insecure: false @@ -496,6 +496,7 @@ disable_vmd: false enable_hotplug: false nr_hugepages: 16384 disable_hugepages: false +allow_thp: false control_log_mask: INFO control_log_file: /var/log/daos/daos_server.log core_dump_filter: 19 @@ -512,7 +513,7 @@ Now we start the `daos_server` service from the generated config which loads suc and runs until the point where a storage format is required, as expected. ```bash -[user@wolf-226 daos]$ install/bin/daos_server start -i -o ~/configs/tmp.yml +$ daos_server start -i -o ~/configs/tmp.yml DAOS Server config loaded from /home/user/configs/tmp.yml install/bin/daos_server logging to file /tmp/daos_server.log NOTICE: Configuration includes only one MS replica. This provides no redundancy in the event of a MS replica failure. @@ -534,12 +535,12 @@ Note the subsequent system query command may not show ranks started immediately format command returns so it is recommended to leave a short delay (~5s) before invoking. ```bash -[user@wolf-226 daos]$ install/bin/dmg storage format -i +$ dmg storage format -i Format Summary: Hosts SCM Devices NVMe Devices ----- ----------- ------------ localhost 2 16 -[user@wolf-226 daos]$ install/bin/dmg system query -i +$ dmg system query -i Rank State ---- ----- [0-1] Joined @@ -564,17 +565,17 @@ daos_engine:1 Using NUMA core allocation algorithm SCM @ /mnt/daos0: 91 GB Total/91 GB Avail Starting I/O Engine instance 0: /home/user/projects/daos/install/bin/daos_engine daos_engine:0 Using NUMA core allocation algorithm -MS leader running on wolf-226.wolf.hpdd.intel.com -daos_engine:1 DAOS I/O Engine (v2.3.101) process 1215202 started on rank 1 with 16 target, 4 helper XS, firstcore 0, host wolf-226.wolf.hpdd.intel.com. +MS leader running on wolf-226.domain +daos_engine:1 DAOS I/O Engine (v2.3.101) process 1215202 started on rank 1 with 16 target, 4 helper XS, firstcore 0, host wolf-226.domain. Using NUMA node: 1 -daos_engine:0 DAOS I/O Engine (v2.3.101) process 1215209 started on rank 0 with 16 target, 4 helper XS, firstcore 0, host wolf-226.wolf.hpdd.intel.com. +daos_engine:0 DAOS I/O Engine (v2.3.101) process 1215209 started on rank 0 with 16 target, 4 helper XS, firstcore 0, host wolf-226.domain. Using NUMA node: 0 ``` For reference, the hardware scan results for the target storage server are included below. ```bash -[user@wolf-226 daos]$ install/bin/daos_server nvme scan +$ daos_server nvme scan Scan locally-attached NVMe storage... NVMe PCI Model FW Revision Socket ID Capacity -------- ----- ----------- --------- -------- @@ -595,7 +596,7 @@ NVMe PCI Model FW Revision Socket ID Capacity 0000:e0:00.0 MZXLR3T8HBLS-000H3 MPK7525Q 1 3.8 TB 0000:e1:00.0 MZXLR3T8HBLS-000H3 MPK7525Q 1 3.8 TB -[user@wolf-226 daos]$ install/bin/daos_server network scan +$ daos_server network scan --------- localhost --------- @@ -807,6 +808,32 @@ configuration file with a populated per-engine section can be stored in `/etc/daos/daos_server.yml`, and after reestarting the `daos_server` service it is then ready for the storage to be formatted. + +### Transparent HugePage (THP) support + +DAOS relies on the use of hugepages in a dedicated manner and turning on transparent hugepages means +the hugepage memory pool gets used in a model more like a cache. This can have adverse effects on +DAOS behavior and may cause OOM and DMA buffer allocation failures at high load. + +By default the server will fail to start and exit when the server is started with THP enabled. + +```bash +DEBUG 2025/12/14 09:54:32.537839 main.go:87: server: code = 623 description = "transparent hugepage (THP) enabled on storage server, DAOS requires THP to be disabled" +ERROR: server: code = 623 description = "transparent hugepage (THP) enabled on storage server, DAOS requires THP to be disabled" +ERROR: server: code = 623 resolution = "disable THP by adding 'transparent_hugepage=never' kernel parameter in the grub configuration file then reboot and restart daos_server" +``` + +The following command can be used to verify whether THP is enabled: + +```bash +cat /sys/kernel/mm/transparent_hugepage/enabled +[always] madvise never +``` + +If `allow_thp: true` parameter is set in server config file global section, the behavior will change +and the server will start with THP enabled. + + ## DAOS Server Remote Access Remote tasking of the DAOS system and individual DAOS Server processes can be @@ -895,7 +922,7 @@ resetting modules into "MemoryMode" through resource allocations. A subsequent reboot is required for BIOS to read the new resource allocations. -#### Multiple PMem namespaces per socket (Experimental) +#### Multiple PMem namespaces per socket By default the `daos_server scm prepare` command will create one PMem namespace on each PMem region. @@ -968,12 +995,12 @@ fallback to using UIO user-space driver with SPDK instead. The output will be equivalent running `dmg storage scan --verbose` remotely. ```bash -bash-4.2$ dmg storage scan +$ dmg storage scan Hosts SCM Total NVMe Total ----- --------- ---------- wolf-[71-72] 6.4 TB (2 namespaces) 3.1 TB (3 controllers) -bash-4.2$ dmg storage scan --verbose +$ dmg storage scan --verbose ------------ wolf-[71-72] ------------ @@ -1018,7 +1045,7 @@ manual reset to do so. SSD health state can be verified via `dmg storage scan --nvme-health`: ```bash -bash-4.2$ dmg storage scan --nvme-health +$ dmg storage scan --nvme-health ------- wolf-71 ------- @@ -1298,7 +1325,7 @@ To illustrate, assume a cluster with homogeneous hardware configurations that returns the following from scan for each host: ```bash -[daos@wolf-72 daos_m]$ dmg -l wolf-7[1-2] storage scan --verbose +$ dmg -l wolf-7[1-2] storage scan --verbose ------- wolf-7[1-2] ------- @@ -1544,7 +1571,7 @@ Upon successful format, DAOS Control Servers will start DAOS I/O engines that have been specified in the server config file. Successful start-up is indicated by the following on stdout: -`DAOS I/O Engine (v2.0.1) process 433456 started on rank 1 with 8 target, 2 helper XS, firstcore 0, host wolf-72.wolf.hpdd.intel.com.` +`DAOS I/O Engine (v2.0.1) process 433456 started on rank 1 with 8 target, 2 helper XS, firstcore 0, host wolf-72.domain.` ### SCM Format diff --git a/docs/admin/hardware.md b/docs/admin/hardware.md index 3f4658cf40b..939ce9b4829 100644 --- a/docs/admin/hardware.md +++ b/docs/admin/hardware.md @@ -21,7 +21,7 @@ servers. DAOS requires a 64-bit processor architecture and is primarily developed on Intel x86\_64 architecture. The DAOS software and the libraries it depends on (e.g., [ISA-L](https://github.com/intel/isa-l), -[SPDK](https://spdk.io/), [PMDK](https://github.com/pmem/pmdk/), and +[SPDK](https://spdk.io/), [PMDK](https://github.com/daos-stack/pmdk/), and [DPDK](https://www.dpdk.org/) can take advantage of Intel Streaming SIMD (SSE) and Intel Advanced Vector Extensions (AVX). @@ -35,9 +35,10 @@ validated on a regular basis. An RDMA-capable fabric is preferred for best performance. The DAOS data plane relies on [OFI libfabric](https://ofiwg.github.io/libfabric/) and supports OFI providers for Ethernet/tcp and InfiniBand/verbs. -Starting with a Technology Preview in DAOS 2.2, [UCX](https://www.openucx.org/) -is also supported as an alternative network stack for DAOS. -Refer to [UCX Fabric Support (DAOS 2.2 Technology Preview)](./ucx.md) +[UCX](https://www.openucx.org/) +is also supported as an alternative network stack for DAOS on InfiniBand/verbs +platforms. +Refer to [UCX Fabric Support](./ucx.md) for details on setting up DAOS with UCX support. DAOS supports multiple network interfaces on the servers diff --git a/docs/admin/pool_operations.md b/docs/admin/pool_operations.md index dc5c3bc12e4..1879f060502 100644 --- a/docs/admin/pool_operations.md +++ b/docs/admin/pool_operations.md @@ -783,7 +783,7 @@ $ dmg pool get-prop pool1 Checksum scrubbing frequency (scrub_freq) not set Checksum scrubbing threshold (scrub_thresh) not set Self-healing policy (self_heal) exclude - Rebuild space ratio (space_rb) 0% + Rebuild space ratio (space_rb) 5% Pool service replica list (svc_list) [0] Pool service redundancy factor (svc_rf) not set Upgrade Status (upgrade_status) not started @@ -812,7 +812,7 @@ $ dmg pool get-prop pool1 Checksum scrubbing frequency (scrub_freq) 604800 Checksum scrubbing threshold (scrub_thresh) 0 Self-healing policy (self_heal) exclude - Rebuild space ratio (space_rb) 0% + Rebuild space ratio (space_rb) 5% Pool service replica list (svc_list) [0] Pool service redundancy factor (svc_rf) 2 Upgrade Status (upgrade_status) in progress @@ -876,7 +876,7 @@ $ dmg pool get-prop tank Checksum scrubbing frequency (scrub_freq) 604800 Checksum scrubbing threshold (scrub_thresh) 0 Self-healing policy (self_heal) exclude,rebuild - Rebuild space ratio (space_rb) 0% + Rebuild space ratio (space_rb) 5% Pool service replica list (svc_list) [0] Pool service redundancy factor (svc_rf) 2 Upgrade Status (upgrade_status) not started @@ -918,7 +918,7 @@ $ dmg pool get-prop tank2 Checksum scrubbing frequency (scrub_freq) 604800 Checksum scrubbing threshold (scrub_thresh) 0 Self-healing policy (self_heal) exclude,rebuild - Rebuild space ratio (space_rb) 0% + Rebuild space ratio (space_rb) 5% Pool service replica list (svc_list) [0] Pool service redundancy factor (svc_rf) 2 Upgrade Status (upgrade_status) not started @@ -968,7 +968,7 @@ Two options are supported: "exclude" (default strategy) and "rebuild". This property defines the percentage of total space reserved on each storage engine for self-healing purpose. The reserved space cannot be consumed by -applications. Valid values are 0% to 100%, the default is 0%. +applications. Valid values are 0% to 100%, the default is 5%. When setting this property, specifying the percentage symbol is optional: `space_rb:2%` and `space_rb:2` both specify two percent of storage capacity. diff --git a/docs/admin/predeployment_check.md b/docs/admin/predeployment_check.md index 8b5a391d8a9..6a0d632a650 100644 --- a/docs/admin/predeployment_check.md +++ b/docs/admin/predeployment_check.md @@ -530,3 +530,30 @@ Current LBA Format: LBA Format #03 Displayed details for controller show LBA format is now "#03". Perform the above process for all SSDs that will be used by DAOS. + + +## Hugepage allocation and memory fragmentation + +DAOS uses linux hugepages for DMA buffer allocation. If hugepage memory becomes fragmented, DMA +buffer allocations may fail because of insufficient contiguous memory availability. + +By default DAOS will allocate necessary hugepages at runtime based on supplied server file +configuration details (mainly the number of engine targets). Runtime allocation of hugepages +may cause fragmentation over time. + +To reduce the chance of memory fragmentation, hugepages can be allocated on the kernel boot +command line by specifying the "hugepages=N" parameter, where 'N' = the number of huge pages +requested. + +[See here for details of allocating hugepages at +boot](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/7/html/performance_tuning_guide/sect-red_hat_enterprise_linux-performance_tuning_guide-memory-configuring-huge-pages) + + +## Disabling transparent hugepage (THP) feature at boot-time + +Linux transparent hugepages feature can increase the likelihood of hugepage memory fragmentation +and should be disabled for optimal performance of DAOS. + +[See here for details of how to disable THP on +boot](https://docs.kernel.org/admin-guide/mm/transhuge.html#boot-parameters) + diff --git a/docs/admin/telemetry_guide.md b/docs/admin/telemetry_guide.md new file mode 100644 index 00000000000..eae31acdc8d --- /dev/null +++ b/docs/admin/telemetry_guide.md @@ -0,0 +1,365 @@ +# DAOS Telemetry Example + +This document will help to run daos metrics command and collect some key metrics from the +server to help debug the issues and analyze the system behavior. + +## How to run telemetry command: + +### Directly on server using daos_metrics command as sudo user + +- Example of collecting the pool query metrics on the servers using daos_metrics command. +- daos_metrics -S will show telemetry data from First I/O Engine (default 0) +- daos_metrics -S 1 will show telemetry data from Second I/O Engine, in case multiple engines are running per node. + +``` +$ sudo daos_metrics -C -S 0 | grep pool_query +ID: 0/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/pool_query,0 +ID: 0/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/pool_query_space,0 +$ sudo daos_metrics -C -S 1 | grep pool_query +ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/pool_query,12 +ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/pool_query_space,10 +``` + +### Dmg command on admin node (dmg telemetry metrics query) + +- Example of collecting the pool query metrics from individual servers using dmg command + +``` +$ sudo dmg telemetry metrics query -m engine_pool_ops_pool_query -l brd-221 +connecting to brd-221:9191... +- Metric Set: engine_pool_ops_pool_query (Type: Counter) + Total number of processed pool query operations + Metric Labels Value + ------ ------ ----- + Counter (pool=8259d3ff-523e-4a43-9248-26aba2a62f4c, rank=0) 0 + Counter (pool=8259d3ff-523e-4a43-9248-26aba2a62f4c, rank=1) 0 + +$ sudo dmg telemetry metrics query -m engine_pool_ops_pool_query -l brd-222 +connecting to brd-222:9191... +- Metric Set: engine_pool_ops_pool_query (Type: Counter) + Total number of processed pool query operations + Metric Labels Value + ------ ------ ----- + Counter (pool=8259d3ff-523e-4a43-9248-26aba2a62f4c, rank=2) 0 + Counter (pool=8259d3ff-523e-4a43-9248-26aba2a62f4c, rank=4) 0 +``` + +### Identify the pool UUID and leader rank: + + - Some metrics are only available on pool leader rank so identify the leader rank for that pool from the pool query command. + - Below is the example of pool query where leader rank is 1 + - Pool 55cc96d8-5c46-41f4-af29-881d293b6f6f, ntarget=48, disabled=0, `leader=1`, version=1, state=Ready + +``` +#sudo dmg pool query samir_pool +``` +``` +Pool 55cc96d8-5c46-41f4-af29-881d293b6f6f, ntarget=48, disabled=0, leader=1, version=1, state=Ready +Pool health info: +- Rebuild idle, 0 objs, 0 recs +Pool space info: +- Target count:48 +- Total memory-file size: 151 GB +- Metadata storage: + Total size: 151 GB + Free: 118 GB, min:2.5 GB, max:2.5 GB, mean:2.5 GB +- Data storage: + Total size: 600 GB + Free: 598 GB, min:12 GB, max:12 GB, mean:12 GB +``` + + - Find the leader rank address so that daos_metrics command can be run on that specific server. + In this example Rank 1 is on `brd-221.daos.hpc.amslabs.hpecorp.net` (`10.214.213.41`) + +``` +#sudo dmg system query -v +``` +``` +Rank UUID Control Address Fault Domain State Reason +---- ---- --------------- ------------ ----- ------ +0 6c481fea-b820-4b50-9845-6a5a04b4cfcf 10.214.213.41:10001 /brd-221.daos.hpc.amslabs.hpecorp.net Joined +1 43865b12-86d3-4107-afe8-3921f19bc9ff 10.214.213.41:10001 /brd-221.daos.hpc.amslabs.hpecorp.net Joined +2 eb413873-c13c-43ea-8bdf-21b691e169c9 10.214.212.229:10001 /brd-222.daos.hpc.amslabs.hpecorp.net Joined +3 607ad987-a55a-4365-ad6b-c4160ac5ff67 10.214.214.190:10001 /brd-223.daos.hpc.amslabs.hpecorp.net Joined +4 6c3d9b9a-2fff-4874-a7f0-309c4126a8e6 10.214.212.229:10001 /brd-222.daos.hpc.amslabs.hpecorp.net Joined +5 6884e5c9-b38b-46aa-b042-7fad9b37cf45 10.214.214.190:10001 /brd-223.daos.hpc.amslabs.hpecorp.net Joined +``` + - dmg command example based on leader Fault Domain (hostname) `-l brd-221` + +``` +$ sudo dmg telemetry metrics query -m engine_pool_ops_pool_query -l brd-221 +connecting to brd-221:9191... +- Metric Set: engine_pool_ops_pool_query (Type: Counter) + Total number of processed pool query operations + Metric Labels Value + ------ ------ ----- + Counter (pool=8259d3ff-523e-4a43-9248-26aba2a62f4c, rank=0) 0 + Counter (pool=8259d3ff-523e-4a43-9248-26aba2a62f4c, rank=1) 2 +``` + +OR + + - dmg command example based on Control Address `-l 10.214.213.41` + +``` +$ sudo dmg telemetry metrics query -m engine_pool_ops_pool_query -l 10.214.213.41 +connecting to 10.214.213.41:9191... +- Metric Set: engine_pool_ops_pool_query (Type: Counter) + Total number of processed pool query operations + Metric Labels Value + ------ ------ ----- + Counter (pool=8259d3ff-523e-4a43-9248-26aba2a62f4c, rank=0) 0 + Counter (pool=8259d3ff-523e-4a43-9248-26aba2a62f4c, rank=1) 2 + +``` + +## Engine Metrics: + +Admin can set environment variable for pool and container name for below example. + +``` +export MY_POOL=Test_pool +export MY_CONT=Test_cont +export MY_MOUNT=/tmp/daos_mount +``` + + +|Operation| Description | DAOS Command | Metrics Command | Output | +|:---:| :---: | :---: | :---: |:------------: | +|When engine started | Timestamp of last engine startup | None | `sudo daos_metrics -S 1 -C \| grep 'started_at' \| grep -v pool`|ID: 0/started_at,Tue Oct 28 23:21:24 2025| +|When engine become ready | Timestamp when the engine became ready | None | `sudo daos_metrics -S 1 -C \| grep 'servicing_at'`|ID: 0/servicing_at,Tue Oct 28 23:21:33 2025| +|Find Engine Rank ID | Rank ID of this engine | None | `sudo daos_metrics -S 1 -C \| grep '/rank' \| grep -v pool`|ID: 1/rank,276| +|check if Engine is dead | engine_events_dead_ranks | None | `sudo daos_metrics -S 1 -C \| grep '/dead'`| ID: 0/events/dead_ranks,1 | +|last event on rank | Timestamp of last received event | None | `sudo daos_metrics -S 1 -C \| grep '/last_event'`| ID: 1/events/last_event_ts,Thu Jan 1 00:00:00 1970 | + +## Pool Metrics: + +|Operation| Description | DAOS Command | Metrics Command | Output | +|:---:| :---: | :---: | :---: |:------------: | +|With No Pools| Total number of processed pool connect operations | None | `sudo daos_metrics -C -S 0 \| grep 'ops/pool'`|None| +|After creating single pool| | dmg pool create $MY_POOL | `sudo daos_metrics -C -S 0 \| grep 'ops/pool'`| ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_evict,0
ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_connect,0
ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_disconnect,0
ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_query,0
ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_query_space,0| +|After querying the single pool without storage| Total number of processed pool query operations | dmg pool query $MY_POOL -t | `sudo daos_metrics -C -S 0 \| grep 'ops/pool_query'` | ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_query,1| +|After querying the single pool with storage | Total number of processed pool query (with operation) operations | dmg pool query $MY_POOL | `sudo daos_metrics -C -S 0 \| grep 'ops/pool_query_space'` | ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_query_space,1| +|After Creating Container| Total number of processed pool connect operations | daos cont create $MY_POOL $MY_CONT| `sudo daos_metrics -C -S 0 \| grep 'ops/pool_connect'`| ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_connect,1| +|After Creating Container| Total number of processed pool disconnect operations | daos cont create $MY_POOL $MY_CONT| `sudo daos_metrics -C -S 0 \| grep 'ops/pool_disconnect'`|ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_disconnect,1| +|After Mounting FUSE Container| Total number of processed pool connect operations | dfuse -m $MY_MOUNT -p $MY_POOL -c $MY_CONT| `sudo daos_metrics -C -S 0 \| grep 'ops/pool_connect'`| ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_connect,2| +|After Unmounting FUSE Container| Total number of processed pool disconnect operations | fusermount3 -u -m $MY_MOUNT | `sudo daos_metrics -C -S 0 \| grep 'ops/pool_disconnect'`| ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_disconnect,2| +|After Pool evict | Total number of pool handle evict operations | dmg pool evict $MY_POOL | `sudo daos_metrics -C -S 0 \| grep 'ops/pool_evict'`| ID: 1/pool/55cc96d8-5c46-41f4-af29-881d293b6f6f/ops/pool_evict,2| + +## Container Metrics: + +|Operation| Description | DAOS Command | Metrics Command | Output | +|:---:| :---: | :---: | :---: |:------------: | +| Container creation | Total number of successful container create operations | daos cont create $MY_POOL $MY_CONT --type='POSIX' | `sudo daos_metrics -C -S 0 \| grep cont_create \| grep `|ID: 0/pool/c22c6a6c-7e31-4788-90a4-a55d1083d57b/ops/cont_create,1| +| Container query | Total number of successful container query operations | daos container query $MY_POOL $MY_CONT | `sudo daos_metrics -C -S 0 \| grep cont_query \| grep `|ID: 0/pool/c22c6a6c-7e31-4788-90a4-a55d1083d57b/ops/cont_query,4| +| Container open | Total number of successful container open operations | dfuse -m $MY_MOUNT -p $MY_POOL -c $MY_CONT | `sudo daos_metrics -C -S 0 \| grep cont_open \| grep `|ID: 0/pool/c22c6a6c-7e31-4788-90a4-a55d1083d57b/ops/cont_open,3| +| Container destroy | Total number of successful container destroy operations | daos cont destroy $MY_POOL $MY_CONT | `sudo daos_metrics -C -S 0 \| grep cont_destroy \| grep `|ID: 0/pool/c22c6a6c-7e31-4788-90a4-a55d1083d57b/ops/cont_destroy,1| + +## I/O Metrics: + +|Operation| Description | DAOS Command | Metrics Command | Output | +|:---:| :---: | :---: | :---: |:------------: | +| data written | Total number of bytes updated/written |Write the Data using any IO | `sudo daos_metrics -C -S 1 \| grep \| grep 'xferred/update'`|ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/update/tgt_0,1335885824
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/update/tgt_7,1337983064
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/update/tgt_2,1342177280
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/update/tgt_4,1325400064
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/update/tgt_6,1337982976
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/update/tgt_5,1384120320
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/update/tgt_1,1332740096
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/update/tgt_3,1341128828| +| data read | Total number of bytes fetched/read | Read the Data using any IO | `sudo daos_metrics -C -S 1 \| grep \| grep 'xferred/fetch'`|ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/fetch/tgt_0,1335885824
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/fetch/tgt_7,1337983240
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/fetch/tgt_2,1342177280
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/fetch/tgt_4,1325400064
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/fetch/tgt_6,1337982976
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/fetch/tgt_5,1384120320
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/fetch/tgt_1,1332740096
ID: 1/pool/e63e81dd-7d5d-4622-8196-83256b12326c/xferred/fetch/tgt_3,1341129076| +| Write IOPS operation | Total number of processed object RPCs | Write the Data using any IO | `sudo daos_metrics -S 1 -C \| grep \| grep 'ops/update'`|ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/update/tgt_6,222
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/update/tgt_2,204
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/update/tgt_5,210
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/update/tgt_7,223
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/update/tgt_4,224
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/update/tgt_0,196
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/update/tgt_1,198
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/update/tgt_3,206| +| Read IOPS operation | Total number of processed object RPCs | Read the Data using any IO | `sudo daos_metrics -S 1 -C \| grep \| grep 'ops/fetch'`|ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/fetch/tgt_6,234
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/fetch/tgt_2,206
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/fetch/tgt_5,215
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/fetch/tgt_7,214
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/fetch/tgt_4,225
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/fetch/tgt_0,192
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/fetch/tgt_1,202
ID: 1/pool/8259d3ff-523e-4a43-9248-26aba2a62f4c/ops/fetch/tgt_3,221| +| IO latency Update | update RPC processing time | Write the Data using 1MiB xfersize | `sudo daos_metrics -S 1 -C \| grep 'io/latency/update'`|ID: 1/io/latency/update/1MB/tgt_0,34423,9173,239956,24216.092267,1875,45405173,35287.151469
ID: 1/io/latency/update/1MB/tgt_1,34824,9195,224337,24619.489373,1882,46333879,35692.908836
ID: 1/io/latency/update/1MB/tgt_2,17586,9187,246820,25627.308223,1885,48307476,37184.782868
ID: 1/io/latency/update/1MB/tgt_3,60684,9182,264286,25998.202265,1943,50514507,38227.372221
ID: 1/io/latency/update/1MB/tgt_4,83487,9193,235707,26626.855799,1914,50963802,37382.179815
ID: 1/io/latency/update/1MB/tgt_5,26402,9200,235859,24656.685802,1951,48105194,34931.529382
ID: 1/io/latency/update/1MB/tgt_6,107294,9190,244975,26761.485861,1945,52051090,38022.684882
ID: 1/io/latency/update/1MB/tgt_7,79041,9213,219362,25710.023921,1923,49440376,36611.272385| +| IO latency Fetch | fetch RPC processing time | Read the Data using 1MiB xfersize | `sudo daos_metrics -S 1 -C \| grep 'io/latency/fetch'`|ID: 1/io/latency/fetch/1MB/tgt_0,29630,9419,225908,19060.848723,1527,29105916,26764.971072
ID: 1/io/latency/fetch/1MB/tgt_1,18329,9406,343931,17769.093144,1546,27471018,23882.809783
ID: 1/io/latency/fetch/1MB/tgt_2,9887,9385,131315,18075.996768,1547,27963567,22973.594024
ID: 1/io/latency/fetch/1MB/tgt_3,39508,9411,155136,19332.508228,1580,30545363,25593.694908
ID: 1/io/latency/fetch/1MB/tgt_4,22616,9413,412206,19062.688062,1558,29699668,27359.057624
ID: 1/io/latency/fetch/1MB/tgt_5,22280,9418,126520,17382.379032,1612,28020395,20937.262665
ID: 1/io/latency/fetch/1MB/tgt_6,40743,9409,207370,18697.681472,1576,29467546,23236.574768
ID: 1/io/latency/fetch/1MB/tgt_7,24048,9417,112182,17725.164955,1558,27615807,21375.496411| + +## Troubleshooting: + +### No response to pool query or any I/O operation + +In case of no response to any dmg pool command or any I/O operation means any one of the single xstream might have stuck. Either ULT is stuck or NVMe cannot respond to I/O operation. +To check if ULT is stuck, run metrics command on each server and check for sched/cycle_duration and sched/cycle_size. +In this case sched/cycle_duration and sched/cycle_size for stuck xstream counter value is higher (outlier) compared to other xstream and ULT count. + +- sched/cycle_duration: Schedule cycle duration, units: ms +- sched/cycle_size: Schedule cycle size, units: ULT + +Below is the example on real system where ULT was stuck and not responding. You can see the outlier for xs_3\ +**xs_3: 87624970 ms**\ +**xs_3: 72508 ULT** +``` +# sudo daos_metrics -C -S 0 | grep -e cycle + + cycle_duration + xs_0: 4 ms [min: 0, max: 736, avg: 1, sum: 4374707, stddev: 2, samples: 4337768] + xs_1: 0 ms [min: 0, max: 4, avg: 0, sum: 12, stddev: 1, samples: 57] + xs_2: 1000 ms [min: 0, max: 1008, avg: 1001, sum: 170805474, stddev: 5, samples: 170636] + xs_3: 87624970 ms [min: 0, max: 87624970, avg: 0, sum: 155215898, stddev: 677, samples: 16729436166] + xs_4: 0 ms [min: 0, max: 480, avg: 0, sum: 170775422, stddev: 0, samples: 52866364909] + xs_5: 0 ms [min: 0, max: 532, avg: 0, sum: 170774266, stddev: 0, samples: 53277804155] + xs_6: 0 ms [min: 0, max: 457, avg: 0, sum: 170775310, stddev: 0, samples: 52654423202] + xs_7: 0 ms [min: 0, max: 449, avg: 0, sum: 170774942, stddev: 0, samples: 53289078146] + xs_8: 0 ms [min: 0, max: 696, avg: 0, sum: 170779578, stddev: 0, samples: 53348599756] + xs_9: 0 ms [min: 0, max: 444, avg: 0, sum: 170775582, stddev: 0, samples: 53085628214] + xs_10: 0 ms [min: 0, max: 456, avg: 0, sum: 170775386, stddev: 0, samples: 53361992047] + xs_11: 0 ms [min: 0, max: 668, avg: 0, sum: 170779354, stddev: 0, samples: 52868332788] + xs_12: 0 ms [min: 0, max: 664, avg: 0, sum: 170779222, stddev: 0, samples: 53207853905] + xs_13: 0 ms [min: 0, max: 484, avg: 0, sum: 170778230, stddev: 0, samples: 53161107629] + xs_14: 0 ms [min: 0, max: 452, avg: 0, sum: 170778690, stddev: 0, samples: 54026864334] + xs_15: 0 ms [min: 0, max: 664, avg: 0, sum: 170779106, stddev: 0, samples: 53240085110] + xs_16: 0 ms [min: 0, max: 588, avg: 0, sum: 170778746, stddev: 0, samples: 53324006952] + xs_17: 0 ms [min: 0, max: 664, avg: 0, sum: 170778646, stddev: 0, samples: 53244261876] + xs_18: 0 ms [min: 0, max: 452, avg: 0, sum: 170779198, stddev: 0, samples: 53498338576] + xs_19: 4 ms [min: 0, max: 108, avg: 0, sum: 30913, stddev: 1, samples: 461742] + xs_20: 0 ms [min: 0, max: 112, avg: 0, sum: 30832, stddev: 1, samples: 460370] + xs_21: 0 ms [min: 0, max: 112, avg: 0, sum: 31340, stddev: 1, samples: 461099] + xs_22: 0 ms [min: 0, max: 116, avg: 0, sum: 105495174, stddev: 0, samples: 92074321933] + cycle_size + xs_0: 1 ULT [min: 1, max: 672, avg: 1, sum: 4486893, stddev: 1, samples: 4337768] + xs_1: 1 ULT [min: 1, max: 15, avg: 2, sum: 116, stddev: 2, samples: 57] + xs_2: 1 ULT [min: 1, max: 1, avg: 1, sum: 170636, stddev: 0, samples: 170636] + xs_3: 72508 ULT [min: 1, max: 72508, avg: 1, sum: 16944980993, stddev: 1, samples: 16729436166] + xs_4: 1 ULT [min: 1, max: 253, avg: 1, sum: 53106562082, stddev: 0, samples: 52866364919] + xs_5: 1 ULT [min: 1, max: 293, avg: 1, sum: 53517385848, stddev: 0, samples: 53277804166] + xs_6: 1 ULT [min: 1, max: 262, avg: 1, sum: 52893882375, stddev: 0, samples: 52654423213] + xs_7: 1 ULT [min: 1, max: 263, avg: 1, sum: 53529014337, stddev: 0, samples: 53289078157] + xs_8: 1 ULT [min: 1, max: 269, avg: 1, sum: 53588382832, stddev: 0, samples: 53348599768] + xs_9: 1 ULT [min: 1, max: 538, avg: 1, sum: 53325349666, stddev: 0, samples: 53085628225] + xs_10: 1 ULT [min: 1, max: 440, avg: 1, sum: 53601721471, stddev: 0, samples: 53361992058] + xs_11: 1 ULT [min: 1, max: 365, avg: 1, sum: 53108191221, stddev: 0, samples: 52868332799] + xs_12: 1 ULT [min: 1, max: 268, avg: 1, sum: 53447917652, stddev: 0, samples: 53207853917] + xs_13: 1 ULT [min: 1, max: 258, avg: 1, sum: 53400854712, stddev: 0, samples: 53161107641] + xs_14: 1 ULT [min: 1, max: 265, avg: 1, sum: 54266784187, stddev: 0, samples: 54026864345] + xs_15: 1 ULT [min: 1, max: 440, avg: 1, sum: 53480318341, stddev: 0, samples: 53240085122] + xs_16: 1 ULT [min: 1, max: 270, avg: 1, sum: 53564352374, stddev: 0, samples: 53324006963] + xs_17: 1 ULT [min: 1, max: 273, avg: 1, sum: 53484431253, stddev: 0, samples: 53244261888] + xs_18: 1 ULT [min: 1, max: 275, avg: 1, sum: 53738248689, stddev: 0, samples: 53498338588] + xs_19: 1 ULT [min: 1, max: 2, avg: 1, sum: 461743, stddev: 0, samples: 461742] + xs_20: 1 ULT [min: 1, max: 1, avg: 1, sum: 460370, stddev: 0, samples: 460370] + xs_21: 1 ULT [min: 1, max: 1, avg: 1, sum: 461099, stddev: 0, samples: 461099] + xs_22: 1 ULT [min: 1, max: 3, avg: 1, sum: 92074426829, stddev: 0, samples: 92074321962] +``` + +### Slow performance + +If DAOS system is performing slower, check write(update) & read(fetch) metrics to indicate the source of the problem across all engines. + +For example, mention below, one of the NVMe was impacting the overall IO performance because write BW on that specific drive was slower. As you can see two targets (tgt_0 & tgt_8) latency for 4MB write were too high compared to other targets. That indicate that specific drive is having lower write BW which increase the update latency too high. +This metrics are available in different IO size ranges from 256B to 4GB so looks for matching IO size used for testing the performance. Below example we used IOR write size 4MB. + +``` +#sudo daos_metrics -C -S 0 | grep 'io/latency/update' + +ID: 0/io/latency/update/4MB/tgt_0,16349826,733843,16349826,7329515.976190,42,307839671,4196687.177444 +ID: 0/io/latency/update/4MB/tgt_1,1260,1147,2191,1463.423077,52,76098,273.640909 +ID: 0/io/latency/update/4MB/tgt_2,1252,1122,2275,1452.000000,62,90024,272.896966 +ID: 0/io/latency/update/4MB/tgt_3,1637,1179,2639,1558.844444,45,70148,302.601219 +ID: 0/io/latency/update/4MB/tgt_4,1155,1119,2280,1496.857143,49,73346,281.746857 +ID: 0/io/latency/update/4MB/tgt_5,1804,1139,1920,1493.767442,43,64232,234.072520 +ID: 0/io/latency/update/4MB/tgt_6,1160,1136,2550,1560.862745,51,79604,293.899440 +ID: 0/io/latency/update/4MB/tgt_7,1399,1126,1969,1411.929825,57,80480,195.942125 +ID: 0/io/latency/update/4MB/tgt_8,15264368,857936,19645847,9109087.453125,64,582981597,5094157.829112 +ID: 0/io/latency/update/4MB/tgt_9,1601,1146,2455,1437.038462,52,74726,262.549712 +ID: 0/io/latency/update/4MB/tgt_10,1366,1138,2094,1459.828125,64,93429,228.692526 +ID: 0/io/latency/update/4MB/tgt_11,1118,1113,2742,1475.378788,66,97375,309.820731 +ID: 0/io/latency/update/4MB/tgt_12,1169,1158,2531,1492.392857,56,83574,270.312323 +ID: 0/io/latency/update/4MB/tgt_13,1477,1148,2204,1485.853659,41,60920,244.983118 +ID: 0/io/latency/update/4MB/tgt_14,1159,1159,2390,1523.333333,48,73120,318.466026 +ID: 0/io/latency/update/4MB/tgt_15,1511,1165,2318,1447.608696,46,66590,253.351094 + +#sudo daos_metrics -C -S 0 | grep 'io/latency/fetch' + +ID: 0/io/latency/fetch/4MB/tgt_0,1390,1099,2169,1380.785714,42,57993,202.810200 +ID: 0/io/latency/fetch/4MB/tgt_1,1902,1413,2956,1845.769231,52,95980,313.043041 +ID: 0/io/latency/fetch/4MB/tgt_2,1741,1395,2493,1783.983871,62,110607,226.501945 +ID: 0/io/latency/fetch/4MB/tgt_3,1543,1241,2568,1824.800000,45,82116,281.414092 +ID: 0/io/latency/fetch/4MB/tgt_4,1705,1506,2426,1850.020408,49,90651,232.079413 +ID: 0/io/latency/fetch/4MB/tgt_5,1579,1251,2396,1754.139535,43,75428,213.314275 +ID: 0/io/latency/fetch/4MB/tgt_6,1566,1262,2403,1747.823529,51,89139,260.631134 +ID: 0/io/latency/fetch/4MB/tgt_7,1663,1354,2912,1853.631579,57,105657,287.610267 +ID: 0/io/latency/fetch/4MB/tgt_8,1508,1051,2276,1417.562500,64,90724,271.118956 +ID: 0/io/latency/fetch/4MB/tgt_9,1508,1404,2468,1791.788462,52,93173,251.042324 +ID: 0/io/latency/fetch/4MB/tgt_10,1746,1453,2645,1796.203125,64,114957,230.458630 +ID: 0/io/latency/fetch/4MB/tgt_11,1695,1394,2416,1761.151515,66,116236,220.046376 +ID: 0/io/latency/fetch/4MB/tgt_12,1966,1396,2654,1740.464286,56,97466,238.501684 +ID: 0/io/latency/fetch/4MB/tgt_13,1915,1341,2613,1774.536585,41,72756,237.038298 +ID: 0/io/latency/fetch/4MB/tgt_14,1861,1337,2543,1807.625000,48,86766,279.890680 +ID: 0/io/latency/fetch/4MB/tgt_15,1740,1326,2420,1733.521739,46,79742,238.393674 + +``` + + +### NVMe Device Error + +Many times, NVMe device has error which can also be an indication for slow performance or system stuck issue. + +``` +#sudo daos_metrics -S 0 -M | grep errs + media_errs: 0 errs, desc: Number of unrecovered data integrity error, units: errs + read_errs: 0 errs, desc: Number of errors reported to the engine on read commands, units: errs + write_errs: 0 errs, desc: Number of errors reported to the engine on write commands, units: errs + unmap_errs: 0 errs, desc: Number of errors reported to the engine on unmap/trim commands, units: errs + checksum_mismatch: 0 errs, desc: Number of checksum mismatch detected by the engine, units: errs + +#sudo daos_metrics -C -S 0 | grep nvm | grep err +ID: 0/nvme/0000:83:00.0/commands/media_errs,0 +ID: 0/nvme/0000:83:00.0/commands/read_errs,0 +ID: 0/nvme/0000:83:00.0/commands/write_errs,0 +ID: 0/nvme/0000:83:00.0/commands/unmap_errs,0 +ID: 0/nvme/0000:83:00.0/vendor/endtoend_err_cnt_raw,0 +ID: 0/nvme/0000:83:00.0/vendor/crc_err_cnt_raw,0 +``` + +## Metrics Unit Type + +daos_metrics output is available in multiple units. for example, Counters, Gauge. It can display the data based on different unit type. + +### Display Counter type metrics +A counter is a cumulative metric that represents a single monotonically increasing counter whose value can only increase or be reset or to zero on restart. + +``` +sudo daos_metrics -c -S 0 -M -C +name,value,min,max,mean,sample_size,sum,std_dev,description,units +ID: 0/events/dead_ranks,0,,,,,,Number of dead rank events received,events +ID: 0/net/uri/lookup_self,0,,,,,,total number of URI requests for self +ID: 0/net/uri/lookup_other,0,,,,,,total number of URI requests for other ranks +ID: 0/net/ofi+tcp;ofi_rxm/hg/bulks/ctx_0,0,,,,,,Mercury-layer count of bulk transfers,bulks +ID: 0/net/ofi+tcp;ofi_rxm/hg/bulks/ctx_1,0,,,,,,Mercury-layer count of bulk transfers,bulks +``` + +### Display Gauge type metrics +A gauge is a metric that represents a single numerical value that can arbitrarily go up and down. + +``` +sudo daos_metrics -S 0 -g -M -C | more +name,value,min,max,mean,sample_size,sum,std_dev,description,units +ID: 0/rank,0,,,,,,Rank ID of this engine +ID: 0/net/ofi+tcp;ofi_rxm/hg/active_rpcs/ctx_0,0,,,,,,Mercury-layer count of active RPCs,rpcs +ID: 0/net/ofi+tcp;ofi_rxm/hg/active_rpcs/ctx_1,0,,,,,,Mercury-layer count of active RPCs,rpcs +ID: 0/net/ofi+tcp;ofi_rxm/hg/active_rpcs/ctx_2,0,,,,,,Mercury-layer count of active RPCs,rpcs +ID: 0/net/ofi+tcp;ofi_rxm/hg/active_rpcs/ctx_3,0,,,,,,Mercury-layer count of active RPCs,rpcs +``` + +## Metrics Unit output format + +Gauge metrics units are in format where multiple values are display for number of samples. For example, update/fetch latency output. + +``` + latency + update + 256B + tgt_0: 118 us [min: 15, max: 3703, avg: 100, sum: 200968, stddev: 124, samples: 2010] +``` + +|metrics type| definition|| +|:---:|:-------------------:|:---:| +|value|Current value|118 us| +|min|The minimum value from all data samples|15 us| +|max|The maximum value from all data samples|3703 us| +|avg|The average value based on all data samples|100 us| +|sum|The total value of all data samples|200968 us| +|stddev| Standard deviation |124 us| +|samples|Total number of data samples used for metrics at given point|2010| + +## Reset the metrics counter + +Metrics counter will be reset when system restarts or it can be reset using below command on individual servers. + +For Engine 0 & 1 (In case multiple engines are running on same node) +``` +sudo daos_metrics -S 0 -e; sudo daos_metrics -S 1 -e +``` + diff --git a/docs/admin/troubleshooting.md b/docs/admin/troubleshooting.md index 78b54340071..8d615e4991f 100644 --- a/docs/admin/troubleshooting.md +++ b/docs/admin/troubleshooting.md @@ -523,7 +523,7 @@ updated its system fabric provider. Example `system_fabric_provider_changed` RAS event from syslog: ``` -daos_server[3302185]: id: [system_fabric_provider_changed] ts: [2024-02-13T20:08:50.956+00:00] host: [boro-74.boro.hpdd.intel.com] type: [INFO] sev: [NOTICE] msg: [system fabric provider has changed: ofi+tcp -> ofi+tcp;ofi_rxm] pid: [3302185] +daos_server[3302185]: id: [system_fabric_provider_changed] ts: [2024-02-13T20:08:50.956+00:00] host: [boro-74.boro.example.com] type: [INFO] sev: [NOTICE] msg: [system fabric provider has changed: ofi+tcp -> ofi+tcp;ofi_rxm] pid: [3302185] ``` To resolve the issue: @@ -1003,7 +1003,7 @@ Please refer the [ndctl list](https://docs.pmem.io/ndctl-user-guide/ndctl-man-pa The pmempool is a management tool for Persistent Memory pool files created by PMDK libraries. DAOS uses the PMDK library to manage persistence inside ext4 files. -[pmempool](https://github.com/pmem/pmdk/blob/stable-2.0/doc/pmempool/pmempool-check.1.md) can check consistency of a given pool file. +[pmempool](https://github.com/daos-stack/pmdk/blob/stable-2.1/doc/pmempool/pmempool-check.1.md) can check consistency of a given pool file. It can be run with -r (repair) option which can fix some of the issues with pool file. DAOS will have more number of such pool file (vos-*), based on number of targets mention per daos engine. User may need to check each vos pool file for corruption on faulty pool. @@ -1083,11 +1083,11 @@ running under systemd run the following command: CGroup: /system.slice/rsyslog.service └─1962 /usr/sbin/rsyslogd -n -May 23 16:12:31 wolf-164.wolf.hpdd.intel.com systemd[1]: Starting System Logging Service... -May 23 16:12:31 wolf-164.wolf.hpdd.intel.com rsyslogd[1962]: [origin software="rsyslogd" swVersion="8.21> -May 23 16:12:31 wolf-164.wolf.hpdd.intel.com systemd[1]: Started System Logging Service. -May 23 16:12:31 wolf-164.wolf.hpdd.intel.com rsyslogd[1962]: imjournal: journal files changed, reloading> -May 29 03:18:01 wolf-164.wolf.hpdd.intel.com rsyslogd[1962]: [origin software="rsyslogd" swVersion="8.21> +May 23 16:12:31 wolf-164.wolf.example.com systemd[1]: Starting System Logging Service... +May 23 16:12:31 wolf-164.wolf.example.com rsyslogd[1962]: [origin software="rsyslogd" swVersion="8.21> +May 23 16:12:31 wolf-164.wolf.example.com systemd[1]: Started System Logging Service. +May 23 16:12:31 wolf-164.wolf.example.com rsyslogd[1962]: imjournal: journal files changed, reloading> +May 29 03:18:01 wolf-164.wolf.example.com rsyslogd[1962]: [origin software="rsyslogd" swVersion="8.21> ``` To configure a Syslog daemon to resolve the delivery errors and receive messages from 'daos_server' diff --git a/docs/admin/ucx.md b/docs/admin/ucx.md index b6529c7f1ee..c366fb1cf06 100644 --- a/docs/admin/ucx.md +++ b/docs/admin/ucx.md @@ -16,33 +16,26 @@ the following steps are needed: for information about supported MLNX\_OFED releases. * The `mercury-ucx` RPM package needs to be **manually** selected for - installation: - - - The base `mercury` RPM package ships with the libfabric plugin. - This RPM will be installed by default and is a dependency of the - `mercury-ucx` RPM. - - - The additional `mercury-ucx` RPM is also provided. This RPM contains - the UCX plugin that is required for enabling UCX support. - This RPM **must** be used in - InfiniBand environments when the intention is to use - UCX. - Attempts to install this RPM in non-Infiniband environments - will fail, because it has a dependency on UCX packages. + installation. The base `mercury` RPM package ships with no plugins. + The `mercury-ucx` RPM contains the UCX plugin that is required for + enabling UCX support. + This RPM **must** be used in InfiniBand environments when the intention + is to use UCX. Attempts to install this RPM in non-Infiniband environments + will fail, because it has a dependency on UCX packages. * At DAOS **installation** time, to enable UCX support the `mercury-ucx` RPM package must be explicitly listed. - For example, using the `yum`/`dnf` package manager on EL8: + For example, using the `dnf`/`yum` package manager on EL8: ```bash # on DAOS_ADMIN nodes: - yum install mercury-ucx daos-admin + dnf install mercury-ucx daos-admin # on DAOS_SERVER nodes: - yum install mercury-ucx daos-server + dnf install mercury-ucx daos-server # on DAOS_CLIENT nodes: - yum install mercury-ucx daos-client + dnf install mercury-ucx daos-client ``` After UCX support has been enabled by installing the `mercury-ucx` diff --git a/docs/dev/development.md b/docs/dev/development.md index db0bcc0dbf7..d267a2ede76 100644 --- a/docs/dev/development.md +++ b/docs/dev/development.md @@ -371,27 +371,33 @@ can be created using the daos admin tool (see next section). For more advanced configurations involving SCM, SSD or a real fabric, please refer to the next section. -## Updating a 3rd party component +## DAOS RPMs build process The DAOS build process now covers building RPMs for both DAOS and dependencies -specified in [`utils/build.config`](../../utils/build.config) (or those that we build regularly with -`--build-deps=yes`). The RPM (and deb) build process uses +specified in [`utils/build.config`](../../utils/build.config) (or those that we +build regularly with `--build-deps=yes`). The complete list of RPMs is defined +in the [`utils/rpms/build_packages.sh`](../../utils/rpms/build_packages.sh) +script. The RPM (and deb) build process uses [FPM](https://fpm.readthedocs.io/en/latest/getting-started.html). Essentially, it creates rpm packages after a DAOS build. Regardless of how that build is done, it will put files in the right places in the final packages. Most of the magic -is in [`utils/rpms/fpm_common.sh`](../../utils/rpms/fpm_common.sh) with component specific code in -`utils/rpms/.sh`. +is in [`utils/rpms/fpm_common.sh`](../../utils/rpms/fpm_common.sh) with +component specific code in `utils/rpms/.sh`. + +### Updating a 3rd party component In order to properly upgrade a 3rd party component, do all of the following: -1. Change the `utils/build.config` to point to the new version or to add a new patch. Patches should - be stored in `deps/patches/`. -1. Update [`utils/rpms/daos.spec`](../../utils/rpms/daos.spec) changelog and release iteration. - This is important to document the change. +1. Change the [`utils/build.config`](../../utils/build.config) to point to + the new version or to add a new patch. Patches should be stored in + `deps/patches/`. +1. Update (increase) the `Release` variable in [`utils/rpms/daos.spec`](../../utils/rpms/daos.spec). +1. Update [`utils/rpms/daos.changelog`](../../utils/rpms/daos.changelog) + changelog and release iteration. This is important to document the change. 1. Update the `_release` and/or `_version` in [`utils/rpms/package_info.sh`](../../utils/rpms/package_info.sh) 1. Make any necessary changes to `utils/rpms/.sh` such as adding new files to various packages. -1. Update the `utils/rpms/.changelog` file to document the change and make sure - the changelog file is referenced by the `RPM_CHANGELOG=".changelog"` variable - in `utils/rpms/.sh`. +1. Update the `utils/rpms/.changelog` file to document the change and + make sure the file is referenced by the + `RPM_CHANGELOG=".changelog"` variable in `utils/rpms/.sh`. diff --git a/docs/overview/terminology.md b/docs/overview/terminology.md index 677e03f579c..4815199e6cc 100644 --- a/docs/overview/terminology.md +++ b/docs/overview/terminology.md @@ -32,7 +32,7 @@ |[OFI](https://ofiwg.github.io/libfabric/)|Open Fabrics Interfaces| |OS|Operating System| |PM|Persistent Memory| -|[PMDK](https://github.com/pmem/pmdk)|Persistent Memory Devevelopment Kit| +|[PMDK](https://github.com/daos-stack/pmdk)|Persistent Memory Devevelopment Kit| |[RAFT](https://raft.github.io/)|Raft is a consensus algorithm used to distribute state transitions among DAOS server nodes.| |RAS|Reliability, Availability & Serviceability| |RDB|Replicated Database, containing pool metadata and maintained across DAOS servers using the Raft algorithm.| diff --git a/docs/testing/ior.md b/docs/testing/ior.md index 15137866d5a..d52ed1e0226 100644 --- a/docs/testing/ior.md +++ b/docs/testing/ior.md @@ -54,7 +54,7 @@ $ mpirun -hostfile /path/to/hostfile_clients -np 30 /bin/ior -a POSIX IOR-3.4.0+dev: MPI Coordinated Test of Parallel I/O Began : Thu Apr 29 23:23:09 2021 Command line : ior -a POSIX -b 5G -t 1M -v -W -w -r -R -i 1 -o /tmp/daos_dfuse/testfile -Machine : Linux wolf-86.wolf.hpdd.intel.com +Machine : Linux wolf-86.wolf.example.com Start time skew across all tasks: 0.00 sec TestID : 0 StartTime : Thu Apr 29 23:23:09 2021 diff --git a/docs/user/container.md b/docs/user/container.md index a6c851506e4..b1538b58537 100644 --- a/docs/user/container.md +++ b/docs/user/container.md @@ -435,17 +435,13 @@ during container create. - cksum (`DAOS_PROP_CO_CSUM`): the type of checksum algorithm to use. Supported values are adler32, crc[16|32|64] or sha[1|256|512]. By default, - checksum is disabled for new containers. + checksum is enabled for new containers using crc32. - cksum\_size (`DAOS_PROP_CO_CSUM_CHUNK_SIZE`): defines the chunk size used for creating checksums of array types. (default is 32K). -- srv\_cksum (`DAOS_PROP_CO_CSUM_SERVER_VERIFY`): Because of the probable decrease to - IOPS, in most cases, it is not desired to verify checksums on an object - update on the server side. It is sufficient for the client to verify on - a fetch because any data corruption, whether on the object update, - storage, or fetch, will be caught. However, there is an advantage to - knowing if corruption happens on an update. The update would fail - right away, indicating to the client to retry the RPC or report an - error to upper levels. +- srv\_cksum (`DAOS_PROP_CO_CSUM_SERVER_VERIFY`): verify the checksum on an + object update on the server side. This is enabled by default. Verifying checksums + on update allows to pro-actively detect corruption over the wire and retry the RPC + from the client, but has an impact on IOPS. For instance, to create a new container with crc64 checksum enabled and checksum verification on the server side, one can use the following command @@ -471,9 +467,8 @@ The DAOS erasure code implementation uses a fixed cell size that applies to all objects in the container. The cell size in DAOS is the size of a single data and parity fragment. By default, a container's `ec_cell_sz` property is inherited from the pool's -default `ec_cell_sz`, which was 1MiB in DAOS 2.0 and has been reduced to -64kiB in DAOS 2.2. The container cell size can also be set at -container creation time via the `--property` option: +default `ec_cell_sz`, which is 128kiB. The container cell size can also be set +at container creation time via the `--property` option: ```bash $ daos cont create tank mycont5 --type POSIX --properties rd_fac:1,cell_size:131072 diff --git a/docs/user/filesystem.md b/docs/user/filesystem.md index ca07a638637..77459efd166 100644 --- a/docs/user/filesystem.md +++ b/docs/user/filesystem.md @@ -486,7 +486,7 @@ $ systemctl status scratch_fs-root_dfuse.mount Docs: man:fstab(5) man:systemd-fstab-generator(8) -Sep 23 15:55:33 wolf-170.wolf.hpdd.intel.com systemd[1]: scratch_fs-root_dfuse.mount: Succeeded. +Sep 23 15:55:33 wolf-170.wolf.example.com systemd[1]: scratch_fs-root_dfuse.mount: Succeeded. $ systemctl start scratch_fs-root_dfuse.mount $ df -h | grep fuse dfuse 537G 5.1G 532G 1% /scratch_fs/root_dfuse @@ -505,8 +505,8 @@ $ systemctl status scratch_fs-root_dfuse.mount CGroup: /system.slice/scratch_fs-root_dfuse.mount └─4173 dfuse /scratch_fs/root_dfuse -o rw pool=admin_pool container=admin_cont dev suid -Sep 23 15:57:52 wolf-170.wolf.hpdd.intel.com systemd[1]: Mounting /scratch_fs/root_dfuse... -Sep 23 15:57:53 wolf-170.wolf.hpdd.intel.com systemd[1]: Mounted /scratch_fs/root_dfuse. +Sep 23 15:57:52 wolf-170.wolf.example.com systemd[1]: Mounting /scratch_fs/root_dfuse... +Sep 23 15:57:53 wolf-170.wolf.example.com systemd[1]: Mounted /scratch_fs/root_dfuse. $ systemctl stop scratch_fs-root_dfuse.mount $ systemctl status scratch_fs-root_dfuse.mount ● scratch_fs-root_dfuse.mount - /scratch_fs/root_dfuse @@ -520,11 +520,11 @@ $ systemctl status scratch_fs-root_dfuse.mount Memory: 540.0K CGroup: /system.slice/scratch_fs-root_dfuse.mount -Sep 23 15:57:52 wolf-170.wolf.hpdd.intel.com systemd[1]: Mounting /scratch_fs/root_dfuse... -Sep 23 15:57:53 wolf-170.wolf.hpdd.intel.com systemd[1]: Mounted /scratch_fs/root_dfuse. -Sep 23 15:58:32 wolf-170.wolf.hpdd.intel.com systemd[1]: Unmounting /scratch_fs/root_dfuse... -Sep 23 15:58:32 wolf-170.wolf.hpdd.intel.com systemd[1]: scratch_fs-root_dfuse.mount: Succeeded. -Sep 23 15:58:32 wolf-170.wolf.hpdd.intel.com systemd[1]: Unmounted /scratch_fs/root_dfuse. +Sep 23 15:57:52 wolf-170.wolf.example.com systemd[1]: Mounting /scratch_fs/root_dfuse... +Sep 23 15:57:53 wolf-170.wolf.example.com systemd[1]: Mounted /scratch_fs/root_dfuse. +Sep 23 15:58:32 wolf-170.wolf.example.com systemd[1]: Unmounting /scratch_fs/root_dfuse... +Sep 23 15:58:32 wolf-170.wolf.example.com systemd[1]: scratch_fs-root_dfuse.mount: Succeeded. +Sep 23 15:58:32 wolf-170.wolf.example.com systemd[1]: Unmounted /scratch_fs/root_dfuse. $ ``` @@ -580,8 +580,8 @@ $ systemctl status scratch_fs-root_dfuse.mount CGroup: /system.slice/scratch_fs-root_dfuse.mount └─2346 dfuse /scratch_fs/root_dfuse -o rw pool=admin_pool container=admin_cont dev suid -Sep 23 16:13:34 wolf-170.wolf.hpdd.intel.com systemd[1]: Mounting /scratch_fs/root_dfuse... -Sep 23 16:13:35 wolf-170.wolf.hpdd.intel.com systemd[1]: Mounted /scratch_fs/root_dfuse. +Sep 23 16:13:34 wolf-170.wolf.example.com systemd[1]: Mounting /scratch_fs/root_dfuse... +Sep 23 16:13:35 wolf-170.wolf.example.com systemd[1]: Mounted /scratch_fs/root_dfuse. $ ``` diff --git a/site_scons/components/__init__.py b/site_scons/components/__init__.py index 18adfca2f1f..f6ab99d6245 100644 --- a/site_scons/components/__init__.py +++ b/site_scons/components/__init__.py @@ -1,6 +1,6 @@ # Copyright 2016-2024 Intel Corporation # Copyright 2025 Google LLC -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -32,9 +32,7 @@ # Check if this is an ARM platform PROCESSOR = platform.machine() ARM_LIST = ["ARMv7", "armeabi", "aarch64", "arm64"] -ARM_PLATFORM = False -if PROCESSOR.lower() in [x.lower() for x in ARM_LIST]: - ARM_PLATFORM = True +ARM_PLATFORM = PROCESSOR.lower() in [x.lower() for x in ARM_LIST] class InstalledComps(): @@ -95,10 +93,12 @@ def check(reqs, name, built_str, installed_str=""): def ofi_config(config): """Check ofi version""" if not GetOption('silent'): - print('Checking for libfabric > 1.11...', end=' ') + print('Checking for libfabric >= 1.20...', end=' ') code = """#include -_Static_assert(FI_MAJOR_VERSION == 1 && FI_MINOR_VERSION >= 11, - "libfabric must be >= 1.11");""" +_Static_assert(FI_VERSION_GE( + FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + FI_VERSION(1, 20)), + "libfabric must be >= 1.20");""" rc = config.TryCompile(code, ".c") if not GetOption('silent'): print('yes' if rc else 'no') @@ -202,17 +202,13 @@ def define_mercury(reqs): '-DMERCURY_USE_SYSTEM_BOOST:BOOL=ON', '-DMERCURY_USE_CHECKSUMS:BOOL=OFF', '-DMERCURY_ENABLE_COUNTERS:BOOL=ON', + '-DMERCURY_ENABLE_DEBUG:BOOL=ON', '-DNA_USE_DYNAMIC_PLUGINS:BOOL=ON', '-DNA_USE_SM:BOOL=ON', '-DNA_USE_OFI:BOOL=ON', '-DNA_USE_UCX:BOOL=ON', '../mercury'] - if reqs.target_type == 'debug': - mercury_build.append('-DMERCURY_ENABLE_DEBUG:BOOL=ON') - else: - mercury_build.append('-DMERCURY_ENABLE_DEBUG:BOOL=OFF') - reqs.define('mercury', retriever=GitRepoRetriever(True), commands=[mercury_build, @@ -273,14 +269,15 @@ def define_components(reqs): reqs.define('isal', retriever=GitRepoRetriever(), commands=[['./autogen.sh'], - ['./configure', '--prefix=$ISAL_PREFIX', '--libdir=$ISAL_PREFIX/lib64'], + ['./configure', '--disable-static', '--prefix=$ISAL_PREFIX', + '--libdir=$ISAL_PREFIX/lib64'], ['make'], ['make', 'install']], libs=['isal']) reqs.define('isal_crypto', retriever=GitRepoRetriever(), - commands=[['./autogen.sh'], - ['./configure', + commands=[['./autogen.sh', '--no-oshmem'], + ['./configure', '--disable-static', '--prefix=$ISAL_CRYPTO_PREFIX', '--libdir=$ISAL_CRYPTO_PREFIX/lib64'], ['make'], @@ -357,8 +354,12 @@ def define_components(reqs): # https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html dist = distro.linux_distribution() + spdk_reqs = ['isal', 'isal_crypto'] + spdk_conf = ['--with-isal=$ISAL_PREFIX', '--with-isal-crypto=$ISAL_CRYPTO_PREFIX'] if ARM_PLATFORM: spdk_arch = 'native' + spdk_reqs = [] + spdk_conf = [] elif dist[0] == 'CentOS Linux' and dist[1] == '7': spdk_arch = 'native' elif dist[0] == 'Ubuntu' and dist[1] == '20.04': @@ -374,25 +375,24 @@ def define_components(reqs): '--prefix=$SPDK_PREFIX', '--disable-tests', '--disable-unit-tests', - '--disable-apps', '--without-vhost', - '--without-crypto', - '--without-pmdk', '--without-rbd', '--without-iscsi-initiator', - '--without-isal', '--without-vtune', '--with-shared', - f'--target-arch={spdk_arch}'], + '--without-nvme-cuse', + '--without-crypto', + f'--target-arch={spdk_arch}'] + spdk_conf, ['make', f'CONFIG_ARCH={spdk_arch}'], ['make', 'libdir=$SPDK_PREFIX/lib64/daos_srv', 'includedir=$SPDK_PREFIX/include/daos_srv', 'install'], [copy_files, 'dpdk/build/lib', '$SPDK_PREFIX/lib64/daos_srv'], + ['rm', '-rf', '$SPDK_PREFIX/lib'], [copy_files, 'dpdk/build/include', '$SPDK_PREFIX/include/daos_srv/dpdk'], [copy_files, 'include', '$SPDK_PREFIX/share/daos/spdk/include'], [copy_files, 'scripts', '$SPDK_PREFIX/share/daos/spdk/scripts'], - ['mv', '$SPDK_PREFIX/bin/spdk_nvme_discovery_aer', - '$SPDK_PREFIX/bin/daos_spdk_nvme_discovery_aer'], + ['mv', '$SPDK_PREFIX/bin/spdk_nvme_discover', + '$SPDK_PREFIX/bin/daos_spdk_nvme_discover'], ['cp', 'build/examples/lsvmd', '$SPDK_PREFIX/bin/daos_spdk_nvme_lsvmd'], ['cp', 'build/examples/nvme_manage', '$SPDK_PREFIX/bin/daos_spdk_nvme_manage'], @@ -404,7 +404,8 @@ def define_components(reqs): extra_lib_path=['lib64/daos_srv'], headers=['spdk/nvme.h'], pkgconfig='daos_spdk', - patch_rpath=['lib64/daos_srv', 'bin']) + patch_rpath=['lib64/daos_srv', 'bin'], + requires=spdk_reqs) reqs.define('protobufc', retriever=GitRepoRetriever(), diff --git a/site_scons/prereq_tools/base.py b/site_scons/prereq_tools/base.py index 207bdbc0f8b..a1cd84fab2a 100644 --- a/site_scons/prereq_tools/base.py +++ b/site_scons/prereq_tools/base.py @@ -34,6 +34,7 @@ import traceback from copy import deepcopy +from SCons.Errors import InternalError from SCons.Script import BUILD_TARGETS, Dir, Exit, GetOption, SetOption, WhereIs from SCons.Variables import BoolVariable, EnumVariable, ListVariable, PathVariable @@ -517,6 +518,7 @@ def __init__(self, env, opts): opts.Add(EnumVariable('WARNING_LEVEL', "Set default warning level", 'error', ['warning', 'warn', 'error'], ignorecase=2)) opts.Add(('SANITIZERS', 'Instrument C code with google sanitizers', None)) + opts.Add(BoolVariable('CMOCKA_FILTER_SUPPORTED', 'Allows to filter cmocka tests', False)) opts.Update(self.__env) @@ -637,6 +639,33 @@ def _setup_build_type(self): return self.__env.subst("$BUILD_ROOT/$BUILD_TYPE/$COMPILER") + def _setup_intelc(self): + """Setup environment to use Intel compilers""" + try: + env = self.__env.Clone(tools=['doneapi']) + self._has_icx = True + except InternalError: + print("No oneapi compiler, trying legacy") + env = self.__env.Clone(tools=['intelc']) + self.__env["ENV"]["PATH"] = env["ENV"]["PATH"] + self.__env["ENV"]["LD_LIBRARY_PATH"] = env["ENV"]["LD_LIBRARY_PATH"] + self.__env.Replace(AR=env.get("AR")) + self.__env.Replace(ENV=env.get("ENV")) + self.__env.Replace(CC=env.get("CC")) + self.__env.Replace(CXX=env.get("CXX")) + version = env.get("INTEL_C_COMPILER_VERSION") + self.__env.Replace(INTEL_C_COMPILER_VERSION=version) + self.__env.Replace(LINK=env.get("LINK")) + # disable the warning about Cilk since we don't use it + if not self._has_icx: + self.__env.AppendUnique(LINKFLAGS=["-static-intel", + "-diag-disable=10237"]) + self.__env.AppendUnique(CCFLAGS=["-diag-disable:2282", + "-diag-disable:188", + "-diag-disable:2405", + "-diag-disable:1338"]) + return {'CC': env.get("CC"), "CXX": env.get("CXX")} + def _setup_compiler(self): """Setup the compiler to use""" compiler_map = {'gcc': {'CC': 'gcc', 'CXX': 'g++'}, @@ -650,6 +679,8 @@ def _setup_compiler(self): return compiler = self.__env.get('COMPILER') + if compiler == 'icc': + compiler_map['icc'] = self._setup_intelc() if self.__env.get('WARNING_LEVEL') == 'error': if compiler == 'icc' and not self._has_icx: diff --git a/site_scons/site_tools/compiler_setup.py b/site_scons/site_tools/compiler_setup.py index 51909b8cfe6..812ef0632dd 100644 --- a/site_scons/site_tools/compiler_setup.py +++ b/site_scons/site_tools/compiler_setup.py @@ -92,7 +92,8 @@ def _base_setup(env): if build_type == 'debug': if compiler == 'gcc': - env.AppendUnique(CCFLAGS=['-Og']) + env['CCFLAGS'].remove('-g') + env.AppendUnique(CCFLAGS=['-g3', '-Og']) else: env.AppendUnique(CCFLAGS=['-O0']) else: @@ -106,7 +107,10 @@ def _base_setup(env): env.AppendUnique(CPPDEFINES={'FAULT_INJECTION': '1'}) env.AppendUnique(CPPDEFINES={'BUILD_PIPELINE': '1'}) - env.AppendUnique(CPPDEFINES={'CMOCKA_FILTER_SUPPORTED': '0'}) + if env['CMOCKA_FILTER_SUPPORTED']: + env.AppendUnique(CPPDEFINES={'CMOCKA_FILTER_SUPPORTED': '1'}) + else: + env.AppendUnique(CPPDEFINES={'CMOCKA_FILTER_SUPPORTED': '0'}) env.AppendUnique(CPPDEFINES='_GNU_SOURCE') diff --git a/src/bio/SConscript b/src/bio/SConscript index 93866e078e8..444a243eb3c 100644 --- a/src/bio/SConscript +++ b/src/bio/SConscript @@ -32,7 +32,7 @@ def scons(): libs += ['spdk_vmd', 'spdk_event_bdev', 'spdk_init'] # Other libs - libs += ['numa', 'dl', 'smd', 'abt'] + libs += ['numa', 'dl', 'smd', 'abt', 'ssl'] tgts = FILES + control_tgts bio = denv.d_library("bio", tgts, install_off="../..", LIBS=libs) diff --git a/src/bio/bio_buffer.c b/src/bio/bio_buffer.c index b043de2f00b..55b67988cff 100644 --- a/src/bio/bio_buffer.c +++ b/src/bio/bio_buffer.c @@ -280,6 +280,7 @@ bio_iod_alloc(struct bio_io_context *ctxt, struct umem_instance *umem, return NULL; D_ASSERT(type < BIO_IOD_TYPE_MAX); + bio_io_lug_init(&biod->bd_io_lug); biod->bd_umem = umem; biod->bd_ctxt = ctxt; biod->bd_type = type; @@ -336,6 +337,7 @@ bio_iod_free(struct bio_desc *biod) bio_sgl_fini(&biod->bd_sgls[i]); D_FREE(biod->bd_bulk_hdls); + bio_io_lug_fini(&biod->bd_io_lug); D_FREE(biod); } @@ -1041,8 +1043,7 @@ rw_completion(void *cb_arg, int err) bxb = biod->bd_ctxt->bic_xs_blobstore; D_ASSERT(bxb != NULL); - D_ASSERT(bxb->bxb_blob_rw > 0); - bxb->bxb_blob_rw--; + bio_io_lug_dequeue(bxb, &biod->bd_io_lug); io_ctxt = biod->bd_ctxt; D_ASSERT(io_ctxt != NULL); @@ -1184,7 +1185,7 @@ nvme_rw(struct bio_desc *biod, struct bio_rsrvd_region *rg) biod->bd_dma_issued = 1; biod->bd_inflights++; - bxb->bxb_blob_rw++; + bio_io_lug_enqueue(xs_ctxt, bxb, &biod->bd_io_lug); biod->bd_ctxt->bic_inflight_dmas++; rw_cnt = (pg_cnt > bio_chk_sz) ? bio_chk_sz : pg_cnt; @@ -1982,3 +1983,42 @@ bio_copy(struct bio_io_context *ioctxt, struct umem_instance *umem, return rc; } + +#define IO_MONITOR_INTVL 1000000 /* us, 1 second */ + +void +bio_io_monitor(struct bio_xs_context *xs_ctxt, uint64_t now) +{ + enum smd_dev_type st; + struct bio_xs_blobstore *bxb; + struct bio_io_lug *io_lug; + struct media_error_msg *mem; + + if ((xs_ctxt->bxc_io_monitor_ts + IO_MONITOR_INTVL) > now) + return; + + xs_ctxt->bxc_io_monitor_ts = now; + + for (st = SMD_DEV_TYPE_DATA; st < SMD_DEV_TYPE_MAX; st++) { + bxb = xs_ctxt->bxc_xs_blobstores[st]; + + if (!bxb || d_list_empty(&bxb->bxb_pending_ios)) + continue; + + io_lug = d_list_entry(bxb->bxb_pending_ios.next, struct bio_io_lug, bil_link); + D_ASSERT(io_lug->bil_submit_ts != 0); + + if ((io_lug->bil_submit_ts + bio_io_timeout) >= now) + continue; + + D_ALLOC_PTR(mem); + if (mem == NULL) { + D_ERROR("Out of memory: NVMe stalled I/O report is skipped\n"); + continue; + } + mem->mem_err_type = MET_IO_STALLED; + mem->mem_bs = bxb->bxb_blobstore; + mem->mem_tgt_id = xs_ctxt->bxc_tgt_id; + spdk_thread_send_msg(owner_thread(mem->mem_bs), bio_media_error, mem); + } +} diff --git a/src/bio/bio_config.c b/src/bio/bio_config.c index 80485070c83..f4a33fd1f21 100644 --- a/src/bio/bio_config.c +++ b/src/bio/bio_config.c @@ -960,10 +960,7 @@ bio_read_auto_faulty_criteria(const char *nvme_conf, bool *enable, uint32_t *max if (rc != 0) { if (rc == JSON_NOT_FOUND) { rc = 0; - *enable = false; - *max_io_errs = UINT32_MAX; - *max_csum_errs = UINT32_MAX; - D_DEBUG(DB_MGMT, "bdev auto-faulty criteria disabled as not configured\n"); + D_DEBUG(DB_MGMT, "bdev auto-faulty criteria not set in config\n"); } return rc; } diff --git a/src/bio/bio_context.c b/src/bio/bio_context.c index be015df2959..0a93de41696 100644 --- a/src/bio/bio_context.c +++ b/src/bio/bio_context.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2018-2025 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -12,6 +13,7 @@ struct blob_cp_arg { spdk_blob_id bca_id; struct spdk_blob *bca_blob; + struct bio_io_lug bca_io_lug; /* * Completion could run on different xstream when NVMe * device is shared by multiple xstreams. @@ -35,6 +37,7 @@ blob_cp_arg_init(struct blob_cp_arg *ba) { int rc; + bio_io_lug_init(&ba->bca_io_lug); rc = ABT_eventual_create(0, &ba->bca_eventual); if (rc != ABT_SUCCESS) return dss_abterr2der(rc); @@ -45,6 +48,7 @@ blob_cp_arg_init(struct blob_cp_arg *ba) static inline void blob_cp_arg_fini(struct blob_cp_arg *ba) { + bio_io_lug_fini(&ba->bca_io_lug); ABT_eventual_free(&ba->bca_eventual); } @@ -163,8 +167,7 @@ blob_unmap_cb(void *arg, int rc) bxb = bma->bma_ioc->bic_xs_blobstore; D_ASSERT(bxb != NULL); - D_ASSERT(bxb->bxb_blob_rw > 0); - bxb->bxb_blob_rw--; + bio_io_lug_dequeue(bxb, &ba->bca_io_lug); blob_common_cb(ba, rc); } @@ -904,6 +907,10 @@ int bio_mc_open(struct bio_xs_context *xs_ctxt, uuid_t pool_id, D_ASSERT(xs_ctxt != NULL); + if (DAOS_FAIL_CHECK(DAOS_FAULT_POOL_OPEN_BIO)) { /** fault injection */ + return daos_errno2der(daos_fail_value_get()); + } + *mc = NULL; if (!bio_nvme_configured(SMD_DEV_TYPE_META)) { /* No data blob for RDB */ @@ -1227,7 +1234,7 @@ blob_unmap_sgl(struct bio_io_context *ioctxt, d_sg_list_t *unmap_sgl, uint32_t b drain_inflight_ios(xs_ctxt, bxb); ba->bca_inflights++; - bxb->bxb_blob_rw++; + bio_io_lug_enqueue(xs_ctxt, bxb, &ba->bca_io_lug); pg_off = (uint64_t)unmap_iov->iov_buf; pg_cnt = unmap_iov->iov_len; diff --git a/src/bio/bio_device.c b/src/bio/bio_device.c index 07deb6c7b10..3725210d1fe 100644 --- a/src/bio/bio_device.c +++ b/src/bio/bio_device.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2020-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -15,6 +15,7 @@ #include #include #include +#include #include "smd.pb-c.h" @@ -48,13 +49,10 @@ revive_dev(struct bio_xs_context *xs_ctxt, struct bio_bdev *d_bdev) d_bdev->bb_trigger_reint = 1; spdk_thread_send_msg(owner_thread(bbs), setup_bio_bdev, d_bdev); - /** - * Reset the LED of the VMD device once revived, a DER_NOTSUPPORTED indicates that VMD-LED - * control is not enabled on device. - */ + /* Reset the LED of the VMD device once revived */ rc = bio_led_manage(xs_ctxt, NULL, d_bdev->bb_uuid, (unsigned int)CTL__LED_ACTION__RESET, NULL, 0); - if ((rc != 0) && (rc != -DER_NOTSUPPORTED)) + if (rc != 0) DL_ERROR(rc, "Reset LED on device:" DF_UUID " failed", DP_UUID(d_bdev->bb_uuid)); return 0; @@ -695,11 +693,12 @@ static void led_device_action(void *ctx, struct spdk_pci_device *pci_device) { struct led_opts *opts = ctx; - enum spdk_vmd_led_state cur_led_state; - Ctl__LedState d_led_state; + enum spdk_vmd_led_state cur_led_state = SPDK_VMD_LED_STATE_UNKNOWN; + Ctl__LedState d_led_state = CTL__LED_STATE__NA; const char *pci_dev_type = NULL; char addr_buf[ADDR_STR_MAX_LEN + 1]; int rc; + bool vmd_on; if (opts->status != 0) return; @@ -726,41 +725,45 @@ led_device_action(void *ctx, struct spdk_pci_device *pci_device) return; } - if (strncmp(pci_dev_type, NVME_PCI_DEV_TYPE_VMD, strlen(NVME_PCI_DEV_TYPE_VMD)) != 0) { - D_DEBUG(DB_MGMT, "Found non-VMD device type (%s:%s), can't manage LED\n", - pci_dev_type, addr_buf); - opts->status = -DER_NOTSUPPORTED; - return; - } + vmd_on = strncmp(pci_dev_type, NVME_PCI_DEV_TYPE_VMD, strlen(NVME_PCI_DEV_TYPE_VMD)) == 0; - /* First check the current state of the VMD LED */ - rc = spdk_vmd_get_led_state(pci_device, &cur_led_state); - if (spdk_unlikely(rc != 0)) { - D_ERROR("Failed to retrieve the state of the LED on %s (%s)\n", addr_buf, - spdk_strerror(-rc)); - opts->status = -DER_NOSYS; - return; - } + D_DEBUG(DB_MGMT, "led_device_action addr:%s, action:%s", addr_buf, + LED_ACTION_NAME(opts->action)); + + if (vmd_on) { + /* First check the current state of the VMD LED */ + rc = spdk_vmd_get_led_state(pci_device, &cur_led_state); + if (spdk_unlikely(rc != 0)) { + D_ERROR("Failed to retrieve the state of the LED on %s (%s)\n", addr_buf, + spdk_strerror(-rc)); + opts->status = -DER_NOSYS; + return; + } - /* Convert state to Ctl__LedState from SPDK led_state */ - d_led_state = led_state_spdk2daos(cur_led_state); + /* Convert state to Ctl__LedState from SPDK led_state */ + d_led_state = led_state_spdk2daos(cur_led_state); - D_DEBUG(DB_MGMT, "led on dev %s has state: %s (action: %s, new state: %s)\n", addr_buf, - LED_STATE_NAME(d_led_state), LED_ACTION_NAME(opts->action), - LED_STATE_NAME(opts->led_state)); + D_DEBUG(DB_MGMT, "vmd led on dev %s has state: %s (action: %s, new state: %s)\n", + addr_buf, LED_STATE_NAME(d_led_state), LED_ACTION_NAME(opts->action), + LED_STATE_NAME(opts->led_state)); + } switch (opts->action) { case CTL__LED_ACTION__GET: - /* Return early with current device state set */ - opts->led_state = d_led_state; + if (vmd_on) + /* Return early with current device state set */ + opts->led_state = d_led_state; + else + /* Leave state as NA */ + D_ERROR("LED state GET not supported for non-VMD device (type %s:%s)\n", + pci_dev_type, addr_buf); return; case CTL__LED_ACTION__SET: break; case CTL__LED_ACTION__RESET: /* Reset intercepted earlier in call-stack and converted to set */ - D_ERROR("Reset action is not supported\n"); - opts->status = -DER_INVAL; - return; + D_ERROR("Reset action unsupported in this code path\n"); + D_ASSERT(false); default: D_ERROR("Unrecognized LED action requested\n"); opts->status = -DER_INVAL; @@ -773,30 +776,44 @@ led_device_action(void *ctx, struct spdk_pci_device *pci_device) return; } - /* Set the LED to the new state */ - rc = spdk_vmd_set_led_state(pci_device, led_state_daos2spdk(opts->led_state)); - if (spdk_unlikely(rc != 0)) { - D_ERROR("Failed to set the VMD LED state on %s (%s)\n", addr_buf, - spdk_strerror(-rc)); - opts->status = -DER_NOSYS; - return; - } + if (vmd_on) { + /* Set the LED to the new state */ + rc = spdk_vmd_set_led_state(pci_device, led_state_daos2spdk(opts->led_state)); + if (spdk_unlikely(rc != 0)) { + D_ERROR("Failed to set the VMD LED state on %s (%s)\n", addr_buf, + spdk_strerror(-rc)); + opts->status = -DER_NOSYS; + return; + } - rc = spdk_vmd_get_led_state(pci_device, &cur_led_state); - if (rc != 0) { - D_ERROR("Failed to get the VMD LED state on %s (%s)\n", addr_buf, - spdk_strerror(-rc)); - opts->status = -DER_NOSYS; - return; + rc = spdk_vmd_get_led_state(pci_device, &cur_led_state); + if (rc != 0) { + D_ERROR("Failed to get the VMD LED state on %s (%s)\n", addr_buf, + spdk_strerror(-rc)); + opts->status = -DER_NOSYS; + return; + } + d_led_state = led_state_spdk2daos(cur_led_state); + } else { + /* Set current state to expected if no VMD */ + d_led_state = opts->led_state; } - d_led_state = led_state_spdk2daos(cur_led_state); /* Verify the correct state is set */ if (d_led_state != opts->led_state) { D_ERROR("Unexpected LED state on %s, want %s got %s\n", addr_buf, LED_STATE_NAME(opts->led_state), LED_STATE_NAME(d_led_state)); - opts->status = -DER_INVAL; + opts->status = -DER_MISC; + return; } + + /** + * Print RAS event for LED change. If no VMD, the RAS events may be used to trigger LED + * control mechanisms outside of SPDK and/or DAOS. + */ + ras_notify_eventf(RAS_DEVICE_LED_SET, RAS_TYPE_INFO, RAS_SEV_NOTICE, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "LED on device %s set to state %s", + addr_buf, LED_STATE_NAME(opts->led_state)); } static int @@ -893,7 +910,7 @@ led_manage(struct bio_xs_context *xs_ctxt, struct spdk_pci_addr pci_addr, Ctl__L case CTL__LED_ACTION__SET: opts.action = action; if (state == NULL) { - D_ERROR("LED state not set for SET action\n"); + D_ERROR("LED state not set, missing state field\n"); return -DER_INVAL; } opts.led_state = *state; @@ -920,15 +937,12 @@ led_manage(struct bio_xs_context *xs_ctxt, struct spdk_pci_addr pci_addr, Ctl__L spdk_pci_for_each_device(&opts, led_device_action); if (opts.status != 0) { - if (opts.status != -DER_NOTSUPPORTED) { - if (state != NULL) - D_ERROR("LED %s failed (target state: %s): %s\n", - LED_ACTION_NAME(action), LED_STATE_NAME(*state), - spdk_strerror(opts.status)); - else - D_ERROR("LED %s failed: %s\n", LED_ACTION_NAME(action), - spdk_strerror(opts.status)); - } + if (state != NULL) + D_ERROR("LED %s failed (target state: %s): %s\n", LED_ACTION_NAME(action), + LED_STATE_NAME(*state), spdk_strerror(opts.status)); + else + D_ERROR("LED %s failed: %s\n", LED_ACTION_NAME(action), + spdk_strerror(opts.status)); return opts.status; } @@ -999,15 +1013,20 @@ dev_uuid2pci_addr(struct spdk_pci_addr *pci_addr, uuid_t dev_uuid) } rc = fill_in_traddr(&b_info, d_bdev->bb_name); - if (rc || b_info.bdi_traddr == NULL) { - D_DEBUG(DB_MGMT, "Unable to get traddr for device %s\n", d_bdev->bb_name); + if (rc) { + D_ERROR("Unable to get traddr for device %s\n", d_bdev->bb_name); return -DER_INVAL; } + if (b_info.bdi_traddr == NULL) { + D_DEBUG(DB_MGMT, "Skipping get traddr for device %s (not NVMe?)\n", + d_bdev->bb_name); + return -DER_NOTSUPPORTED; + } rc = spdk_pci_addr_parse(pci_addr, b_info.bdi_traddr); if (rc != 0) { - D_DEBUG(DB_MGMT, "Unable to parse PCI address for device %s (%s)\n", - b_info.bdi_traddr, spdk_strerror(-rc)); + D_ERROR("Unable to parse PCI address for device %s (%s)\n", b_info.bdi_traddr, + spdk_strerror(-rc)); rc = -DER_INVAL; } @@ -1015,18 +1034,22 @@ dev_uuid2pci_addr(struct spdk_pci_addr *pci_addr, uuid_t dev_uuid) return rc; } +static bool +is_pci_addr_valid(const struct spdk_pci_addr *addr) +{ + struct spdk_pci_addr zero = {0}; + + return spdk_pci_addr_compare(addr, &zero) != 0; +} + int bio_led_manage(struct bio_xs_context *xs_ctxt, char *tr_addr, uuid_t dev_uuid, unsigned int action, unsigned int *state, uint64_t duration) { - struct spdk_pci_addr pci_addr; + struct spdk_pci_addr pci_addr = {0}; int addr_len = 0; int rc; - /* LED management on NVMe devices currently only supported when VMD is enabled. */ - if (!bio_vmd_enabled) - return -DER_NOTSUPPORTED; - /** * If tr_addr is already provided, convert to a PCI address. If tr_addr is NULL or empty, * derive PCI address from the provided UUID and if tr_addr is an empty string buffer then @@ -1035,14 +1058,21 @@ bio_led_manage(struct bio_xs_context *xs_ctxt, char *tr_addr, uuid_t dev_uuid, u if (tr_addr != NULL) { addr_len = strnlen(tr_addr, SPDK_NVMF_TRADDR_MAX_LEN + 1); - if (addr_len == SPDK_NVMF_TRADDR_MAX_LEN + 1) + if (addr_len == SPDK_NVMF_TRADDR_MAX_LEN + 1) { + D_ERROR("Address string too long"); return -DER_INVAL; + } } if (addr_len == 0) { rc = dev_uuid2pci_addr(&pci_addr, dev_uuid); + if (rc == -DER_NOTSUPPORTED) { + /* Skip LED action for device without valid PCI address */ + return 0; + } if (rc != 0) { - DL_ERROR(rc, "Failed to read PCI addr from dev UUID"); + DL_ERROR(rc, "Failed to read PCI addr from device " DF_UUID, + DP_UUID(dev_uuid)); return rc; } @@ -1050,7 +1080,7 @@ bio_led_manage(struct bio_xs_context *xs_ctxt, char *tr_addr, uuid_t dev_uuid, u /* Populate tr_addr buffer to return address */ rc = spdk_pci_addr_fmt(tr_addr, addr_len, &pci_addr); if (rc != 0) { - D_ERROR("Failed to write VMD's PCI address (%s)\n", + D_ERROR("Failed to write VMD's PCI address (%s)", spdk_strerror(-rc)); return -DER_INVAL; } @@ -1058,12 +1088,135 @@ bio_led_manage(struct bio_xs_context *xs_ctxt, char *tr_addr, uuid_t dev_uuid, u } else { rc = spdk_pci_addr_parse(&pci_addr, tr_addr); if (rc != 0) { - D_ERROR("Unable to parse PCI address for device %s (%s)\n", tr_addr, + D_ERROR("Unable to parse PCI address for device %s (%s)", tr_addr, spdk_strerror(-rc)); return -DER_INVAL; } } + if (!is_pci_addr_valid(&pci_addr)) { + D_ERROR("No valid PCI address found for device"); + return -DER_INVAL; + } + return led_manage(xs_ctxt, pci_addr, (Ctl__LedAction)action, (Ctl__LedState *)state, duration); } + +struct power_mgmt_context_t { + const char *bdev_name; + unsigned int set_val; + unsigned int inflights; +}; + +static void +set_power_mgmt_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct power_mgmt_context_t *pm_ctx = cb_arg; + int sc; + int sct; + uint32_t cdw0; + + spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); + if (sc) { + D_ERROR("Set power management failed for device %s (value: 0x%x), NVMe status " + "code/type: 0x%x/0x%x", + pm_ctx->bdev_name, pm_ctx->set_val, sc, sct); + if (sc == SPDK_NVME_SC_INVALID_FIELD && sct == 0) + D_ERROR(" - INVALID_FIELD: Device may not support requested power state\n"); + else + D_ERROR("\n"); + } else { + D_INFO("Power management value set to 0x%x on device %s\n", pm_ctx->set_val, + pm_ctx->bdev_name); + } + + D_ASSERT(pm_ctx->inflights == 1); + pm_ctx->inflights--; + spdk_bdev_free_io(bdev_io); +} + +int +bio_set_power_mgmt(struct bio_xs_context *ctxt, const char *bdev_name) +{ + struct power_mgmt_context_t pm_ctx = {0}; + struct spdk_nvme_cmd cmd = {0}; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *bdev_io_channel; + int rc = 0; + + /* If default has not been overwritten, skip setting the value */ + if (bio_spdk_power_mgmt_val == NVME_POWER_MGMT_UNINIT) + goto out; + + /* Validate power state value is in valid range (5-bit field) */ + if (bio_spdk_power_mgmt_val > 0x1F) { + D_ERROR("bio_spdk_power_mgmt_val %u exceeds 5-bit limit (0x1F)\n", + bio_spdk_power_mgmt_val); + rc = -DER_INVAL; + goto out; + } + + D_ASSERT(bdev_name != NULL); + + bdev = spdk_bdev_get_by_name(bdev_name); + if (bdev == NULL) { + D_ERROR("No bdev associated with device name %s\n", bdev_name); + rc = -DER_INVAL; + goto out; + } + + if (get_bdev_type(bdev) != BDEV_CLASS_NVME) { + D_DEBUG(DB_MGMT, "Device %s is not NVMe, skipping power management\n", bdev_name); + rc = -DER_NOTSUPPORTED; + goto out; + } + + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN)) { + D_DEBUG(DB_MGMT, "Bdev NVMe admin passthru not supported for %s\n", bdev_name); + rc = -DER_NOTSUPPORTED; + goto out; + } + + /* Writable descriptor required for applying power management settings */ + rc = spdk_bdev_open_ext(bdev_name, true, bio_bdev_event_cb, NULL, &bdev_desc); + if (rc != 0) { + D_ERROR("Failed to open bdev %s, %d\n", bdev_name, rc); + rc = daos_errno2der(-rc); + goto out; + } + + bdev_io_channel = spdk_bdev_get_io_channel(bdev_desc); + D_ASSERT(bdev_io_channel != NULL); + + /* Build NVMe Set Features command for Power Management */ + cmd.opc = SPDK_NVME_OPC_SET_FEATURES; + cmd.nsid = 0; /* 0 = controller-level feature */ + cmd.cdw10_bits.set_features.fid = SPDK_NVME_FEAT_POWER_MANAGEMENT; + cmd.cdw10_bits.set_features.sv = 0; /* Don't save across resets */ + cmd.cdw11_bits.feat_power_management.bits.ps = bio_spdk_power_mgmt_val; + cmd.cdw11_bits.feat_power_management.bits.wh = 0; /* Workload hint = 0 */ + + pm_ctx.bdev_name = bdev_name; + pm_ctx.set_val = bio_spdk_power_mgmt_val; + pm_ctx.inflights = 1; + + rc = spdk_bdev_nvme_admin_passthru(bdev_desc, bdev_io_channel, &cmd, NULL, 0, + set_power_mgmt_completion, &pm_ctx); + if (rc != 0) { + D_ERROR("Failed to submit power management command to set 0x%x on %s, rc:%d\n", + bio_spdk_power_mgmt_val, bdev_name, rc); + rc = daos_errno2der(-rc); + goto out_chan; + } + + rc = xs_poll_completion(ctxt, &pm_ctx.inflights, 0); + D_ASSERT(rc == 0); + +out_chan: + spdk_put_io_channel(bdev_io_channel); + spdk_bdev_close(bdev_desc); +out: + return rc; +} diff --git a/src/bio/bio_internal.h b/src/bio/bio_internal.h index 408c02fa297..810c337942e 100644 --- a/src/bio/bio_internal.h +++ b/src/bio/bio_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -32,6 +32,8 @@ #define NVME_MONITOR_PERIOD (60ULL * (NSEC_PER_SEC / NSEC_PER_USEC)) #define NVME_MONITOR_SHORT_PERIOD (3ULL * (NSEC_PER_SEC / NSEC_PER_USEC)) +#define NVME_POWER_MGMT_UNINIT UINT32_MAX + struct bio_bulk_args { void *ba_bulk_ctxt; unsigned int ba_bulk_perm; @@ -278,7 +280,7 @@ struct bio_dev_health { void *bdh_intel_smart_buf; /*Intel SMART attributes*/ uint64_t bdh_stat_age; unsigned int bdh_inflights; - unsigned int bdh_stopping:1; + unsigned int bdh_stopping : 1, bdh_io_stalled : 1; uint16_t bdh_vendor_id; /* PCI vendor ID */ /** @@ -365,10 +367,21 @@ struct bio_blobstore { bb_faulty_done:1; /* Faulty reaction is done */ }; +struct bio_io_lug { + /* Link to bio_xs_blobstore::bxb_pending_ios */ + d_list_t bil_link; + /* When the I/O is submitted */ + uint64_t bil_submit_ts; + /* Reference count */ + uint32_t bil_ref; +}; + /* Per-xstream blobstore */ struct bio_xs_blobstore { /* In-flight blob read/write */ unsigned int bxb_blob_rw; + /* Pending I/Os */ + d_list_t bxb_pending_ios; /* spdk io channel */ struct spdk_io_channel *bxb_io_channel; /* per bio blobstore */ @@ -381,6 +394,7 @@ struct bio_xs_blobstore { /* Per-xstream NVMe context */ struct bio_xs_context { int bxc_tgt_id; + uint64_t bxc_io_monitor_ts; struct spdk_thread *bxc_thread; struct bio_xs_blobstore *bxc_xs_blobstores[SMD_DEV_TYPE_MAX]; struct bio_dma_buffer *bxc_dma_buf; @@ -388,6 +402,52 @@ struct bio_xs_context { unsigned int bxc_skip_draining : 1; }; +static inline void +bio_io_lug_init(struct bio_io_lug *io_lug) +{ + D_INIT_LIST_HEAD(&io_lug->bil_link); + io_lug->bil_submit_ts = 0; + io_lug->bil_ref = 0; +} + +static inline void +bio_io_lug_fini(struct bio_io_lug *io_lug) +{ + D_ASSERT(io_lug->bil_ref == 0); + D_ASSERT(d_list_empty(&io_lug->bil_link)); +} + +static inline void +bio_io_lug_dequeue(struct bio_xs_blobstore *bxb, struct bio_io_lug *io_lug) +{ + D_ASSERT(bxb->bxb_blob_rw > 0); + bxb->bxb_blob_rw--; + + D_ASSERT(!d_list_empty(&io_lug->bil_link)); + D_ASSERT(io_lug->bil_submit_ts != 0); + D_ASSERT(io_lug->bil_ref > 0); + io_lug->bil_ref--; + if (io_lug->bil_ref == 0) + d_list_del_init(&io_lug->bil_link); +} + +static inline void +bio_io_lug_enqueue(struct bio_xs_context *xs_ctxt, struct bio_xs_blobstore *bxb, + struct bio_io_lug *io_lug) +{ + bxb->bxb_blob_rw++; + if (io_lug->bil_ref == 0) { + if (xs_ctxt->bxc_io_monitor_ts) + io_lug->bil_submit_ts = xs_ctxt->bxc_io_monitor_ts; + else + io_lug->bil_submit_ts = d_timeus_secdiff(0); + + D_ASSERT(d_list_empty(&io_lug->bil_link)); + d_list_add_tail(&io_lug->bil_link, &bxb->bxb_pending_ios); + } + io_lug->bil_ref++; +} + /* Per VOS instance I/O context */ struct bio_io_context { d_list_t bic_link; /* link to bxb_io_ctxts */ @@ -437,6 +497,7 @@ struct bio_rsrvd_dma { /* I/O descriptor */ struct bio_desc { + struct bio_io_lug bd_io_lug; struct umem_instance *bd_umem; struct bio_io_context *bd_ctxt; /* DMA buffers reserved by this io descriptor */ @@ -539,13 +600,14 @@ extern struct bio_faulty_criteria glb_criteria; /* bio_xstream.c */ extern bool bio_scm_rdma; -extern bool bio_spdk_inited; -extern bool bio_vmd_enabled; +extern bool bio_spdk_inited; extern unsigned int bio_chk_sz; extern unsigned int bio_chk_cnt_max; extern unsigned int bio_numa_node; extern unsigned int bio_spdk_max_unmap_cnt; extern unsigned int bio_max_async_sz; +extern unsigned int bio_io_timeout; +extern unsigned int bio_spdk_power_mgmt_val; int xs_poll_completion(struct bio_xs_context *ctxt, unsigned int *inflights, uint64_t timeout); @@ -583,6 +645,8 @@ int iod_add_region(struct bio_desc *biod, struct bio_dma_chunk *chk, uint64_t end, uint8_t media); int dma_buffer_grow(struct bio_dma_buffer *buf, unsigned int cnt); void iod_dma_wait(struct bio_desc *biod); +void +bio_io_monitor(struct bio_xs_context *xs_ctxt, uint64_t now); static inline struct bio_dma_buffer * iod_dma_buf(struct bio_desc *biod) @@ -658,6 +722,8 @@ void trigger_faulty_reaction(struct bio_blobstore *bbs); int fill_in_traddr(struct bio_dev_info *b_info, char *dev_name); struct bio_dev_info * alloc_dev_info(uuid_t dev_id, struct bio_bdev *d_bdev, struct smd_dev_info *s_info); +int +bio_set_power_mgmt(struct bio_xs_context *ctxt, const char *bdev_name); /* bio_config.c */ int diff --git a/src/bio/bio_monitor.c b/src/bio/bio_monitor.c index 22e20a4f9f5..272e0dc80c5 100644 --- a/src/bio/bio_monitor.c +++ b/src/bio/bio_monitor.c @@ -680,7 +680,8 @@ get_spdk_health_info_completion(struct spdk_bdev_io *bdev_io, bool success, static bool is_bbs_faulty(struct bio_blobstore *bbs) { - struct nvme_stats *dev_stats = &bbs->bb_dev_health.bdh_health_state; + struct bio_dev_health *bdh = &bbs->bb_dev_health; + struct nvme_stats *dev_stats = &bdh->bdh_health_state; /* * Used for DAOS NVMe Recovery Tests. Will trigger bs faulty reaction @@ -708,6 +709,12 @@ is_bbs_faulty(struct bio_blobstore *bbs) } } + /* Auto-faulty for stalled I/O stalled is always enabled */ + if (bdh->bdh_io_stalled) { + D_ERROR("I/O stalled on NVMe device " DF_UUID "\n", DP_UUID(bbs->bb_dev->bb_uuid)); + return true; + } + if (!glb_criteria.fc_enabled) return false; diff --git a/src/bio/bio_recovery.c b/src/bio/bio_recovery.c index bcdc24830ec..c717301abe7 100644 --- a/src/bio/bio_recovery.c +++ b/src/bio/bio_recovery.c @@ -514,6 +514,10 @@ bio_xsctxt_health_check(struct bio_xs_context *xs_ctxt, bool log_err, bool updat if (xs_ctxt == NULL) return 0; + if (DAOS_FAIL_CHECK(DAOS_FAULT_POOL_NVME_HEALTH)) { /** fault injection */ + return daos_errno2der(daos_fail_value_get()); + } + for (st = SMD_DEV_TYPE_DATA; st < SMD_DEV_TYPE_MAX; st++) { bxb = xs_ctxt->bxc_xs_blobstores[st]; @@ -693,11 +697,20 @@ bio_media_error(void *msg_arg) "Device: "DF_UUID" csum error logged from tgt_id:%d\n", DP_UUID(mem->mem_bs->bb_dev->bb_uuid), mem->mem_tgt_id); break; + case MET_IO_STALLED: + /* I/O stalling has been reported for this device */ + if (bdh->bdh_io_stalled) + goto out; + bdh->bdh_io_stalled = 1; + snprintf(err_str, DAOS_RAS_STR_FIELD_SIZE, + "Device: " DF_UUID " stalled I/O logged from tgt_id:%d\n", + DP_UUID(mem->mem_bs->bb_dev->bb_uuid), mem->mem_tgt_id); + break; } ras_notify_event(RAS_DEVICE_MEDIA_ERROR, err_str, RAS_TYPE_INFO, RAS_SEV_ERROR, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); +out: auto_faulty_detect(mem->mem_bs); - D_FREE(mem); } diff --git a/src/bio/bio_wal.c b/src/bio/bio_wal.c index 8e482b9f146..c7fc1821cf3 100644 --- a/src/bio/bio_wal.c +++ b/src/bio/bio_wal.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -728,10 +728,6 @@ wal_tx_completion(struct wal_tx_desc *wal_tx, bool complete_next) biod_tx->bd_result = wal_tx->td_error; if (wal_tx->td_error) { - /* Rollback unused ID */ - if (wal_id_cmp(si, wal_tx->td_id, si->si_unused_id) < 0) - si->si_unused_id = wal_tx->td_id; - if (next != NULL) { /* Propagate error to depended transactions, block incoming transactions */ si->si_tx_failed = 1; diff --git a/src/bio/bio_xstream.c b/src/bio/bio_xstream.c index 9785e038d31..043ba121ece 100644 --- a/src/bio/bio_xstream.c +++ b/src/bio/bio_xstream.c @@ -1,6 +1,7 @@ /** * (C) Copyright 2018-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -21,6 +22,7 @@ #include #include #include +#include #include #include "bio_internal.h" #include @@ -29,8 +31,8 @@ /* These Macros should be turned into DAOS configuration in the future */ #define DAOS_MSG_RING_SZ 4096 -/* SPDK blob parameters */ -#define DAOS_BS_CLUSTER_SZ (1ULL << 25) /* 32MB */ +/* Default cluster size in MB */ +#define DAOS_DEFAULT_CLUSTER_MB 128 /* DMA buffer parameters */ #define DAOS_DMA_CHUNK_INIT_PCT 50 /* Default per-xstream init chunks, in percentage */ #define DAOS_DMA_CHUNK_CNT_MAX 128 /* Default per-xstream max chunks, 1GB */ @@ -54,14 +56,15 @@ static unsigned int bio_chk_init_pct; /* Diret RDMA over SCM */ bool bio_scm_rdma; /* Whether SPDK inited */ -bool bio_spdk_inited; -/* Whether VMD is enabled */ -bool bio_vmd_enabled; +bool bio_spdk_inited; /* SPDK subsystem fini timeout */ unsigned int bio_spdk_subsys_timeout = 25000; /* ms */ +/* SPDK NVMe power management value, use bits 0-4 as per NVMe spec */ +unsigned int bio_spdk_power_mgmt_val = NVME_POWER_MGMT_UNINIT; /* How many blob unmap calls can be called in a row */ unsigned int bio_spdk_max_unmap_cnt = 32; unsigned int bio_max_async_sz = (1UL << 15) /* 32k */; +unsigned int bio_io_timeout = 120000000; /* us, 120 seconds */ struct bio_nvme_data { ABT_mutex bd_mutex; @@ -106,7 +109,6 @@ bio_spdk_conf_read(struct spdk_env_opts *opts) return rc; } nvme_glb.bd_nvme_roles = roles; - bio_vmd_enabled = vmd_enabled && (nvme_glb.bd_bdev_class == BDEV_CLASS_NVME); rc = bio_set_hotplug_filter(nvme_glb.bd_nvme_conf); if (rc != 0) { @@ -149,14 +151,44 @@ static int bio_spdk_env_init(void) { struct spdk_env_opts opts; + const char *dpdk_opts; + unsigned int spdk_level = DAOS_SPDK_LOG_DEFAULT; + unsigned int dpdk_level = DAOS_DPDK_LOG_DEFAULT; int rc; - /* Only print error and more severe to stderr. */ - spdk_log_set_print_level(SPDK_LOG_ERROR); + /* Check for SPDK log level from environment */ + d_getenv_uint("DAOS_SPDK_LOG_LEVEL", &spdk_level); + if (spdk_level > DAOS_SPDK_LOG_MAX) { + D_WARN("Invalid DAOS_DPDK_LOG_LEVEL=%u, using default (%u)\n", dpdk_level, + DAOS_SPDK_LOG_DEFAULT); + spdk_level = DAOS_SPDK_LOG_DEFAULT; + } + + /* Check for DPDK log level from environment */ + d_getenv_uint("DAOS_DPDK_LOG_LEVEL", &dpdk_level); + if (dpdk_level < DAOS_DPDK_LOG_MIN || dpdk_level > DAOS_DPDK_LOG_MAX) { + D_WARN("Invalid DAOS_DPDK_LOG_LEVEL=%u, using default (%u)\n", dpdk_level, + DAOS_DPDK_LOG_DEFAULT); + dpdk_level = DAOS_DPDK_LOG_DEFAULT; + } + + D_INFO("SPDK log level: %u, DPDK log level: %u\n", spdk_level, dpdk_level); + + /* Set SPDK log print level to configured value */ + spdk_log_set_print_level(spdk_level); + + /* Build DPDK options with specified log level for all DPDK log facilities */ + dpdk_opts = dpdk_cli_build_opts(dpdk_level, dpdk_level); + if (dpdk_opts == NULL) { + D_ERROR("Failed to build DPDK options\n"); + rc = -DER_NOMEM; + goto out; + } + opts.opts_size = sizeof(opts); spdk_env_opts_init(&opts); opts.name = "daos_engine"; - opts.env_context = (char *)dpdk_cli_override_opts; + opts.env_context = (char *)dpdk_opts; /** * TODO: Set opts.mem_size to nvme_glb.bd_mem_size @@ -219,7 +251,8 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size, { char *env; int rc, fd; - unsigned int size_mb = BIO_DMA_CHUNK_MB; + unsigned int size_mb = BIO_DMA_CHUNK_MB, io_timeout_secs = 0; + unsigned int cluster_mb = DAOS_DEFAULT_CLUSTER_MB; if (tgt_nr <= 0) { D_ERROR("tgt_nr: %u should be > 0\n", tgt_nr); @@ -266,6 +299,11 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size, d_getenv_bool("DAOS_SCM_RDMA_ENABLED", &bio_scm_rdma); D_INFO("RDMA to SCM is %s\n", bio_scm_rdma ? "enabled" : "disabled"); + d_getenv_uint("DAOS_NVME_POWER_MGMT", &bio_spdk_power_mgmt_val); + if (bio_spdk_power_mgmt_val != NVME_POWER_MGMT_UNINIT) + D_INFO("NVMe power management setting to be applied is %u\n", + bio_spdk_power_mgmt_val); + d_getenv_uint("DAOS_SPDK_SUBSYS_TIMEOUT", &bio_spdk_subsys_timeout); D_INFO("SPDK subsystem fini timeout is %u ms\n", bio_spdk_subsys_timeout); @@ -277,6 +315,16 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size, d_getenv_uint("DAOS_MAX_ASYNC_SZ", &bio_max_async_sz); D_INFO("Max async data size is set to %u bytes\n", bio_max_async_sz); + d_getenv_uint("DAOS_SPDK_IO_TIMEOUT", &io_timeout_secs); + if (io_timeout_secs > 0) { + if (io_timeout_secs < 30 || io_timeout_secs > 300) + D_WARN("DAOS_SPDK_IO_TIMEOUT(%u) is invalid. Min:30,Max:300,Default:120\n", + io_timeout_secs); + else + bio_io_timeout = io_timeout_secs * 1000000; /* convert to us */ + } + D_INFO("SPDK IO timeout set to %u us\n", bio_io_timeout); + /* Hugepages disabled */ if (mem_size == 0) { D_INFO("Set per-xstream DMA buffer upper bound to %u %uMB chunks\n", @@ -286,7 +334,7 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size, } if (nvme_conf && strlen(nvme_conf) > 0) { - fd = open(nvme_conf, O_RDONLY, 0600); + fd = open(nvme_conf, O_RDONLY); if (fd < 0) D_WARN("Open %s failed, skip DAOS NVMe setup "DF_RC"\n", nvme_conf, DP_RC(daos_errno2der(errno))); @@ -309,8 +357,14 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size, D_INFO("Set per-xstream DMA buffer upper bound to %u %uMB chunks, prealloc %u chunks\n", bio_chk_cnt_max, size_mb, init_chk_cnt()); + d_getenv_uint("DAOS_BS_CLUSTER_MB", &cluster_mb); + if (cluster_mb < 32 || cluster_mb > 1024) { + D_WARN("DAOS_BS_CLUSTER_MB %u is invalid, default %u is used\n", cluster_mb, + DAOS_DEFAULT_CLUSTER_MB); + cluster_mb = DAOS_DEFAULT_CLUSTER_MB; + } spdk_bs_opts_init(&nvme_glb.bd_bs_opts, sizeof(nvme_glb.bd_bs_opts)); - nvme_glb.bd_bs_opts.cluster_sz = DAOS_BS_CLUSTER_SZ; + nvme_glb.bd_bs_opts.cluster_sz = (cluster_mb << 20); nvme_glb.bd_bs_opts.max_channel_ops = BIO_BS_MAX_CHANNEL_OPS; d_agetenv_str(&env, "VOS_BDEV_CLASS"); @@ -354,8 +408,9 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size, if (!bio_nvme_configured(SMD_DEV_TYPE_META)) nvme_glb.bd_bs_opts.cluster_sz = (1UL << 30); /* 1GB */ - D_INFO("MD on SSD is %s\n", - bio_nvme_configured(SMD_DEV_TYPE_META) ? "enabled" : "disabled"); + D_INFO("MD on SSD is %s, %u cluster size is used\n", + bio_nvme_configured(SMD_DEV_TYPE_META) ? "enabled" : "disabled", + nvme_glb.bd_bs_opts.cluster_sz); bio_spdk_inited = true; @@ -496,10 +551,30 @@ common_init_cb(void *arg, int rc) cp_arg->cca_rc = daos_errno2der(-rc); } +struct subsystem_init_arg { + struct common_cp_arg *cp_arg; + void *json_data; + ssize_t json_data_size; +}; + static void subsys_init_cb(int rc, void *arg) { - common_init_cb(arg, rc); + struct subsystem_init_arg *init_arg = arg; + + if (init_arg->json_data != NULL) { + free(init_arg->json_data); + init_arg->json_data = NULL; + } + + if (rc) + D_ERROR("subsystem init failed: %d\n", rc); + + common_init_cb(init_arg->cp_arg, rc); + + D_FREE(init_arg); + + return; } static void @@ -901,8 +976,8 @@ create_bio_bdev(struct bio_xs_context *ctxt, const char *bdev_name, unsigned int * Hold the SPDK bdev by an open descriptor, otherwise, the bdev * could be deconstructed by SPDK on device hot remove. */ - rc = spdk_bdev_open_ext(d_bdev->bb_name, false, bio_bdev_event_cb, - d_bdev, &d_bdev->bb_desc); + rc = + spdk_bdev_open_ext(d_bdev->bb_name, false, bio_bdev_event_cb, d_bdev, &d_bdev->bb_desc); if (rc != 0) { D_ERROR("Failed to hold bdev %s, %d\n", d_bdev->bb_name, rc); rc = daos_errno2der(-rc); @@ -910,6 +985,7 @@ create_bio_bdev(struct bio_xs_context *ctxt, const char *bdev_name, unsigned int } D_ASSERT(d_bdev->bb_desc != NULL); + /* Try to load blobstore without specifying 'bstype' first */ bs = load_blobstore(ctxt, d_bdev->bb_name, NULL, false, false, NULL, NULL); @@ -1011,6 +1087,12 @@ init_bio_bdevs(struct bio_xs_context *ctxt) bdev_name = spdk_bdev_get_name(bdev); + /* Apply NVMe power management settings */ + rc = bio_set_power_mgmt(ctxt, bdev_name); + if (rc != 0 && rc != -DER_NOTSUPPORTED) + D_WARN("Failed to set power management for device %s: " DF_RC "\n", + bdev_name, DP_RC(rc)); + rc = bdev_name2roles(bdev_name); if (rc < 0) { D_ERROR("Failed to get role from bdev name '%s', "DF_RC"\n", bdev_name, @@ -1033,10 +1115,10 @@ init_bio_bdevs(struct bio_xs_context *ctxt) return -DER_EXIST; } - /* A DER_NOTSUPPORTED RC indicates that VMD-LED control not possible */ + /* Clear any pre-existing VMD-LED state */ rc = bio_led_manage(ctxt, NULL, d_bdev->bb_uuid, (unsigned int)CTL__LED_ACTION__RESET, NULL, 0); - if ((rc != 0) && (rc != -DER_NOTSUPPORTED)) { + if (rc != 0) { DL_ERROR(rc, "Reset LED on device:" DF_UUID " failed", DP_UUID(d_bdev->bb_uuid)); return rc; @@ -1241,6 +1323,7 @@ alloc_xs_blobstore(void) if (bxb == NULL) return NULL; + D_INIT_LIST_HEAD(&bxb->bxb_pending_ios); D_INIT_LIST_HEAD(&bxb->bxb_io_ctxts); return bxb; @@ -1584,6 +1667,76 @@ bio_xsctxt_free(struct bio_xs_context *ctxt) D_FREE(ctxt); } +static void +subsystem_init_cb(int rc, void *arg) +{ + struct subsystem_init_arg *init_arg; + + if (rc) { + subsys_init_cb(rc, arg); + return; + } + + init_arg = arg; + + /* Set RUNTIME state and load config again for RUNTIME methods */ + spdk_rpc_set_state(SPDK_RPC_RUNTIME); + spdk_subsystem_load_config(init_arg->json_data, init_arg->json_data_size, subsys_init_cb, + init_arg, true); +} + +static void +load_config_cb(int rc, void *arg) +{ + if (rc) { + subsys_init_cb(rc, arg); + return; + } + + /* init subsystem */ + spdk_subsystem_init(subsystem_init_cb, arg); +} + +static int +bio_xsctxt_init_by_config(struct common_cp_arg *cp_arg) +{ + struct subsystem_init_arg *init_arg; + void *json_data; + size_t json_data_size; + + json_data = spdk_posix_file_load_from_name(nvme_glb.bd_nvme_conf, &json_data_size); + if (json_data == NULL) { + D_ERROR("failed to load nvme conf %s\n", nvme_glb.bd_nvme_conf); + return -DER_NOMEM; + } + + /** + * Initially, this was called internally spdk_subsystem_load_config() -> ... -> + * spdk_rpc_initialize(). However, since commit + * https://github.com/spdk/spdk/commit/fba209c7324a11b9230533144c02e7a66bc738ea (>=v24.01) + * SPDK_RPC_STARTUP has become the initial value of the underlying global variable and it + * is no longer reset automatically. This makes no difference for applications that + * initialize SPDK only once during the lifetime of the process. But some BIO module + * consumers—such as DDB—expect to be able to initialize, finalize, and then reinitialize + * SPDK multiple times within the same process, for example when inspecting multiple pools + * sequentially. For those use cases, the RPC state must now be reset explicitly. + */ + spdk_rpc_set_state(SPDK_RPC_STARTUP); + + D_ALLOC_PTR(init_arg); + if (init_arg == NULL) { + free(json_data); + return -DER_NOMEM; + } + + init_arg->cp_arg = cp_arg; + init_arg->json_data = json_data; + init_arg->json_data_size = (ssize_t)json_data_size; + spdk_subsystem_load_config(json_data, (ssize_t)json_data_size, load_config_cb, init_arg, + true); + return 0; +} + int bio_xsctxt_alloc(struct bio_xs_context **pctxt, int tgt_id, bool self_polling) { @@ -1647,13 +1800,14 @@ bio_xsctxt_alloc(struct bio_xs_context **pctxt, int tgt_id, bool self_polling) /* Initialize all registered subsystems: bdev, vmd, copy. */ common_prep_arg(&cp_arg); - spdk_subsystem_init_from_json_config(nvme_glb.bd_nvme_conf, - SPDK_DEFAULT_RPC_ADDR, - subsys_init_cb, &cp_arg, - true); + rc = bio_xsctxt_init_by_config(&cp_arg); + if (rc != 0) { + D_ERROR("failed to load nvme conf %s\n", nvme_glb.bd_nvme_conf); + goto out; + } + rc = xs_poll_completion(ctxt, &cp_arg.cca_inflights, 0); D_ASSERT(rc == 0); - if (cp_arg.cca_rc != 0) { rc = cp_arg.cca_rc; DL_ERROR(rc, "failed to init bdevs"); @@ -1683,7 +1837,7 @@ bio_xsctxt_alloc(struct bio_xs_context **pctxt, int tgt_id, bool self_polling) if ((!nvme_glb.bd_rpc_srv_addr) || (strlen(nvme_glb.bd_rpc_srv_addr) == 0)) nvme_glb.bd_rpc_srv_addr = SPDK_DEFAULT_RPC_ADDR; - rc = spdk_rpc_initialize(nvme_glb.bd_rpc_srv_addr); + rc = spdk_rpc_initialize(nvme_glb.bd_rpc_srv_addr, NULL); if (rc != 0) { D_ERROR("failed to start SPDK JSON-RPC server at %s, "DF_RC"\n", nvme_glb.bd_rpc_srv_addr, DP_RC(daos_errno2der(-rc))); @@ -1761,8 +1915,10 @@ bio_nvme_ctl(unsigned int cmd, void *arg) static inline void reset_media_errors(struct bio_blobstore *bbs) { - struct nvme_stats *dev_stats = &bbs->bb_dev_health.bdh_health_state; + struct bio_dev_health *bdh = &bbs->bb_dev_health; + struct nvme_stats *dev_stats = &bdh->bdh_health_state; + bdh->bdh_io_stalled = 0; dev_stats->bio_read_errs = 0; dev_stats->bio_write_errs = 0; dev_stats->bio_unmap_errs = 0; @@ -1921,22 +2077,15 @@ bio_led_event_monitor(struct bio_xs_context *ctxt, uint64_t now) struct bio_bdev *d_bdev; int rc; - if (!bio_vmd_enabled) - return; - /* Scan all devices present in bio_bdev list */ d_list_for_each_entry(d_bdev, bio_bdev_list(), bb_link) { if ((d_bdev->bb_led_expiry_time != 0) && (d_bdev->bb_led_expiry_time < now)) { - /** - * LED will be reset to faulty or normal state based on SSDs bio_bdevs. - * A DER_NOTSUPPORTED RC indicates that VMD-LED control not possible. - */ + /* LED will be reset to faulty or normal state based on SSDs bio_bdevs. */ rc = bio_led_manage(ctxt, NULL, d_bdev->bb_uuid, (unsigned int)CTL__LED_ACTION__RESET, NULL, 0); if (rc != 0) { - if (rc != -DER_NOTSUPPORTED) - DL_ERROR(rc, "Reset LED on device:" DF_UUID " failed", - DP_UUID(d_bdev->bb_uuid)); + DL_ERROR(rc, "Reset LED on device:" DF_UUID " failed", + DP_UUID(d_bdev->bb_uuid)); continue; } @@ -1992,5 +2141,8 @@ bio_nvme_poll(struct bio_xs_context *ctxt) bio_led_event_monitor(ctxt, now); } + /* Detect stalled I/Os */ + bio_io_monitor(ctxt, now); + return rc; } diff --git a/src/bio/smd/smd_pool.c b/src/bio/smd/smd_pool.c index 169e548b1c8..a2d8c86faf0 100644 --- a/src/bio/smd/smd_pool.c +++ b/src/bio/smd/smd_pool.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2025 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -179,7 +179,7 @@ smd_rdb_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, enum smd_dev_ } static int -pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, char *table_name) +pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, char *table_name, int *tgt_cnt) { struct smd_pool pool; struct d_uuid id; @@ -226,6 +226,9 @@ pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, char *table_name) rc = 1; /* Inform caller that last target is deleted */ } + if (tgt_cnt) + *tgt_cnt = pool.sp_tgt_cnt; + return rc; } @@ -234,15 +237,15 @@ smd_pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, enum smd_dev_type st) { struct smd_pool_meta meta = { 0 }; struct d_uuid id; - int rc; + int rc, remaining = 0; smd_db_lock(); - rc = pool_del_tgt(pool_id, tgt_id, TABLE_POOLS[st]); + rc = pool_del_tgt(pool_id, tgt_id, TABLE_POOLS[st], &remaining); if (rc <= 0) goto out; rc = 0; - if (st == SMD_DEV_TYPE_META) { + if (st == SMD_DEV_TYPE_META && !remaining) { uuid_copy(id.uuid, pool_id); rc = smd_db_fetch(TABLE_POOLS_EX[st], &id, sizeof(id), &meta, sizeof(meta)); @@ -269,7 +272,7 @@ smd_rdb_del_tgt(uuid_t pool_id, uint32_t tgt_id, enum smd_dev_type st) int rc; smd_db_lock(); - rc = pool_del_tgt(pool_id, tgt_id, TABLE_RDBS[st]); + rc = pool_del_tgt(pool_id, tgt_id, TABLE_RDBS[st], NULL); smd_db_unlock(); return rc < 0 ? rc : 0; diff --git a/src/cart/README.env b/src/cart/README.env index 93e1801c65e..befc0fd8f4f 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -226,3 +226,8 @@ This file lists the environment variables used in CaRT. D_PROGRESS_BUSY Force busy polling when progressing, preventing from sleeping when waiting for new messages. + + D_MEM_DEVICE + Enable detection and use of memory devices (GPU, etc) to perform RMA transfers to/from. + Be wary of potential performance impacts if this variable is set and memory devices + are not used. diff --git a/src/cart/crt_context.c b/src/cart/crt_context.c index 1b88bab56b7..ccfa40ea9c6 100644 --- a/src/cart/crt_context.c +++ b/src/cart/crt_context.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -665,7 +665,7 @@ crt_ctx_epi_abort(struct crt_ep_inflight *epi, int flags) break; } ts_now = d_timeus_secdiff(0); - if (ts_now - ts_start > 2 * CRT_DEFAULT_TIMEOUT_US) { + if (ts_now - ts_start > 2 * CRT_TIMEOUT_DEFAULT * 1e6) { D_ERROR("stop progress due to timed out.\n"); d_list_for_each_entry(rpc_priv, &epi->epi_req_q, crp_epi_link) RPC_ERROR(rpc_priv, diff --git a/src/cart/crt_corpc.c b/src/cart/crt_corpc.c index abced7310c3..422a44618b9 100644 --- a/src/cart/crt_corpc.c +++ b/src/cart/crt_corpc.c @@ -784,8 +784,13 @@ crt_corpc_req_hdlr(struct crt_rpc_priv *rpc_priv) opc_info = rpc_priv->crp_opc_info; co_ops = opc_info->coi_co_ops; - if (rpc_priv->crp_fail_hlc) - D_GOTO(forward_done, rc = -DER_HLC_SYNC); + if (rpc_priv->crp_fail_hlc) { + rc = -DER_HLC_SYNC; + RPC_ERROR(rpc_priv, "crp_fail_hlc (group %s) failed: " DF_RC "\n", + co_info->co_grp_priv->gp_pub.cg_grpid, DP_RC(rc)); + crt_corpc_fail_parent_rpc(rpc_priv, rc); + D_GOTO(forward_done, rc); + } /* Invoke pre-forward callback first if it is registered */ if (co_ops && co_ops->co_pre_forward) { @@ -899,20 +904,28 @@ crt_corpc_req_hdlr(struct crt_rpc_priv *rpc_priv) } forward_done: - if (rc != 0 && rpc_priv->crp_flags & CRT_RPC_FLAG_CO_FAILOUT) - co_failout = true; + if (rc != 0) { + /* reset rc to 0 as it already failed the parent/child RPC and + * will be replied/completed by crt_corpc_complete(). + */ + rc = 0; + if (rpc_priv->crp_flags & CRT_RPC_FLAG_CO_FAILOUT) + co_failout = true; + } - /* NOOP bcast (no child and root excluded) */ - if (co_info->co_child_num == 0 && (co_info->co_root_excluded || co_failout)) - crt_corpc_complete(rpc_priv); + /* need not call local RPC handler */ + if (co_info->co_root_excluded || co_failout) { + /* NOOP bcast (no child and root excluded) */ + if (co_info->co_child_num == 0) + crt_corpc_complete(rpc_priv); - if (co_info->co_root_excluded == 1 || co_failout) { - if (co_info->co_grp_priv->gp_self == co_info->co_root) { - /* don't return error for root to avoid RPC_DECREF in - * fail case in crt_req_send. - */ - rc = 0; - } + /* Corresponding the initial ref 1 in crt_rpc_handler_common() -> + * crt_rpc_priv_init(rpc_priv, crt_ctx, true). + * That ref commonly will be released by crt_rpc_common_hdlr() -> crt_handle_rpc(), + * here as will not call crt_rpc_common_hdlr() so drop it explicitly. + */ + if (rpc_priv->crp_srv) + RPC_DECREF(rpc_priv); D_GOTO(out, rc); } diff --git a/src/cart/crt_ctl.c b/src/cart/crt_ctl.c index 4be89d72e0c..91b50125cf3 100644 --- a/src/cart/crt_ctl.c +++ b/src/cart/crt_ctl.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2018-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -151,6 +152,28 @@ crt_hdlr_ctl_get_hostname(crt_rpc_t *rpc_req) D_ERROR("crt_reply_send() failed with rc %d\n", rc); } +void +crt_hdlr_ctl_dump_counters(crt_rpc_t *rpc_req) +{ + char old_dlog_mask[1024]; + + /* HG counters require log levels to be at debug to be printed */ + + /* store current log mask */ + d_log_getmasks(old_dlog_mask, 0, sizeof(old_dlog_mask), 0); + d_log_setmasks("DEBUG", -1); + HG_Set_log_level("debug"); + + HG_Diag_dump_counters(); + + /* restore log masks */ + /* Note: we cannot query log level from HG today so we restore back to 'warning' */ + HG_Set_log_level("warning"); + d_log_setmasks(old_dlog_mask, -1); + + crt_reply_send(rpc_req); +} + void crt_hdlr_ctl_get_pid(crt_rpc_t *rpc_req) { diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 62d6c171f89..082e1de7bd2 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1,7 +1,7 @@ /* * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -846,7 +846,7 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.na_init_info.auth_key = prov_data->cpg_na_config.noc_auth_key; - if (crt_provider_is_block_mode(provider) && !prov_data->cpg_progress_busy) + if (crt_provider_is_block_mode(provider) && !crt_gdata.cg_progress_busy) init_info.na_init_info.progress_mode = 0; else init_info.na_init_info.progress_mode = NA_NO_BLOCK; @@ -872,6 +872,7 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc; if (thread_mode_single) init_info.na_init_info.thread_mode = NA_THREAD_MODE_SINGLE; + init_info.na_init_info.request_mem_device = crt_gdata.cg_mem_device; retry: hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info); if (hg_class == NULL) { @@ -1189,33 +1190,24 @@ crt_rpc_handler_common(hg_handle_t hg_hdl) HG_Destroy(rpc_tmp.crp_hg_hdl); D_GOTO(out, hg_ret = HG_SUCCESS); } - D_ASSERT(proc != NULL); - opc = rpc_tmp.crp_req_hdr.cch_opc; - /** - * Set the opcode in the temp RPC so that it can be correctly logged. - */ + D_ASSERT(proc != NULL); + opc = rpc_tmp.crp_req_hdr.cch_opc; rpc_tmp.crp_pub.cr_opc = opc; + /* allocate rpc struct for a given opcode; in/out size will vary per opc */ rc = crt_rpc_priv_alloc(opc, &rpc_priv, false /* forward */); if (unlikely(rc != 0)) { - if (rc == -DER_UNREG) { - D_ERROR("opc: %#x, lookup failed.\n", opc); - /* - * The RPC is not registered on the server, we don't know how to - * process the RPC request, so we send a CART - * level error message to the client. - */ - crt_hg_reply_error_send(&rpc_tmp, rc); - crt_hg_unpack_cleanup(proc); - HG_Destroy(rpc_tmp.crp_hg_hdl); - D_GOTO(out, hg_ret = HG_SUCCESS); - } else if (rc == -DER_NOMEM) { - crt_hg_reply_error_send(&rpc_tmp, -DER_DOS); - crt_hg_unpack_cleanup(proc); - HG_Destroy(rpc_tmp.crp_hg_hdl); - D_GOTO(out, hg_ret = HG_SUCCESS); - } + /* set client rc to denial of service if server is out of mem */ + if (rc == -DER_NOMEM) + rc = -DER_DOS; /* don't log as we are oom already */ + else + D_ERROR("crt_rpc_priv_alloc() failed, rc: %d.\n", rc); + + crt_hg_reply_error_send(&rpc_tmp, rc); + crt_hg_unpack_cleanup(proc); + HG_Destroy(rpc_tmp.crp_hg_hdl); + D_GOTO(out, hg_ret = HG_SUCCESS); } opc_info = rpc_priv->crp_opc_info; @@ -1610,6 +1602,7 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv) D_ASSERT(rpc_priv != NULL); + /* corresponds to decref in crt_hg_reply_send_cb */ RPC_ADDREF(rpc_priv); hg_ret = HG_Respond(rpc_priv->crp_hg_hdl, crt_hg_reply_send_cb, rpc_priv, &rpc_priv->crp_pub.cr_output); diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 21f9ea08891..1b926465ac3 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -1,7 +1,7 @@ /* * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -24,6 +24,19 @@ static bool g_prov_settings_applied[CRT_PROV_COUNT]; static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES}; #undef X +#define CRT_ENV_OPT_GET(opt, x, env) \ + do { \ + if (opt != NULL && opt->cio_##x) \ + x = opt->cio_##x; \ + else \ + crt_env_get(env, &x); \ + } while (0) + +static int +crt_init_prov(crt_provider_t provider, bool primary, struct crt_prov_gdata *prov_gdata, + const char *interface, const char *domain, const char *port, const char *auth_key, + bool port_auto_adjust, crt_init_options_t *opt); + static void crt_lib_init(void) __attribute__((__constructor__)); @@ -77,31 +90,27 @@ dump_opt(crt_init_options_t *opt) D_INFO("options:\n"); D_INFO("crt_timeout = %d\n", opt->cio_crt_timeout); D_INFO("max_ctx_num = %d\n", opt->cio_ctx_max_num); - D_INFO("swim_idx = %d\n", opt->cio_swim_crt_idx); - D_INFO("provider = %s\n", opt->cio_provider); - D_INFO("interface = %s\n", opt->cio_interface); - D_INFO("domain = %s\n", opt->cio_domain); - D_INFO("port = %s\n", opt->cio_port); - D_INFO("Flags: fi: %d, use_credits: %d, use_sensors: %d\n", opt->cio_fault_inject, - opt->cio_use_credits, opt->cio_use_sensors); + D_INFO("swim_idx = %d\n", opt->cio_swim_crt_idx); + D_INFO("provider = %s\n", opt->cio_provider); + D_INFO("interface = %s\n", opt->cio_interface); + D_INFO("domain = %s\n", opt->cio_domain); + D_INFO("port = %s\n", opt->cio_port); + D_INFO("auth_key = %s\n", opt->cio_auth_key); + D_INFO("ep_credits = %d\n", opt->cio_ep_credits); + D_INFO("Flags: fault_inject = %d, use_sensors = %d, thread_mode_single = %d, " + "progress_busy = %d, mem_device = %d\n", + opt->cio_fault_inject, opt->cio_use_sensors, opt->cio_thread_mode_single, + opt->cio_progress_busy, opt->cio_mem_device); if (opt->cio_use_expected_size) D_INFO("max_expected_size = %d\n", opt->cio_max_expected_size); if (opt->cio_use_unexpected_size) D_INFO("max_unexpect_size = %d\n", opt->cio_max_unexpected_size); - - /* Handle similar to D_PROVIDER_AUTH_KEY */ - if (opt->cio_auth_key) - D_INFO("auth_key is set\n"); - if (opt->cio_thread_mode_single) - D_INFO("thread mode single is set\n"); - if (opt->cio_progress_busy) - D_INFO("progress busy mode is set\n"); } static int -crt_na_config_init(bool primary, crt_provider_t provider, char *interface, char *domain, char *port, - char *auth_key, bool port_auto_adjust); +crt_na_config_init(bool primary, crt_provider_t provider, const char *interface, const char *domain, + const char *port, const char *auth_key, bool port_auto_adjust); /* Workaround for CART-890 */ static void @@ -152,10 +161,9 @@ prov_data_init(struct crt_prov_gdata *prov_data, crt_provider_t provider, bool p crt_init_options_t *opt) { - uint32_t ctx_num = 0; uint32_t max_expect_size = 0; uint32_t max_unexpect_size = 0; - uint32_t max_num_ctx = CRT_SRV_CONTEXT_NUM; + uint32_t ctx_max_num = 0; int i; int rc; @@ -164,27 +172,29 @@ prov_data_init(struct crt_prov_gdata *prov_data, crt_provider_t provider, bool p return rc; if (crt_is_service()) { - ctx_num = CRT_SRV_CONTEXT_NUM; - max_num_ctx = CRT_SRV_CONTEXT_NUM; + ctx_max_num = CRT_SRV_CONTEXT_NUM; } else { /* Only limit the number of contexts for clients */ - crt_env_get(CRT_CTX_NUM, &ctx_num); + CRT_ENV_OPT_GET(opt, ctx_max_num, CRT_CTX_NUM); /* Default setting to the number of cores */ - if (opt) - max_num_ctx = - ctx_num ? ctx_num : max(crt_gdata.cg_num_cores, opt->cio_ctx_max_num); - else - max_num_ctx = ctx_num ? ctx_num : crt_gdata.cg_num_cores; - } + if (!ctx_max_num) + ctx_max_num = crt_gdata.cg_num_cores; - if (max_num_ctx > CRT_SRV_CONTEXT_NUM) - max_num_ctx = CRT_SRV_CONTEXT_NUM; - /* To be able to run on VMs */ - if (max_num_ctx < CRT_SRV_CONTEXT_NUM_MIN) - max_num_ctx = CRT_SRV_CONTEXT_NUM_MIN; + if (ctx_max_num > CRT_SRV_CONTEXT_NUM) { + D_WARN("ctx_max_num %u exceeds max %u, using max\n", ctx_max_num, + CRT_SRV_CONTEXT_NUM); + ctx_max_num = CRT_SRV_CONTEXT_NUM; + } + /* To be able to run on VMs */ + if (ctx_max_num < CRT_SRV_CONTEXT_NUM_MIN) { + D_INFO("ctx_max_num %u is less than min %u, using min\n", ctx_max_num, + CRT_SRV_CONTEXT_NUM_MIN); + ctx_max_num = CRT_SRV_CONTEXT_NUM_MIN; + } + } - D_DEBUG(DB_ALL, "Max number of contexts set to %d\n", max_num_ctx); + D_DEBUG(DB_ALL, "Max number of contexts set to %u\n", ctx_max_num); if (opt && opt->cio_use_expected_size) max_expect_size = opt->cio_max_expected_size; @@ -197,19 +207,11 @@ prov_data_init(struct crt_prov_gdata *prov_data, crt_provider_t provider, bool p prov_data->cpg_ctx_num = 0; prov_data->cpg_sep_mode = false; prov_data->cpg_contig_ports = true; - prov_data->cpg_ctx_max_num = max_num_ctx; + prov_data->cpg_ctx_max_num = ctx_max_num; prov_data->cpg_max_exp_size = max_expect_size; prov_data->cpg_max_unexp_size = max_unexpect_size; prov_data->cpg_primary = primary; - if (opt && opt->cio_progress_busy) { - prov_data->cpg_progress_busy = opt->cio_progress_busy; - } else { - bool progress_busy = false; - crt_env_get(D_PROGRESS_BUSY, &progress_busy); - prov_data->cpg_progress_busy = progress_busy; - } - for (i = 0; i < CRT_SRV_CONTEXT_NUM; i++) prov_data->cpg_used_idx[i] = false; @@ -218,7 +220,7 @@ prov_data_init(struct crt_prov_gdata *prov_data, crt_provider_t provider, bool p prov_data->cpg_last_remote_tag = 0; D_DEBUG(DB_ALL, "prov_idx: %d primary: %d sizes: (%d/%d) max_ctx: %d\n", provider, primary, - max_expect_size, max_unexpect_size, max_num_ctx); + max_expect_size, max_unexpect_size, ctx_max_num); D_INIT_LIST_HEAD(&prov_data->cpg_ctx_list); @@ -261,8 +263,8 @@ crt_str_to_tc(const char *str) static int data_init(int server, crt_init_options_t *opt) { - uint32_t timeout = 0; - uint32_t credits; + uint32_t crt_timeout = 0; + uint32_t ep_credits = CRT_DEFAULT_CREDITS_PER_EP_CTX; uint32_t fi_univ_size = 0; uint32_t mem_pin_enable = 0; uint32_t is_secondary; @@ -274,6 +276,9 @@ data_init(int server, crt_init_options_t *opt) crt_env_dump(); + if (!crt_env_list_valid()) + return -DER_INVAL; + /* Set context post init / post incr to tune number of pre-posted recvs */ crt_env_get(D_POST_INIT, &post_init); crt_gdata.cg_post_init = post_init; @@ -307,24 +312,25 @@ data_init(int server, crt_init_options_t *opt) } crt_gdata.cg_provider_is_primary = (is_secondary) ? 0 : 1; - if (opt && opt->cio_crt_timeout != 0) - timeout = opt->cio_crt_timeout; - else - crt_env_get(CRT_TIMEOUT, &timeout); + CRT_ENV_OPT_GET(opt, crt_timeout, CRT_TIMEOUT); + if (crt_timeout == 0) + crt_gdata.cg_timeout = CRT_TIMEOUT_DEFAULT; + else if (crt_timeout > CRT_TIMEOUT_MAX) { + D_WARN("crt_timeout %u exceeds max %u, using max\n", crt_timeout, CRT_TIMEOUT_MAX); + crt_gdata.cg_timeout = CRT_TIMEOUT_MAX; + } else + crt_gdata.cg_timeout = crt_timeout; - if (timeout == 0 || timeout > 3600) - crt_gdata.cg_timeout = CRT_DEFAULT_TIMEOUT_S; - else - crt_gdata.cg_timeout = timeout; crt_gdata.cg_swim_ctx_idx = CRT_DEFAULT_PROGRESS_CTX_IDX; /* Override defaults and environment if option is set */ - if (opt && opt->cio_use_credits) { - credits = opt->cio_ep_credits; - } else { - credits = CRT_DEFAULT_CREDITS_PER_EP_CTX; - crt_env_get(CRT_CREDIT_EP_CTX, &credits); + CRT_ENV_OPT_GET(opt, ep_credits, CRT_CREDIT_EP_CTX); + if (ep_credits > CRT_MAX_CREDITS_PER_EP_CTX) { + D_WARN("ep_credits %u exceeds max %u, using max\n", ep_credits, + CRT_MAX_CREDITS_PER_EP_CTX); + ep_credits = CRT_MAX_CREDITS_PER_EP_CTX; } + crt_gdata.cg_credit_ep_ctx = ep_credits; /* Enable quotas by default only on clients */ crt_gdata.cg_rpc_quota = server ? 0 : CRT_QUOTA_RPCS_DEFAULT; @@ -349,10 +355,6 @@ data_init(int server, crt_init_options_t *opt) d_setenv("FI_UNIVERSE_SIZE", "2048", 1); } - if (credits > CRT_MAX_CREDITS_PER_EP_CTX) - credits = CRT_MAX_CREDITS_PER_EP_CTX; - crt_gdata.cg_credit_ep_ctx = credits; - /** enable sensors if requested */ crt_gdata.cg_use_sensors = (opt && opt->cio_use_sensors); @@ -424,38 +426,6 @@ crt_plugin_fini(void) D_MUTEX_DESTROY(&crt_plugin_gdata.cpg_mutex); } -static int -__split_arg(char *s_arg_to_split, const char *delim, char **first_arg, char **second_arg) -{ - char *save_ptr = NULL; - char *arg_to_split; - - D_ASSERT(first_arg != NULL); - D_ASSERT(second_arg != NULL); - - /* no-op, not an error case */ - if (s_arg_to_split == NULL) { - *first_arg = NULL; - *second_arg = NULL; - return DER_SUCCESS; - } - - D_STRNDUP(arg_to_split, s_arg_to_split, 255); - if (!arg_to_split) { - *first_arg = NULL; - *second_arg = NULL; - return -DER_NOMEM; - } - - *first_arg = 0; - *second_arg = 0; - - *first_arg = strtok_r(arg_to_split, delim, &save_ptr); - *second_arg = save_ptr; - - return DER_SUCCESS; -} - crt_provider_t crt_str_to_provider(const char *str_provider) { @@ -481,8 +451,13 @@ crt_str_to_provider(const char *str_provider) if (len > strlen(CRT_UCX_STR) && strchr(str_provider, '+')) { D_STRNDUP(p, str_provider, len); if (!p) { + /* Return provider unknown if allocation fails. */ return prov; } else { + /* Store the default UCX provider string in the alt_str + * to allow it to be restored if finalize is called. + */ + crt_na_dict[i].nad_alt_str = crt_na_dict[i].nad_str; crt_na_dict[i].nad_str = p; crt_na_dict[i].nad_str_alloc = true; } @@ -627,33 +602,16 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) { bool server = flags & CRT_FLAG_BIT_SERVER; int rc = 0; - crt_provider_t primary_provider; - crt_provider_t secondary_provider; - crt_provider_t tmp_prov; - char *provider = NULL; - char *provider_env = NULL; - char *interface = NULL; - char *interface_env = NULL; - char *domain = NULL; - char *domain_env = NULL; - char *auth_key = NULL; - char *auth_key_env = NULL; - char *path = NULL; - char *provider_str0 = NULL; - char *provider_str1 = NULL; - char *port = NULL; - char *port_env = NULL; - char *port0 = NULL; - char *port1 = NULL; - char *iface0 = NULL; - char *iface1 = NULL; - char *domain0 = NULL; - char *domain1 = NULL; - char *auth_key0 = NULL; - char *auth_key1 = NULL; - int num_secondaries = 0; - bool port_auto_adjust = false; - int i; + crt_provider_t prov; + char *provider = NULL, *interface = NULL, *domain = NULL, *port = NULL, *auth_key = NULL; + char *path = NULL; + char *provider_str = NULL, *interface_str = NULL, *domain_str = NULL, *port_str = NULL, + *auth_key_str = NULL; + char *save_provider_str = NULL, *save_interface_str = NULL, *save_domain_str = NULL, + *save_port_str = NULL, *save_auth_key_str = NULL; + bool port_auto_adjust = false, thread_mode_single = false, progress_busy = false, + mem_device = false; + int i; d_signal_register(); @@ -669,7 +627,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) D_INFO("libcart (%s) v%s initializing\n", server ? "server" : "client", CART_VERSION); crt_env_init(); - if (opt) + if (opt != NULL) dump_opt(opt); /* d_fault_inject_init() is reference counted */ @@ -694,246 +652,228 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) D_ASSERT(gdata_init_flag == 1); D_RWLOCK_WRLOCK(&crt_gdata.cg_rwlock); - if (crt_gdata.cg_inited == 0) { - crt_gdata.cg_server = server; - crt_gdata.cg_auto_swim_disable = (flags & CRT_FLAG_BIT_AUTO_SWIM_DISABLE) ? 1 : 0; - - crt_env_get(CRT_ATTACH_INFO_PATH, &path); - if (path != NULL && strlen(path) > 0) { - rc = crt_group_config_path_set(path); - if (rc != 0) - D_ERROR("Got %s from ENV CRT_ATTACH_INFO_PATH, " - "but crt_group_config_path_set failed " - "rc: %d, ignore the ENV.\n", - path, rc); - else - D_DEBUG(DB_ALL, "set group_config_path as %s.\n", path); + if (crt_gdata.cg_inited) { + if (!crt_gdata.cg_server && server) { + D_ERROR("CRT initialized as client, cannot set as server again.\n"); + D_GOTO(unlock, rc = -DER_INVAL); } + crt_gdata.cg_refcount++; + D_GOTO(unlock, rc); + } - if (opt && opt->cio_thread_mode_single) { - crt_gdata.cg_thread_mode_single = opt->cio_thread_mode_single; - } else { - bool thread_mode_single = false; - crt_env_get(D_THREAD_MODE_SINGLE, &thread_mode_single); - crt_gdata.cg_thread_mode_single = thread_mode_single; - } + crt_gdata.cg_server = server; + crt_gdata.cg_auto_swim_disable = (flags & CRT_FLAG_BIT_AUTO_SWIM_DISABLE) ? 1 : 0; - if (opt && opt->cio_auth_key) - auth_key = opt->cio_auth_key; - else { - crt_env_get(D_PROVIDER_AUTH_KEY, &auth_key_env); - auth_key = auth_key_env; - } + crt_env_get(CRT_ATTACH_INFO_PATH, &path); + if (path != NULL && strlen(path) > 0) { + rc = crt_group_config_path_set(path); + if (rc != 0) + D_ERROR("Got %s from ENV CRT_ATTACH_INFO_PATH, " + "but crt_group_config_path_set failed " + "rc: %d, ignore the ENV.\n", + path, rc); + else + D_DEBUG(DB_ALL, "set group_config_path as %s.\n", path); + } + + CRT_ENV_OPT_GET(opt, provider, D_PROVIDER); + CRT_ENV_OPT_GET(opt, interface, D_INTERFACE); + CRT_ENV_OPT_GET(opt, domain, D_DOMAIN); + CRT_ENV_OPT_GET(opt, port, D_PORT); + CRT_ENV_OPT_GET(opt, auth_key, D_PROVIDER_AUTH_KEY); + + crt_env_get(D_PORT_AUTO_ADJUST, &port_auto_adjust); + + /* TODO kept as unique globals but may want to distinguish for multi-provider case */ + CRT_ENV_OPT_GET(opt, thread_mode_single, D_THREAD_MODE_SINGLE); + crt_gdata.cg_thread_mode_single = thread_mode_single; + + CRT_ENV_OPT_GET(opt, progress_busy, D_PROGRESS_BUSY); + crt_gdata.cg_progress_busy = progress_busy; + + CRT_ENV_OPT_GET(opt, mem_device, D_MEM_DEVICE); + crt_gdata.cg_mem_device = mem_device; - if (opt && opt->cio_provider) - provider = opt->cio_provider; - else { - crt_env_get(D_PROVIDER, &provider_env); - provider = provider_env; + if (provider == NULL) { + D_ERROR("No provider specified\n"); + D_GOTO(unlock, rc = -DER_INVAL); + } + /* + * A coma-separated list of arguments for interfaces, domains, ports, keys is + * interpreted differently, depending whether it is on a client or on a server side. + * + * On a client, a coma-separated list means multi-interface selection, while on a + * server it means a multi-provider selection. + */ + if (!crt_is_service()) { + if (strchr(provider, ',') != NULL) { + D_ERROR("Multiple providers specified in provider string, but secondary " + "provider only supported on server side\n"); + D_GOTO(unlock, rc = -DER_INVAL); + } + } else if (strchr(provider, ',') != NULL) { + D_STRNDUP(provider_str, provider, CRT_ENV_STR_MAX_SIZE); + if (provider_str == NULL) + D_GOTO(unlock, rc = -DER_NOMEM); + provider = strtok_r(provider_str, ",", &save_provider_str); + + if (interface != NULL) { + D_STRNDUP(interface_str, interface, CRT_ENV_STR_MAX_SIZE); + if (interface_str == NULL) + D_GOTO(unlock, rc = -DER_NOMEM); + interface = strtok_r(interface_str, ",", &save_interface_str); } - if (opt && opt->cio_interface) - interface = opt->cio_interface; - else { - crt_env_get(D_INTERFACE, &interface_env); - interface = interface_env; + if (domain != NULL) { + D_STRNDUP(domain_str, domain, CRT_ENV_STR_MAX_SIZE); + if (domain_str == NULL) + D_GOTO(unlock, rc = -DER_NOMEM); + domain = strtok_r(domain_str, ",", &save_domain_str); } - if (opt && opt->cio_domain) - domain = opt->cio_domain; - else { - crt_env_get(D_DOMAIN, &domain_env); - domain = domain_env; + if (port != NULL) { + D_STRNDUP(port_str, port, CRT_ENV_STR_MAX_SIZE); + if (port_str == NULL) + D_GOTO(unlock, rc = -DER_NOMEM); + port = strtok_r(port_str, ",", &save_port_str); } - if (opt && opt->cio_port) - port = opt->cio_port; - else { - crt_env_get(D_PORT, &port_env); - port = port_env; + if (auth_key != NULL) { + D_STRNDUP(auth_key_str, auth_key, CRT_ENV_STR_MAX_SIZE); + if (auth_key_str == NULL) + D_GOTO(unlock, rc = -DER_NOMEM); + auth_key = strtok_r(auth_key_str, ",", &save_auth_key_str); } + } - crt_env_get(D_PORT_AUTO_ADJUST, &port_auto_adjust); - rc = __split_arg(provider, ",", &provider_str0, &provider_str1); - if (rc != 0) - D_GOTO(unlock, rc); + prov = crt_str_to_provider(provider); + if (prov == CRT_PROV_UNKNOWN) { + D_ERROR("Requested provider %s not found\n", provider); + D_GOTO(unlock, rc = -DER_NONEXIST); + } + + /* CXI doesn't use interface value, instead uses domain */ + if (interface == NULL && prov != CRT_PROV_OFI_CXI) + D_WARN("No interface specified\n"); - primary_provider = crt_str_to_provider(provider_str0); - secondary_provider = crt_str_to_provider(provider_str1); + /* For PALS-enabled environments, auto-detect svc ID / VNI and use DAOS VNI */ + if (prov == CRT_PROV_OFI_CXI && auth_key == NULL && crt_env_is_set(SLINGSHOT_VNIS)) + auth_key = "0:0:2"; /* format is svc_id:vni:vni_idx, use hard-coded value to tell + mercury to detect svc_id and vni from the env vars and use + the DAOS VNI at index 2 */ - if (primary_provider == CRT_PROV_UNKNOWN) { - D_ERROR("Requested provider %s not found\n", provider); - D_GOTO(unlock, rc = -DER_NONEXIST); + crt_gdata.cg_primary_prov = prov; + /* + * Note: If on the client the 'interface' contains a + * coma-separated list then it will be later parsed out + * and processed in crt_na_config_init(). + */ + rc = crt_init_prov(prov, true, &crt_gdata.cg_prov_gdata_primary, interface, domain, port, + auth_key, port_auto_adjust, opt); + if (rc != 0) + D_GOTO(unlock, rc); + + if (provider_str != NULL) { /* multi-provider case */ + int num_secondaries = 1; + const char *provider_ptr = save_provider_str; + + while (provider_ptr = strchr(provider_ptr, ','), provider_ptr != NULL) { + num_secondaries++; + provider_ptr++; } + crt_gdata.cg_num_secondary_provs = num_secondaries; - /* - * A coma-separated list of arguments for interfaces, domains, ports, keys is - * interpreted differently, depending whether it is on a client or on a server side. - * - * On a client, a coma-separated list means multi-interface selection, while on a - * server it means a multi-provider selection. - */ - if (crt_is_service()) { - rc = __split_arg(interface, ",", &iface0, &iface1); - if (rc != 0) - D_GOTO(unlock, rc); - rc = __split_arg(domain, ",", &domain0, &domain1); - if (rc != 0) - D_GOTO(unlock, rc); - rc = __split_arg(port, ",", &port0, &port1); - if (rc != 0) - D_GOTO(unlock, rc); - rc = __split_arg(auth_key, ",", &auth_key0, &auth_key1); - if (rc != 0) - D_GOTO(unlock, rc); - } else { - /* - * Note: If on the client the 'interface' contains a - * coma-separated list then it will be later parsed out - * and processed in crt_na_config_init(). - */ - if (interface) { - D_STRNDUP(iface0, interface, 255); - if (!iface0) - D_GOTO(unlock, rc = -DER_NOMEM); - } + D_ALLOC_ARRAY(crt_gdata.cg_secondary_provs, num_secondaries); + if (crt_gdata.cg_secondary_provs == NULL) + D_GOTO(cleanup, rc = -DER_NOMEM); - if (domain) { - D_STRNDUP(domain0, domain, 255); - if (!domain0) - D_GOTO(unlock, rc = -DER_NOMEM); - } + D_ALLOC_ARRAY(crt_gdata.cg_prov_gdata_secondary, num_secondaries); + if (crt_gdata.cg_prov_gdata_secondary == NULL) + D_GOTO(cleanup, rc = -DER_NOMEM); - if (port) { - D_STRNDUP(port0, port, 255); - if (!port0) - D_GOTO(unlock, rc = -DER_NOMEM); + for (i = 0; i < num_secondaries; i++) { + provider = strtok_r(NULL, ",", &save_provider_str); + if (provider == NULL) { + D_ERROR("Failed to parse secondary provider\n"); + D_GOTO(cleanup, rc = -DER_INVAL); } - if (auth_key) { - D_STRNDUP(auth_key0, auth_key, 255); - if (!auth_key0) - D_GOTO(unlock, rc = -DER_NOMEM); + prov = crt_str_to_provider(provider); + if (prov == CRT_PROV_UNKNOWN) { + D_ERROR("Requested secondary provider %s not found\n", provider); + D_GOTO(cleanup, rc = -DER_NONEXIST); } - } + crt_gdata.cg_secondary_provs[i] = prov; - /* Secondary provider is specified */ - if (secondary_provider != CRT_PROV_UNKNOWN) { - /* Multi provider mode only supported on the server side */ - if (!crt_is_service()) { - D_ERROR("Secondary provider only supported on the server side\n"); - D_GOTO(unlock, rc = -DER_INVAL); - } + if (interface != NULL) + interface = strtok_r(NULL, ",", &save_interface_str); + if (domain != NULL) + domain = strtok_r(NULL, ",", &save_domain_str); + if (port != NULL) + port = strtok_r(NULL, ",", &save_port_str); + if (auth_key != NULL) + auth_key = strtok_r(NULL, ",", &save_auth_key_str); /* Secondary provider needs its own interface or domain */ - if (iface1 == NULL && domain1 == NULL) { + if (interface == NULL && domain == NULL) { D_ERROR( "Either a secondary domain or interface must be specified\n"); D_GOTO(unlock, rc = -DER_INVAL); } - /* Note: secondary ports and auth keys are optional */ - } - - /* CXI doesn't use interface value, instead uses domain */ - if (iface0 == NULL && primary_provider != CRT_PROV_OFI_CXI) - D_WARN("No interface specified\n"); + if (port == NULL || port[0] == '\0') + D_WARN("No port specified for secondary provider\n"); - rc = prov_data_init(&crt_gdata.cg_prov_gdata_primary, primary_provider, true, opt); - if (rc != 0) - D_GOTO(unlock, rc); - - prov_settings_apply(true, primary_provider, opt); - crt_gdata.cg_primary_prov = primary_provider; - - rc = crt_na_config_init(true, primary_provider, iface0, domain0, port0, auth_key0, - port_auto_adjust); - if (rc != 0) { - D_ERROR("crt_na_config_init() failed, " DF_RC "\n", DP_RC(rc)); - D_GOTO(unlock, rc); - } - - if (secondary_provider != CRT_PROV_UNKNOWN) { - num_secondaries = 1; - crt_gdata.cg_num_secondary_provs = num_secondaries; - - if (port1 == NULL || port1[0] == '\0') { - port1 = port0; - } - - D_ALLOC_ARRAY(crt_gdata.cg_secondary_provs, num_secondaries); - if (crt_gdata.cg_secondary_provs == NULL) - D_GOTO(cleanup, rc = -DER_NOMEM); - - D_ALLOC_ARRAY(crt_gdata.cg_prov_gdata_secondary, num_secondaries); - if (crt_gdata.cg_prov_gdata_secondary == NULL) - D_GOTO(cleanup, rc = -DER_NOMEM); - - crt_gdata.cg_secondary_provs[0] = secondary_provider; - } - - for (i = 0; i < num_secondaries; i++) { - tmp_prov = crt_gdata.cg_secondary_provs[i]; - - rc = prov_data_init(&crt_gdata.cg_prov_gdata_secondary[i], tmp_prov, false, - opt); - if (rc != 0) - D_GOTO(cleanup, rc); - - prov_settings_apply(false, tmp_prov, opt); - - rc = crt_na_config_init(false, tmp_prov, iface1, domain1, port1, auth_key1, - port_auto_adjust); + rc = crt_init_prov(crt_gdata.cg_secondary_provs[i], false, + &crt_gdata.cg_prov_gdata_secondary[i], interface, domain, + port, auth_key, port_auto_adjust, opt); if (rc != 0) { - D_ERROR("crt_na_config_init() failed, " DF_RC "\n", DP_RC(rc)); + D_ERROR("crt_init_prov() failed for secondary provider, " DF_RC + "\n", + DP_RC(rc)); D_GOTO(cleanup, rc); } } + } - rc = crt_hg_init(); - if (rc != 0) { - D_ERROR("crt_hg_init() failed, " DF_RC "\n", DP_RC(rc)); - D_GOTO(cleanup, rc); - } + rc = crt_hg_init(); + if (rc != 0) { + D_ERROR("crt_hg_init() failed, " DF_RC "\n", DP_RC(rc)); + D_GOTO(cleanup, rc); + } + + rc = crt_grp_init(grpid); + if (rc != 0) { + D_ERROR("crt_grp_init() failed, " DF_RC "\n", DP_RC(rc)); + D_GOTO(cleanup, rc); + } - rc = crt_grp_init(grpid); + if (crt_plugin_gdata.cpg_inited == 0) { + rc = crt_plugin_init(); if (rc != 0) { - D_ERROR("crt_grp_init() failed, " DF_RC "\n", DP_RC(rc)); + D_ERROR("crt_plugin_init() failed, " DF_RC "\n", DP_RC(rc)); D_GOTO(cleanup, rc); } + } - if (crt_plugin_gdata.cpg_inited == 0) { - rc = crt_plugin_init(); - if (rc != 0) { - D_ERROR("crt_plugin_init() failed, " DF_RC "\n", DP_RC(rc)); - D_GOTO(cleanup, rc); - } - } - - crt_self_test_init(); + crt_self_test_init(); - crt_iv_init(opt); - rc = crt_opc_map_create(); - if (rc != 0) { - D_ERROR("crt_opc_map_create() failed, " DF_RC "\n", DP_RC(rc)); - D_GOTO(self_test, rc); - } + crt_iv_init(opt); + rc = crt_opc_map_create(); + if (rc != 0) { + D_ERROR("crt_opc_map_create() failed, " DF_RC "\n", DP_RC(rc)); + D_GOTO(self_test, rc); + } - rc = crt_internal_rpc_register(server); - if (rc != 0) { - D_ERROR("crt_internal_rpc_register() failed, " DF_RC "\n", DP_RC(rc)); - D_GOTO(self_test, rc); - } + rc = crt_internal_rpc_register(server); + if (rc != 0) { + D_ERROR("crt_internal_rpc_register() failed, " DF_RC "\n", DP_RC(rc)); + D_GOTO(self_test, rc); + } - D_ASSERT(crt_gdata.cg_opc_map != NULL); + D_ASSERT(crt_gdata.cg_opc_map != NULL); - crt_gdata.cg_inited = 1; - } else { - if (crt_gdata.cg_server == false && server == true) { - D_ERROR("CRT initialized as client, cannot set as server again.\n"); - D_GOTO(unlock, rc = -DER_INVAL); - } - } + crt_gdata.cg_inited = 1; crt_gdata.cg_refcount++; @@ -960,15 +900,11 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) D_RWLOCK_UNLOCK(&crt_gdata.cg_rwlock); out: - /* - * We don't need to free port1, iface1 and domain1 as - * they occupy the same original string as port0, iface0 and domain0 - */ - D_FREE(port0); - D_FREE(iface0); - D_FREE(domain0); - D_FREE(provider_str0); - D_FREE(auth_key0); + D_FREE(provider_str); + D_FREE(interface_str); + D_FREE(domain_str); + D_FREE(port_str); + D_FREE(auth_key_str); if (rc != 0) { D_ERROR("failed, " DF_RC "\n", DP_RC(rc)); @@ -978,6 +914,29 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) return rc; } +static int +crt_init_prov(crt_provider_t provider, bool primary, struct crt_prov_gdata *prov_gdata, + const char *interface, const char *domain, const char *port, const char *auth_key, + bool port_auto_adjust, crt_init_options_t *opt) +{ + int rc; + + rc = prov_data_init(prov_gdata, provider, primary, opt); + if (rc != 0) + return rc; + + prov_settings_apply(primary, provider, opt); + + rc = crt_na_config_init(primary, provider, interface, domain, port, auth_key, + port_auto_adjust); + if (rc != 0) { + D_ERROR("crt_na_config_init() failed, " DF_RC "\n", DP_RC(rc)); + return rc; + } + + return 0; +} + bool crt_initialized() { @@ -1049,8 +1008,12 @@ crt_finalize(void) } for (i = 0; crt_na_dict[i].nad_str != NULL; i++) - if (crt_na_dict[i].nad_str_alloc) + if (crt_na_dict[i].nad_str_alloc) { D_FREE(crt_na_dict[i].nad_str); + crt_na_dict[i].nad_str = crt_na_dict[i].nad_alt_str; + crt_na_dict[i].nad_alt_str = NULL; + crt_na_dict[i].nad_str_alloc = false; + } D_FREE(crt_gdata.cg_secondary_provs); D_FREE(crt_gdata.cg_prov_gdata_secondary); @@ -1076,7 +1039,7 @@ crt_finalize(void) } static inline bool -is_integer_str(char *str) +is_integer_str(const char *str) { const char *p; @@ -1170,8 +1133,8 @@ crt_port_range_verify(int port) } static int -crt_na_config_init(bool primary, crt_provider_t provider, char *interface, char *domain, - char *port_str, char *auth_key, bool port_auto_adjust) +crt_na_config_init(bool primary, crt_provider_t provider, const char *interface, const char *domain, + const char *port_str, const char *auth_key, bool port_auto_adjust) { struct crt_na_config *na_cfg; int rc = 0; @@ -1190,24 +1153,34 @@ crt_na_config_init(bool primary, crt_provider_t provider, char *interface, char } if (interface) { - D_STRNDUP(na_cfg->noc_interface, interface, 64); + D_STRNDUP(na_cfg->noc_interface, interface, CRT_ENV_STR_MAX_SIZE); if (!na_cfg->noc_interface) D_GOTO(out, rc = -DER_NOMEM); } if (domain) { - D_STRNDUP(na_cfg->noc_domain, domain, 64); + D_STRNDUP(na_cfg->noc_domain, domain, CRT_ENV_STR_MAX_SIZE); if (!na_cfg->noc_domain) D_GOTO(out, rc = -DER_NOMEM); } if (auth_key) { - D_STRNDUP(na_cfg->noc_auth_key, auth_key, 255); + D_STRNDUP(na_cfg->noc_auth_key, auth_key, CRT_ENV_STR_MAX_SIZE); if (!na_cfg->noc_auth_key) D_GOTO(out, rc = -DER_NOMEM); } if (na_cfg->noc_interface) { + /* + * env checks limit strings to CRT_ENV_STR_MAX_SIZE, but an interface can + * be passed as an init argument + */ + if (strlen(na_cfg->noc_interface) + 1 >= CRT_ENV_STR_MAX_SIZE) { + D_ERROR("Interface value '%s' exceeds limit of %d characters\n", + na_cfg->noc_interface, CRT_ENV_STR_MAX_SIZE); + D_GOTO(out, rc = -DER_INVAL); + } + /* count number of ','-separated interfaces */ count = 1; save_ptr = na_cfg->noc_interface; @@ -1227,6 +1200,7 @@ crt_na_config_init(bool primary, crt_provider_t provider, char *interface, char idx = 0; token = strtok_r(na_cfg->noc_interface, ",", &save_ptr); while (token != NULL) { + /* TODO: If needed add filtering for duplicate interfaces here */ na_cfg->noc_iface_str[idx] = token; token = strtok_r(NULL, ",", &save_ptr); idx++; @@ -1238,6 +1212,16 @@ crt_na_config_init(bool primary, crt_provider_t provider, char *interface, char count = 0; if (na_cfg->noc_domain) { + /* + * env checks limit strings to CRT_ENV_STR_MAX_SIZE, but a domain can + * be passed as an init argument + */ + if (strlen(na_cfg->noc_domain) + 1 >= CRT_ENV_STR_MAX_SIZE) { + D_ERROR("Domain value '%s' exceeds limit of %d characters\n", + na_cfg->noc_domain, CRT_ENV_STR_MAX_SIZE); + D_GOTO(out, rc = -DER_INVAL); + } + /* count number of ','-separated domains */ count = 1; save_ptr = na_cfg->noc_domain; @@ -1267,8 +1251,9 @@ crt_na_config_init(bool primary, crt_provider_t provider, char *interface, char na_cfg->noc_domain_total = count; if (na_cfg->noc_domain_total > 0 && na_cfg->noc_domain_total != na_cfg->noc_iface_total) { - D_ERROR("Mismatched number of domains (%d) and interfaces (%d) specified\n", - na_cfg->noc_domain_total, na_cfg->noc_iface_total); + D_ERROR("Mismatched # of domains [%d]='%s' and interfaces [%d]='%s' specified\n", + na_cfg->noc_domain_total, na_cfg->noc_domain, na_cfg->noc_iface_total, + na_cfg->noc_interface); D_GOTO(out, rc = -DER_INVAL); } diff --git a/src/cart/crt_internal.h b/src/cart/crt_internal.h index 69d50fe31f0..d62be93af78 100644 --- a/src/cart/crt_internal.h +++ b/src/cart/crt_internal.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -153,6 +153,8 @@ void crt_hdlr_ctl_get_hostname(crt_rpc_t *rpc_req); void crt_hdlr_ctl_get_pid(crt_rpc_t *rpc_req); +void +crt_hdlr_ctl_dump_counters(crt_rpc_t *rpc_req); void crt_iv_init(crt_init_options_t *ops); diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index 472d266fb06..02fbe3eea0c 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -1,7 +1,7 @@ /* * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -86,7 +86,6 @@ struct crt_prov_gdata { bool cpg_primary; bool cpg_contig_ports; bool cpg_inited; - bool cpg_progress_busy; /** Mutext to protect fields above */ pthread_mutex_t cpg_mutex; @@ -153,6 +152,12 @@ struct crt_gdata { /** use single thread to access context */ bool cg_thread_mode_single; + /** use busy polling for progress */ + bool cg_progress_busy; + + /** use memory device */ + bool cg_mem_device; + ATOMIC uint64_t cg_rpcid; /* rpc id */ /* protects crt_gdata (see the lock order comment on crp_mutex) */ @@ -190,10 +195,12 @@ struct crt_event_cb_priv { #define CRT_CALLBACKS_NUM (4) /* start number of CBs */ #endif +#define CRT_ENV_STR_MAX_SIZE 1024 + /* * List of environment variables to read at CaRT library load time. * for integer envs use ENV() - * for string ones ENV_STR() or ENV_STR_NO_PRINT() + * for string ones ENV_STR() **/ #define CRT_ENV_LIST \ ENV_STR(CRT_ATTACH_INFO_PATH) \ @@ -230,12 +237,14 @@ struct crt_event_cb_priv { ENV(D_PORT_AUTO_ADJUST) \ ENV(D_THREAD_MODE_SINGLE) \ ENV(D_PROGRESS_BUSY) \ + ENV(D_MEM_DEVICE) \ ENV(D_POST_INCR) \ ENV(D_POST_INIT) \ ENV(D_MRECV_BUF) \ ENV(D_MRECV_BUF_COPY) \ ENV_STR(D_PROVIDER) \ - ENV_STR_NO_PRINT(D_PROVIDER_AUTH_KEY) \ + ENV_STR(D_PROVIDER_AUTH_KEY) \ + ENV_STR(SLINGSHOT_VNIS) \ ENV(D_QUOTA_RPCS) \ ENV(D_QUOTA_BULKS) \ ENV(FI_OFI_RXM_USE_SRX) \ @@ -250,16 +259,12 @@ struct crt_event_cb_priv { /* uint env */ #define ENV(x) \ unsigned int _##x; \ - int _rc_##x; \ - int _no_print_##x; + int _rc_##x; /* char* env */ #define ENV_STR(x) \ char *_##x; \ - int _rc_##x; \ - int _no_print_##x; - -#define ENV_STR_NO_PRINT(x) ENV_STR(x) + int _rc_##x; struct crt_envs { CRT_ENV_LIST; @@ -268,13 +273,13 @@ struct crt_envs { #undef ENV #undef ENV_STR -#undef ENV_STR_NO_PRINT extern struct crt_envs crt_genvs; static inline void crt_env_fini(void); +/* init cart env structure */ static inline void crt_env_init(void) { @@ -284,47 +289,37 @@ crt_env_init(void) #define ENV(x) \ do { \ - crt_genvs._rc_##x = d_getenv_uint(#x, &crt_genvs._##x); \ - crt_genvs._no_print_##x = 0; \ + crt_genvs._rc_##x = d_getenv_uint(#x, &crt_genvs._##x); \ } while (0); #define ENV_STR(x) \ do { \ - crt_genvs._rc_##x = d_agetenv_str(&crt_genvs._##x, #x); \ - crt_genvs._no_print_##x = 0; \ - } while (0); - -#define ENV_STR_NO_PRINT(x) \ - do { \ - crt_genvs._rc_##x = d_agetenv_str(&crt_genvs._##x, #x); \ - crt_genvs._no_print_##x = 1; \ + crt_genvs._rc_##x = d_agetenv_str(&crt_genvs._##x, #x); \ } while (0); CRT_ENV_LIST; #undef ENV #undef ENV_STR -#undef ENV_STR_NO_PRINT crt_genvs.inited = true; } +/* fini cart envs */ static inline void crt_env_fini(void) { #define ENV(x) (void) #define ENV_STR(x) d_freeenv_str(&crt_genvs._##x); -#define ENV_STR_NO_PRINT ENV_STR CRT_ENV_LIST #undef ENV #undef ENV_STR -#undef ENV_STR_NO_PRINT crt_genvs.inited = false; } -/* Returns value if env was present at load time */ +/* Returns value if env was present at load time and is part of CRT_ENV_LIST */ #define crt_env_get(name, val) \ do { \ D_ASSERT(crt_genvs.inited); \ @@ -332,6 +327,33 @@ crt_env_fini(void) *val = crt_genvs._##name; \ } while (0) +/* Check if the env is set */ +#define crt_env_is_set(name) (crt_genvs._rc_##name == 0) + +/* Check envs that contain strings to not exceed CRT_ENV_STR_MAX_SIZE */ +static inline bool +crt_env_list_valid(void) +{ +/* Ignore non-string envs in this check */ +#define ENV(x) + +/* if string env exceeds CRT_ENV_STR_MAX_SIZE - return false */ +#define ENV_STR(x) \ + if (crt_genvs._rc_##x == 0 && strlen(crt_genvs._##x) + 1 > CRT_ENV_STR_MAX_SIZE) { \ + D_ERROR("env '%s' (value='%s') exceeded max size %d\n", #x, crt_genvs._##x, \ + CRT_ENV_STR_MAX_SIZE); \ + return false; \ + } + + /* expand env list using the above ENV_* definitions */ + CRT_ENV_LIST; + return true; + +#undef ENV +#undef ENV_STR +} + +/* dump environment variables from the CRT_ENV_LIST */ static inline void crt_env_dump(void) { @@ -339,20 +361,17 @@ crt_env_dump(void) /* Only dump envariables that were set */ #define ENV(x) \ - if (!crt_genvs._rc_##x && crt_genvs._no_print_##x == 0) \ + if (!crt_genvs._rc_##x) \ D_INFO("%s = %d\n", #x, crt_genvs._##x); #define ENV_STR(x) \ if (!crt_genvs._rc_##x) \ - D_INFO("%s = %s\n", #x, crt_genvs._no_print_##x ? "****" : crt_genvs._##x); - -#define ENV_STR_NO_PRINT ENV_STR + D_INFO("%s = %s\n", #x, crt_genvs._##x); CRT_ENV_LIST; #undef ENV #undef ENV_STR -#undef ENV_STR_NO_PRINT } /* structure of global fault tolerance data */ diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index da2a8383908..b858964cdbd 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -165,7 +165,7 @@ crt_proc_struct_crt_grp_cache(crt_proc_t proc, crt_proc_op_t proc_op, return crt_proc_crt_grp_cache(proc, data); } -/* !! All of the following 4 RPC definition should have the same input fields !! +/* !! All of the following 5 RPC definition should have the same input fields !! * All of them are verified in one function: * int verify_ctl_in_args(struct crt_ctl_ep_ls_in *in_args) */ @@ -173,6 +173,7 @@ CRT_RPC_DEFINE(crt_ctl_get_uri_cache, CRT_ISEQ_CTL, CRT_OSEQ_CTL_GET_URI_CACHE) CRT_RPC_DEFINE(crt_ctl_ep_ls, CRT_ISEQ_CTL, CRT_OSEQ_CTL_EP_LS) CRT_RPC_DEFINE(crt_ctl_get_host, CRT_ISEQ_CTL, CRT_OSEQ_CTL_GET_HOST) CRT_RPC_DEFINE(crt_ctl_get_pid, CRT_ISEQ_CTL, CRT_OSEQ_CTL_GET_PID) +CRT_RPC_DEFINE(crt_ctl_dump_counters, CRT_ISEQ_CTL, CRT_OSEQ_CTL_DUMP_COUNTERS) CRT_RPC_DEFINE(crt_proto_query, CRT_ISEQ_PROTO_QUERY, CRT_OSEQ_PROTO_QUERY) diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h index c99a80f4e26..64fffdcc9e4 100644 --- a/src/cart/crt_rpc.h +++ b/src/cart/crt_rpc.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -15,9 +15,9 @@ #include #include -/* default RPC timeout 60 seconds */ -#define CRT_DEFAULT_TIMEOUT_S (60) /* second */ -#define CRT_DEFAULT_TIMEOUT_US (CRT_DEFAULT_TIMEOUT_S * 1e6) /* micro-second */ +/* default RPC timeout */ +#define CRT_TIMEOUT_DEFAULT (60U) /* 60 seconds */ +#define CRT_TIMEOUT_MAX (3600U) /* 1 hour */ #define CRT_QUOTA_RPCS_DEFAULT 64 #define CRT_QUOTA_BULKS_DEFAULT 64 @@ -262,76 +262,44 @@ crt_rpc_unlock(struct crt_rpc_priv *rpc_priv) * this to ping the server waiting for start so needs to work before * proto_query() can be called. */ -#define CRT_INTERNAL_RPCS_LIST \ - X(CRT_OPC_URI_LOOKUP, \ - 0, &CQF_crt_uri_lookup, \ - crt_hdlr_uri_lookup, NULL) \ - X(CRT_OPC_PROTO_QUERY, \ - 0, &CQF_crt_proto_query, \ - crt_hdlr_proto_query, NULL) \ - X(CRT_OPC_CTL_LS, \ - 0, &CQF_crt_ctl_ep_ls, \ - crt_hdlr_ctl_ls, NULL) \ - -#define CRT_FI_RPCS_LIST \ - X(CRT_OPC_CTL_FI_TOGGLE, \ - 0, &CQF_crt_ctl_fi_toggle, \ - crt_hdlr_ctl_fi_toggle, NULL) \ - X(CRT_OPC_CTL_FI_SET_ATTR, \ - 0, &CQF_crt_ctl_fi_attr_set, \ - crt_hdlr_ctl_fi_attr_set, NULL) \ - -#define CRT_ST_RPCS_LIST \ - X(CRT_OPC_SELF_TEST_BOTH_EMPTY, \ - 0, NULL, \ - crt_self_test_msg_handler, NULL) \ - X(CRT_OPC_SELF_TEST_SEND_ID_REPLY_IOV, \ - 0, &CQF_crt_st_send_id_reply_iov, \ - crt_self_test_msg_handler, NULL) \ - X(CRT_OPC_SELF_TEST_SEND_IOV_REPLY_EMPTY, \ - 0, &CQF_crt_st_send_iov_reply_empty, \ - crt_self_test_msg_handler, NULL) \ - X(CRT_OPC_SELF_TEST_BOTH_IOV, \ - 0, &CQF_crt_st_both_iov, \ - crt_self_test_msg_handler, NULL) \ - X(CRT_OPC_SELF_TEST_SEND_BULK_REPLY_IOV, \ - 0, &CQF_crt_st_send_bulk_reply_iov, \ - crt_self_test_msg_handler, NULL) \ - X(CRT_OPC_SELF_TEST_SEND_IOV_REPLY_BULK, \ - 0, &CQF_crt_st_send_iov_reply_bulk, \ - crt_self_test_msg_handler, NULL) \ - X(CRT_OPC_SELF_TEST_BOTH_BULK, \ - 0, &CQF_crt_st_both_bulk, \ - crt_self_test_msg_handler, NULL) \ - X(CRT_OPC_SELF_TEST_OPEN_SESSION, \ - 0, &CQF_crt_st_open_session, \ - crt_self_test_open_session_handler, NULL) \ - X(CRT_OPC_SELF_TEST_CLOSE_SESSION, \ - 0, &CQF_crt_st_close_session, \ - crt_self_test_close_session_handler, NULL) \ - X(CRT_OPC_SELF_TEST_START, \ - 0, &CQF_crt_st_start, \ - crt_self_test_start_handler, NULL) \ - X(CRT_OPC_SELF_TEST_STATUS_REQ, \ - 0, &CQF_crt_st_status_req, \ - crt_self_test_status_req_handler, NULL) \ - -#define CRT_CTL_RPCS_LIST \ - X(CRT_OPC_CTL_LOG_SET, \ - 0, &CQF_crt_ctl_log_set, \ - crt_hdlr_ctl_log_set, NULL) \ - X(CRT_OPC_CTL_LOG_ADD_MSG, \ - 0, &CQF_crt_ctl_log_add_msg, \ - crt_hdlr_ctl_log_add_msg, NULL) \ - X(CRT_OPC_CTL_GET_URI_CACHE, \ - 0, &CQF_crt_ctl_get_uri_cache, \ - crt_hdlr_ctl_get_uri_cache, NULL) \ - X(CRT_OPC_CTL_GET_HOSTNAME, \ - 0, &CQF_crt_ctl_get_host, \ - crt_hdlr_ctl_get_hostname, NULL) \ - X(CRT_OPC_CTL_GET_PID, \ - 0, &CQF_crt_ctl_get_pid, \ - crt_hdlr_ctl_get_pid, NULL) \ +#define CRT_INTERNAL_RPCS_LIST \ + X(CRT_OPC_URI_LOOKUP, 0, &CQF_crt_uri_lookup, crt_hdlr_uri_lookup, NULL) \ + X(CRT_OPC_PROTO_QUERY, 0, &CQF_crt_proto_query, crt_hdlr_proto_query, NULL) \ + X(CRT_OPC_CTL_LS, 0, &CQF_crt_ctl_ep_ls, crt_hdlr_ctl_ls, NULL) + +#define CRT_FI_RPCS_LIST \ + X(CRT_OPC_CTL_FI_TOGGLE, 0, &CQF_crt_ctl_fi_toggle, crt_hdlr_ctl_fi_toggle, NULL) \ + X(CRT_OPC_CTL_FI_SET_ATTR, 0, &CQF_crt_ctl_fi_attr_set, crt_hdlr_ctl_fi_attr_set, NULL) + +#define CRT_ST_RPCS_LIST \ + X(CRT_OPC_SELF_TEST_BOTH_EMPTY, 0, NULL, crt_self_test_msg_handler, NULL) \ + X(CRT_OPC_SELF_TEST_SEND_ID_REPLY_IOV, 0, &CQF_crt_st_send_id_reply_iov, \ + crt_self_test_msg_handler, NULL) \ + X(CRT_OPC_SELF_TEST_SEND_IOV_REPLY_EMPTY, 0, &CQF_crt_st_send_iov_reply_empty, \ + crt_self_test_msg_handler, NULL) \ + X(CRT_OPC_SELF_TEST_BOTH_IOV, 0, &CQF_crt_st_both_iov, crt_self_test_msg_handler, NULL) \ + X(CRT_OPC_SELF_TEST_SEND_BULK_REPLY_IOV, 0, &CQF_crt_st_send_bulk_reply_iov, \ + crt_self_test_msg_handler, NULL) \ + X(CRT_OPC_SELF_TEST_SEND_IOV_REPLY_BULK, 0, &CQF_crt_st_send_iov_reply_bulk, \ + crt_self_test_msg_handler, NULL) \ + X(CRT_OPC_SELF_TEST_BOTH_BULK, 0, &CQF_crt_st_both_bulk, crt_self_test_msg_handler, NULL) \ + X(CRT_OPC_SELF_TEST_OPEN_SESSION, 0, &CQF_crt_st_open_session, \ + crt_self_test_open_session_handler, NULL) \ + X(CRT_OPC_SELF_TEST_CLOSE_SESSION, 0, &CQF_crt_st_close_session, \ + crt_self_test_close_session_handler, NULL) \ + X(CRT_OPC_SELF_TEST_START, 0, &CQF_crt_st_start, crt_self_test_start_handler, NULL) \ + X(CRT_OPC_SELF_TEST_STATUS_REQ, 0, &CQF_crt_st_status_req, \ + crt_self_test_status_req_handler, NULL) + +#define CRT_CTL_RPCS_LIST \ + X(CRT_OPC_CTL_LOG_SET, 0, &CQF_crt_ctl_log_set, crt_hdlr_ctl_log_set, NULL) \ + X(CRT_OPC_CTL_LOG_ADD_MSG, 0, &CQF_crt_ctl_log_add_msg, crt_hdlr_ctl_log_add_msg, NULL) \ + X(CRT_OPC_CTL_GET_URI_CACHE, 0, &CQF_crt_ctl_get_uri_cache, crt_hdlr_ctl_get_uri_cache, \ + NULL) \ + X(CRT_OPC_CTL_GET_HOSTNAME, 0, &CQF_crt_ctl_get_host, crt_hdlr_ctl_get_hostname, NULL) \ + X(CRT_OPC_CTL_GET_PID, 0, &CQF_crt_ctl_get_pid, crt_hdlr_ctl_get_pid, NULL) \ + X(CRT_OPC_CTL_DUMP_COUNTERS, 0, &CQF_crt_ctl_dump_counters, crt_hdlr_ctl_dump_counters, \ + NULL) #define CRT_IV_RPCS_LIST \ X(CRT_OPC_IV_FETCH, \ @@ -580,6 +548,10 @@ CRT_RPC_DECLARE(crt_ctl_get_uri_cache, CRT_ISEQ_CTL, CRT_OSEQ_CTL_GET_URI_CACHE) CRT_RPC_DECLARE(crt_ctl_get_host, CRT_ISEQ_CTL, CRT_OSEQ_CTL_GET_HOST) +#define CRT_OSEQ_CTL_DUMP_COUNTERS /* output fields */ ((uint32_t)(rc)CRT_VAR) + +CRT_RPC_DECLARE(crt_ctl_dump_counters, CRT_ISEQ_CTL, CRT_OSEQ_CTL_DUMP_COUNTERS) + #define CRT_OSEQ_CTL_GET_PID /* output fields */ \ ((int32_t) (cgp_pid) CRT_VAR) \ ((int32_t) (cgp_rc) CRT_VAR) diff --git a/src/cart/swim/swim.c b/src/cart/swim/swim.c index 4073dcf9ad3..60d077d2ac5 100644 --- a/src/cart/swim/swim.c +++ b/src/cart/swim/swim.c @@ -2,7 +2,7 @@ * Copyright (c) 2016 UChicago Argonne, LLC * (C) Copyright 2018-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1093,6 +1093,7 @@ swim_updates_parse(struct swim_context *ctx, swim_id_t from_id, swim_id_t id, struct swim_member_state id_state; swim_id_t self_id = swim_self_get(ctx); swim_id_t upd_id; + bool from_untrustable = false; size_t i; int rc = 0; @@ -1106,10 +1107,9 @@ swim_updates_parse(struct swim_context *ctx, swim_id_t from_id, swim_id_t id, rc = ctx->sc_ops->get_member_state(ctx, from_id, &id_state); if (rc == -DER_NONEXIST || id_state.sms_status == SWIM_MEMBER_DEAD) { - swim_ctx_unlock(ctx); - SWIM_DEBUG("%lu: skip untrustable update from %lu, rc = %d\n", self_id, from_id, - rc); - D_GOTO(out, rc = -DER_NONEXIST); + SWIM_DEBUG("%lu: 'untrustable' updates from %lu: " DF_RC "\n", self_id, from_id, + DP_RC(rc)); + from_untrustable = true; } else if (rc != 0) { swim_ctx_unlock(ctx); SWIM_ERROR("get_member_state(%lu): " DF_RC "\n", from_id, DP_RC(rc)); @@ -1182,6 +1182,13 @@ swim_updates_parse(struct swim_context *ctx, swim_id_t from_id, swim_id_t id, break; } + /* + * If from_id is "untrustable", react to its SUSPECT and DEAD updates about + * me (above), but ignore those about others. + */ + if (from_untrustable) + break; + if (upds[i].smu_state.sms_status == SWIM_MEMBER_SUSPECT) swim_member_suspect(ctx, from_id, upd_id, upds[i].smu_state.sms_incarnation); diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 4771f022302..2cb7347ed55 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -486,6 +486,13 @@ fun:_ZN6__tsan9ShadowSetEPNS_9RawShadowES1_S0_ fun:racecall } +{ + MemoryRangeSet ShadowSet + Memcheck:Value8 + fun:ShadowSet + fun:_ZN6__tsanL14MemoryRangeSetEmmNS_9RawShadowE + fun:racecall +} { FI leak 8 Memcheck:Leak @@ -505,6 +512,14 @@ fun:hg_dlog_mkcount32 ... } +{ + + Memcheck:Leak + match-leak-kinds: reachable + fun:malloc + fun:hg_dlog_mkcount64 + ... +} { FI leak 9 Memcheck:Leak @@ -520,18 +535,6 @@ fun:HG_Init_opt fun:crt_hg_class_init } -{ - Tcp provider - Memcheck:Param - sendmsg(msg.msg_iov[1]) - ... - fun:sendmsg - fun:ofi_sockapi_sendv_socket - fun:ofi_bsock_sendv - ... - fun:fi_senddata - ... -} { Tcp provider with ofi rxm Memcheck:Param @@ -539,20 +542,14 @@ ... fun:ofi_bsock_sendv ... - fun:fi_tsend - ... } { Tcp provider with ofi rxm 2 Memcheck:Param sendmsg(msg.msg_iov[2]) ... - fun:sendmsg - fun:ofi_sockapi_sendv_socket fun:ofi_bsock_sendv ... - fun:fi_tsend - ... } { Go syscall. @@ -609,18 +606,21 @@ { __tsan_go_atomic64_load Memcheck:Addr8 + ... fun:__tsan_go_atomic64_load fun:racecall } { __tsan_go_atomic64_store Memcheck:Addr8 + ... fun:__tsan_go_atomic64_store fun:racecall } { __tsan_go_atomic64_compare_exchange Memcheck:Addr8 + ... fun:__tsan_go_atomic64_compare_exchange fun:racecall } @@ -637,18 +637,21 @@ { __tsan_write_pc Memcheck:Value8 + ... fun:__tsan_write_pc fun:racecall } { __tsan_read_pc Memcheck:Value8 + ... fun:__tsan_read_pc fun:racecall } { tsan::MemoryAccessRange Memcheck:Value8 + ... fun:_ZN6__tsan18MemoryAccessRangeTILb0EEEvPNS_11ThreadStateEmmm ... fun:racecall @@ -656,6 +659,7 @@ { tsan::MemoryAccessRange Memcheck:Value8 + ... fun:_ZN6__tsan18MemoryAccessRangeTILb1EEEvPNS_11ThreadStateEmmm ... fun:racecall @@ -663,6 +667,7 @@ { tsan::TraceRestartMemoryAccess Memcheck:Value8 + ... fun:_ZN6__tsan24TraceRestartMemoryAccessEPNS_11ThreadStateEmmmm ... fun:racecall @@ -670,12 +675,14 @@ { __tsan_read Memcheck:Value8 + ... fun:__tsan_read fun:racecall } { __tsan_write Memcheck:Value8 + ... fun:__tsan_write fun:racecall } @@ -698,18 +705,21 @@ { __tsan_go_atomic32_load Memcheck:Addr4 + ... fun:__tsan_go_atomic32_load fun:racecall } { __tsan_go_atomic32_store Memcheck:Addr4 + ... fun:__tsan_go_atomic32_store fun:racecall } { __tsan_go_atomic32_compare_exchange Memcheck:Addr4 + ... fun:__tsan_go_atomic32_compare_exchange fun:racecall } @@ -761,30 +771,35 @@ { __tsan_go_atomic32_fetch_add Memcheck:Addr4 + ... fun:__tsan_go_atomic32_fetch_add fun:racecall } { __tsan_go_atomic64_fetch_add Memcheck:Addr8 + ... fun:__tsan_go_atomic64_fetch_add fun:racecall } { - DAOS-17006 - mercury leak + getpwnam_r() leak + Memcheck:Leak + fun:*alloc + ... + fun:getpwnam_r* +} +{ + getpwuid_r() leak + Memcheck:Leak + fun:calloc + ... + fun:getpwuid_r* +} +{ + localtime() leak Memcheck:Leak - match-leak-kinds: reachable fun:malloc - fun:hg_dlog_mkcount64 - fun:hg_core_counters_init - fun:hg_core_init - fun:HG_Core_init_opt2 - fun:HG_Init_opt2 - fun:crt_hg_class_init - fun:crt_hg_ctx_init - fun:crt_context_provider_create - fun:daos_eq_lib_init - fun:daos_init - fun:_cgo_b590e4e2531a_Cfunc_daos_init - fun:runtime.asmcgocall.abi0 + ... + fun:__tz_convert } diff --git a/src/chk/chk.pb-c.c b/src/chk/chk.pb-c.c index aaf178ff90a..318081afc3a 100644 --- a/src/chk/chk.pb-c.c +++ b/src/chk/chk.pb-c.c @@ -379,7 +379,7 @@ const ProtobufCEnumDescriptor chk__check_inconsist_class__descriptor = chk__check_inconsist_class__value_ranges, NULL,NULL,NULL,NULL /* reserved[1234] */ }; -static const ProtobufCEnumValue chk__check_inconsist_action__enum_values_by_number[13] = +static const ProtobufCEnumValue chk__check_inconsist_action__enum_values_by_number[14] = { { "CIA_DEFAULT", "CHK__CHECK_INCONSIST_ACTION__CIA_DEFAULT", 0 }, { "CIA_INTERACT", "CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT", 1 }, @@ -394,17 +394,19 @@ static const ProtobufCEnumValue chk__check_inconsist_action__enum_values_by_numb { "CIA_TRUST_OLDEST", "CHK__CHECK_INCONSIST_ACTION__CIA_TRUST_OLDEST", 10 }, { "CIA_TRUST_EC_PARITY", "CHK__CHECK_INCONSIST_ACTION__CIA_TRUST_EC_PARITY", 11 }, { "CIA_TRUST_EC_DATA", "CHK__CHECK_INCONSIST_ACTION__CIA_TRUST_EC_DATA", 12 }, + { "CIA_STALE", "CHK__CHECK_INCONSIST_ACTION__CIA_STALE", 65535 }, }; static const ProtobufCIntRange chk__check_inconsist_action__value_ranges[] = { -{0, 0},{0, 13} +{0, 0},{65535, 13},{0, 14} }; -static const ProtobufCEnumValueIndex chk__check_inconsist_action__enum_values_by_name[13] = +static const ProtobufCEnumValueIndex chk__check_inconsist_action__enum_values_by_name[14] = { { "CIA_DEFAULT", 0 }, { "CIA_DISCARD", 3 }, { "CIA_IGNORE", 2 }, { "CIA_INTERACT", 1 }, { "CIA_READD", 4 }, + { "CIA_STALE", 13 }, { "CIA_TRUST_EC_DATA", 12 }, { "CIA_TRUST_EC_PARITY", 11 }, { "CIA_TRUST_LATEST", 9 }, @@ -421,11 +423,11 @@ const ProtobufCEnumDescriptor chk__check_inconsist_action__descriptor = "CheckInconsistAction", "Chk__CheckInconsistAction", "chk", - 13, + 14, chk__check_inconsist_action__enum_values_by_number, - 13, + 14, chk__check_inconsist_action__enum_values_by_name, - 1, + 2, chk__check_inconsist_action__value_ranges, NULL,NULL,NULL,NULL /* reserved[1234] */ }; diff --git a/src/chk/chk.pb-c.h b/src/chk/chk.pb-c.h index 1f1bdaf0367..0ab35624431 100644 --- a/src/chk/chk.pb-c.h +++ b/src/chk/chk.pb-c.h @@ -196,7 +196,11 @@ typedef enum _Chk__CheckInconsistAction { /* * Trust EC data shard. */ - CHK__CHECK_INCONSIST_ACTION__CIA_TRUST_EC_DATA = 12 + CHK__CHECK_INCONSIST_ACTION__CIA_TRUST_EC_DATA = 12, + /* + * Stale unresolved interaction. The checker can no longer address this report without re-running on affected pool. + */ + CHK__CHECK_INCONSIST_ACTION__CIA_STALE = 65535 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(CHK__CHECK_INCONSIST_ACTION) } Chk__CheckInconsistAction; /* diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index 0e270926e79..21d0a8ceba7 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -291,7 +291,7 @@ chk_pending_free(struct btr_instance *tins, struct btr_record *rec, void *args) ABT_mutex_unlock(cpr->cpr_mutex); } else { ABT_mutex_unlock(cpr->cpr_mutex); - chk_pending_destroy(cpr); + chk_pending_destroy(NULL, cpr); } } @@ -930,6 +930,27 @@ chk_pool_shard_cleanup(struct chk_instance *ins) } } +int +chk_pending_lookup(struct chk_instance *ins, uint64_t seq, struct chk_pending_rec **cpr) +{ + d_iov_t kiov; + d_iov_t riov; + int rc; + + d_iov_set(&riov, NULL, 0); + d_iov_set(&kiov, &seq, sizeof(seq)); + + ABT_rwlock_rdlock(ins->ci_abt_lock); + rc = dbtree_lookup(ins->ci_pending_hdl, &kiov, &riov); + ABT_rwlock_unlock(ins->ci_abt_lock); + if (rc == 0) + *cpr = (struct chk_pending_rec *)riov.iov_buf; + else + *cpr = NULL; + + return rc; +} + int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid, uint64_t seq, uint32_t rank, uint32_t cla, uint32_t option_nr, uint32_t *options, @@ -985,12 +1006,14 @@ chk_pending_del(struct chk_instance *ins, uint64_t seq, struct chk_pending_rec * d_iov_set(&kiov, &seq, sizeof(seq)); ABT_rwlock_wrlock(ins->ci_abt_lock); - rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ, &kiov, &riov); + rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ, &kiov, cpr == NULL ? NULL : &riov); ABT_rwlock_unlock(ins->ci_abt_lock); - if (rc == 0) - *cpr = (struct chk_pending_rec *)riov.iov_buf; - else - *cpr = NULL; + if (cpr != NULL) { + if (rc == 0) + *cpr = (struct chk_pending_rec *)riov.iov_buf; + else + *cpr = NULL; + } D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG, "Del pending record with gen "DF_X64", seq "DF_X64": "DF_RC"\n", @@ -1028,29 +1051,13 @@ chk_pending_wakeup(struct chk_instance *ins, struct chk_pending_rec *cpr) ABT_mutex_unlock(cpr->cpr_mutex); } else { ABT_mutex_unlock(cpr->cpr_mutex); - chk_pending_destroy(cpr); + chk_pending_destroy(ins, cpr); } } return rc; } -void -chk_pending_destroy(struct chk_pending_rec *cpr) -{ - D_ASSERT(d_list_empty(&cpr->cpr_pool_link)); - D_ASSERT(d_list_empty(&cpr->cpr_rank_link)); - D_ASSERT(d_list_empty(&cpr->cpr_ins_link)); - - if (cpr->cpr_cond != ABT_COND_NULL) - ABT_cond_free(&cpr->cpr_cond); - - if (cpr->cpr_mutex != ABT_MUTEX_NULL) - ABT_mutex_free(&cpr->cpr_mutex); - - D_FREE(cpr); -} - int chk_policy_refresh(uint32_t policy_nr, struct chk_policy *policies, struct chk_property *prop) { @@ -1073,8 +1080,7 @@ chk_policy_refresh(uint32_t policy_nr, struct chk_policy *policies, struct chk_p } int -chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, - uint32_t policy_nr, struct chk_policy *policies, +chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop) { int rc = 0; @@ -1086,11 +1092,8 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, prop->cp_flags &= ~CHK__CHECK_FLAG__CF_FAILOUT; if (flags & CHK__CHECK_FLAG__CF_NO_AUTO) prop->cp_flags &= ~CHK__CHECK_FLAG__CF_AUTO; - prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | - CHK__CHECK_FLAG__CF_ORPHAN_POOL | - CHK__CHECK_FLAG__CF_NO_FAILOUT | - CHK__CHECK_FLAG__CF_NO_AUTO); - prop->cp_phase = phase; + prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | CHK__CHECK_FLAG__CF_ORPHAN_POOL | + CHK__CHECK_FLAG__CF_NO_FAILOUT | CHK__CHECK_FLAG__CF_NO_AUTO); if (ranks != NULL) prop->cp_rank_nr = ranks->rl_nr; @@ -1234,6 +1237,15 @@ chk_ins_merge_info(uint32_t *status_dst, uint32_t status_src, uint32_t *phase_ds *status_dst = status_src; } +void +chk_ins_cleanup(struct chk_instance *ins) +{ + chk_stop_sched(ins); + ins->ci_inited = 0; + + chk_iv_ns_destroy(ins); +} + int chk_ins_init(struct chk_instance **p_ins) { @@ -1246,7 +1258,8 @@ chk_ins_init(struct chk_instance **p_ins) if (ins == NULL) D_GOTO(out_init, rc = -DER_NOMEM); - ins->ci_sched = ABT_THREAD_NULL; + ins->ci_sched = ABT_THREAD_NULL; + ins->ci_dead_rank_ult = ABT_THREAD_NULL; ins->ci_rank_hdl = DAOS_HDL_INVAL; D_INIT_LIST_HEAD(&ins->ci_rank_list); @@ -1300,11 +1313,8 @@ chk_ins_fini(struct chk_instance **p_ins) if (ins == NULL) return; - ins->ci_inited = 0; - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) - crt_group_secondary_destroy(ins->ci_iv_group); + D_ASSERT(ins->ci_iv_ns == NULL); + D_ASSERT(ins->ci_iv_group == NULL); d_rank_list_free(ins->ci_ranks); D_ASSERT(d_list_empty(&ins->ci_dead_ranks)); @@ -1321,6 +1331,8 @@ chk_ins_fini(struct chk_instance **p_ins) D_ASSERT(d_list_empty(&ins->ci_interaction_filter_list)); D_ASSERT(d_list_empty(&ins->ci_pool_shutdown_list)); + D_ASSERT(ins->ci_dead_rank_ult == ABT_THREAD_NULL); + if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index a0ab820d9bd..b7374acb806 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -225,8 +225,7 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu iv.ci_to_leader = 1; /* Notify the leader that check instance exit on the engine. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, - CRT_IV_SYNC_NONE, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify leader for its exit, status %u: rc = %d\n", DP_ENGINE(ins), dss_self_rank(), ins_status, rc); @@ -247,6 +246,7 @@ chk_engine_post_repair(struct chk_pool_rec *cpr, int *result, bool update) *result = 0; if (*result != 0) { + chk_ins_set_fail(cpr->cpr_ins, cbk->cb_phase); if (cpr->cpr_ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_FAILOUT) { cbk->cb_time.ct_stop_time = time(NULL); cbk->cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_FAILED; @@ -1205,10 +1205,13 @@ chk_engine_cont_target_label_empty(struct chk_cont_rec *ccr) static inline bool chk_engine_cont_cs_label_empty(struct chk_cont_rec *ccr) { - if (daos_iov_empty(&ccr->ccr_label_cs)) + d_iov_t *label = &ccr->ccr_label_cs; + + if (daos_iov_empty(label)) return true; - if (strncmp(DAOS_PROP_NO_CO_LABEL, ccr->ccr_label_cs.iov_buf, DAOS_PROP_LABEL_MAX_LEN) == 0) + if (strlen(DAOS_PROP_NO_CO_LABEL) == label->iov_len && + strncmp(DAOS_PROP_NO_CO_LABEL, label->iov_buf, label->iov_len) == 0) return true; return false; @@ -1580,8 +1583,8 @@ chk_engine_cont_label_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg ccr = riov.iov_buf; if (ccr->ccr_label_prop == NULL || - strncmp(key->iov_buf, ccr->ccr_label_prop->dpp_entries[0].dpe_str, - DAOS_PROP_LABEL_MAX_LEN) != 0) + key->iov_len != strlen(ccr->ccr_label_prop->dpp_entries[0].dpe_str) || + strncmp(key->iov_buf, ccr->ccr_label_prop->dpp_entries[0].dpe_str, key->iov_len) != 0) rc = daos_iov_copy(&ccr->ccr_label_cs, key); else ccr->ccr_label_checked = 1; @@ -1708,8 +1711,7 @@ chk_engine_pool_notify(struct chk_pool_rec *cpr) * to all engines. Otherwise, the engine out of the pool map cannot get * the notification. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER, - true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify pool shards for "DF_UUIDF", phase %u, " "ins_status %u, pool_status %u: rc = %d\n", @@ -1721,8 +1723,7 @@ chk_engine_pool_notify(struct chk_pool_rec *cpr) iv.ci_from_psl = 0; iv.ci_to_leader = 1; /* Synchronously notify the check leader with the new check status/phase. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, - CRT_IV_SYNC_NONE, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify check leader for "DF_UUIDF", phase %u, " "ins_status %u, pool_status %u: rc = %d\n", @@ -1981,7 +1982,7 @@ chk_engine_sched(void *args) D_GOTO(out, rc); } - if (ins_phase > cbk->cb_phase) { + if (ins_phase != CHK_INVAL_PHASE && ins_phase > cbk->cb_phase) { D_INFO(DF_ENGINE" on rank %u moves from phase %u to phase %u\n", DP_ENGINE(ins), myrank, cbk->cb_phase, ins_phase); @@ -2045,9 +2046,8 @@ chk_engine_sched(void *args) static int chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint64_t gen, int phase, uint32_t api_flags, - d_rank_t leader, uint32_t flags) + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + uint64_t gen, uint32_t api_flags, d_rank_t leader, uint32_t flags) { struct chk_traverse_pools_args ctpa = { 0 }; struct chk_bookmark *cbk = &ins->ci_bk; @@ -2134,8 +2134,7 @@ chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank init: if (!chk_is_on_leader(gen, leader, true)) { - rc = chk_prop_prepare(leader, api_flags, phase, policy_nr, policies, rank_list, - prop); + rc = chk_prop_prepare(leader, api_flags, policy_nr, policies, rank_list, prop); if (rc != 0) goto out; @@ -2263,16 +2262,15 @@ chk_engine_pool_filter(uuid_t uuid, void *arg, int *phase) int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], uint32_t api_flags, - int phase, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, + uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, struct ds_pool_clues *clues) { - struct chk_instance *ins = chk_engine; - struct chk_bookmark *cbk = &ins->ci_bk; - struct umem_attr uma = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - d_rank_t myrank = dss_self_rank(); - int rc; - int rc1; + struct chk_instance *ins = chk_engine; + struct chk_bookmark *cbk = &ins->ci_bk; + struct umem_attr uma = {0}; + d_rank_t myrank = dss_self_rank(); + int rc; + int rc1; rc = chk_ins_can_start(ins); if (rc != 0) @@ -2294,12 +2292,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); uma.uma_id = UMEM_CLASS_VMEM; @@ -2313,27 +2306,20 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (rc != 0) goto out_tree; - rc = chk_engine_start_prep(ins, rank_nr, ranks, policy_nr, policies, - pool_nr, pools, gen, phase, api_flags, leader, flags); + rc = chk_engine_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, gen, + api_flags, leader, flags); if (rc != 0) goto out_tree; if (chk_is_on_leader(gen, leader, true)) { ins->ci_iv_ns = chk_leader_get_iv_ns(); - if (unlikely(ins->ci_iv_ns == NULL)) - goto out_tree; + D_ASSERT(ins->ci_iv_ns != NULL); + + ins->ci_ns_ver = ns_ver; } else { - uuid_unparse_lower(iv_uuid, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); + rc = chk_iv_ns_create(ins, iv_uuid, leader, ns_ver); if (rc != 0) goto out_tree; - - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, iv_uuid, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, leader, ins->ci_iv_ns->iv_master_term + 1); } uuid_copy(cbk->cb_iv_uuid, iv_uuid); @@ -2345,6 +2331,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (rc != 0) goto out_stop; + ins->ci_pause = 0; ins->ci_sched_running = 1; rc = dss_ult_create(chk_engine_sched, ins, DSS_XS_SYS, 0, DSS_DEEP_STACK_SZ, @@ -2366,12 +2353,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic D_WARN(DF_ENGINE" failed to update engine bookmark: "DF_RC"\n", DP_ENGINE(ins), DP_RC(rc1)); } - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); out_tree: chk_destroy_pending_tree(ins); chk_destroy_pool_tree(ins); @@ -2379,17 +2361,18 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic ins->ci_starting = 0; out_log: if (rc >= 0) { - D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, phase %d, leader %u, " - "flags %x, iv "DF_UUIDF": rc %d\n", + D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, ns_ver %d, leader %u, " + "flags %x, iv " DF_UUIDF ": rc %d\n", DP_ENGINE(ins), chk_is_ins_reset(ins, api_flags) ? "start" : "resume", - myrank, api_flags, phase, leader, flags, DP_UUID(iv_uuid), rc); + myrank, api_flags, ns_ver, leader, flags, DP_UUID(iv_uuid), rc); chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks); chk_pools_dump(&ins->ci_pool_list, pool_nr, pools); } else { - D_ERROR(DF_ENGINE" failed to start on rank %u with %d pools, api_flags %x, " - "phase %d, leader %u, flags %x, gen "DF_X64", iv "DF_UUIDF": "DF_RC"\n", - DP_ENGINE(ins), myrank, pool_nr, api_flags, phase, leader, flags, gen, + D_ERROR(DF_ENGINE " failed to start on rank %u with %d pools, api_flags %x, " + "ns_ver %d, leader %u, flags %x, gen " DF_X64 ", iv " DF_UUIDF + ": " DF_RC "\n", + DP_ENGINE(ins), myrank, pool_nr, api_flags, ns_ver, leader, flags, gen, DP_UUID(iv_uuid), DP_RC(rc)); } @@ -2407,13 +2390,15 @@ chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags) int i; int active = false; + CHK_IS_READY(ins); + if (gen != 0 && gen != cbk->cb_gen) D_GOTO(log, rc = -DER_NOTAPPLICABLE); if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE) D_GOTO(log, rc = -DER_NOTAPPLICABLE); - if (ins->ci_starting) + if (ins->ci_starting || ins->ci_rejoining) D_GOTO(log, rc = -DER_BUSY); if (ins->ci_stopping || ins->ci_sched_exiting) @@ -2577,7 +2562,7 @@ chk_engine_query_pool(uuid_t uuid, void *args) coll_ops.co_func = chk_engine_query_one; coll_args.ca_func_args = shard; - rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_ULT_DEEP_STACK); out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG, @@ -2596,6 +2581,8 @@ chk_engine_query(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *ins_status int rc = 0; int i; + CHK_IS_READY(ins); + /* * We will support to check query from new check leader under the case of old leader * crashed, that may have different check generation. So do not check "cb_gen" here, @@ -2642,32 +2629,46 @@ chk_engine_query(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *ins_status int chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version) { - struct chk_instance *ins = chk_engine; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - d_rank_list_t *rank_list = NULL; - int rc = 0; + struct chk_instance *ins = chk_engine; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + int rc = 0; + + CHK_IS_READY(ins); if (cbk->cb_gen != gen) D_GOTO(out, rc = -DER_NOTAPPLICABLE); - rc = chk_prop_fetch(prop, &rank_list); - if (rc != 0) - goto out; + /* For check engine on the leader, reload rank list that has been refreshed by leader. */ + if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) { + d_rank_list_free(ins->ci_ranks); + ins->ci_ranks = NULL; + } - D_ASSERT(rank_list != NULL); + if (ins->ci_ranks == NULL) { + rc = chk_prop_fetch(prop, &ins->ci_ranks); + if (rc != 0) + goto out; - /* For check engine on the leader, related rank has already been marked as "dead". */ - if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) - goto group; + /* For check engine on the leader, it's done. */ + if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) { + ins->ci_ns_ver = version; + goto out; + } + } + + if (unlikely(ins->ci_ranks == NULL)) + D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (!chk_remove_rank_from_list(rank_list, rank)) + if (!chk_remove_rank_from_list(ins->ci_ranks, rank)) D_GOTO(out, rc = -DER_NOTAPPLICABLE); prop->cp_rank_nr--; - rc = chk_prop_update(prop, rank_list); + rc = chk_prop_update(prop, ins->ci_ranks); if (rc != 0) - goto out; + ins->ci_skip_oog = 1; + else + rc = chk_iv_ns_update(ins, version); /* * NOTE: If the rank dead before DAOS check start, then subsequent check start will @@ -2688,19 +2689,7 @@ chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version) * sometime later as the DAOS check going. */ -group: - if (ins->ci_iv_group != NULL) - rc = crt_group_secondary_modify(ins->ci_iv_group, rank_list, rank_list, - CRT_GROUP_MOD_OP_REPLACE, version); - out: - if (rc == 0) { - d_rank_list_free(ins->ci_ranks); - ins->ci_ranks = rank_list; - rank_list = NULL; - } - - d_rank_list_free(rank_list); if (rc != -DER_NOTAPPLICABLE) D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u mark rank %u as dead with gen " @@ -2745,6 +2734,8 @@ chk_engine_act(uint64_t gen, uint64_t seq, uint32_t act) struct chk_instance *ins = chk_engine; int rc; + CHK_IS_READY(ins); + if (ins->ci_bk.cb_gen != gen) D_GOTO(out, rc = -DER_NOTAPPLICABLE); @@ -2874,6 +2865,8 @@ chk_engine_cont_list(uint64_t gen, uuid_t pool_uuid, uuid_t **conts, uint32_t *c int i = 0; int rc = 0; + CHK_IS_READY(ins); + if (cbk->cb_gen != gen) D_GOTO(out, rc = -DER_NOTAPPLICABLE); @@ -2929,6 +2922,8 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags) d_iov_t kiov; int rc; + CHK_IS_READY(ins); + if (ins->ci_bk.cb_ins_status != CHK__CHECK_INST_STATUS__CIS_RUNNING) D_GOTO(out, rc = -DER_SHUTDOWN); @@ -3047,6 +3042,8 @@ chk_engine_pool_mbs(uint64_t gen, uuid_t uuid, uint32_t phase, const char *label int rc; int i; + CHK_IS_READY(ins); + if (ins->ci_bk.cb_ins_status != CHK__CHECK_INST_STATUS__CIS_RUNNING) D_GOTO(out, rc = -DER_SHUTDOWN); @@ -3158,6 +3155,8 @@ chk_engine_set_policy(uint64_t gen, uint32_t policy_nr, struct chk_policy *polic struct chk_property *prop = &ins->ci_prop; int rc = 0; + CHK_IS_READY(ins); + /* Do nothing if no (engine) check instance is running. */ if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE || cbk->cb_gen != gen || cbk->cb_ins_status != CHK__CHECK_INST_STATUS__CIS_RUNNING) @@ -3182,13 +3181,12 @@ chk_engine_set_policy(uint64_t gen, uint32_t policy_nr, struct chk_policy *polic static int chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) { - struct chk_instance *ins = chk_engine; - struct chk_pending_rec *cpr = NULL; - struct chk_pending_rec *tmp = NULL; - struct chk_pool_rec *pool = NULL; - d_iov_t kiov; - d_iov_t riov; - int rc; + struct chk_instance *ins = chk_engine; + struct chk_pending_rec *cpr = NULL; + struct chk_pool_rec *pool = NULL; + d_iov_t kiov; + d_iov_t riov; + int rc; D_ASSERT(cru->cru_pool != NULL); @@ -3225,14 +3223,9 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) cru->cru_detail_nr, cru->cru_details, *seq); if (unlikely(rc == -DER_AGAIN)) { D_ASSERT(cru->cru_act == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT); + D_ASSERT(cpr != NULL); - rc = chk_pending_del(ins, *seq, &tmp); - if (rc == 0) - D_ASSERT(tmp == NULL); - else if (rc != -DER_NONEXIST) - goto log; - - chk_pending_destroy(cpr); + chk_pending_destroy(ins, cpr); cpr = NULL; goto new_seq; @@ -3278,11 +3271,12 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) goto again; out: - if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING) - pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; - if (cpr != NULL) - chk_pending_destroy(cpr); + chk_pending_destroy(ins, cpr); + + if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING && + d_list_empty(&pool->cpr_pending_list)) + pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; return rc; } @@ -3295,6 +3289,8 @@ chk_engine_notify(struct chk_iv *iv) struct chk_pool_rec *cpr; int rc = 0; + CHK_IS_READY(ins); + if (cbk->cb_gen != iv->ci_gen) D_GOTO(out, rc = -DER_NOTAPPLICABLE); @@ -3364,19 +3360,19 @@ chk_engine_notify(struct chk_iv *iv) void chk_engine_rejoin(void *args) { - struct chk_instance *ins = chk_engine; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - uuid_t *pools = NULL; - struct chk_iv iv = { 0 }; - struct umem_attr uma = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - d_rank_t myrank = dss_self_rank(); - uint32_t pool_nr = 0; - uint32_t flags = 0; - int rc = 0; - int rc1; - bool need_join = false; + struct chk_instance *ins = chk_engine; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + d_rank_list_t *ranks = NULL; + uuid_t *pools = NULL; + struct chk_iv iv = {0}; + struct umem_attr uma = {0}; + d_rank_t myrank = dss_self_rank(); + uint32_t pool_nr = 0; + uint32_t flags = 0; + int rc = 0; + int rc1; + bool need_join = false; if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE) goto out_log; @@ -3385,7 +3381,7 @@ chk_engine_rejoin(void *args) cbk->cb_ins_status != CHK__CHECK_INST_STATUS__CIS_PAUSED) goto out_log; - /* We do NOT support leader (and its associated engine ) to rejoin former check instance. */ + /* We do NOT support leader (and its associated engine) to rejoin former check instance. */ if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) goto out_log; @@ -3420,22 +3416,10 @@ chk_engine_rejoin(void *args) if (rc != 0) goto out_tree; - uuid_unparse_lower(cbk->cb_iv_uuid, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); - if (rc != 0) - goto out_tree; - - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, cbk->cb_iv_uuid, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, prop->cp_leader, ins->ci_iv_ns->iv_master_term + 1); - again: /* Ask leader whether this engine can rejoin or not. */ rc = chk_rejoin_remote(prop->cp_leader, cbk->cb_gen, myrank, cbk->cb_iv_uuid, &flags, - &pool_nr, &pools); + &ins->ci_ns_ver, &pool_nr, &pools, &ranks); if (rc != 0) { if ((rc == -DER_OOG || rc == -DER_GRPVER) && !ins->ci_pause) { D_INFO(DF_ENGINE" Someone is not ready %d, let's rejoin after 1 sec\n", @@ -3445,14 +3429,22 @@ chk_engine_rejoin(void *args) goto again; } - goto out_iv; + goto out_tree; } - if (pool_nr == 0) { + if (ranks == NULL || pool_nr == 0) { need_join = false; - D_GOTO(out_iv, rc = 1); + D_GOTO(out_tree, rc = 1); } + d_rank_list_free(ins->ci_ranks); + ins->ci_ranks = ranks; + ranks = NULL; + + rc = chk_iv_ns_create(ins, cbk->cb_iv_uuid, prop->cp_leader, ins->ci_ns_ver); + if (rc != 0) + goto out_tree; + rc = chk_pools_load_list(ins, cbk->cb_gen, 0, pool_nr, pools, NULL); if (rc != 0) goto out_notify; @@ -3492,51 +3484,32 @@ chk_engine_rejoin(void *args) iv.ci_to_leader = 1; /* Notify the leader that check instance exit on the engine. */ - rc1 = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE, true); + rc1 = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc1 != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify leader for its exit, status %u: rc1 = %d\n", DP_ENGINE(ins), myrank, cbk->cb_ins_status, rc1); -out_iv: - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); out_tree: chk_destroy_pending_tree(ins); chk_destroy_pool_tree(ins); out_log: + d_rank_list_free(ranks); + D_FREE(pools); if (need_join) D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" rejoin on rank %u with iv "DF_UUIDF": "DF_RC"\n", DP_ENGINE(ins), myrank, DP_UUID(cbk->cb_iv_uuid), DP_RC(rc)); ins->ci_rejoining = 0; - ins->ci_starting = 0; - ins->ci_inited = 1; -} - -void -chk_engine_pause(void) -{ - struct chk_instance *ins = chk_engine; - - chk_stop_sched(ins); - D_ASSERT(d_list_empty(&ins->ci_pool_list)); + ins->ci_starting = 0; } int -chk_engine_init(void) +chk_engine_setup(void) { - struct chk_traverse_pools_args ctpa = { 0 }; - struct chk_bookmark *cbk; - int rc; - - rc = chk_ins_init(&chk_engine); - if (rc != 0) - goto fini; - - chk_report_seq_init(chk_engine); + struct chk_instance *ins = chk_engine; + struct chk_bookmark *cbk = &ins->ci_bk; + struct chk_traverse_pools_args ctpa = {0}; + int rc; /* * DAOS global consistency check depends on all related engines' local @@ -3545,7 +3518,8 @@ chk_engine_init(void) * related local inconsistency firstly. */ - cbk = &chk_engine->ci_bk; + chk_report_seq_init(ins); + rc = chk_bk_fetch_engine(cbk); if (rc == -DER_NONEXIST) goto prop; @@ -3569,37 +3543,45 @@ chk_engine_init(void) cbk->cb_time.ct_stop_time = time(NULL); rc = chk_bk_update_engine(cbk); if (rc != 0) { - D_ERROR(DF_ENGINE" failed to reset status as 'PAUSED': "DF_RC"\n", - DP_ENGINE(chk_engine), DP_RC(rc)); + D_ERROR(DF_ENGINE " failed to reset status as 'PAUSED': " DF_RC "\n", + DP_ENGINE(ins), DP_RC(rc)); goto fini; } ctpa.ctpa_gen = cbk->cb_gen; - ctpa.ctpa_ins = chk_engine; + ctpa.ctpa_ins = ins; rc = chk_traverse_pools(chk_pools_pause_cb, &ctpa); /* * Failed to reset pool status will not affect next check start, so it is not fatal, * but related check query result may be confused for user. */ if (rc != 0) - D_WARN(DF_ENGINE" failed to reset pools status as 'PAUSED': "DF_RC"\n", - DP_ENGINE(chk_engine), DP_RC(rc)); + D_WARN(DF_ENGINE " failed to reset pools status as 'PAUSED': " DF_RC "\n", + DP_ENGINE(ins), DP_RC(rc)); } prop: - rc = chk_prop_fetch(&chk_engine->ci_prop, &chk_engine->ci_ranks); + rc = chk_prop_fetch(&ins->ci_prop, &ins->ci_ranks); if (rc == -DER_NONEXIST) rc = 0; + if (rc == 0) { + ins->ci_inited = 1; + ins->ci_pause = 0; + } + fini: if (rc != 0) - chk_ins_fini(&chk_engine); + chk_engine_cleanup(); return rc; } void -chk_engine_fini(void) +chk_engine_cleanup(void) { - chk_ins_fini(&chk_engine); + struct chk_instance *ins = chk_engine; + + chk_ins_cleanup(ins); + D_ASSERT(d_list_empty(&ins->ci_pool_list)); } int @@ -3609,6 +3591,8 @@ chk_engine_pool_stop(uuid_t pool_uuid, bool destroy) uint32_t phase; int rc = 0; + CHK_IS_READY(chk_engine); + if (destroy) { status = CHK__CHECK_POOL_STATUS__CPS_CHECKED; phase = CHK__CHECK_SCAN_PHASE__CSP_DONE; @@ -3624,3 +3608,15 @@ chk_engine_pool_stop(uuid_t pool_uuid, bool destroy) return rc; } + +int +chk_engine_init(void) +{ + return chk_ins_init(&chk_engine); +} + +void +chk_engine_fini(void) +{ + chk_ins_fini(&chk_engine); +} diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 798154b2b2a..276f7121c51 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -76,6 +76,7 @@ struct chk_pool_mbs { uint32_t *cpm_tgt_status; }; +/* clang-format off */ /* * CHK_START: * From check leader to check engine to start the check instance on specified pool(s) or all pools. @@ -83,7 +84,7 @@ struct chk_pool_mbs { #define DAOS_ISEQ_CHK_START \ ((uint64_t) (csi_gen) CRT_VAR) \ ((uint32_t) (csi_flags) CRT_VAR) \ - ((int32_t) (csi_phase) CRT_VAR) \ + ((int32_t) (csi_ns_ver) CRT_VAR) \ ((d_rank_t) (csi_leader_rank) CRT_VAR) \ ((uint32_t) (csi_api_flags) CRT_VAR) \ ((uuid_t) (csi_iv_uuid) CRT_VAR) \ @@ -272,11 +273,13 @@ CRT_RPC_DECLARE(chk_report, DAOS_ISEQ_CHK_REPORT, DAOS_OSEQ_CHK_REPORT); #define DAOS_OSEQ_CHK_REJOIN \ ((int32_t) (cro_status) CRT_VAR) \ ((uint32_t) (cro_flags) CRT_VAR) \ + ((uint32_t) (cro_ns_ver) CRT_VAR) \ + ((uint32_t) (cro_padding) CRT_VAR) \ + ((d_rank_t) (cro_ranks) CRT_ARRAY) \ ((uuid_t) (cro_pools) CRT_ARRAY) CRT_RPC_DECLARE(chk_rejoin, DAOS_ISEQ_CHK_REJOIN, DAOS_OSEQ_CHK_REJOIN); -/* clang-format off */ /* * CHK_SET_POLICY: * From check leader to check engine to set policy during check instance running. @@ -501,16 +504,12 @@ struct chk_bookmark { * 'reset' for all pools. */ struct chk_property { - d_rank_t cp_leader; - Chk__CheckFlag cp_flags; - Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; - /* - * NOTE: Preserve for supporting to continue the check until the specified phase in the - * future. -1 means to check all phases. - */ - int32_t cp_phase; + d_rank_t cp_leader; + Chk__CheckFlag cp_flags; + Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; + uint32_t cp_padding; /* How many ranks (ever or should) take part in the check instance. */ - uint32_t cp_rank_nr; + uint32_t cp_rank_nr; }; /* @@ -555,6 +554,7 @@ struct chk_instance { d_list_t ci_dead_ranks; ABT_thread ci_sched; + ABT_thread ci_dead_rank_ult; ABT_rwlock ci_abt_lock; ABT_mutex ci_abt_mutex; ABT_cond ci_abt_cond; @@ -562,20 +562,12 @@ struct chk_instance { /* Generator for report event, pending repair actions, and so on. */ uint64_t ci_seq; - uint32_t ci_is_leader:1, - ci_sched_running:1, - ci_sched_exiting:1, - ci_for_orphan:1, - ci_orphan_done:1, /* leader has processed orphan pools. */ - ci_pool_stopped:1, /* check on some pools have been stopped. */ - ci_starting:1, - ci_stopping:1, - ci_started:1, - ci_inited:1, - ci_pause:1, - ci_rejoining:1, - ci_implicated:1; - uint32_t ci_start_flags; + uint32_t ci_is_leader : 1, ci_sched_running : 1, ci_sched_exiting : 1, ci_for_orphan : 1, + ci_orphan_done : 1, ci_pool_stopped : 1, /* check on some pools have been stopped. */ + ci_starting : 1, ci_stopping : 1, ci_started : 1, ci_inited : 1, ci_pause : 1, + ci_skip_oog : 1, ci_rejoining : 1, ci_implicated : 1; + uint32_t ci_start_flags; + uint32_t ci_ns_ver; }; struct chk_iv { @@ -750,6 +742,8 @@ int chk_pool_add_shard(daos_handle_t hdl, d_list_t *head, uuid_t uuid, d_rank_t void chk_pool_shard_cleanup(struct chk_instance *ins); +int chk_pending_lookup(struct chk_instance *ins, uint64_t seq, struct chk_pending_rec **cpr); + int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid, uint64_t seq, uint32_t rank, uint32_t cla, uint32_t option_nr, uint32_t *options, struct chk_pending_rec **cpr); @@ -758,19 +752,18 @@ int chk_pending_del(struct chk_instance *ins, uint64_t seq, struct chk_pending_r int chk_pending_wakeup(struct chk_instance *ins, struct chk_pending_rec *cpr); -void chk_pending_destroy(struct chk_pending_rec *cpr); - int chk_policy_refresh(uint32_t policy_nr, struct chk_policy *policies, struct chk_property *prop); -int chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, - uint32_t policy_nr, struct chk_policy *policies, - d_rank_list_t *ranks, struct chk_property *prop); +int chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, + struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop); uint32_t chk_pool_merge_status(uint32_t status_a, uint32_t status_b); void chk_ins_merge_info(uint32_t *status_dst, uint32_t status_src, uint32_t *phase_dst, uint32_t phase_src, uint64_t *gen_dst, uint64_t gen_src); +void chk_ins_cleanup(struct chk_instance *ins); + int chk_ins_init(struct chk_instance **p_ins); void chk_ins_fini(struct chk_instance **p_ins); @@ -779,7 +772,7 @@ void chk_ins_fini(struct chk_instance **p_ins); int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, + uuid_t pools[], uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, struct ds_pool_clues *clues); int chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags); @@ -806,7 +799,9 @@ int chk_engine_notify(struct chk_iv *iv); void chk_engine_rejoin(void *args); -void chk_engine_pause(void); +int chk_engine_setup(void); + +void chk_engine_cleanup(void); int chk_engine_init(void); @@ -814,7 +809,14 @@ void chk_engine_fini(void); /* chk_iv.c */ -int chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode, bool retry); +void chk_iv_ns_destroy(struct chk_instance *ins); + +int chk_iv_ns_create(struct chk_instance *ins, uuid_t uuid, d_rank_t leader, uint32_t ns_ver); + +int chk_iv_ns_update(struct chk_instance *ins, uint32_t ns_ver); + +int chk_iv_update(struct chk_instance *ins, struct chk_iv *iv, uint32_t shortcut, + uint32_t sync_mode); int chk_iv_init(void); @@ -830,10 +832,12 @@ int chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) int chk_leader_notify(struct chk_iv *iv); -int chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, int *pool_nr, - uuid_t **pools); +int chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, + uint32_t *ns_ver, int *pool_nr, uuid_t **pools, d_rank_list_t **ranks); + +int chk_leader_setup(void); -void chk_leader_pause(void); +void chk_leader_cleanup(void); int chk_leader_init(void); @@ -843,8 +847,8 @@ void chk_leader_fini(void); int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, uint32_t flags, - uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args); + uuid_t pools[], uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, + uint32_t flags, uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args); int chk_stop_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t pools[], chk_co_rpc_cb_t stop_cb, void *args); @@ -873,7 +877,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, uint32_t detail_nr, d_sg_list_t *details, uint64_t seq); int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, - uint32_t *pool_nr, uuid_t **pools); + uint32_t *ns_ver, uint32_t *pool_nr, uuid_t **pools, d_rank_list_t **ranks); int chk_set_policy_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t policy_nr, struct chk_policy *policies); @@ -912,9 +916,16 @@ int chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list); int chk_traverse_pools(sys_db_trav_cb_t cb, void *args); -void chk_vos_init(void); +void chk_vos_setup(void); + +void chk_vos_cleanup(void); + +#define CHK_IS_READY(ins) \ + do { \ + if (unlikely((ins)->ci_inited == 0)) \ + return -DER_UNINIT; \ + } while (0) -void chk_vos_fini(void); /* clang-format on */ static inline bool @@ -933,41 +944,24 @@ chk_ins_set_fail(struct chk_instance *ins, uint32_t phase) static inline bool chk_rank_in_list(d_rank_list_t *rlist, d_rank_t rank) { - int i; - bool found = false; - - /* TBD: more efficiently search for the sorted ranks list. */ - - for (i = 0; i < rlist->rl_nr; i++) { - if (rlist->rl_ranks[i] == rank) { - found = true; - break; - } - } - - return found; + return d_rank_list_bsearch(rlist, rank, NULL); } static inline bool chk_remove_rank_from_list(d_rank_list_t *rlist, d_rank_t rank) { - int i; - bool found = false; - - /* TBD: more efficiently search for the sorted ranks list. */ - - for (i = 0; i < rlist->rl_nr; i++) { - if (rlist->rl_ranks[i] == rank) { - found = true; - rlist->rl_nr--; - /* The leader rank will always be in the rank list. */ - D_ASSERT(rlist->rl_nr > 0); - - if (i < rlist->rl_nr) - memmove(&rlist->rl_ranks[i], &rlist->rl_ranks[i + 1], - sizeof(rlist->rl_ranks[i]) * (rlist->rl_nr - i)); - break; - } + int idx = -1; + bool found = false; + + if (d_rank_list_bsearch(rlist, rank, &idx)) { + D_ASSERT(rlist->rl_nr > 0); + D_ASSERT(idx >= 0); + + rlist->rl_nr--; + if (idx < rlist->rl_nr) + memmove(&rlist->rl_ranks[idx], &rlist->rl_ranks[idx + 1], + sizeof(rlist->rl_ranks[idx]) * (rlist->rl_nr - idx)); + found = true; } return found; @@ -992,6 +986,26 @@ chk_destroy_tree(daos_handle_t *toh, struct btr_root *root) } } +static inline void +chk_pending_destroy(struct chk_instance *ins, struct chk_pending_rec *cpr) +{ + if (d_list_empty(&cpr->cpr_pool_link)) { + D_ASSERT(d_list_empty(&cpr->cpr_rank_link)); + D_ASSERT(d_list_empty(&cpr->cpr_ins_link)); + + if (cpr->cpr_cond != ABT_COND_NULL) + ABT_cond_free(&cpr->cpr_cond); + + if (cpr->cpr_mutex != ABT_MUTEX_NULL) + ABT_mutex_free(&cpr->cpr_mutex); + + D_FREE(cpr); + } else { + cpr->cpr_busy = 0; + chk_pending_del(ins, cpr->cpr_seq, NULL); + } +} + static inline void chk_destroy_pending_tree(struct chk_instance *ins) { @@ -1019,17 +1033,6 @@ chk_query_free(struct chk_query_pool_shard *shards, uint32_t shard_nr) } } -static inline void -chk_iv_ns_cleanup(struct ds_iv_ns **ns) -{ - if (*ns != NULL) { - if ((*ns)->iv_refcount == 1) - ds_iv_ns_cleanup(*ns); - ds_iv_ns_put(*ns); - *ns = NULL; - } -} - static inline void chk_pool_get(struct chk_pool_rec *cpr) { @@ -1173,6 +1176,14 @@ chk_pools_find_slowest(struct chk_instance *ins, int *done) phase = cpr->cpr_bk.cb_phase; } + /* All pools have been done, some check engines are still running, leader needs to wait. */ + if (ins->ci_orphan_done && *done > 0 && !d_list_empty(&ins->ci_rank_list)) { + D_ASSERT(ins->ci_is_leader); + + phase = CHK_INVAL_PHASE; + *done = 0; + } + return phase; } @@ -1220,13 +1231,15 @@ chk_stop_sched(struct chk_instance *ins) static inline int chk_ins_can_start(struct chk_instance *ins) { - if (unlikely(!ins->ci_inited)) + CHK_IS_READY(ins); + + if (!ins->ci_is_leader && ins->ci_rejoining) return -DER_AGAIN; if (ins->ci_starting) return -DER_INPROGRESS; - if (ins->ci_stopping || ins->ci_sched_exiting) + if (ins->ci_stopping || ins->ci_sched_exiting || ins->ci_rejoining) return -DER_BUSY; if (ins->ci_sched_running) diff --git a/src/chk/chk_iv.c b/src/chk/chk_iv.c index 299c1554856..467648520bb 100644 --- a/src/chk/chk_iv.c +++ b/src/chk/chk_iv.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -181,13 +182,76 @@ struct ds_iv_class_ops chk_iv_ops = { .ivc_value_alloc = chk_iv_value_alloc, }; +void +chk_iv_ns_destroy(struct chk_instance *ins) +{ + if (ins->ci_iv_ns != NULL) { + if (ins->ci_iv_ns->iv_refcount == 1) + ds_iv_ns_cleanup(ins->ci_iv_ns); + ds_iv_ns_put(ins->ci_iv_ns); + ins->ci_iv_ns = NULL; + } + + if (ins->ci_iv_group != NULL) { + crt_group_secondary_destroy(ins->ci_iv_group); + ins->ci_iv_group = NULL; + } +} + int -chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode, bool retry) +chk_iv_ns_create(struct chk_instance *ins, uuid_t uuid, d_rank_t leader, uint32_t ns_ver) { - d_sg_list_t sgl; - d_iov_t iov; - struct ds_iv_key key; - int rc; + char uuid_str[DAOS_UUID_STR_SIZE]; + int rc; + + uuid_unparse_lower(uuid, uuid_str); + rc = crt_group_secondary_create(uuid_str, NULL, NULL, &ins->ci_iv_group); + if (rc != 0) + goto out; + + rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, uuid, ins->ci_iv_group, &ins->ci_iv_id, + &ins->ci_iv_ns); + if (rc != 0) + goto out; + + rc = chk_iv_ns_update(ins, ns_ver); + if (rc == 0) { + ds_iv_ns_update(ins->ci_iv_ns, leader, ins->ci_iv_ns->iv_master_term + 1); + ins->ci_skip_oog = 0; + } + +out: + if (rc != 0) + chk_iv_ns_destroy(ins); + return rc; +} + +int +chk_iv_ns_update(struct chk_instance *ins, uint32_t ns_ver) +{ + int rc; + + /* Let secondary rank == primary rank. */ + rc = crt_group_secondary_modify(ins->ci_iv_group, ins->ci_ranks, ins->ci_ranks, + CRT_GROUP_MOD_OP_REPLACE, ns_ver); + if (rc == 0) + ins->ci_ns_ver = ns_ver; + else + ins->ci_skip_oog = 1; + + return rc; +} + +int +chk_iv_update(struct chk_instance *ins, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode) +{ + d_sg_list_t sgl; + d_iov_t iov; + struct ds_iv_key key; + uint32_t ver; + int try_cnt = 0; + int wait_cnt = 0; + int rc; iv->ci_rank = dss_self_rank(); iv->ci_seq = d_hlc_get(); @@ -208,9 +272,41 @@ chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode memset(&key, 0, sizeof(key)); key.class_id = IV_CHK; - rc = ds_iv_update(ns, &key, &sgl, shortcut, sync_mode, 0, retry); + +again: + try_cnt++; + ver = ins->ci_ns_ver; + rc = ds_iv_update(ins->ci_iv_ns, &key, &sgl, shortcut, sync_mode, 0, true); + if (likely(rc != -DER_OOG)) + goto out; + + if (try_cnt % 10 == 0) + D_WARN("CHK iv " DF_X64 "/" DF_X64 " retry because of -DER_OOG for more " + "than %d times.\n", + iv->ci_gen, iv->ci_seq, try_cnt); + + /* Wait chk_deak_rank_ult to sync the IV namespace. */ + while (ver == ins->ci_ns_ver && ins->ci_skip_oog == 0 && ins->ci_pause == 0) { + dss_sleep(500); + if (++wait_cnt % 40 == 0) { + D_WARN("CHK iv " DF_X64 "/" DF_X64 " is blocked because of DER_OOG " + "for %d seconds.\n", + iv->ci_gen, iv->ci_seq, wait_cnt / 2); + /* + * Let's retry IV in case of related dead rank recovered back before + * being handled by chk_dead_rank_ult, although it is rare. + */ + break; + } + } + + if (ins->ci_pause || ins->ci_skip_oog) + goto out; + + goto again; } +out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, "CHK iv "DF_X64"/"DF_X64" on rank %u, phase %u, ins_status %u, " "pool_status %u, to_leader %s, from_psl %s: rc = %d\n", diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c index 261924e2135..18be52d0ac0 100644 --- a/src/chk/chk_leader.c +++ b/src/chk/chk_leader.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -226,9 +226,9 @@ static void chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_status, uint32_t pool_status, bool bcast) { - struct chk_bookmark *cbk = &ins->ci_bk; - struct chk_iv iv = { 0 }; - int rc = 0; + struct chk_bookmark *cbk = &ins->ci_bk; + struct chk_iv iv = {0}; + int rc = 0; ins->ci_sched_exiting = 1; @@ -237,15 +237,13 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu chk_pool_stop_all(ins, pool_status, NULL); if ((bcast && ins_status == CHK__CHECK_INST_STATUS__CIS_FAILED) || - ins_status == CHK__CHECK_INST_STATUS__CIS_IMPLICATED || - unlikely(ins_status == CHK__CHECK_INST_STATUS__CIS_COMPLETED && !ins->ci_orphan_done)) { + ins_status == CHK__CHECK_INST_STATUS__CIS_IMPLICATED || !ins->ci_orphan_done) { iv.ci_gen = cbk->cb_gen; iv.ci_phase = ins_phase != CHK_INVAL_PHASE ? ins_phase : cbk->cb_phase; iv.ci_ins_status = ins_status; /* Synchronously notify the engines that the check leader exit. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify the engines its exit, status %u: rc = %d\n", DP_LEADER(ins), ins_status, rc); @@ -305,31 +303,23 @@ chk_leader_post_repair(struct chk_instance *ins, struct chk_pool_rec *cpr, DP_UUID(cpr->cpr_uuid), rc); } - /* - * If the operation failed and 'failout' is set, then do nothing here. - * chk_leader_exit will handle all the IV and bookmark related things. - */ - if (*result == 0 || !(ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_FAILOUT)) { - if (notify) { - iv.ci_gen = cbk->cb_gen; - uuid_copy(iv.ci_uuid, cpr->cpr_uuid); - iv.ci_ins_status = ins->ci_bk.cb_ins_status; - iv.ci_phase = cbk->cb_phase; - iv.ci_pool_status = cbk->cb_pool_status; - - /* Synchronously notify the engines that check on the pool got failure. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); - D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, - DF_LEADER" notify the engines that the check for pool " - DF_UUIDF" is done with status %u: rc = %d\n", - DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), iv.ci_pool_status, rc); - if (rc == 0) - cpr->cpr_notified_exit = 1; - } + if (notify) { + uuid_copy(iv.ci_uuid, cpr->cpr_uuid); + iv.ci_gen = cbk->cb_gen; + iv.ci_ins_status = ins->ci_bk.cb_ins_status; + iv.ci_phase = cbk->cb_phase; + iv.ci_pool_status = cbk->cb_pool_status; + + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); + DL_CDEBUG(rc != 0, DLOG_WARN, DLOG_INFO, rc, + DF_LEADER " notify engines that check pool " DF_UUIDF " done, status %u", + DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), iv.ci_pool_status); + if (rc == 0) + cpr->cpr_notified_exit = 1; + } + if (!(ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_FAILOUT)) *result = 0; - } if (update) { rc = chk_bk_update_leader(&ins->ci_bk); @@ -1401,8 +1391,7 @@ chk_leader_start_pool_svc(struct chk_pool_rec *cpr) } rc = ds_rsvc_dist_start(DS_RSVC_CLASS_POOL, &psid, cpr->cpr_uuid, ranks, RDB_NIL_TERM, - cpr->cpr_healthy ? DS_RSVC_START : DS_RSVC_DICTATE, - false /* bootstrap */, 0 /* size */, 0 /* vos_df_version */); + cpr->cpr_healthy ? DS_RSVC_START : DS_RSVC_DICTATE, NULL); out: d_rank_list_free(ranks); @@ -2106,8 +2095,7 @@ chk_leader_pool_ult(void *arg) uuid_copy(iv.ci_uuid, cpr->cpr_uuid); iv.ci_phase = cbk->cb_phase; - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines to exit check for pool "DF_UUIDF" failure: %d\n", DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), rc); @@ -2132,7 +2120,7 @@ chk_leader_mark_rank_dead(struct chk_instance *ins, struct chk_dead_rank *cdr) struct chk_pool_shard *tmp; struct chk_property *prop = &ins->ci_prop; struct chk_bookmark *cbk = &ins->ci_bk; - uint32_t version = cbk->cb_gen - prop->cp_rank_nr - 1; + uint32_t version = ins->ci_ns_ver + 1; int rc = 0; if (!chk_remove_rank_from_list(ins->ci_ranks, cdr->cdr_rank)) @@ -2140,11 +2128,12 @@ chk_leader_mark_rank_dead(struct chk_instance *ins, struct chk_dead_rank *cdr) prop->cp_rank_nr--; rc = chk_prop_update(prop, ins->ci_ranks); - if (rc != 0) + if (rc != 0) { + ins->ci_skip_oog = 1; goto out; + } - rc = crt_group_secondary_modify(ins->ci_iv_group, ins->ci_ranks, ins->ci_ranks, - CRT_GROUP_MOD_OP_REPLACE, version); + rc = chk_iv_ns_update(ins, version); if (rc != 0) goto out; @@ -2202,7 +2191,6 @@ chk_leader_sched(void *args) { struct chk_instance *ins = args; struct chk_bookmark *cbk = &ins->ci_bk; - struct chk_dead_rank *cdr; struct chk_pending_rec *pending; struct chk_iv iv = {0}; uint32_t ins_phase; @@ -2212,7 +2200,6 @@ chk_leader_sched(void *args) int done = 0; int rc = 0; bool bcast = false; - bool more_dead; D_INFO(DF_LEADER" scheduler enter at phase %u\n", DP_LEADER(ins), cbk->cb_phase); @@ -2243,29 +2230,9 @@ chk_leader_sched(void *args) while (1) { dss_sleep(300); -check_dead: - ABT_mutex_lock(ins->ci_abt_mutex); - if (!d_list_empty(&ins->ci_dead_ranks)) { - cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, cdr_link); - if (!d_list_empty(&ins->ci_dead_ranks)) - more_dead = true; - else - more_dead = false; - } else { - cdr = NULL; - more_dead = false; - } - ABT_mutex_unlock(ins->ci_abt_mutex); - - if (cdr != NULL) - chk_leader_mark_rank_dead(ins, cdr); - if (chk_leader_need_stop(ins, &rc)) D_GOTO(out, bcast = (rc > 0 ? true : false)); - if (more_dead) - goto check_dead; - if (!d_list_empty(&ins->ci_interaction_filter_list)) { pending = d_list_pop_entry(&ins->ci_interaction_filter_list, struct chk_pending_rec, cpr_ins_link); @@ -2285,15 +2252,15 @@ chk_leader_sched(void *args) ins_phase = chk_pools_find_slowest(ins, &done); - if (ins_phase >= CHK__CHECK_SCAN_PHASE__CSP_POOL_MBS && !ins->ci_orphan_done && + if (ins_phase != CHK_INVAL_PHASE && + ins_phase >= CHK__CHECK_SCAN_PHASE__CSP_POOL_MBS && !ins->ci_orphan_done && !DAOS_FAIL_CHECK(DAOS_CHK_SYNC_ORPHAN_PROCESS)) { iv.ci_gen = cbk->cb_gen; iv.ci_phase = ins_phase; iv.ci_ins_status = CHK__CHECK_INST_STATUS__CIS_RUNNING; /* Synchronously notify engines that orphan pools have been processed. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines that orphan pools have been process: %d\n", DP_LEADER(ins), rc); @@ -2317,7 +2284,7 @@ chk_leader_sched(void *args) D_GOTO(out, rc); } - if (cbk->cb_phase == CHK_INVAL_PHASE || cbk->cb_phase < ins_phase) { + if (ins_phase != CHK_INVAL_PHASE && ins_phase > cbk->cb_phase) { D_INFO(DF_LEADER" moves from phase %u to phase %u\n", DP_LEADER(ins), cbk->cb_phase, ins_phase); @@ -2464,8 +2431,8 @@ chk_leader_ranks_prepare(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *r static int chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], int phase, d_rank_t leader, uint32_t flags) + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + d_rank_t leader, uint32_t flags) { struct chk_property *prop = &ins->ci_prop; struct chk_bookmark *cbk = &ins->ci_bk; @@ -2555,7 +2522,7 @@ chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank cbk->cb_version = chk_ver; init: - rc = chk_prop_prepare(leader, flags, phase, policy_nr, policies, rank_list, prop); + rc = chk_prop_prepare(leader, flags, policy_nr, policies, rank_list, prop); if (rc != 0) goto out; @@ -2701,8 +2668,7 @@ chk_leader_start_post(struct chk_instance *ins) * to notify the engine for the check done, that is not fatal. That * can be redo in next check instance. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines the pool "DF_UUIDF" is checked: %d\n", DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), rc); @@ -2882,20 +2848,20 @@ chk_leader_start_cb(struct chk_co_rpc_cb_args *cb_args) int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, - int pool_nr, uuid_t pools[], uint32_t api_flags, int phase) + int pool_nr, uuid_t pools[], uint32_t api_flags) { - struct chk_instance *ins = chk_leader; - struct chk_bookmark *cbk = &ins->ci_bk; - uuid_t *c_pools = NULL; - struct umem_attr uma = { 0 }; - uuid_t dummy_pool = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - uint64_t old_gen = cbk->cb_gen; - d_rank_t myrank = dss_self_rank(); - uint32_t flags = api_flags; - int c_pool_nr = 0; - int rc; - int rc1; + struct chk_instance *ins = chk_leader; + struct chk_bookmark *cbk = &ins->ci_bk; + uuid_t *c_pools = NULL; + struct umem_attr uma = {0}; + uuid_t dummy_pool = {0}; + uint64_t old_gen = cbk->cb_gen; + d_rank_t myrank = dss_self_rank(); + uint32_t flags = api_flags; + uint32_t ns_ver = (uint32_t)daos_wallclock_secs(); + int c_pool_nr = 0; + int rc; + int rc1; rc = chk_ins_can_start(ins); if (rc != 0) @@ -2920,13 +2886,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } - + chk_iv_ns_destroy(ins); uma.uma_id = UMEM_CLASS_VMEM; rc = dbtree_create_inplace(DBTREE_CLASS_CHK_RANK, 0, CHK_BTREE_ORDER, &uma, @@ -2945,8 +2905,8 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto out_tree; reset: - rc = chk_leader_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, - phase, myrank, flags); + rc = chk_leader_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, myrank, + flags); if (rc == 1 && !(flags & CHK__CHECK_FLAG__CF_RESET)) { /* Former check instance has done, let's re-start from the beginning. */ flags |= CHK__CHECK_FLAG__CF_RESET; @@ -2960,18 +2920,10 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto remote; uuid_generate(dummy_pool); - uuid_unparse_lower(dummy_pool, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); + rc = chk_iv_ns_create(ins, dummy_pool, myrank, ns_ver); if (rc != 0) goto out_tree; - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, dummy_pool, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, myrank, ins->ci_iv_ns->iv_master_term + 1); - if (d_list_empty(&ins->ci_pool_list)) { c_pool_nr = pool_nr; c_pools = pools; @@ -2983,7 +2935,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c remote: rc = chk_start_remote(ins->ci_ranks, cbk->cb_gen, rank_nr, ranks, policy_nr, policies, - c_pool_nr, c_pools, flags, phase, myrank, ins->ci_start_flags, + c_pool_nr, c_pools, flags, ns_ver, myrank, ins->ci_start_flags, dummy_pool, chk_leader_start_cb, ins); if (rc != 0) { if (rc == -DER_OOG || rc == -DER_GRPVER || rc == -DER_AGAIN) { @@ -3016,6 +2968,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c if (rc != 0) goto out_stop_remote; + ins->ci_pause = 0; ins->ci_sched_running = 1; rc = dss_ult_create(chk_leader_sched, ins, DSS_XS_SYS, 0, DSS_DEEP_STACK_SZ, @@ -3025,10 +2978,9 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto out_stop_pools; } - D_INFO("Leader %s check with api_flags %x, phase %d, leader %u, flags %x, gen " DF_X64 - " iv "DF_UUIDF": rc %d\n", - chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, phase, myrank, - ins->ci_start_flags, cbk->cb_gen, DP_UUID(dummy_pool), rc); + D_INFO("Leader %s with api_flags %x, leader %u, flags %x, gen " DF_X64 " iv " DF_UUIDF "\n", + chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, myrank, + ins->ci_start_flags, cbk->cb_gen, DP_UUID(dummy_pool)); chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks); chk_pools_dump(&ins->ci_pool_list, c_pool_nr > 0 ? c_pool_nr : pool_nr, @@ -3051,8 +3003,6 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c D_WARN(DF_LEADER" failed to rollback failed check start: "DF_RC"\n", DP_LEADER(ins), DP_RC(rc1)); out_iv: - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: if (cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING || cbk->cb_gen != old_gen) { cbk->cb_gen = old_gen; if (cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING) { @@ -3064,17 +3014,16 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c D_WARN(DF_LEADER" failed to update leader bookmark: "DF_RC"\n", DP_LEADER(ins), DP_RC(rc1)); } - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; + chk_iv_ns_destroy(ins); out_tree: chk_leader_destroy_trees(ins); ins->ci_starting = 0; out_log: - D_CDEBUG(likely(rc < 0), DLOG_ERR, DLOG_INFO, - "Leader %s to start check on %u ranks for %d pools with " - "api_flags %x, phase %d, leader %u, gen "DF_X64": rc = %d\n", - rc < 0 ? "failed" : "try", rank_nr, pool_nr, api_flags, phase, - myrank, cbk->cb_gen, rc); + DL_CDEBUG(likely(rc < 0), DLOG_ERR, DLOG_INFO, rc, + "Leader %s to start check on %u ranks for %d pools with api_flags %x, ns_ver %d, " + "leader %u, gen " DF_X64, + rc < 0 ? "failed" : "try", rank_nr, pool_nr, api_flags, ns_ver, myrank, + cbk->cb_gen); if (unlikely(rc > 0)) rc = 0; @@ -3117,6 +3066,8 @@ chk_leader_stop(int pool_nr, uuid_t pools[]) int rc = 0; int i; + CHK_IS_READY(ins); + if (ins->ci_starting) D_GOTO(log, rc = -DER_BUSY); @@ -3328,10 +3279,15 @@ chk_leader_query(int pool_nr, uuid_t pools[], chk_query_head_cb_t head_cb, uint32_t idx = 0; uint32_t status; uint32_t phase; + uint32_t ver; + int try_cnt = 0; + int wait_cnt = 0; int rc; int i; bool skip; + CHK_IS_READY(ins); + /* * NOTE: Similar as stop case, we need the ability to query check information from * new leader if the old one dead. But the information from new leader may be @@ -3357,22 +3313,45 @@ chk_leader_query(int pool_nr, uuid_t pools[], chk_query_head_cb_t head_cb, D_GOTO(out, rc = -DER_NOMEM); again: - rc = chk_query_remote(ins->ci_ranks, gen, pool_nr, pools, chk_leader_query_cb, cqa); + try_cnt++; + ver = ins->ci_ns_ver; + rc = chk_query_remote(ins->ci_ranks, gen, pool_nr, pools, chk_leader_query_cb, cqa); if (rc != 0) { - if (rc == -DER_OOG || rc == -DER_GRPVER || rc == -DER_AGAIN) { - D_INFO(DF_LEADER" Someone is not ready %d, let's retry query after 1 sec\n", - DP_LEADER(ins), rc); - if (!d_list_empty(&cqa->cqa_list)) { - chk_cqa_free(cqa); - cqa = chk_cqa_alloc(ins); - if (cqa == NULL) - D_GOTO(out, rc = -DER_NOMEM); + if (rc != -DER_OOG && rc != -DER_GRPVER && rc != -DER_AGAIN) + goto out; + + if (try_cnt % 10 == 0) + D_WARN("Leader (" DF_X64 ") query retried because of %d for %d times.\n", + gen, rc, try_cnt); + + while (ver == ins->ci_ns_ver && ins->ci_skip_oog == 0 && ins->ci_pause == 0) { + dss_sleep(500); + if (++wait_cnt % 40 == 0) { + D_WARN("Leader (" DF_X64 ") query is blocked because of %d for " + "about %d seconds.\n", + gen, rc, wait_cnt / 2); + /* + * Let's retry query in case of related dead rank recovered back + * before being handled by chk_dead_rank_ult, although it is rare. + */ + break; } - dss_sleep(1000); - goto again; + + if (rc != -DER_OOG) + break; } - goto out; + if (ins->ci_pause || ins->ci_skip_oog) + goto out; + + if (!d_list_empty(&cqa->cqa_list)) { + chk_cqa_free(cqa); + cqa = chk_cqa_alloc(ins); + if (cqa == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + goto again; } d_list_for_each_entry(cpr, &ins->ci_pool_list, cpr_link) { @@ -3476,6 +3455,8 @@ chk_leader_prop(chk_prop_cb_t prop_cb, void *buf) { struct chk_property *prop = &chk_leader->ci_prop; + CHK_IS_READY(chk_leader); + return prop_cb(buf, prop->cp_policies, CHK_POLICY_MAX - 1, prop->cp_flags); } @@ -3488,12 +3469,10 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act) d_iov_t riov; int rc; - rc = chk_pending_del(ins, seq, &pending); + rc = chk_pending_lookup(ins, seq, &pending); if (rc != 0) goto out; - D_ASSERT(pending->cpr_busy); - if (pending->cpr_on_leader) { ABT_mutex_lock(pending->cpr_mutex); /* @@ -3503,20 +3482,24 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act) pending->cpr_action = act; ABT_cond_broadcast(pending->cpr_cond); ABT_mutex_unlock(pending->cpr_mutex); + chk_pending_del(ins, seq, &pending); } else { d_iov_set(&riov, NULL, 0); d_iov_set(&kiov, pending->cpr_uuid, sizeof(uuid_t)); rc = dbtree_lookup(ins->ci_pool_hdl, &kiov, &riov); - if (rc == 0) { + if (rc == 0) pool = (struct chk_pool_rec *)riov.iov_buf; - if (pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING) - pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; - } rc = chk_act_remote(ins->ci_ranks, ins->ci_bk.cb_gen, seq, pending->cpr_class, act, pending->cpr_rank); + if (rc == 0) { + chk_pending_destroy(ins, pending); - chk_pending_destroy(pending); + if (pool != NULL && + pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING && + d_list_empty(&pool->cpr_pending_list)) + pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; + } } out: @@ -3534,6 +3517,8 @@ chk_leader_act(uint64_t seq, uint32_t act) struct chk_bookmark *cbk = &ins->ci_bk; int rc; + CHK_IS_READY(ins); + if (cbk->cb_magic != CHK_BK_MAGIC_LEADER) D_GOTO(out, rc = -DER_NOTLEADER); @@ -3567,6 +3552,8 @@ chk_leader_set_policy(uint32_t policy_nr, struct chk_policy *policies) struct chk_pending_rec *tmp; int rc; + CHK_IS_READY(ins); + /* Do nothing if no (leader) check instance is running. */ if (cbk->cb_magic != CHK_BK_MAGIC_LEADER || cbk->cb_ins_status != CHK__CHECK_INST_STATUS__CIS_RUNNING) @@ -3615,6 +3602,8 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) d_iov_t riov; int rc; + CHK_IS_READY(ins); + if (cbk->cb_magic != CHK_BK_MAGIC_LEADER) D_GOTO(out, rc = -DER_NOTLEADER); @@ -3720,14 +3709,13 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) goto again; out: + if ((rc != 0 || decision != NULL) && cpr != NULL) + chk_pending_destroy(ins, cpr); + if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING && - (rc != 0 || (cpr != NULL && - cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT))) + d_list_empty(&pool->cpr_pending_list)) pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; - if ((rc != 0 || decision != NULL) && cpr != NULL) - chk_pending_destroy(cpr); - return rc; } @@ -3742,6 +3730,8 @@ chk_leader_notify(struct chk_iv *iv) d_iov_t riov; int rc = 0; + CHK_IS_READY(ins); + /* Ignore the notification that is not applicable to current rank. */ if (cbk->cb_magic != CHK_BK_MAGIC_LEADER) @@ -3814,13 +3804,15 @@ chk_leader_notify(struct chk_iv *iv) } int -chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, int *pool_nr, - uuid_t **pools) +chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, uint32_t *ns_ver, + int *pool_nr, uuid_t **pools, d_rank_list_t **ranks) { struct chk_instance *ins = chk_leader; struct chk_bookmark *cbk = &ins->ci_bk; int rc = 0; + CHK_IS_READY(ins); + if (cbk->cb_magic != CHK_BK_MAGIC_LEADER) D_GOTO(out, rc = -DER_NOTLEADER); @@ -3840,7 +3832,9 @@ chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, if (ins->ci_orphan_done) *flags = CRF_ORPHAN_DONE; - rc = chk_leader_pools2list(ins, pool_nr, pools); + *ns_ver = ins->ci_ns_ver; + *ranks = ins->ci_ranks; + rc = chk_leader_pools2list(ins, pool_nr, pools); out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, @@ -3850,15 +3844,6 @@ chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, return rc; } -void -chk_leader_pause(void) -{ - struct chk_instance *ins = chk_leader; - - chk_stop_sched(ins); - D_ASSERT(d_list_empty(&ins->ci_rank_list)); -} - static void chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src, enum crt_event_type type, void *arg) @@ -3867,31 +3852,54 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src struct chk_dead_rank *cdr = NULL; int rc = 0; + if (ins->ci_ranks == NULL) + D_GOTO(out, rc = -DER_NOTAPPLICABLE); + /* Ignore the event that is not applicable to current rank. */ - if (src != CRT_EVS_SWIM) + if (src != CRT_EVS_SWIM && src != CRT_EVS_GRPMOD) D_GOTO(out, rc = -DER_NOTAPPLICABLE); if (type != CRT_EVT_DEAD && type != CRT_EVT_ALIVE) D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (!ins->ci_sched_running) - D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (type == CRT_EVT_DEAD) { D_ALLOC_PTR(cdr); if (cdr == NULL) D_GOTO(out, rc = -DER_NOMEM); cdr->cdr_rank = rank; + } else if (d_list_empty(&ins->ci_dead_ranks)) { + D_GOTO(out, rc = -DER_NOTAPPLICABLE); } ABT_mutex_lock(ins->ci_abt_mutex); if (cdr != NULL) { + struct chk_dead_rank *tmp; + /* - * The event may be triggered on non-system SX. Let's notify the leader scheduler + * The event may be triggered on non-system SX (SWIM). Let's ask chk_dead_rank_ult * to handle that on system XS. + * + * The callback for one rank dead event maybe triggered twice from multiple source: + * SWIM and PG memberskip changes. Let's only add once into the ins->ci_dead_ranks. + * + * Generally, ins->ci_dead_ranks is very short. Then it is very fast to go through + * the whole list. */ + d_list_for_each_entry(tmp, &ins->ci_dead_ranks, cdr_link) { + if (tmp->cdr_rank == rank) { + /* Repeated one, ignore it. */ + D_FREE(cdr); + D_GOTO(unlock, rc = -DER_NOTAPPLICABLE); + } + + if (tmp->cdr_rank > rank) { + d_list_add(&cdr->cdr_link, &tmp->cdr_link); + D_GOTO(unlock, rc = 0); + } + } + d_list_add_tail(&cdr->cdr_link, &ins->ci_dead_ranks); } else { /* Remove former non-handled dead rank from the list. */ @@ -3901,8 +3909,13 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src D_FREE(cdr); break; } + + if (cdr->cdr_rank > rank) + D_GOTO(unlock, rc = -DER_NOTAPPLICABLE); } } + +unlock: ABT_mutex_unlock(ins->ci_abt_mutex); out: @@ -3912,19 +3925,41 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src DP_LEADER(ins), rank, type == CRT_EVT_DEAD ? "dead" : "alive", DP_RC(rc)); } -int -chk_leader_init(void) +static void +chk_dead_rank_ult(void *args) { - struct chk_traverse_pools_args ctpa = { 0 }; - struct chk_bookmark *cbk; - int rc; + struct chk_instance *ins = args; + struct chk_dead_rank *cdr; - rc = chk_ins_init(&chk_leader); - if (rc != 0) - goto fini; + while (ins->ci_inited) { + cdr = NULL; + if (!d_list_empty(&ins->ci_dead_ranks)) { + ABT_mutex_lock(ins->ci_abt_mutex); + if (likely(!d_list_empty(&ins->ci_dead_ranks))) + cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, + cdr_link); + ABT_mutex_unlock(ins->ci_abt_mutex); + } - chk_leader->ci_is_leader = 1; - chk_report_seq_init(chk_leader); + if (cdr != NULL) + chk_leader_mark_rank_dead(ins, cdr); + + if (d_list_empty(&ins->ci_dead_ranks)) + dss_sleep(500); + } + + while ((cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, cdr_link)) != + NULL) + D_FREE(cdr); +} + +int +chk_leader_setup(void) +{ + struct chk_instance *ins = chk_leader; + struct chk_bookmark *cbk = &ins->ci_bk; + struct chk_traverse_pools_args ctpa = {0}; + int rc; /* * DAOS global consistency check depends on all related engines' local @@ -3933,7 +3968,8 @@ chk_leader_init(void) * related local inconsistency firstly. */ - cbk = &chk_leader->ci_bk; + chk_report_seq_init(ins); + rc = chk_bk_fetch_leader(cbk); if (rc == -DER_NONEXIST) goto prop; @@ -3968,38 +4004,69 @@ chk_leader_init(void) cbk->cb_time.ct_stop_time = time(NULL); rc = chk_bk_update_leader(cbk); if (rc != 0) { - D_ERROR(DF_LEADER" failed to reset ins status as 'PAUSED': "DF_RC"\n", - DP_LEADER(chk_leader), DP_RC(rc)); + D_ERROR(DF_LEADER " failed to reset ins status as 'PAUSED': " DF_RC "\n", + DP_LEADER(ins), DP_RC(rc)); goto fini; } ctpa.ctpa_gen = cbk->cb_gen; - ctpa.ctpa_ins = chk_leader; + ctpa.ctpa_ins = ins; rc = chk_traverse_pools(chk_pools_pause_cb, &ctpa); /* * Failed to reset pool status will not affect next check start, so it is not fatal, * but related check query result may be confused for user. */ if (rc != 0) - D_WARN(DF_LEADER" failed to reset pools status as 'PAUSED': "DF_RC"\n", - DP_LEADER(chk_leader), DP_RC(rc)); + D_WARN(DF_LEADER " failed to reset pools status as 'PAUSED': " DF_RC "\n", + DP_LEADER(ins), DP_RC(rc)); } prop: - rc = chk_prop_fetch(&chk_leader->ci_prop, &chk_leader->ci_ranks); - if (rc == 0 || rc == -DER_NONEXIST) + rc = chk_prop_fetch(&ins->ci_prop, &ins->ci_ranks); + if (rc != 0 && rc != -DER_NONEXIST) + goto fini; + + ins->ci_inited = 1; + ins->ci_pause = 0; + + rc = dss_ult_create(chk_dead_rank_ult, ins, DSS_XS_SYS, 0, 0, &ins->ci_dead_rank_ult); + if (rc == 0) rc = crt_register_event_cb(chk_rank_event_cb, NULL); + fini: if (rc != 0) - chk_ins_fini(&chk_leader); - else - chk_leader->ci_inited = 1; + chk_leader_cleanup(); return rc; } void -chk_leader_fini(void) +chk_leader_cleanup(void) { + struct chk_instance *ins = chk_leader; + crt_unregister_event_cb(chk_rank_event_cb, NULL); + + chk_ins_cleanup(ins); + D_ASSERT(d_list_empty(&ins->ci_rank_list)); + + if (ins->ci_dead_rank_ult != ABT_THREAD_NULL) + ABT_thread_free(&ins->ci_dead_rank_ult); +} + +int +chk_leader_init(void) +{ + int rc; + + rc = chk_ins_init(&chk_leader); + if (rc == 0) + chk_leader->ci_is_leader = 1; + + return rc; +} + +void +chk_leader_fini(void) +{ chk_ins_fini(&chk_leader); } diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c index d81506e5c35..e250936dfc2 100644 --- a/src/chk/chk_rpc.c +++ b/src/chk/chk_rpc.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -527,8 +527,8 @@ chk_sg_rpc_prepare(d_rank_t rank, crt_opcode_t opc, crt_rpc_t **req) int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, uint32_t flags, + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args) { struct chk_co_rpc_cb_args cb_args = { 0 }; @@ -544,12 +544,12 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran if (rc != 0) goto out; - csi = crt_req_get(req); - csi->csi_gen = gen; - csi->csi_flags = flags; - csi->csi_phase = phase; + csi = crt_req_get(req); + csi->csi_gen = gen; + csi->csi_flags = flags; + csi->csi_ns_ver = ns_ver; csi->csi_leader_rank = leader; - csi->csi_api_flags = api_flags; + csi->csi_api_flags = api_flags; uuid_copy(csi->csi_iv_uuid, iv_uuid); csi->csi_ranks.ca_count = rank_nr; csi->csi_ranks.ca_arrays = ranks; @@ -605,9 +605,9 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran crt_req_decref(req); } - D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, - "Rank %u start checker, gen "DF_X64", flags %x, phase %d, iv "DF_UUIDF":"DF_RC"\n", - leader, gen, flags, phase, DP_UUID(iv_uuid), DP_RC(rc)); + DL_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, rc, + "Rank %u start checker, gen " DF_X64 ", flags %x, ns_ver %d, iv " DF_UUIDF, + leader, gen, flags, ns_ver, DP_UUID(iv_uuid)); return rc; } @@ -1019,7 +1019,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, - uint32_t *pool_nr, uuid_t **pools) + uint32_t *ns_ver, uint32_t *pool_nr, uuid_t **pools, d_rank_list_t **ranks) { crt_rpc_t *req = NULL; struct chk_rejoin_in *cri; @@ -1042,8 +1042,22 @@ chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, cro = crt_reply_get(req); rc = cro->cro_status; - if (rc == 0 && cro->cro_pools.ca_count > 0) { - *flags = cro->cro_flags; + if (rc != 0) + goto out; + + *flags = cro->cro_flags; + *ns_ver = cro->cro_ns_ver; + + if (cro->cro_ranks.ca_count > 0) { + *ranks = d_rank_list_alloc(cro->cro_ranks.ca_count); + if (*ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy((*ranks)->rl_ranks, cro->cro_ranks.ca_arrays, + sizeof(d_rank_t) * cro->cro_ranks.ca_count); + } + + if (cro->cro_pools.ca_count > 0) { D_ALLOC(tmp, cro->cro_pools.ca_count); if (tmp == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/chk/chk_srv.c b/src/chk/chk_srv.c index 48543de0f96..d50e3b59657 100644 --- a/src/chk/chk_srv.c +++ b/src/chk/chk_srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -27,7 +27,7 @@ ds_chk_start_hdlr(crt_rpc_t *rpc) rc = chk_engine_start(csi->csi_gen, csi->csi_ranks.ca_count, csi->csi_ranks.ca_arrays, csi->csi_policies.ca_count, csi->csi_policies.ca_arrays, csi->csi_uuids.ca_count, csi->csi_uuids.ca_arrays, csi->csi_api_flags, - csi->csi_phase, csi->csi_leader_rank, csi->csi_flags, + csi->csi_ns_ver, csi->csi_leader_rank, csi->csi_flags, csi->csi_iv_uuid, &clues); if (rc > 0) { D_ALLOC_PTR(rank); @@ -249,18 +249,21 @@ ds_chk_report_hdlr(crt_rpc_t *rpc) static void ds_chk_rejoin_hdlr(crt_rpc_t *rpc) { - struct chk_rejoin_in *cri = crt_req_get(rpc); - struct chk_rejoin_out *cro = crt_reply_get(rpc); - uuid_t *pools = NULL; - int pool_nr = 0; - int rc; + struct chk_rejoin_in *cri = crt_req_get(rpc); + struct chk_rejoin_out *cro = crt_reply_get(rpc); + uuid_t *pools = NULL; + d_rank_list_t *ranks = NULL; + int pool_nr = 0; + int rc; rc = chk_leader_rejoin(cri->cri_gen, cri->cri_rank, cri->cri_iv_uuid, &cro->cro_flags, - &pool_nr, &pools); + &cro->cro_ns_ver, &pool_nr, &pools, &ranks); cro->cro_status = rc; if (rc == 0) { - cro->cro_pools.ca_count = pool_nr; + cro->cro_ranks.ca_count = ranks->rl_nr; + cro->cro_ranks.ca_arrays = ranks->rl_ranks; + cro->cro_pools.ca_count = pool_nr; cro->cro_pools.ca_arrays = pools; } @@ -307,6 +310,14 @@ ds_chk_init(void) goto out; rc = chk_iv_init(); + if (rc != 0) + goto out; + + rc = chk_leader_init(); + if (rc != 0) + goto out; + + rc = chk_engine_init(); out: return rc; @@ -315,6 +326,9 @@ ds_chk_init(void) static int ds_chk_fini(void) { + chk_engine_fini(); + chk_leader_fini(); + return chk_iv_fini(); } @@ -323,14 +337,14 @@ ds_chk_setup(void) { int rc; - /* Do NOT move chk_vos_init into ds_chk_init, because sys_db is not ready at that time. */ - chk_vos_init(); + /* Do NOT move chk_vos_setup into ds_chk_init, because sys_db is not ready at that time. */ + chk_vos_setup(); - rc = chk_leader_init(); + rc = chk_leader_setup(); if (rc != 0) goto out_vos; - rc = chk_engine_init(); + rc = chk_engine_setup(); if (rc != 0) goto out_leader; @@ -347,9 +361,9 @@ ds_chk_setup(void) goto out_done; out_leader: - chk_leader_fini(); + chk_leader_cleanup(); out_vos: - chk_vos_fini(); + chk_vos_cleanup(); out_done: return rc; } @@ -357,11 +371,9 @@ ds_chk_setup(void) static int ds_chk_cleanup(void) { - chk_engine_pause(); - chk_leader_pause(); - chk_engine_fini(); - chk_leader_fini(); - chk_vos_fini(); + chk_engine_cleanup(); + chk_leader_cleanup(); + chk_vos_cleanup(); return 0; } diff --git a/src/chk/chk_upcall.c b/src/chk/chk_upcall.c index bbc05db5f75..8d699195ac9 100644 --- a/src/chk/chk_upcall.c +++ b/src/chk/chk_upcall.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -83,6 +84,9 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re time_t tm = time(NULL); int rc; + if (DAOS_FAIL_CHECK(DAOS_CHK_REPORT_FAILURE)) + return -DER_IO; + report.seq = seq; report.class_ = cla; report.action = act; diff --git a/src/chk/chk_vos.c b/src/chk/chk_vos.c index 4cd7356e7ae..5970f3207db 100644 --- a/src/chk/chk_vos.c +++ b/src/chk/chk_vos.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -303,13 +303,13 @@ chk_traverse_pools(sys_db_trav_cb_t cb, void *args) } void -chk_vos_init(void) +chk_vos_setup(void) { chk_db = vos_db_get(); } void -chk_vos_fini(void) +chk_vos_cleanup(void) { chk_db = NULL; } diff --git a/src/client/api/rpc.c b/src/client/api/rpc.c index 189d34197a9..13daf4ee8d2 100644 --- a/src/client/api/rpc.c +++ b/src/client/api/rpc.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -98,6 +98,7 @@ daos_rpc_send_wait(crt_rpc_t *rpc) struct rpc_proto { int rank_idx; + int num_retries_left; crt_endpoint_t ep; int version; int rc; @@ -122,6 +123,17 @@ query_cb(struct crt_proto_query_cb_info *cb_info) nr_ranks = dc_mgmt_net_get_num_srv_ranks(); D_ASSERT(nr_ranks > 0); rproto->rank_idx = (rproto->rank_idx + 1) % nr_ranks; + rproto->num_retries_left--; + + /** We tried all engines and found none alive */ + if (rproto->num_retries_left <= 0) { + D_ERROR("crt_proto_query_with_ctx() failed -- All %d targets tried\n", + nr_ranks); + rproto->rc = cb_info->pq_rc; + rproto->completed = true; + return; + } + rank = dc_mgmt_net_get_srv_rank(rproto->rank_idx); D_ASSERT(rank != CRT_NO_RANK); rproto->ep.ep_rank = rank; @@ -170,6 +182,7 @@ daos_rpc_proto_query(crt_opcode_t base_opc, uint32_t *ver_array, int count, int D_GOTO(out_free, -DER_NONEXIST); } rproto->rank_idx = d_rand() % nr_ranks; + rproto->num_retries_left = nr_ranks; rank = dc_mgmt_net_get_srv_rank(rproto->rank_idx); D_ASSERT(rank != CRT_NO_RANK); rproto->ep.ep_rank = rank; @@ -179,7 +192,7 @@ daos_rpc_proto_query(crt_opcode_t base_opc, uint32_t *ver_array, int count, int rproto->array_size = count; rproto->ep.ep_grp = sys->sy_group; rproto->base_opc = base_opc; - rproto->timeout = 3; + rproto->timeout = 10; rc = crt_proto_query_with_ctx(&rproto->ep, base_opc, ver_array, count, rproto->timeout, query_cb, rproto, ctx); diff --git a/src/client/dfs/README.md b/src/client/dfs/README.md index 485846b660a..8b0fe59e6c7 100644 --- a/src/client/dfs/README.md +++ b/src/client/dfs/README.md @@ -134,7 +134,14 @@ Object testdir By default, all directories are created with an object class with 1 shard. This means, that if the container redundancy factor (RF) is 0, OC_S1 oclass will be used; if RF=1 OC_RP_2G1 is used, and so on. The user can of course change that when creating the directory and set the desired object class -manually, or set the default object class when creating the container. +manually, or set the default object class when creating the container. Using an EC object class +class for directories is not recommended since directory entries are small and EC overhead will be +large anyway. Thus when setting the directory object class on container creation to an EC object +class, DAOS will ignore the user setting and use the default replication object class depending on +the redundancy factory of the container as explained earlier. If one uses the DAOS tool to change +the object class of new files and directories to be created under an existing directory (daos fs +set-attr), and that object class is EC, that setting will apply only to files. New directories will +use the container default in that case. Note that with this mapping, the inode information is stored with the entry that it corresponds to in the parent directory object. Thus, hard links won't be supported, since it won't be possible to diff --git a/src/client/dfs/common.c b/src/client/dfs/common.c index acbc7eb11f7..72cae4dceb5 100644 --- a/src/client/dfs/common.c +++ b/src/client/dfs/common.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2018-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -643,8 +644,15 @@ create_dir(dfs_t *dfs, dfs_obj_t *parent, daos_oclass_id_t cid, dfs_obj_t *dir) if (cid == 0) { if (parent->d.oclass == 0) cid = dfs->attr.da_dir_oclass_id; - else + else { cid = parent->d.oclass; + /* + * If the parent oclass is EC, do not use that for a directory and use the + * container default instead. + */ + if (daos_cid_is_ec(cid)) + cid = dfs->attr.da_dir_oclass_id; + } } /** Allocate an OID for the dir - local operation */ diff --git a/src/client/dfs/cont.c b/src/client/dfs/cont.c index b3c133a8580..b01d527c112 100644 --- a/src/client/dfs/cont.c +++ b/src/client/dfs/cont.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -189,8 +189,14 @@ dfs_cont_create(daos_handle_t poh, uuid_t *cuuid, dfs_attr_t *attr, daos_handle_ } if (attr->da_file_oclass_id) dattr.da_file_oclass_id = attr->da_file_oclass_id; - if (attr->da_dir_oclass_id) + if (attr->da_dir_oclass_id) { dattr.da_dir_oclass_id = attr->da_dir_oclass_id; + if (daos_cid_is_ec(dattr.da_dir_oclass_id)) { + D_WARN("EC object class for directories is not supported," + " reverting to use default"); + dattr.da_dir_oclass_id = 0; + } + } /** check non default mode */ if ((attr->da_mode & MODE_MASK) == DFS_RELAXED || diff --git a/src/client/dfs/obj.c b/src/client/dfs/obj.c index b4a81ba4855..c85a8f84f9c 100644 --- a/src/client/dfs/obj.c +++ b/src/client/dfs/obj.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -81,7 +81,16 @@ dfs_obj_get_info(dfs_t *dfs, dfs_obj_t *obj, dfs_obj_info_t *info) /** what is the default oclass files and dirs will be created with in this dir */ if (obj->d.oclass) { - info->doi_dir_oclass_id = obj->d.oclass; + /** if parent oclass is EC, dir would be chosen as container default */ + if (!daos_cid_is_ec(obj->d.oclass)) { + info->doi_dir_oclass_id = obj->d.oclass; + } else { + if (dfs->attr.da_dir_oclass_id) + info->doi_dir_oclass_id = dfs->attr.da_dir_oclass_id; + else + rc = daos_obj_get_oclass(dfs->coh, DAOS_OT_MULTI_HASHED, 0, + 0, &info->doi_dir_oclass_id); + } info->doi_file_oclass_id = obj->d.oclass; } else { if (dfs->attr.da_dir_oclass_id) diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index 03eeaaef6a7..fad6ccb5ede 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -851,7 +851,7 @@ retrieve_handles_from_fuse(int idx) fclose(tmp_file); unlink(fname); if (read_size != hs_reply.fsr_pool_size) { - errno_saved = EAGAIN; + errno_saved = EIO; D_DEBUG(DB_ANY, "fread expected %zu bytes, read %d bytes : %d (%s)\n", hs_reply.fsr_pool_size, read_size, errno_saved, strerror(errno_saved)); @@ -1474,16 +1474,16 @@ init_fd_list(void) rc = D_MUTEX_INIT(&lock_fd, NULL); if (rc) - return 1; + return rc; rc = D_MUTEX_INIT(&lock_dirfd, NULL); if (rc) - return 1; + return rc; rc = D_MUTEX_INIT(&lock_mmap, NULL); if (rc) - return 1; + return rc; rc = D_RWLOCK_INIT(&lock_fd_dup2ed, NULL); if (rc) - return 1; + return rc; /* fatal error above: failure to create mutexes. */ @@ -1655,7 +1655,6 @@ find_next_available_map(int *idx) return 0; } -/* May need to support duplicated fd as duplicated dirfd too. */ static void free_fd(int idx, bool closing_dup_fd) { @@ -1676,7 +1675,7 @@ free_fd(int idx, bool closing_dup_fd) d_file_list[idx]->ref_count--; if (d_file_list[idx]->ref_count == 0) saved_obj = d_file_list[idx]; - if (dup_ref_count[idx] > 0 || ((d_file_list[idx]->ref_count > 0) && !d_compatible_mode)) { + if ((dup_ref_count[idx] > 0) || (closing_dup_fd && (d_file_list[idx]->ref_count > 0))) { D_MUTEX_UNLOCK(&lock_fd); return; } @@ -6092,8 +6091,10 @@ utimens_timespec(const char *path, const struct timespec times[2], int flags) int utimensat(int dirfd, const char *path, const struct timespec times[2], int flags) { - int idx_dfs, error = 0, rc; - char *full_path = NULL; + int idx_dfs, error = 0, rc; + char *full_path = NULL; + struct timespec times_loc[2]; + struct timespec *times_ptr; if (next_utimensat == NULL) { next_utimensat = dlsym(RTLD_NEXT, "utimensat"); @@ -6111,18 +6112,30 @@ utimensat(int dirfd, const char *path, const struct timespec times[2], int flags } _Pragma("GCC diagnostic pop") + /* clang-format off */ + + if (times == NULL) { + clock_gettime(CLOCK_REALTIME, ×_loc[0]); + times_loc[1].tv_sec = times_loc[0].tv_sec; + times_loc[1].tv_nsec = times_loc[0].tv_nsec; + times_ptr = times_loc; + } else { + times_ptr = (struct timespec *)times; + } + /* clang-format on */ + /* absolute path, dirfd is ignored */ if (path[0] == '/') - return utimens_timespec(path, times, flags); + return utimens_timespec(path, times_ptr, flags); idx_dfs = check_path_with_dirfd(dirfd, &full_path, path, &error); if (error) goto out_err; if (idx_dfs >= 0) - rc = utimens_timespec(full_path, times, flags); + rc = utimens_timespec(full_path, times_ptr, flags); else - rc = next_utimensat(dirfd, path, times, flags); + rc = next_utimensat(dirfd, path, times_ptr, flags); error = errno; if (full_path) { diff --git a/src/client/java/hadoop-daos/pom.xml b/src/client/java/hadoop-daos/pom.xml index 28576e3ec63..98fa7f00fc9 100644 --- a/src/client/java/hadoop-daos/pom.xml +++ b/src/client/java/hadoop-daos/pom.xml @@ -123,7 +123,7 @@ org.assertj assertj-core - 3.19.0 + 3.27.7 test diff --git a/src/client/java/pom.xml b/src/client/java/pom.xml index d01d4293704..bef30ccc4fc 100644 --- a/src/client/java/pom.xml +++ b/src/client/java/pom.xml @@ -16,7 +16,7 @@ 1.4.0 5.4.0 1.7.25 - 2.17.1 + 2.25.3 true 1.8 1.8 diff --git a/src/client/pydaos/pydaos_core.py b/src/client/pydaos/pydaos_core.py index 384639c1d31..22d296c415a 100644 --- a/src/client/pydaos/pydaos_core.py +++ b/src/client/pydaos/pydaos_core.py @@ -1,4 +1,5 @@ # (C) Copyright 2019-2024 Intel Corporation. +# (C) Copyright 2025 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -73,6 +74,10 @@ class DCont(): array(name, kwargs): Create new DArray object. + + destroy() + Destroy the DDict or DArray object. This does not invalidate open objects and using those + objects after destroying it will result in undefined behavior. """ def __init__(self, pool=None, cont=None, path=None, open_mode='RW'): @@ -164,6 +169,14 @@ def array(self, name, v: list = None, cid="0"): return da + def destroy(self, name): + """ Destroy an existing Dict or Array object """ + + # Remove the entry for the container root object and destroy the kv + ret = pydaos_shim.cont_destroyobj(DAOS_MAGIC, self._hdl, name) + if ret != pydaos_shim.DER_SUCCESS: + raise PyDError("failed to destroy DAOS dict", ret) + def __str__(self): return '{}/{}'.format(self.pool, self.cont) diff --git a/src/client/pydaos/pydaos_shim.c b/src/client/pydaos/pydaos_shim.c index 4436e056f9d..370deedf1d6 100644 --- a/src/client/pydaos/pydaos_shim.c +++ b/src/client/pydaos/pydaos_shim.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -286,8 +287,10 @@ __shim_handle__cont_get(PyObject *self, PyObject *args) struct open_handle *hdl; char *name; struct pydaos_df entry; - size_t size = sizeof(entry); - daos_obj_id_t oid = {0, }; + size_t size = sizeof(entry); + daos_obj_id_t oid = { + 0, + }; unsigned int otype = 0; int rc; @@ -388,6 +391,68 @@ __shim_handle__cont_newobj(PyObject *self, PyObject *args) return return_list; } +static PyObject * +__shim_handle__cont_destroyobj(PyObject *self, PyObject *args) +{ + struct open_handle *hdl; + char *name; + struct pydaos_df entry; + size_t size = sizeof(entry); + daos_obj_id_t oid = { + 0, + }; + daos_handle_t oh; + unsigned int otype = 0; + int rc; + + /* Parse arguments */ + RETURN_NULL_IF_FAILED_TO_PARSE(args, "Ks", &hdl, &name); + + /** Lookup name in root kv */ + rc = daos_kv_get(hdl->oh, DAOS_TX_NONE, 0, name, &size, &entry, NULL); + if (rc != -DER_SUCCESS) + goto out; + + /** Check if entry actually exists */ + if (size == 0) { + rc = -DER_NONEXIST; + goto out; + } + + /** If we fetched a value which isn't an entry ... we have a problem */ + if (size != sizeof(entry)) { + rc = -DER_INVAL; + goto out; + } + + oid = entry.oid; + otype = entry.otype; + + /** we do not support arrays anyway, so we would not be here */ + if (otype == PYDAOS_ARRAY) { + rc = -DER_INVAL; + goto out; + } + + /* Remove name from root kv, use conditional to fail if not exist */ + rc = daos_kv_remove(hdl->oh, DAOS_TX_NONE, DAOS_COND_PUNCH, name, NULL); + if (rc != -DER_SUCCESS) + goto out; + + rc = daos_kv_open(hdl->coh, oid, DAOS_OO_RW, &oh, NULL); + if (rc != -DER_SUCCESS) + goto out; + rc = daos_kv_destroy(oh, DAOS_TX_NONE, NULL); + if (rc != -DER_SUCCESS) { + daos_kv_close(oh, NULL); + goto out; + } + rc = daos_kv_close(oh, NULL); + +out: + return PyLong_FromLong(rc); +} + static PyObject * __shim_handle__cont_close(PyObject *self, PyObject *args) { @@ -740,10 +805,13 @@ do { \ DEFINE_OC_EXPL(EC_2P2G); /** OC_EC_2P2G1, OC_EC_2P2G2, ... */ DEFINE_OC_EXPL(EC_4P1G); /** OC_EC_4P1G1, OC_EC_4P1G2, ... */ DEFINE_OC_EXPL(EC_4P2G); /** OC_EC_4P2G1, OC_EC_4P2G2, ... */ + DEFINE_OC_EXPL(EC_4P3G); /** OC_EC_4P3G1, OC_EC_4P3G2, ... */ DEFINE_OC_EXPL(EC_8P1G); /** OC_EC_8P1G1, OC_EC_8P1G2, ... */ DEFINE_OC_EXPL(EC_8P2G); /** OC_EC_8P2G1, OC_EC_8P2G2, ... */ + DEFINE_OC_EXPL(EC_8P3G); /** OC_EC_8P3G1, OC_EC_8P3G2, ... */ DEFINE_OC_EXPL(EC_16P1G); /** OC_EC_16P1G1, OC_EC_16P1G2, ... */ DEFINE_OC_EXPL(EC_16P2G); /** OC_EC_16P2G1, OC_EC_16P2G2, ... */ + DEFINE_OC_EXPL(EC_16P3G); /** OC_EC_16P3G1, OC_EC_16P3G2, ... */ #define DEFINE_OC_INTERNAL(name)\ do { \ @@ -1393,6 +1461,7 @@ static PyMethodDef daosMethods[] = { EXPORT_PYTHON_METHOD(cont_open_by_path), EXPORT_PYTHON_METHOD(cont_get), EXPORT_PYTHON_METHOD(cont_newobj), + EXPORT_PYTHON_METHOD(cont_destroyobj), EXPORT_PYTHON_METHOD(cont_close), EXPORT_PYTHON_METHOD(cont_check), EXPORT_PYTHON_METHOD(cont_check_by_path), diff --git a/src/client/pydaos/raw/daos_api.py b/src/client/pydaos/raw/daos_api.py index e0207374828..76fb026b71d 100644 --- a/src/client/pydaos/raw/daos_api.py +++ b/src/client/pydaos/raw/daos_api.py @@ -403,23 +403,24 @@ class DaosObjClassOld(enum.IntEnum): DAOS_OC_R3S_SPEC_RANK = 21 -# pylint: disable=no-member +# pylint: disable=no-member,invalid-name + ConvertObjClass = { - DaosObjClassOld.DAOS_OC_TINY_RW: DaosObjClass.OC_S1, - DaosObjClassOld.DAOS_OC_SMALL_RW: DaosObjClass.OC_S4, - DaosObjClassOld.DAOS_OC_LARGE_RW: DaosObjClass.OC_SX, - DaosObjClassOld.DAOS_OC_R2S_RW: DaosObjClass.OC_RP_2G1, - DaosObjClassOld.DAOS_OC_R2_RW: DaosObjClass.OC_RP_2G2, - DaosObjClassOld.DAOS_OC_R2_MAX_RW: DaosObjClass.OC_RP_2GX, - DaosObjClassOld.DAOS_OC_R3S_RW: DaosObjClass.OC_RP_3G1, - DaosObjClassOld.DAOS_OC_R3_RW: DaosObjClass.OC_RP_3G2, - DaosObjClassOld.DAOS_OC_R3_MAX_RW: DaosObjClass.OC_RP_3GX, - DaosObjClassOld.DAOS_OC_R4S_RW: DaosObjClass.OC_RP_4G1, - DaosObjClassOld.DAOS_OC_R4_RW: DaosObjClass.OC_RP_4G2, - DaosObjClassOld.DAOS_OC_R4_MAX_RW: DaosObjClass.OC_RP_4GX, - DaosObjClassOld.DAOS_OC_REPL_MAX_RW: DaosObjClass.OC_RP_XSF + DaosObjClassOld.DAOS_OC_TINY_RW: DaosObjClass.OC_S1, # noqa: E241 + DaosObjClassOld.DAOS_OC_SMALL_RW: DaosObjClass.OC_S4, # noqa: E241 + DaosObjClassOld.DAOS_OC_LARGE_RW: DaosObjClass.OC_SX, # noqa: E241 + DaosObjClassOld.DAOS_OC_R2S_RW: DaosObjClass.OC_RP_2G1, # noqa: E241 + DaosObjClassOld.DAOS_OC_R2_RW: DaosObjClass.OC_RP_2G2, # noqa: E241 + DaosObjClassOld.DAOS_OC_R2_MAX_RW: DaosObjClass.OC_RP_2GX, # noqa: E241 + DaosObjClassOld.DAOS_OC_R3S_RW: DaosObjClass.OC_RP_3G1, # noqa: E241 + DaosObjClassOld.DAOS_OC_R3_RW: DaosObjClass.OC_RP_3G2, # noqa: E241 + DaosObjClassOld.DAOS_OC_R3_MAX_RW: DaosObjClass.OC_RP_3GX, # noqa: E241 + DaosObjClassOld.DAOS_OC_R4S_RW: DaosObjClass.OC_RP_4G1, # noqa: E241 + DaosObjClassOld.DAOS_OC_R4_RW: DaosObjClass.OC_RP_4G2, # noqa: E241 + DaosObjClassOld.DAOS_OC_R4_MAX_RW: DaosObjClass.OC_RP_4GX, # noqa: E241 + DaosObjClassOld.DAOS_OC_REPL_MAX_RW: DaosObjClass.OC_RP_XSF # noqa: E241 } -# pylint: enable=no-member +# pylint: enable=no-member,invalid-name def get_object_class(item): @@ -2307,55 +2308,55 @@ def __init__(self, path): # Note: action-subject format self.ftable = { - 'close-cont': self.libdaos.daos_cont_close, - 'close-obj': self.libdaos.daos_obj_close, - 'close-tx': self.libdaos.daos_tx_close, - 'commit-tx': self.libdaos.daos_tx_commit, - 'connect-pool': self.libdaos.daos_pool_connect, - 'convert-cglobal': self.libdaos.daos_cont_global2local, - 'convert-clocal': self.libdaos.daos_cont_local2global, - 'convert-pglobal': self.libdaos.daos_pool_global2local, - 'convert-plocal': self.libdaos.daos_pool_local2global, - 'create-cont': self.libdaos.daos_cont_create, - 'create-eq': self.libdaos.daos_eq_create, - 'create-snap': self.libdaos.daos_cont_create_snap, - 'destroy-cont': self.libdaos.daos_cont_destroy, - 'destroy-eq': self.libdaos.daos_eq_destroy, - 'destroy-snap': self.libdaos.daos_cont_destroy_snap, - 'destroy-tx': self.libdaos.daos_tx_abort, - 'disconnect-pool': self.libdaos.daos_pool_disconnect, - 'fetch-obj': self.libdaos.daos_obj_fetch, - 'generate-oid': self.libdaos.daos_obj_generate_oid, - 'get-cont-attr': self.libdaos.daos_cont_get_attr, - 'get-pool-attr': self.libdaos.daos_pool_get_attr, - 'get-layout': self.libdaos.daos_obj_layout_get, - 'init-event': self.libdaos.daos_event_init, - 'list-akey': self.libdaos.daos_obj_list_akey, - 'list-attr': self.libdaos.daos_cont_list_attr, - 'list-cont-attr': self.libdaos.daos_cont_list_attr, - 'list-dkey': self.libdaos.daos_obj_list_dkey, - 'list-pool-attr': self.libdaos.daos_pool_list_attr, - 'cont-aggregate': self.libdaos.daos_cont_aggregate, - 'list-snap': self.libdaos.daos_cont_list_snap, - 'open-cont': self.libdaos.daos_cont_open, - 'open-obj': self.libdaos.daos_obj_open, - 'open-snap': self.libdaos.daos_tx_open_snap, - 'open-tx': self.libdaos.daos_tx_open, - 'poll-eq': self.libdaos.daos_eq_poll, - 'punch-akeys': self.libdaos.daos_obj_punch_akeys, - 'punch-dkeys': self.libdaos.daos_obj_punch_dkeys, - 'punch-obj': self.libdaos.daos_obj_punch, - 'query-cont': self.libdaos.daos_cont_query, - 'query-obj': self.libdaos.daos_obj_query, - 'query-pool': self.libdaos.daos_pool_query, - 'query-target': self.libdaos.daos_pool_query_target, - 'restart-tx': self.libdaos.daos_tx_restart, - 'set-cont-attr': self.libdaos.daos_cont_set_attr, - 'set-pool-attr': self.libdaos.daos_pool_set_attr, - 'stop-service': self.libdaos.daos_pool_stop_svc, - 'test-event': self.libdaos.daos_event_test, - 'update-obj': self.libdaos.daos_obj_update, - 'oid_gen': self.libtest.dts_oid_gen if self.libtest else None} + 'close-cont': self.libdaos.daos_cont_close, # noqa: E241 + 'close-obj': self.libdaos.daos_obj_close, # noqa: E241 + 'close-tx': self.libdaos.daos_tx_close, # noqa: E241 + 'commit-tx': self.libdaos.daos_tx_commit, # noqa: E241 + 'connect-pool': self.libdaos.daos_pool_connect, # noqa: E241 + 'convert-cglobal': self.libdaos.daos_cont_global2local, # noqa: E241 + 'convert-clocal': self.libdaos.daos_cont_local2global, # noqa: E241 + 'convert-pglobal': self.libdaos.daos_pool_global2local, # noqa: E241 + 'convert-plocal': self.libdaos.daos_pool_local2global, # noqa: E241 + 'create-cont': self.libdaos.daos_cont_create, # noqa: E241 + 'create-eq': self.libdaos.daos_eq_create, # noqa: E241 + 'create-snap': self.libdaos.daos_cont_create_snap, # noqa: E241 + 'destroy-cont': self.libdaos.daos_cont_destroy, # noqa: E241 + 'destroy-eq': self.libdaos.daos_eq_destroy, # noqa: E241 + 'destroy-snap': self.libdaos.daos_cont_destroy_snap, # noqa: E241 + 'destroy-tx': self.libdaos.daos_tx_abort, # noqa: E241 + 'disconnect-pool': self.libdaos.daos_pool_disconnect, # noqa: E241 + 'fetch-obj': self.libdaos.daos_obj_fetch, # noqa: E241 + 'generate-oid': self.libdaos.daos_obj_generate_oid, # noqa: E241 + 'get-cont-attr': self.libdaos.daos_cont_get_attr, # noqa: E241 + 'get-pool-attr': self.libdaos.daos_pool_get_attr, # noqa: E241 + 'get-layout': self.libdaos.daos_obj_layout_get, # noqa: E241 + 'init-event': self.libdaos.daos_event_init, # noqa: E241 + 'list-akey': self.libdaos.daos_obj_list_akey, # noqa: E241 + 'list-attr': self.libdaos.daos_cont_list_attr, # noqa: E241 + 'list-cont-attr': self.libdaos.daos_cont_list_attr, # noqa: E241 + 'list-dkey': self.libdaos.daos_obj_list_dkey, # noqa: E241 + 'list-pool-attr': self.libdaos.daos_pool_list_attr, # noqa: E241 + 'cont-aggregate': self.libdaos.daos_cont_aggregate, # noqa: E241 + 'list-snap': self.libdaos.daos_cont_list_snap, # noqa: E241 + 'open-cont': self.libdaos.daos_cont_open, # noqa: E241 + 'open-obj': self.libdaos.daos_obj_open, # noqa: E241 + 'open-snap': self.libdaos.daos_tx_open_snap, # noqa: E241 + 'open-tx': self.libdaos.daos_tx_open, # noqa: E241 + 'poll-eq': self.libdaos.daos_eq_poll, # noqa: E241 + 'punch-akeys': self.libdaos.daos_obj_punch_akeys, # noqa: E241 + 'punch-dkeys': self.libdaos.daos_obj_punch_dkeys, # noqa: E241 + 'punch-obj': self.libdaos.daos_obj_punch, # noqa: E241 + 'query-cont': self.libdaos.daos_cont_query, # noqa: E241 + 'query-obj': self.libdaos.daos_obj_query, # noqa: E241 + 'query-pool': self.libdaos.daos_pool_query, # noqa: E241 + 'query-target': self.libdaos.daos_pool_query_target, # noqa: E241 + 'restart-tx': self.libdaos.daos_tx_restart, # noqa: E241 + 'set-cont-attr': self.libdaos.daos_cont_set_attr, # noqa: E241 + 'set-pool-attr': self.libdaos.daos_pool_set_attr, # noqa: E241 + 'stop-service': self.libdaos.daos_pool_stop_svc, # noqa: E241 + 'test-event': self.libdaos.daos_event_test, # noqa: E241 + 'update-obj': self.libdaos.daos_obj_update, # noqa: E241 + 'oid_gen': self.libtest.dts_oid_gen if self.libtest else None} # noqa: E241 def get_function(self, function): """Get a function handle by name. diff --git a/src/client/pydaos/raw/daos_cref.py b/src/client/pydaos/raw/daos_cref.py index 3878912b698..f86faaa937d 100644 --- a/src/client/pydaos/raw/daos_cref.py +++ b/src/client/pydaos/raw/daos_cref.py @@ -1,6 +1,6 @@ """ (C) Copyright 2018-2023 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -53,7 +53,8 @@ class RebuildStatus(ctypes.Structure): ("rs_errno", ctypes.c_uint32), ("rs_state", ctypes.c_uint32), ("rs_max_supported_layout_ver", ctypes.c_uint16), - ("rs_padding16", ctypes.c_uint16), + ("rs_flags", ctypes.c_uint8), + ("rs_reserved_", ctypes.c_uint8), ("rs_fail_rank", ctypes.c_uint32), ("rs_toberb_obj_nr", ctypes.c_uint64), ("rs_obj_nr", ctypes.c_uint64), diff --git a/src/client/pydaos/torch/torch_api.py b/src/client/pydaos/torch/torch_api.py index 9225c97e8d1..73ed2f22dbb 100644 --- a/src/client/pydaos/torch/torch_api.py +++ b/src/client/pydaos/torch/torch_api.py @@ -1,6 +1,6 @@ # # (C) Copyright 2024-2025 Google LLC -# (C) Copyright 2024-2025 Enakta Labs Ltd +# (C) Copyright 2024-2026 Enakta Labs Ltd # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -11,11 +11,14 @@ In addition, it provides Checkpoint class to save and load PyTorch model checkpoints. """ +import errno import io import math import os import stat +import sys from multiprocessing import Process, Queue +from pathlib import Path from torch.utils.data import Dataset as TorchDataset from torch.utils.data import IterableDataset as TorchIterableDataset @@ -372,15 +375,19 @@ def __init__(self, dfs, path, mode, open_flags, class_name, self._workers.append(worker) def _worker_fn(self, queue): - self._dfs.worker_init() - while True: - work = queue.get() - if work is None: - break - - (offset, chunk) = work - self._dfs.write(self._path, self._mode, self._oflags, - self._class_name, self._file_chunk_size, offset, chunk) + try: + self._dfs.worker_init() + while True: + work = queue.get() + if work is None: + break + + (offset, chunk) = work + self._dfs.write(self._path, self._mode, self._oflags, + self._class_name, self._file_chunk_size, offset, chunk) + # pylint: disable=broad-exception-caught + except Exception as e: + sys.exit(getattr(e, 'errno', errno.EIO)) def write(self, data): """ Writes data to the buffer.""" @@ -431,6 +438,11 @@ def close(self): for worker in self._workers: worker.join() + # lets see if any worker exited abnormally and if so, raise an error + for worker in self._workers: + if worker.exitcode != 0: + raise OSError(worker.exitcode, os.strerror(worker.exitcode)) + super().close() def _flush(self): @@ -619,13 +631,16 @@ def reader(self, file, stream=None): stream.seek(0) return stream - def writer(self, file): + def writer(self, file, ensure_path=True): """ Returns write buffer to save the checkpoint file """ if file is None: raise ValueError("file is required") path = os.path.join(self._prefix, file) + if ensure_path: + self._dfs.mkdirall(os.path.dirname(path)) + return WriteBuffer(self._dfs, path, self._mode, self._oflags, self._class_name, self._file_chunk_size, self._transfer_chunk_size, self._chunks_limit, self._workers) @@ -810,3 +825,18 @@ def get_file_size(self, path): if ret != 0: raise OSError(ret, os.strerror(ret), path) return size + + def mkdirall(self, path, mode=0o755): + """ Creates directory, making parent directories if needed """ + + path = os.path.normpath(path) + dirs = list(Path(path).parts) + if not dirs: + raise ValueError(f"invalid path: {path}") + + parent = dirs.pop(0) + for name in dirs: + parent = os.path.join(parent, name) + ret = torch_shim.torch_mkdir(DAOS_MAGIC, self._dfs, parent, mode) + if ret not in (0, errno.EEXIST): + raise OSError(ret, os.strerror(ret), parent) diff --git a/src/client/pydaos/torch/torch_shim.c b/src/client/pydaos/torch/torch_shim.c index 73d93df45e8..5027ab690f8 100644 --- a/src/client/pydaos/torch/torch_shim.c +++ b/src/client/pydaos/torch/torch_shim.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. * (C) Copyright 2024-2025 Google LLC - * (C) Copyright 2024-2025 Enakta Labs Ltd + * (C) Copyright 2024-2026 Enakta Labs Ltd * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1061,6 +1061,39 @@ __shim_handle__torch_get_fsize(PyObject *self, PyObject *args) return Py_BuildValue("iK", rc, st.st_size); } +static PyObject * +__shim_handle__torch_mkdir(PyObject *self, PyObject *args) +{ + struct dfs_handle *hdl = NULL; + char *path = NULL; + char *dir = NULL; + char *name = NULL; + mode_t mode; + dfs_obj_t *parent = NULL; + + RETURN_NULL_IF_FAILED_TO_PARSE(args, "LsI", &hdl, &path, &mode); + + assert(hdl->dfs != NULL); + + int rc = split_path(path, &dir, &name); + if (rc) { + return PyLong_FromLong(rc); + } + + rc = lookup_or_insert_dir_obj(hdl, dir, &parent); + if (rc) { + D_ERROR("Could not lookup '%s': %s (rc=%d)", dir, strerror(rc), rc); + goto out; + } + + rc = dfs_mkdir(hdl->dfs, parent, name, mode, 0); + +out: + D_FREE(dir); + D_FREE(name); + return PyLong_FromLong(rc); +} + /** * Python shim module */ @@ -1080,6 +1113,7 @@ static PyMethodDef torchMethods[] = { EXPORT_PYTHON_METHOD(torch_recommended_dir_split), EXPORT_PYTHON_METHOD(torch_list_with_anchor), EXPORT_PYTHON_METHOD(torch_get_fsize), + EXPORT_PYTHON_METHOD(torch_mkdir), EXPORT_PYTHON_METHOD(module_init), EXPORT_PYTHON_METHOD(module_fini), diff --git a/src/common/ad_mem.c b/src/common/ad_mem.c index 03b3120344a..fd069273fd0 100644 --- a/src/common/ad_mem.c +++ b/src/common/ad_mem.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2023 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -608,7 +609,7 @@ blob_file_open(struct ad_blob *blob, const char *path, size_t *size, bool create int flags = O_RDWR; while (1) { - fd = open(path, flags, 0600); + fd = open(path, flags, UMEM_FILE_MODE_DEFAULT); if (fd >= 0) break; diff --git a/src/common/btree.c b/src/common/btree.c index 128fd0e2092..82afe158e60 100644 --- a/src/common/btree.c +++ b/src/common/btree.c @@ -167,11 +167,10 @@ struct btr_context { /** size of print buffer */ #define BTR_PRINT_BUF 128 -static int btr_class_init(umem_off_t root_off, - struct btr_root *root, unsigned int tree_class, - uint64_t *tree_feats, struct umem_attr *uma, - daos_handle_t coh, void *priv, - struct btr_instance *tins); +static int +btr_class_init(umem_off_t root_off, struct btr_root *root, unsigned int tree_class, + uint64_t *tree_feats, struct umem_attr *uma, daos_handle_t coh, void *priv, + btr_report_fn_t report_fn, void *report_arg, struct btr_instance *tins); static struct btr_record *btr_node_rec_at(struct btr_context *tcx, umem_off_t nd_off, unsigned int at); @@ -319,6 +318,11 @@ btr_ops(struct btr_context *tcx) return tcx->tc_tins.ti_ops; } +static inline void +report_fn_nop(void *arg, enum btr_report_type type, const char *fmt, ...) +{ +} + /** * Create a btree context (in volatile memory). * @@ -347,8 +351,8 @@ btr_context_create(umem_off_t root_off, struct btr_root *root, return -DER_NOMEM; tcx->tc_ref = 1; /* for the caller */ - rc = btr_class_init(root_off, root, tree_class, &tree_feats, uma, - coh, priv, &tcx->tc_tins); + rc = btr_class_init(root_off, root, tree_class, &tree_feats, uma, coh, priv, report_fn_nop, + NULL, &tcx->tc_tins); if (rc != 0) { D_ERROR("Failed to setup mem class %d: "DF_RC"\n", uma->uma_id, DP_RC(rc)); @@ -1698,11 +1702,12 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, memset(&tcx->tc_traces[0], 0, sizeof(tcx->tc_traces[0]) * BTR_TRACE_MAX); - /* depth could be changed by dbtree_delete/dbtree_iter_delete from - * a different btr_context, so we always reinitialize both depth - * and start point of trace for the context. + /* depth & feats could be changed by dbtree_delete/dbtree_iter_delete + * from a different btr_context, so we always reinitialize both depth, + * feats and start point of trace for the context. */ btr_context_set_depth(tcx, tcx->tc_tins.ti_root->tr_depth); + tcx->tc_feats = tcx->tc_tins.ti_root->tr_feats; if (btr_root_empty(tcx)) { /* empty tree */ D_DEBUG(DB_TRACE, "Empty tree\n"); @@ -4446,49 +4451,22 @@ dbtree_iterate(daos_handle_t toh, uint32_t intent, bool backward, static struct btr_class btr_class_registered[BTR_TYPE_MAX]; /** - * Initialize a tree instance from a registered tree class. + * Calculate tree's features. + * + * \param[in] tree_class Tree's class identified. + * \param[in,out] tree_feats Tree's features. + * \param[in] tc Tree's class. + * + * \retval -DER_PROTO Unsupported features + * \retval DER_SUCCESS Success */ static int -btr_class_init(umem_off_t root_off, struct btr_root *root, - unsigned int tree_class, uint64_t *tree_feats, - struct umem_attr *uma, daos_handle_t coh, void *priv, - struct btr_instance *tins) +btr_class_feats_init(unsigned int tree_class, uint64_t *tree_feats, struct btr_class *tc) { - struct btr_class *tc; - uint64_t special_feat; - int rc; + uint64_t special_feat; - memset(tins, 0, sizeof(*tins)); - rc = umem_class_init(uma, &tins->ti_umm); - if (rc != 0) - return rc; - - tins->ti_priv = priv; - tins->ti_coh = coh; - tins->ti_root_off = UMOFF_NULL; - - if (!UMOFF_IS_NULL(root_off)) { - tins->ti_root_off = root_off; - if (root == NULL) - root = umem_off2ptr(&tins->ti_umm, root_off); - } - tins->ti_root = root; - - if (root != NULL && root->tr_class != 0) { - tree_class = root->tr_class; - *tree_feats = root->tr_feats; - } - - /* XXX should be multi-thread safe */ - if (tree_class >= BTR_TYPE_MAX) { - D_DEBUG(DB_TRACE, "Invalid class id: %d\n", tree_class); - return -DER_INVAL; - } - - tc = &btr_class_registered[tree_class]; - if (tc->tc_ops == NULL) { - D_DEBUG(DB_TRACE, "Unregistered class id %d\n", tree_class); - return -DER_NONEXIST; + if (DAOS_FAIL_CHECK(DAOS_FAULT_BTREE_FEATURES)) { /** fault injection */ + return -DER_PROTO; } /* If no hkey callbacks are supplied, only special key types are @@ -4522,11 +4500,77 @@ btr_class_init(umem_off_t root_off, struct btr_root *root, /** Only check btree managed bits that can be set in tr_class */ if ((*tree_feats & tc->tc_feats) != (*tree_feats & BTR_EXT_FEAT_MASK)) { - D_ERROR("Unsupported features "DF_X64"/"DF_X64"\n", - *tree_feats, tc->tc_feats); return -DER_PROTO; } + return DER_SUCCESS; +} + +#define TREE_CLASS_STR "Tree class... " +#define TREE_FEATURES_STR "Tree features... " +#define INVALID_CLASS_FMT "Invalid class id: %d\n" +#define UNREGISTERED_CLASS_FMT "Unregistered class id %d\n" +#define UNSUPPORTED_FEATURES_FMT "Unsupported features " DF_X64 "/" DF_X64 "\n" +#define OK_STR "ok.\n" + +/** + * Initialize a tree instance from a registered tree class. + */ +static int +btr_class_init(umem_off_t root_off, struct btr_root *root, unsigned int tree_class, + uint64_t *tree_feats, struct umem_attr *uma, daos_handle_t coh, void *priv, + btr_report_fn_t report_fn, void *report_arg, struct btr_instance *tins) +{ + struct btr_class *tc; + int rc; + + memset(tins, 0, sizeof(*tins)); + rc = umem_class_init(uma, &tins->ti_umm); + if (rc != 0) + return rc; + + tins->ti_priv = priv; + tins->ti_coh = coh; + tins->ti_root_off = UMOFF_NULL; + + if (!UMOFF_IS_NULL(root_off)) { + tins->ti_root_off = root_off; + if (root == NULL) + root = umem_off2ptr(&tins->ti_umm, root_off); + } + tins->ti_root = root; + + if (root != NULL && root->tr_class != 0) { + tree_class = root->tr_class; + *tree_feats = root->tr_feats; + } + + /* XXX should be multi-thread safe */ + if (tree_class >= BTR_TYPE_MAX || DAOS_FAIL_CHECK(DAOS_FAULT_BTREE_OPEN_INV_CLASS)) { + report_fn(report_arg, BTR_REPORT_ERROR, TREE_CLASS_STR INVALID_CLASS_FMT, + tree_class); + D_DEBUG(DB_TRACE, INVALID_CLASS_FMT, tree_class); + return -DER_INVAL; + } + + tc = &btr_class_registered[tree_class]; + if (tc->tc_ops == NULL || DAOS_FAIL_CHECK(DAOS_FAULT_BTREE_OPEN_UNREG_CLASS)) { + report_fn(report_arg, BTR_REPORT_ERROR, TREE_CLASS_STR UNREGISTERED_CLASS_FMT, + tree_class); + D_DEBUG(DB_TRACE, UNREGISTERED_CLASS_FMT, tree_class); + return -DER_NONEXIST; + } + report_fn(report_arg, BTR_REPORT_MSG, TREE_CLASS_STR OK_STR); + + rc = btr_class_feats_init(tree_class, tree_feats, tc); + if (rc != DER_SUCCESS) { + report_fn(report_arg, BTR_REPORT_ERROR, TREE_FEATURES_STR UNSUPPORTED_FEATURES_FMT, + *tree_feats, tc->tc_feats); + D_ERROR(UNSUPPORTED_FEATURES_FMT, *tree_feats, tc->tc_feats); + return rc; + } + report_fn(report_arg, BTR_REPORT_MSG, TREE_FEATURES_STR OK_STR); + tins->ti_ops = tc->tc_ops; return rc; } @@ -4640,3 +4684,183 @@ dbtree_overhead_get(int alloc_overhead, unsigned int tclass, uint64_t otype, return 0; } +#define CK_BTREE_NODE_FMT "Node (off=%#lx)... " +#define CK_BTREE_NODE_MALFORMED_STR "malformed - " +#define CK_BTREE_NON_ZERO_PADDING_FMT CK_BTREE_NODE_MALFORMED_STR "tn_pad_32 != 0 (%#" PRIx32 ")" +#define CK_BTREE_NON_ZERO_GEN_FMT CK_BTREE_NODE_MALFORMED_STR "tn_gen != 0 (%#" PRIx32 ")" + +/** + * Validate the integrity of the btree node. + * + * \param[in] nd Node to check. + * \param[in] nd_off Node's offset. + * \param[in] ck Checker. + * + * \retval DER_SUCCESS The node is correct. + * \retval -DER_NOTYPE The node is malformed. + */ +static int +btr_node_check(struct btr_node *nd, umem_off_t nd_off, btr_report_fn_t report_fn, void *report_arg, + bool error_on_non_zero_padding) +{ + uint16_t unknown_flags; + + D_ASSERT(report_fn != NULL); + + unknown_flags = nd->tn_flags & ~(BTR_NODE_LEAF | BTR_NODE_ROOT); + if (unknown_flags != 0) { + report_fn(report_arg, BTR_REPORT_ERROR, + CK_BTREE_NODE_MALFORMED_STR "unknown flags (%#" PRIx16 ")", + unknown_flags); + return -DER_NOTYPE; + } + + if (nd->tn_pad_32 != 0) { + if (error_on_non_zero_padding) { + report_fn(report_arg, BTR_REPORT_ERROR, + CK_BTREE_NODE_FMT CK_BTREE_NON_ZERO_PADDING_FMT, nd_off, + nd->tn_pad_32); + return -DER_NOTYPE; + } else { + report_fn(report_arg, BTR_REPORT_WARNING, + CK_BTREE_NODE_FMT CK_BTREE_NON_ZERO_PADDING_FMT, nd_off, + nd->tn_pad_32); + } + } + + if (nd->tn_gen != 0) { + if (error_on_non_zero_padding) { + report_fn(report_arg, BTR_REPORT_ERROR, + CK_BTREE_NODE_FMT CK_BTREE_NON_ZERO_GEN_FMT, nd_off, nd->tn_gen); + return -DER_NOTYPE; + } else { + report_fn(report_arg, BTR_REPORT_WARNING, + CK_BTREE_NODE_FMT CK_BTREE_NON_ZERO_GEN_FMT, nd_off, nd->tn_gen); + } + } + + report_fn(report_arg, BTR_REPORT_MSG, CK_BTREE_NODE_FMT OK_STR, nd_off); + + return DER_SUCCESS; +} + +/** + * \struct node_info + * + * List of node offsets. + */ +struct node_info { + d_list_t link; + umem_off_t nd_off; +}; + +/** + * Validate the integrity of a btree. + * + * \param[in] tcx Btree context. + * \param[in] ck Checker. + * + * \retval DER_SUCCESS The tree is correct. + * \retval -DER_NOTYPE The tree is malformed. + * \retval -DER_NONEXIST The tree is malformed. + * \retval -DER_* Possibly other errors. + */ +static int +btr_nodes_check(struct btr_context *tcx, btr_report_fn_t report_fn, void *report_arg, + bool error_on_non_zero_padding) +{ + D_LIST_HEAD(node_list); + struct node_info *ni; + struct node_info *ni_tmp; + umem_off_t nd_off; + struct btr_node *nd; + int rc = DER_SUCCESS; + + D_ASSERT(report_fn != NULL); + + if (btr_root_empty(tcx)) { + report_fn(report_arg, BTR_REPORT_MSG, "Empty tree\n"); + return DER_SUCCESS; + } + + D_ASSERT(!btr_has_embedded_value(tcx)); + + /** add the root node to the node list */ + D_ALLOC_PTR(ni); + ni->nd_off = tcx->tc_tins.ti_root->tr_node; + d_list_add_tail(&ni->link, &node_list); + + /** process the node list */ + while (!d_list_empty(&node_list)) { + ni = d_list_pop_entry(&node_list, struct node_info, link); + nd_off = ni->nd_off; + nd = btr_off2ptr(tcx, nd_off); + + /** check the node */ + rc = btr_node_check(nd, nd_off, report_fn, report_arg, error_on_non_zero_padding); + if (rc != DER_SUCCESS) { + break; + } + + /** a leaf has no child nodes */ + if (btr_node_is_leaf(tcx, nd_off)) { + continue; + } + + /** + * append the node's children to the front of the nodes' list + * + * Note: This makes the traversal depth-first. Given the limited depth of a typical + * DAOS tree, this approach should help reduce resource usage. + */ + for (int at = 0; at < nd->tn_keyn; ++at) { + D_ALLOC_PTR(ni); + ni->nd_off = btr_node_child_at(tcx, nd_off, at); + d_list_add(&ni->link, &node_list); + } + } + + /** free the list - in case we exit with an error and the list of nodes is not empty */ + d_list_for_each_entry_safe(ni, ni_tmp, &node_list, link) { + /** remove the node from the list */ + d_list_del(&ni->link); + D_FREE(ni); + } + + return rc; +} + +/** + * Check a btree. + * + * \param[in] root Address of the tree root. + * \param[in] uma Memory class attributes. + * \param[in] ck Checker. + */ +int +dbtree_check_inplace(struct btr_root *root, struct umem_attr *uma, btr_report_fn_t report_fn, + void *report_arg, bool error_on_non_zero_padding) +{ + struct btr_context tcx = {0}; + uint64_t tree_feats = -1; + int rc; + + D_ASSERT(root != NULL); + D_ASSERT(uma != NULL); + D_ASSERT(report_fn != NULL); + + rc = btr_class_init(UMOFF_NULL, root, -1, &tree_feats, uma, DAOS_HDL_INVAL, NULL, report_fn, + report_arg, &tcx.tc_tins); + if (rc != DER_SUCCESS) { + return rc; + } + + tcx.tc_feats = root->tr_feats; + tcx.tc_order = root->tr_order; + + rc = btr_nodes_check(&tcx, report_fn, report_arg, error_on_non_zero_padding); + + /** no need to free tcx */ + + return rc; +} diff --git a/src/common/checksum.c b/src/common/checksum.c index c36f14e3c6d..d0f80a3e92b 100644 --- a/src/common/checksum.c +++ b/src/common/checksum.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1051,6 +1052,38 @@ daos_csummer_verify_key(struct daos_csummer *obj, daos_key_t *key, return 0; } +int +daos_csummer_verify_value(struct daos_csummer *csummer, daos_recx_t *recx, daos_size_t rsize, + d_iov_t *val, struct dcs_csum_info *csum_info) +{ + struct dcs_iod_csums iod_csum = {0}; + daos_iod_t iod = {0}; + d_sg_list_t sgl = {0}; + bool skip_key_c = csummer->dcs_skip_key_calc; + bool skip_key_v = csummer->dcs_skip_key_verify; + int rc; + + iod.iod_nr = 1; + iod.iod_size = rsize; + iod.iod_recxs = recx; + iod.iod_type = recx ? DAOS_IOD_ARRAY : DAOS_IOD_SINGLE; + + iod_csum.ic_nr = 1; + iod_csum.ic_data = csum_info; + + sgl.sg_iovs = val; + sgl.sg_nr = 1; + sgl.sg_nr_out = 1; + + csummer->dcs_skip_key_calc = true; + csummer->dcs_skip_key_verify = true; + rc = daos_csummer_verify_iod(csummer, &iod, &sgl, &iod_csum, NULL, 0, NULL); + csummer->dcs_skip_key_calc = skip_key_c; + csummer->dcs_skip_key_verify = skip_key_v; + + return rc; +} + int daos_csummer_alloc_iods_csums_with_packed(struct daos_csummer *csummer, daos_iod_t *iods, int iod_cnt, diff --git a/src/common/control.c b/src/common/control.c index 8977667bc9c..75f5ee7db0d 100644 --- a/src/common/control.c +++ b/src/common/control.c @@ -1,50 +1,21 @@ /** * (C) Copyright 2020-2021 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ /** * This file implements functions shared with the control-plane. */ + #include #include #include +#include +#include -/* - * Disable DPDK telemetry to avoid socket file clashes and quiet DPDK - * logging by setting specific facility masks. - */ -const char * -dpdk_cli_override_opts = "--log-level=lib.eal:4 " - "--log-level=lib.malloc:4 " - "--log-level=lib.ring:4 " - "--log-level=lib.mempool:4 " - "--log-level=lib.timer:4 " - "--log-level=pmd:4 " - "--log-level=lib.hash:4 " - "--log-level=lib.lpm:4 " - "--log-level=lib.kni:4 " - "--log-level=lib.acl:4 " - "--log-level=lib.power:4 " - "--log-level=lib.meter:4 " - "--log-level=lib.sched:4 " - "--log-level=lib.port:4 " - "--log-level=lib.table:4 " - "--log-level=lib.pipeline:4 " - "--log-level=lib.mbuf:4 " - "--log-level=lib.cryptodev:4 " - "--log-level=lib.efd:4 " - "--log-level=lib.eventdev:4 " - "--log-level=lib.gso:4 " - "--log-level=user1:4 " - "--log-level=user2:4 " - "--log-level=user3:4 " - "--log-level=user4:4 " - "--log-level=user5:4 " - "--log-level=user6:4 " - "--log-level=user7:4 " - "--log-level=user8:4 " - "--no-telemetry"; +/* Buffer to hold dynamically generated DPDK CLI options */ +static char dpdk_cli_opts_buffer[2048]; int copy_ascii(char *dst, size_t dst_sz, const void *src, size_t src_sz) @@ -72,3 +43,66 @@ copy_ascii(char *dst, size_t dst_sz, const void *src, size_t src_sz) return 0; } + +/** + * Build DPDK CLI options string with per-facility log levels. Function is not thread safe. + * + * \param eal_level Log level for Environment Abstraction Layer facility (1-8) + * \param default_level Default log level for other facilities (1-8) + * + * \return Pointer to static buffer containing DPDK CLI options string, + * or NULL on error. + */ +const char * +dpdk_cli_build_opts(int eal_level, int default_level) +{ + int ret; + + /* Validate log levels */ + if (eal_level < 1 || eal_level > 8 || default_level < 1 || default_level > 8) + return NULL; + + /* Build with custom EAL level, others at default */ + ret = snprintf(dpdk_cli_opts_buffer, sizeof(dpdk_cli_opts_buffer), + "--log-level=lib.eal:%d " + "--log-level=lib.malloc:%d " + "--log-level=lib.ring:%d " + "--log-level=lib.mempool:%d " + "--log-level=lib.timer:%d " + "--log-level=pmd:%d " + "--log-level=lib.hash:%d " + "--log-level=lib.lpm:%d " + "--log-level=lib.kni:%d " + "--log-level=lib.acl:%d " + "--log-level=lib.power:%d " + "--log-level=lib.meter:%d " + "--log-level=lib.sched:%d " + "--log-level=lib.port:%d " + "--log-level=lib.table:%d " + "--log-level=lib.pipeline:%d " + "--log-level=lib.mbuf:%d " + "--log-level=lib.cryptodev:%d " + "--log-level=lib.efd:%d " + "--log-level=lib.eventdev:%d " + "--log-level=lib.gso:%d " + "--log-level=user1:%d " + "--log-level=user2:%d " + "--log-level=user3:%d " + "--log-level=user4:%d " + "--log-level=user5:%d " + "--log-level=user6:%d " + "--log-level=user7:%d " + "--log-level=user8:%d " + "--no-telemetry", + eal_level, default_level, default_level, default_level, default_level, + default_level, default_level, default_level, default_level, default_level, + default_level, default_level, default_level, default_level, default_level, + default_level, default_level, default_level, default_level, default_level, + default_level, default_level, default_level, default_level, default_level, + default_level, default_level, default_level, default_level); + + if (ret < 0 || ret >= sizeof(dpdk_cli_opts_buffer)) + return NULL; + + return dpdk_cli_opts_buffer; +} diff --git a/src/common/dav/dav_internal.h b/src/common/dav/dav_internal.h index ae6150c2748..2d31b4480f9 100644 --- a/src/common/dav/dav_internal.h +++ b/src/common/dav/dav_internal.h @@ -69,6 +69,7 @@ typedef struct dav_obj { static inline struct dav_tx *utx2wtx(struct umem_wal_tx *utx) { + D_ASSERT(utx != NULL); return (struct dav_tx *)&utx->utx_private; } diff --git a/src/common/dav/heap.c b/src/common/dav/heap.c index ee2feca85a1..89f0ecf15c6 100644 --- a/src/common/dav/heap.c +++ b/src/common/dav/heap.c @@ -816,7 +816,10 @@ heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, case CHUNK_TYPE_USED: break; default: - ASSERT(0); + D_ASSERTF(0, + "Encountered invalid chunk (%" PRIu32 ") of type %" PRIu16 + " val = 0x%" PRIx64, + i, hdr->type, *(uint64_t *)hdr); } i = m.chunk_id + m.size_idx; /* hdr might have changed */ diff --git a/src/common/dav/memblock.c b/src/common/dav/memblock.c index 9600e49c46c..4fa51f5e502 100644 --- a/src/common/dav/memblock.c +++ b/src/common/dav/memblock.c @@ -1233,8 +1233,8 @@ huge_reinit_chunk(const struct memory_block *m) { struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); - if (hdr->type == CHUNK_TYPE_USED) - huge_write_footer(hdr, hdr->size_idx); + D_ASSERT((hdr->type == CHUNK_TYPE_USED) || (hdr->type == CHUNK_TYPE_FREE)); + huge_write_footer(hdr, hdr->size_idx); } /* diff --git a/src/common/dav/tx.c b/src/common/dav/tx.c index c7516b479a7..b98e076bd7b 100644 --- a/src/common/dav/tx.c +++ b/src/common/dav/tx.c @@ -1505,6 +1505,8 @@ dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_n if (palloc_reserve(pop->do_heap, size, NULL, NULL, type_num, 0, 0, 0, act) != 0) { + if (!tx_inprogress) + lw_tx_end(pop, NULL); DAV_API_END(); return 0; } @@ -1558,9 +1560,26 @@ dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) void dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) { + int rc, tx_inprogress = 0; + DAV_DBG("actvcnt=%zu", actvcnt); + if (get_tx()->stage != DAV_TX_STAGE_NONE) + tx_inprogress = 1; + DAV_API_START(); + if (!tx_inprogress) { + rc = lw_tx_begin(pop); + if (rc) { + D_ERROR("Failed to start local tx. %d\n", rc); + return; + } + } + palloc_cancel(pop->do_heap, actv, actvcnt); + + if (!tx_inprogress) + lw_tx_end(pop, NULL); + DAV_API_END(); } diff --git a/src/common/dav_v2/dav_internal.h b/src/common/dav_v2/dav_internal.h index bc13e2eabc3..974da2a1fb0 100644 --- a/src/common/dav_v2/dav_internal.h +++ b/src/common/dav_v2/dav_internal.h @@ -58,6 +58,7 @@ typedef struct dav_obj { static inline struct dav_tx *utx2wtx(struct umem_wal_tx *utx) { + D_ASSERT(utx != NULL); return (struct dav_tx *)&utx->utx_private; } diff --git a/src/common/dav_v2/heap.c b/src/common/dav_v2/heap.c index 29a7ce17c9c..c40e708c1e9 100644 --- a/src/common/dav_v2/heap.c +++ b/src/common/dav_v2/heap.c @@ -703,6 +703,26 @@ heap_mbrt_log_alloc_failure(struct palloc_heap *heap, uint32_t zone_id) } } +/* + * heap_touch_umem_cache -- touch the cache page for a memory address + * if the memory bucket is evictable + */ +void +heap_touch_umem_cache(struct palloc_heap *heap, void *addr, size_t size) +{ + uint64_t offset = HEAP_PTR_TO_OFF(heap, addr); + uint32_t zone_id = OFFSET_TO_ZID(offset); + struct mbrt *mb = heap_mbrt_get_mb(heap, zone_id); + dav_obj_t *dav_hdl = (dav_obj_t *)heap->p_ops.base; + + D_ASSERT((mb != NULL) && (dav_hdl != NULL) && (dav_hdl->do_utx != NULL)); + + if (!mb->is_evictable) + return; + + umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, offset, size); +} + void heap_mbrt_setmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t usage) { @@ -1312,7 +1332,10 @@ heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, case CHUNK_TYPE_USED: break; default: - ASSERT(0); + D_ASSERTF(0, + "Encountered invalid chunk (%" PRIu32 ") of type %" PRIu16 + " val = 0x%" PRIx64, + i, hdr->type, *(uint64_t *)hdr); } i = m.chunk_id + m.size_idx; /* hdr might have changed */ diff --git a/src/common/dav_v2/heap.h b/src/common/dav_v2/heap.h index 8a49934b5c4..c01aba37dcb 100644 --- a/src/common/dav_v2/heap.h +++ b/src/common/dav_v2/heap.h @@ -139,6 +139,9 @@ heap_mbrt_get_mb(struct palloc_heap *heap, uint32_t zone_id); void heap_mbrt_log_alloc_failure(struct palloc_heap *heap, uint32_t zone_id); +void +heap_touch_umem_cache(struct palloc_heap *heap, void *addr, size_t size); + int heap_get_evictable_mb(struct palloc_heap *heap, uint32_t *zone_id); diff --git a/src/common/dav_v2/memblock.c b/src/common/dav_v2/memblock.c index 402b79a4df9..6c544100ea2 100644 --- a/src/common/dav_v2/memblock.c +++ b/src/common/dav_v2/memblock.c @@ -680,14 +680,20 @@ huge_prep_operation_hdr(const struct memory_block *m, enum memblock_state op, * The footer entry change is updated as transient because it will * be recreated at heap boot regardless - it's just needed for runtime * operations. + * Note: + * If a footer is added as part of a tx, creating a transient entry + * and marking the page as dirty at commit time does not justify the + * added complexity and occurs less frequently. Therefore, for now, + * we commit footer in the same way as header when called under a tx. */ + if (ctx == NULL) { util_atomic_store_explicit64((uint64_t *)footer, val, memory_order_relaxed); + heap_touch_umem_cache(m->heap, footer, sizeof(*footer)); VALGRIND_SET_CLEAN(footer, sizeof(*footer)); } else { - operation_add_typed_entry(ctx, - footer, val, ULOG_OPERATION_SET, LOG_TRANSIENT); + operation_add_entry(ctx, footer, val, ULOG_OPERATION_SET); } } @@ -1234,8 +1240,8 @@ huge_reinit_chunk(const struct memory_block *m) { struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); - if (hdr->type == CHUNK_TYPE_USED) - huge_write_footer(hdr, hdr->size_idx); + D_ASSERT((hdr->type == CHUNK_TYPE_USED) || (hdr->type == CHUNK_TYPE_FREE)); + huge_write_footer(hdr, hdr->size_idx); } /* diff --git a/src/common/dav_v2/tx.c b/src/common/dav_v2/tx.c index ae7edde31d1..c6103eb20cb 100644 --- a/src/common/dav_v2/tx.c +++ b/src/common/dav_v2/tx.c @@ -1733,6 +1733,8 @@ dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t typ if (palloc_reserve(pop->do_heap, size, constructor_alloc, &carg, type_num, 0, CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), act) != 0) { + if (!tx_inprogress) + lw_tx_end(pop, NULL); DAV_API_END(); return 0; } @@ -1786,9 +1788,26 @@ dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) DAV_FUNC_EXPORT void dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) { + int rc, tx_inprogress = 0; + DAV_DBG("actvcnt=%zu", actvcnt); + if (get_tx()->stage != DAV_TX_STAGE_NONE) + tx_inprogress = 1; + DAV_API_START(); + if (!tx_inprogress) { + rc = lw_tx_begin(pop); + if (rc) { + D_ERROR("Failed to start local tx. %d\n", rc); + return; + } + } + palloc_cancel(pop->do_heap, actv, actvcnt); + + if (!tx_inprogress) + lw_tx_end(pop, NULL); + DAV_API_END(); } diff --git a/src/common/mem.c b/src/common/mem.c index 73b0399baea..e473cb8bfe5 100644 --- a/src/common/mem.c +++ b/src/common/mem.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -415,6 +415,11 @@ umempobj_open(const char *path, const char *layout_name, int flags, struct umem_ int enabled = 1; int rc; + if (DAOS_FAIL_CHECK(DAOS_FAULT_POOL_OPEN_UMEM)) { /** fault injection */ + errno = daos_fail_value_get(); + return NULL; + } + D_ALLOC(umm_pool, sizeof(*umm_pool) + sizeof(umm_pool->up_slabs[0]) * UMM_SLABS_CNT); if (umm_pool == NULL) return NULL; @@ -2228,6 +2233,7 @@ umem_cache_free(struct umem_store *store) D_ASSERT(d_list_empty(&cache->ca_pgs_pinned)); D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] == 0); D_ASSERT(cache->ca_reserve_waiters == 0); + D_ASSERT(cache->ca_unpin_waiters == 0); pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages]; for (i = 0; i < cache->ca_mem_pages; i++) { @@ -2243,6 +2249,11 @@ umem_cache_free(struct umem_store *store) } + if (cache->ca_unpin_wq != NULL) { + store->stor_ops->so_waitqueue_destroy(cache->ca_unpin_wq); + cache->ca_unpin_wq = NULL; + } + if (store->cache->off2ptr) D_FREE(store->cache->off2ptr); if (store->cache->ptr2off) @@ -2327,8 +2338,8 @@ umem_cache_alloc(struct umem_store *store, uint32_t page_sz, uint32_t md_pgs, ui if (cache == NULL) return -DER_NOMEM; - D_DEBUG(DB_IO, "Allocated page cache, md-pages(%u), mem-pages(%u), max-ne-pages(%u) %p\n", - md_pgs, mem_pgs, max_ne_pgs, cache); + D_INFO("Page cache: md-pgs(%u), mem-pages(%u), max-ne-pgs(%u), mode(%d)\n", md_pgs, mem_pgs, + max_ne_pgs, cmode); cache->ca_store = store; cache->ca_base = base; @@ -2378,6 +2389,11 @@ umem_cache_alloc(struct umem_store *store, uint32_t page_sz, uint32_t md_pgs, ui if (rc) goto error; + D_ASSERT(store->stor_ops->so_waitqueue_create != NULL); + rc = store->stor_ops->so_waitqueue_create(&cache->ca_unpin_wq); + if (rc) + goto error; + pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages]; for (idx = 0; idx < cache->ca_mem_pages; idx++) { rc = page_waitqueue_create(cache, pinfo); @@ -2438,6 +2454,7 @@ cache_unmap_page(struct umem_cache *cache, struct umem_page_info *pinfo) verify_clean_page(pinfo, 1); D_ASSERT(pinfo->pi_pg_id < cache->ca_md_pages); D_ASSERT(cache->ca_pages[pinfo->pi_pg_id].pg_info == pinfo); + D_ASSERT(pinfo->pi_ref == 0); cache->off2ptr[pinfo->pi_pg_id] = 0; cache_idx = (pinfo - (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages]); @@ -2481,9 +2498,13 @@ cache_add2lru(struct umem_cache *cache, struct umem_page_info *pinfo) D_ASSERT(d_list_empty(&pinfo->pi_lru_link)); D_ASSERT(pinfo->pi_ref == 0); - if (pinfo->pi_evictable) + if (pinfo->pi_evictable) { d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_lru[1]); - else + if (cache->ca_unpin_waiters) { + cache->ca_unpin_waiters--; + cache->ca_store->stor_ops->so_waitqueue_wakeup(cache->ca_unpin_wq, false); + } + } else d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_lru[0]); } @@ -3185,7 +3206,8 @@ cache_evict_page(struct umem_cache *cache, bool for_sys) D_ERROR("No evictable page.\n"); return -DER_INVAL; } else if (d_list_empty(pg_list)) { - D_ERROR("All evictable pages are pinned.\n"); + cache->ca_unpin_waiters++; + cache->ca_store->stor_ops->so_waitqueue_wait(cache->ca_unpin_wq, false); return -DER_BUSY; } @@ -3284,14 +3306,22 @@ cache_get_free_page(struct umem_cache *cache, struct umem_page_info **ret_pinfo, } /* All pinned pages are from current caller */ - if (rc == -DER_BUSY && pinned_nr == cache->ca_pgs_stats[UMEM_PG_STATS_PINNED]) { - D_ERROR("Not enough evictable pages.\n"); + if (rc == -DER_BUSY && pinned_nr && + pinned_nr == cache->ca_pgs_stats[UMEM_PG_STATS_PINNED]) { + D_ERROR("Not enough evictable pages. pinned [%u/%u]\n", pinned_nr, + cache->ca_pgs_stats[UMEM_PG_STATS_PINNED]); return -DER_INVAL; } - D_CDEBUG(retry_cnt == 10, DLOG_ERR, DB_TRACE, - "Retry get free page, %d times\n", retry_cnt); + if (rc == -DER_BUSY) + return rc; + retry_cnt++; + D_CDEBUG(retry_cnt % 20 == 0, DLOG_ERR, DB_TRACE, + "%u retries of get free page with %u pinned. [ne:%u,pinned:%u,free:%u]\n", + retry_cnt, pinned_nr, cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], + cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], + cache->ca_pgs_stats[UMEM_PG_STATS_FREE]); } pinfo = cache_pop_free_page(cache); @@ -3312,6 +3342,8 @@ cache_map_pages(struct umem_cache *cache, uint32_t *pages, int page_nr) struct umem_page_info *pinfo, *free_pinfo = NULL; uint32_t pg_id; int i, rc = 0; + int retry_cnt; + bool pages_evicted = false; for (i = 0; i < page_nr; i++) { pg_id = pages[i]; @@ -3320,6 +3352,7 @@ cache_map_pages(struct umem_cache *cache, uint32_t *pages, int page_nr) D_ERROR("Can only map single evictable page.\n"); return -DER_INVAL; } + retry_cnt = 0; retry: pinfo = cache->ca_pages[pg_id].pg_info; /* The page is already mapped */ @@ -3330,6 +3363,7 @@ cache_map_pages(struct umem_cache *cache, uint32_t *pages, int page_nr) if (free_pinfo != NULL) { cache_push_free_page(cache, free_pinfo); free_pinfo = NULL; + pages_evicted = true; } if (is_id_evictable(cache, pg_id) != pinfo->pi_evictable) { pinfo->pi_evictable = is_id_evictable(cache, pg_id); @@ -3346,14 +3380,21 @@ cache_map_pages(struct umem_cache *cache, uint32_t *pages, int page_nr) if (is_id_evictable(cache, pg_id)) { if (free_pinfo == NULL) { rc = cache_get_free_page(cache, &free_pinfo, 0, false); - if (rc) { + if (rc && rc != -DER_BUSY) { DL_ERROR(rc, "Failed to get free page."); break; } + retry_cnt++; + D_CDEBUG(retry_cnt % 100 == 0, DLOG_ERR, DB_TRACE, + "%u retries of get free page. [ne:%u,pinned:%u,free:%u]\n", + retry_cnt, cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], + cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], + cache->ca_pgs_stats[UMEM_PG_STATS_FREE]); goto retry; } else { pinfo = free_pinfo; free_pinfo = NULL; + pages_evicted = true; } } else { pinfo = cache_pop_free_page(cache); @@ -3369,6 +3410,10 @@ cache_map_pages(struct umem_cache *cache, uint32_t *pages, int page_nr) /* Map an empty page, doesn't need to load page */ pinfo->pi_loaded = 1; } + if (rc || (pages_evicted && cache->ca_unpin_waiters)) { + cache->ca_unpin_waiters = 0; + cache->ca_store->stor_ops->so_waitqueue_wakeup(cache->ca_unpin_wq, true); + } return rc; } @@ -3378,10 +3423,13 @@ cache_pin_pages(struct umem_cache *cache, uint32_t *pages, int page_nr, bool for { struct umem_page_info *pinfo, *free_pinfo = NULL; uint32_t pg_id; - int i, processed = 0, pinned = 0, rc = 0; + int i, processed = 0, pinned = 0, rc = 0; + int retry_cnt; + bool pages_evicted = false; for (i = 0; i < page_nr; i++) { pg_id = pages[i]; + retry_cnt = 0; retry: pinfo = cache->ca_pages[pg_id].pg_info; /* The page is already mapped */ @@ -3392,19 +3440,28 @@ cache_pin_pages(struct umem_cache *cache, uint32_t *pages, int page_nr, bool for if (free_pinfo != NULL) { cache_push_free_page(cache, free_pinfo); free_pinfo = NULL; + pages_evicted = true; } goto next; } if (free_pinfo == NULL) { rc = cache_get_free_page(cache, &free_pinfo, pinned, for_sys); - if (rc) + if (rc && rc != -DER_BUSY) goto error; + retry_cnt++; + D_CDEBUG(retry_cnt % 20 == 0, DLOG_ERR, DB_TRACE, + "%u retries of get free page with %u pinned. " + "[ne:%u,pinned:%u,free:%u]\n", + retry_cnt, pinned, cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], + cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], + cache->ca_pgs_stats[UMEM_PG_STATS_FREE]); /* Above cache_get_free_page() could yield, need re-check mapped status */ goto retry; } else { pinfo = free_pinfo; free_pinfo = NULL; + pages_evicted = true; } inc_cache_stats(cache, UMEM_CACHE_STATS_MISS); @@ -3429,6 +3486,10 @@ cache_pin_pages(struct umem_cache *cache, uint32_t *pages, int page_nr, bool for pinfo->pi_sys = for_sys; } + if (pages_evicted && cache->ca_unpin_waiters) { + cache->ca_unpin_waiters = 0; + cache->ca_store->stor_ops->so_waitqueue_wakeup(cache->ca_unpin_wq, true); + } return 0; error: for (i = 0; i < processed; i++) { @@ -3439,6 +3500,10 @@ cache_pin_pages(struct umem_cache *cache, uint32_t *pages, int page_nr, bool for cache_unpin_page(cache, pinfo); } + if (cache->ca_unpin_waiters) { + cache->ca_unpin_waiters = 0; + cache->ca_store->stor_ops->so_waitqueue_wakeup(cache->ca_unpin_wq, true); + } return rc; } @@ -3673,9 +3738,12 @@ umem_cache_reserve(struct umem_store *store) } rc = 0; - D_CDEBUG(retry_cnt == 10, DLOG_ERR, DB_TRACE, - "Retry reserve free page, %d times\n", retry_cnt); retry_cnt++; + D_CDEBUG(retry_cnt % 20 == 0, DLOG_ERR, DB_TRACE, + "%u retries of reserve page. [ne:%u,pinned:%u,free:%u]\n", retry_cnt, + cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], + cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], + cache->ca_pgs_stats[UMEM_PG_STATS_FREE]); } D_ASSERT(cache->ca_reserve_waiters > 0); diff --git a/src/common/multihash_isal.c b/src/common/multihash_isal.c index 6b858e23c9c..cf293507bce 100644 --- a/src/common/multihash_isal.c +++ b/src/common/multihash_isal.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2020-2021 Intel Corporation. + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -237,6 +238,13 @@ crc64_finish(void *daos_mhash_ctx, uint8_t *buf, size_t buf_len) return 0; } +#define HANDLE_ERROR(statement) \ + ({ \ + int __rc__ = statement; \ + D_ASSERTF(__rc__ == 0, #statement " = %d\n", __rc__); \ + __rc__ == 0 ? 0 : -DER_INVAL; \ + }) + struct hash_ft crc64_algo = { .cf_update = crc64_update, .cf_init = crc64_init, @@ -250,7 +258,7 @@ struct hash_ft crc64_algo = { /** SHA1 */ struct sha1_ctx { - struct mh_sha1_ctx s1_ctx; + struct isal_mh_sha1_ctx s1_ctx; bool s1_updated; }; @@ -264,10 +272,14 @@ sha1_init(void **daos_mhash_ctx) if (ctx == NULL) return -DER_NOMEM; - rc = mh_sha1_init(&ctx->s1_ctx); - if (rc == 0) - *daos_mhash_ctx = ctx; - return rc; + rc = isal_mh_sha1_init(&ctx->s1_ctx); + if (rc != 0) { + D_FREE(ctx); + return HANDLE_ERROR(rc); + } + *daos_mhash_ctx = ctx; + + return 0; } static int @@ -276,7 +288,7 @@ sha1_reset(void *daos_mhash_ctx) struct sha1_ctx *ctx = daos_mhash_ctx; ctx->s1_updated = false; - return mh_sha1_init(&ctx->s1_ctx); + return HANDLE_ERROR(isal_mh_sha1_init(&ctx->s1_ctx)); } static void @@ -291,7 +303,7 @@ sha1_update(void *daos_mhash_ctx, uint8_t *buf, size_t buf_len) struct sha1_ctx *ctx = daos_mhash_ctx; ctx->s1_updated = true; - return mh_sha1_update(&ctx->s1_ctx, buf, buf_len); + return HANDLE_ERROR(isal_mh_sha1_update(&ctx->s1_ctx, buf, buf_len)); } static int @@ -300,7 +312,7 @@ sha1_finish(void *daos_mhash_ctx, uint8_t *buf, size_t buf_len) struct sha1_ctx *ctx = daos_mhash_ctx; if (ctx->s1_updated) - return mh_sha1_finalize(&ctx->s1_ctx, buf); + return HANDLE_ERROR(isal_mh_sha1_finalize(&ctx->s1_ctx, buf)); return 0; } @@ -317,8 +329,8 @@ struct hash_ft sha1_algo = { /** SHA256 */ struct sha256_ctx { - struct mh_sha256_ctx s2_ctx; - bool s2_updated; + struct isal_mh_sha256_ctx s2_ctx; + bool s2_updated; }; static int @@ -331,10 +343,16 @@ sha256_init(void **daos_mhash_ctx) if (ctx == NULL) return -DER_NOMEM; - rc = mh_sha256_init(&ctx->s2_ctx); - if (rc == 0) - *daos_mhash_ctx = ctx; - return rc; + rc = isal_mh_sha256_init(&ctx->s2_ctx); + if (rc != 0) { + D_FREE(ctx); + return HANDLE_ERROR(rc); + } + + *daos_mhash_ctx = ctx; + ctx->s2_updated = false; + + return 0; } static int @@ -343,7 +361,7 @@ sha256_reset(void *daos_mhash_ctx) struct sha256_ctx *ctx = daos_mhash_ctx; ctx->s2_updated = false; - return mh_sha256_init(&ctx->s2_ctx); + return HANDLE_ERROR(isal_mh_sha256_init(&ctx->s2_ctx)); } static void @@ -358,7 +376,7 @@ sha256_update(void *daos_mhash_ctx, uint8_t *buf, size_t buf_len) struct sha256_ctx *ctx = daos_mhash_ctx; ctx->s2_updated = true; - return mh_sha256_update(&ctx->s2_ctx, buf, buf_len); + return HANDLE_ERROR(isal_mh_sha256_update(&ctx->s2_ctx, buf, buf_len)); } static int @@ -367,7 +385,7 @@ sha256_finish(void *daos_mhash_ctx, uint8_t *buf, size_t buf_len) struct sha256_ctx *ctx = daos_mhash_ctx; if (ctx->s2_updated) - return mh_sha256_finalize(&ctx->s2_ctx, buf); + return HANDLE_ERROR(isal_mh_sha256_finalize(&ctx->s2_ctx, buf)); return 0; } @@ -384,22 +402,28 @@ struct hash_ft sha256_algo = { /** SHA512 */ struct sha512_ctx { - SHA512_HASH_CTX_MGR s5_mgr; - SHA512_HASH_CTX s5_ctx; - bool s5_updated; + ISAL_SHA512_HASH_CTX_MGR s5_mgr; + ISAL_SHA512_HASH_CTX s5_ctx; + bool s5_updated; }; static int sha512_init(void **daos_mhash_ctx) { struct sha512_ctx *ctx; + int rc; D_ALLOC_PTR(ctx); if (ctx == NULL) return -DER_NOMEM; - sha512_ctx_mgr_init(&ctx->s5_mgr); - hash_ctx_init(&ctx->s5_ctx); + rc = isal_sha512_ctx_mgr_init(&ctx->s5_mgr); + if (rc != 0) { + D_FREE(ctx); + return HANDLE_ERROR(rc); + } + isal_hash_ctx_init(&ctx->s5_ctx); + ctx->s5_updated = false; *daos_mhash_ctx = ctx; return 0; @@ -417,6 +441,8 @@ sha512_reset(void *daos_mhash_ctx) struct sha512_ctx *ctx = daos_mhash_ctx; ctx->s5_updated = false; + isal_hash_ctx_init(&ctx->s5_ctx); + return 0; } @@ -424,48 +450,55 @@ static int sha512_update(void *daos_mhash_ctx, uint8_t *buf, size_t buf_len) { struct sha512_ctx *ctx = daos_mhash_ctx; - SHA512_HASH_CTX *tmp; + ISAL_SHA512_HASH_CTX *tmp = NULL; + int rc; if (!ctx->s5_updated) - tmp = sha512_ctx_mgr_submit(&ctx->s5_mgr, - &ctx->s5_ctx, buf, - buf_len, - HASH_FIRST); + rc = isal_sha512_ctx_mgr_submit(&ctx->s5_mgr, &ctx->s5_ctx, &tmp, buf, buf_len, + ISAL_HASH_FIRST); else - tmp = sha512_ctx_mgr_submit(&ctx->s5_mgr, - &ctx->s5_ctx, buf, - buf_len, - HASH_UPDATE); + rc = isal_sha512_ctx_mgr_submit(&ctx->s5_mgr, &ctx->s5_ctx, &tmp, buf, buf_len, + ISAL_HASH_UPDATE); + + if (rc != 0) + return HANDLE_ERROR(rc); - if (tmp == NULL) - sha512_ctx_mgr_flush(&ctx->s5_mgr); + if (tmp == NULL) { + rc = isal_sha512_ctx_mgr_flush(&ctx->s5_mgr, &tmp); + if (rc != 0) + return HANDLE_ERROR(rc); + } ctx->s5_updated = true; - return ctx->s5_ctx.error; + return HANDLE_ERROR(ctx->s5_ctx.error); } static int sha512_finish(void *daos_mhash_ctx, uint8_t *buf, size_t buf_len) { struct sha512_ctx *ctx = daos_mhash_ctx; + int rc = 0; if (ctx->s5_updated) { - SHA512_HASH_CTX *tmp; + ISAL_SHA512_HASH_CTX *tmp = NULL; - tmp = sha512_ctx_mgr_submit(&ctx->s5_mgr, - &ctx->s5_ctx, NULL, - 0, - HASH_LAST); + rc = isal_sha512_ctx_mgr_submit(&ctx->s5_mgr, &ctx->s5_ctx, &tmp, NULL, 0, + ISAL_HASH_LAST); + if (rc != 0) + return HANDLE_ERROR(rc); - if (tmp == NULL) - sha512_ctx_mgr_flush(&ctx->s5_mgr); + if (tmp == NULL) { + rc = isal_sha512_ctx_mgr_flush(&ctx->s5_mgr, &tmp); + if (rc != 0) + return HANDLE_ERROR(rc); + } memcpy(buf, ctx->s5_ctx.job.result_digest, buf_len); - return ctx->s5_ctx.error; + rc = ctx->s5_ctx.error; } - return 0; + return HANDLE_ERROR(rc); } struct hash_ft sha512_algo = { diff --git a/src/common/pool_map.c b/src/common/pool_map.c index 594a0599079..3711ac98310 100644 --- a/src/common/pool_map.c +++ b/src/common/pool_map.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1877,7 +1877,7 @@ child_status_check(struct pool_domain *domain, uint32_t status) /* Domain status update state machine */ static int update_dom_status(struct pool_domain *domain, uint32_t id, uint32_t status, uint32_t version, - bool *updated) + bool *updated, bool for_revert) { int i; @@ -1893,7 +1893,7 @@ update_dom_status(struct pool_domain *domain, uint32_t id, uint32_t status, uint struct pool_domain *child = &domain->do_children[i]; int found; - found = update_dom_status(child, id, status, version, updated); + found = update_dom_status(child, id, status, version, updated, for_revert); if (!found) continue; @@ -1947,14 +1947,14 @@ update_dom_status(struct pool_domain *domain, uint32_t id, uint32_t status, uint /* Only change to DOWNOUT/DOWN if all of children are DOWNOUT/DOWN */ if (child_status_check(child, PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT) && (child->do_comp.co_status != status)) { - D_DEBUG(DB_MD, "rank %u id %u status %u --> %u\n", + D_DEBUG(DB_MD, "rank %u id %u status %u --> %u, for_revert %d", child->do_comp.co_rank, child->do_comp.co_id, - child->do_comp.co_status, status); + child->do_comp.co_status, status, for_revert); if (child->do_comp.co_status == PO_COMP_ST_DOWN) child->do_comp.co_flags = PO_COMPF_DOWN2OUT; child->do_comp.co_status = status; - if (status == PO_COMP_ST_DOWN) + if (status == PO_COMP_ST_DOWN && !for_revert) child->do_comp.co_fseq = version; *updated = true; } @@ -1975,12 +1975,12 @@ update_dom_status(struct pool_domain *domain, uint32_t id, uint32_t status, uint int update_dom_status_by_tgt_id(struct pool_map *map, uint32_t tgt_id, uint32_t status, - uint32_t version, bool *updated) + uint32_t version, bool *updated, bool for_revert) { int rc; D_ASSERT(map->po_tree != NULL); - rc = update_dom_status(map->po_tree, tgt_id, status, version, updated); + rc = update_dom_status(map->po_tree, tgt_id, status, version, updated, for_revert); if (rc < 0) return rc; return 0; @@ -2040,6 +2040,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id, struct pool_domain *tmp; int i; + D_ASSERT(map != NULL); if (pool_map_empty(map)) { D_ERROR("Uninitialized pool map\n"); return 0; @@ -2092,6 +2093,7 @@ int pool_map_find_ranks(struct pool_map *map, uint32_t id, struct pool_domain **domain_pp) { + D_ASSERT(map != NULL); return pool_map_find_domain(map, PO_COMP_TP_RANK, id, domain_pp); } @@ -2150,6 +2152,7 @@ pool_map_find_dom_by_rank(struct pool_map *map, uint32_t rank) int doms_cnt; int i; + D_ASSERT(map != NULL); doms_cnt = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms); if (doms_cnt <= 0) return NULL; @@ -2232,6 +2235,7 @@ pool_map_find_target_by_rank_idx(struct pool_map *map, uint32_t rank, { struct pool_domain *dom; + D_ASSERT(map != NULL); dom = pool_map_find_dom_by_rank(map, rank); if (dom == NULL) return 0; diff --git a/src/common/tests/SConscript b/src/common/tests/SConscript index fedea1b9915..49a15f9deac 100644 --- a/src/common/tests/SConscript +++ b/src/common/tests/SConscript @@ -77,6 +77,11 @@ def scons(): ['drpc_tests.c', '../drpc.c', '../drpc.pb-c.c', mock_test_utils], LIBS=['protobuf-c', 'daos_common', 'gurt', 'cmocka']) + Depends('control_tests', common_mock_ld_script) + unit_env.d_test_program('control_tests', + ['dpdk_cli_opts_tests.c', '../control.c', mock_test_utils], + LIBS=['daos_common', 'gurt', 'cmocka']) + if __name__ == "SCons.Script": scons() diff --git a/src/common/tests/dpdk_cli_opts_tests.c b/src/common/tests/dpdk_cli_opts_tests.c new file mode 100644 index 00000000000..503623ea729 --- /dev/null +++ b/src/common/tests/dpdk_cli_opts_tests.c @@ -0,0 +1,126 @@ +/** + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include +#include +#include +#include +#include + +#include + +/* Test dpdk_cli_build_opts with valid log levels */ +static void +test_dpdk_cli_build_opts_valid(void **state) +{ + const char *opts; + int log_level; + + /* Test each valid log level */ + for (log_level = 1; log_level <= 8; log_level++) { + opts = dpdk_cli_build_opts(log_level, log_level); + assert_non_null(opts); + + /* Verify the string contains the correct log level */ + char expected[64]; + snprintf(expected, sizeof(expected), "--log-level=lib.eal:%d ", log_level); + assert_non_null(strstr(opts, expected)); + + /* Verify it contains --no-telemetry */ + assert_non_null(strstr(opts, "--no-telemetry")); + } +} + +/* Test dpdk_cli_build_opts with invalid log levels */ +static void +test_dpdk_cli_build_opts_invalid(void **state) +{ + const char *opts; + + /* Test below minimum */ + opts = dpdk_cli_build_opts(0, 1); + assert_null(opts); + + /* Test above maximum */ + opts = dpdk_cli_build_opts(9, 1); + assert_null(opts); + + /* Test negative */ + opts = dpdk_cli_build_opts(-1, 1); + + /* Test the same for the second input */ + + opts = dpdk_cli_build_opts(1, 0); + assert_null(opts); + + opts = dpdk_cli_build_opts(1, 9); + assert_null(opts); + + opts = dpdk_cli_build_opts(1, -1); + assert_null(opts); +} + +/* Test dpdk_cli_build_opts_selective */ +static void +test_dpdk_cli_build_opts_selective(void **state) +{ + const char *opts; + + /* Test EAL at DEBUG, others at ERROR */ + opts = dpdk_cli_build_opts(8, 4); + assert_non_null(opts); + + /* Verify EAL is at level 8 */ + assert_non_null(strstr(opts, "--log-level=lib.eal:8 ")); + + /* Verify malloc is at level 4 */ + assert_non_null(strstr(opts, "--log-level=lib.malloc:4 ")); +} + +/* Test that different log levels produce different strings */ +static void +test_dpdk_cli_build_opts_different_levels(void **state) +{ + const char *tmp; + char opts4[2048]; + const char *opts8; + + /** + * Returned will be the single string buffer and it will be overridden on each call to the + * function so copy to a local buffer before comparison. + */ + tmp = dpdk_cli_build_opts(4, 4); + strcpy(opts4, tmp); + opts8 = dpdk_cli_build_opts(8, 8); + + assert_non_null(opts4); + assert_non_null(opts8); + assert_non_null(tmp); + + /* Should be different strings */ + assert_string_not_equal(opts4, opts8); + + /* opts4 should have ":4 " */ + assert_non_null(strstr(opts4, ":4 ")); + assert_null(strstr(opts4, ":8 ")); + + /* opts8 should have ":8 " */ + assert_non_null(strstr(opts8, ":8 ")); + assert_null(strstr(opts8, ":4 ")); +} + +int +main(void) +{ + const struct CMUnitTest tests[] = { + cmocka_unit_test(test_dpdk_cli_build_opts_valid), + cmocka_unit_test(test_dpdk_cli_build_opts_invalid), + cmocka_unit_test(test_dpdk_cli_build_opts_selective), + cmocka_unit_test(test_dpdk_cli_build_opts_different_levels), + }; + + return cmocka_run_group_tests(tests, NULL, NULL); +} diff --git a/src/common/tests/umem_test_bmem.c b/src/common/tests/umem_test_bmem.c index 0b555e67c08..a62fe3814a4 100644 --- a/src/common/tests/umem_test_bmem.c +++ b/src/common/tests/umem_test_bmem.c @@ -1495,7 +1495,8 @@ test_tx_reserve_publish_cancel(void **state) assert_int_equal(memcmp(rsrv_ptr1, local_buf, 980), 0); assert_int_equal(memcmp(rsrv_ptr2, local_buf, 128), 0); umem_cancel(umm, rsrvd_act); - validate_persist_activity(1, 0); + /* umem_cacnel() internally started tx, which increased one additional resrv_cnt */ + validate_persist_activity(2, 0); utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); assert_true(cur_mem_used >= initial_mem_used); umoff = umem_atomic_alloc(umm, 980, UMEM_TYPE_ANY); @@ -1626,7 +1627,8 @@ test_tx_bucket_reserve_publish_cancel(void **state) assert_int_equal(memcmp(rsrv_ptr1, local_buf, 980), 0); assert_int_equal(memcmp(rsrv_ptr2, local_buf, 128), 0); umem_cancel(umm, rsrvd_act); - validate_persist_activity(1, 0); + /* umem_cacnel() internally started tx, which increased one additional resrv_cnt */ + validate_persist_activity(2, 0); utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); assert_true(cur_mem_used >= initial_mem_used); umoff = umem_atomic_alloc_from_bucket(umm, 980, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); diff --git a/src/common/tests_dmg_helpers.c b/src/common/tests_dmg_helpers.c index 08825366133..c77bc880820 100644 --- a/src/common/tests_dmg_helpers.c +++ b/src/common/tests_dmg_helpers.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2020-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -827,6 +827,34 @@ dmg_pool_create(const char *dmg_config_file, } } + /* Temporarily use old pool property defaults due to DAOS-17946 */ + /* Set default rd_fac:0 if --properties=rd_fac is not already defined in args */ + bool has_rd_fac = false; + for (int i = 0; i < argcount; i++) { + if (args[i] && strstr(args[i], "--properties=rd_fac") != NULL) { + has_rd_fac = true; + break; + } + } + if (!has_rd_fac) { + args = cmd_push_arg(args, &argcount, "--properties=rd_fac:0 "); + if (args == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + /* Set default space_rb:0 if --properties=space_rb is not already defined in args */ + bool has_space_rb = false; + for (int i = 0; i < argcount; i++) { + if (args[i] && strstr(args[i], "--properties=space_rb") != NULL) { + has_space_rb = true; + break; + } + } + if (!has_space_rb) { + args = cmd_push_arg(args, &argcount, "--properties=space_rb:0 "); + if (args == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + if (!has_label) { char path[] = "/tmp/test_XXXXXX"; int tmp_fd; @@ -1946,21 +1974,6 @@ dmg_check_stop(const char *dmg_config_file, uint32_t pool_nr, uuid_t uuids[]) return rc; } -static int -check_query_reports_cmp(const void *p1, const void *p2) -{ - const struct daos_check_report_info *dcri1 = p1; - const struct daos_check_report_info *dcri2 = p2; - - if (dcri1->dcri_class > dcri2->dcri_class) - return 1; - - if (dcri1->dcri_class < dcri2->dcri_class) - return -1; - - return 0; -} - static int parse_check_query_pool(struct json_object *obj, uuid_t uuid, struct daos_check_info *dci) { @@ -2119,11 +2132,6 @@ parse_check_query_info(struct json_object *query_output, uint32_t pool_nr, uuid_ return rc; } - /* Sort the inconsistency reports for easy verification. */ - if (dci->dci_report_nr > 1) - qsort(dci->dci_reports, dci->dci_report_nr, sizeof(dci->dci_reports[0]), - check_query_reports_cmp); - return 0; } diff --git a/src/common/tse.c b/src/common/tse.c index c936459e4be..e8472eef5fa 100644 --- a/src/common/tse.c +++ b/src/common/tse.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -907,7 +908,12 @@ tse_task_add_dependent(tse_task_t *task, tse_task_t *dep) D_DEBUG(DB_TRACE, "Add dependent %p ---> %p\n", dep, task); D_MUTEX_LOCK(&dtp->dtp_sched->dsp_lock); - D_ASSERT(dtp->dtp_dep_cnt < UINT16_MAX); + if (dtp->dtp_dep_cnt >= UINT16_MAX || dtp->dtp_refcnt >= UINT16_MAX) { + D_ERROR("Max dependent tasks reached: %" PRIu16 "\n", dtp->dtp_dep_cnt); + D_MUTEX_UNLOCK(&dtp->dtp_sched->dsp_lock); + D_FREE(tlink); + return -DER_NOMEM; + } tse_task_addref_locked(dtp); tlink->tl_task = task; dtp->dtp_dep_cnt++; diff --git a/src/container/container_iv.c b/src/container/container_iv.c index 0ac8ae9737f..05a15fb751d 100644 --- a/src/container/container_iv.c +++ b/src/container/container_iv.c @@ -742,6 +742,12 @@ cont_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, d_sg_list_t *src, int ref_rc, void **priv) { D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + if (ref_rc != 0) { + DL_WARN(ref_rc, DF_UUID "bypass refresh, IV class id %d.", + DP_UUID(entry->ns->iv_pool_uuid), key->class_id); + return ref_rc; + } + return cont_iv_ent_update(entry, key, src, priv); } @@ -1115,6 +1121,12 @@ cont_iv_hdl_fetch(uuid_t cont_hdl_uuid, uuid_t pool_uuid, return rc; } +static inline bool +cont_iv_retryable_error(int rc) +{ + return daos_rpc_retryable_rc(rc) || rc == -DER_NOTLEADER || rc == -DER_BUSY; +} + static int cont_iv_track_eph_update_internal(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, daos_epoch_t stable_eph, unsigned int shortcut, @@ -1135,30 +1147,58 @@ cont_iv_track_eph_update_internal(void *ns, uuid_t cont_uuid, daos_epoch_t ec_ag return rc; } - rc = cont_iv_update(ns, op, cont_uuid, &iv_entry, sizeof(iv_entry), - shortcut, sync_mode, true /* retry */); - if (rc) + rc = cont_iv_update(ns, op, cont_uuid, &iv_entry, sizeof(iv_entry), shortcut, sync_mode, + false); + if (rc && !cont_iv_retryable_error(rc)) D_ERROR(DF_UUID" op %d, cont_iv_update failed "DF_RC"\n", DP_UUID(cont_uuid), op, DP_RC(rc)); return rc; } +static int +cont_iv_track_eph_retry(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, + daos_epoch_t stable_eph, unsigned int shortcut, unsigned int sync_mode, + uint32_t op, struct sched_request *req) +{ + int sleep_ms = 1000; /* 1 second retry interval */ + int rc = 0; + + while (1) { + rc = cont_iv_track_eph_update_internal(ns, cont_uuid, ec_agg_eph, stable_eph, + shortcut, sync_mode, op); + if (rc == 0) + break; + + /* Only retry on specific errors */ + if (!cont_iv_retryable_error(rc)) + break; + + if (req && dss_ult_exiting(req)) { + rc = -DER_SHUTDOWN; + break; + } + + dss_sleep(sleep_ms); + } + + return rc; +} + int cont_iv_track_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, - daos_epoch_t stable_eph) + daos_epoch_t stable_eph, struct sched_request *req) { - return cont_iv_track_eph_update_internal(ns, cont_uuid, ec_agg_eph, stable_eph, - CRT_IV_SHORTCUT_TO_ROOT, - CRT_IV_SYNC_NONE, - IV_CONT_TRACK_EPOCH_REPORT); + return cont_iv_track_eph_retry(ns, cont_uuid, ec_agg_eph, stable_eph, + CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE, + IV_CONT_TRACK_EPOCH_REPORT, req); } int cont_iv_track_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, - daos_epoch_t stable_eph) + daos_epoch_t stable_eph, struct sched_request *req) { - return cont_iv_track_eph_update_internal(ns, cont_uuid, ec_agg_eph, stable_eph, 0, - CRT_IV_SYNC_EAGER, IV_CONT_TRACK_EPOCH); + return cont_iv_track_eph_retry(ns, cont_uuid, ec_agg_eph, stable_eph, 0, CRT_IV_SYNC_EAGER, + IV_CONT_TRACK_EPOCH, req); } int @@ -1547,19 +1587,22 @@ cont_iv_prop_fetch_ult(void *data) iv_entry, iv_entry_size, iv_entry_size, false /* retry */); if (rc) { - DL_CDEBUG(rc == -DER_NOTLEADER, DB_ANY, DLOG_ERR, rc, "cont_iv_fetch failed"); + DL_CDEBUG(rc == -DER_NOTLEADER, DB_ANY, DLOG_ERR, rc, + DF_CONT ": cont_iv_fetch failed", DP_CONT(pool->sp_uuid, arg->cont_uuid)); D_GOTO(out, rc); } rc = cont_iv_prop_g2l(&iv_entry->iv_prop, &prop_fetch); if (rc) { - D_ERROR("cont_iv_prop_g2l failed "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_CONT ": cont_iv_prop_g2l failed", + DP_CONT(pool->sp_uuid, arg->cont_uuid)); D_GOTO(out, rc); } rc = daos_prop_copy(prop, prop_fetch); if (rc) { - D_ERROR("daos_prop_copy failed "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_CONT ": daos_prop_copy failed", + DP_CONT(pool->sp_uuid, arg->cont_uuid)); D_GOTO(out, rc); } @@ -1734,7 +1777,7 @@ ds_cont_find_hdl(uuid_t po_uuid, uuid_t coh_uuid, struct ds_cont_hdl **coh_p) /* Return a retry-able error when the srv handle not propagated */ if (d_list_empty(&pool_child->spc_srv_cont_hdl)) { struct copy_hdl_arg arg; - int rc; + int rc, ret; /* * Sometimes the srv container handle failed to be propagated to the pool @@ -1750,8 +1793,10 @@ ds_cont_find_hdl(uuid_t po_uuid, uuid_t coh_uuid, struct ds_cont_hdl **coh_p) } } ds_pool_child_put(pool_child); - D_INFO(DF_UUID ": Server handle isn't propagated yet.\n", DP_UUID(po_uuid)); - return -DER_STALE; + ret = -DER_STALE; + DL_INFO(ret, DF_UUID ": Server handle isn't propagated yet %d.", DP_UUID(po_uuid), + rc); + return ret; } srv_hdl_ready: diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 5ad305231b4..3efa3211f89 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1734,6 +1734,7 @@ cont_track_eph_leader_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid, eph_ldr->cte_server_ephs[i].re_rank = doms[i].do_comp.co_rank; eph_ldr->cte_server_ephs[i].re_ec_agg_eph = 0; eph_ldr->cte_server_ephs[i].re_stable_eph = 0; + eph_ldr->cte_server_ephs[i].re_ec_agg_eph_update_ts = daos_gettime_coarse(); } d_list_add(&eph_ldr->cte_list, &cont_svc->cs_cont_ephs_leader_list); *leader_p = eph_ldr; @@ -1790,8 +1791,11 @@ ds_cont_leader_update_track_eph(uuid_t pool_uuid, uuid_t cont_uuid, d_rank_t ran for (i = 0; i < eph_ldr->cte_servers_num; i++) { if (eph_ldr->cte_server_ephs[i].re_rank == rank) { - if (eph_ldr->cte_server_ephs[i].re_ec_agg_eph < ec_agg_eph) + if (eph_ldr->cte_server_ephs[i].re_ec_agg_eph < ec_agg_eph) { eph_ldr->cte_server_ephs[i].re_ec_agg_eph = ec_agg_eph; + eph_ldr->cte_server_ephs[i].re_ec_agg_eph_update_ts = + daos_gettime_coarse(); + } if (eph_ldr->cte_server_ephs[i].re_stable_eph < stable_eph) eph_ldr->cte_server_ephs[i].re_stable_eph = stable_eph; break; @@ -1896,7 +1900,8 @@ ds_cont_tgt_refresh_track_eph(uuid_t pool_uuid, uuid_t cont_uuid, rc = ds_pool_lookup(pool_uuid, &pool); if (rc != 0) { - D_ERROR(DF_UUID" lookup pool failed: %d\n", DP_UUID(pool_uuid), rc); + DL_CDEBUG(rc != 0 && rc != -DER_SHUTDOWN, DLOG_ERR, DB_MD, rc, + DF_UUID " lookup pool failed", DP_UUID(pool_uuid)); goto out; } rank = dss_self_rank(); @@ -1909,7 +1914,7 @@ ds_cont_tgt_refresh_track_eph(uuid_t pool_uuid, uuid_t cont_uuid, rc = ds_pool_thread_collective( pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, cont_refresh_track_eph_one, &arg, DSS_ULT_DEEP_STACK | DSS_ULT_FL_PERIODIC); - DL_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, rc, + DL_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG, rc, DF_CONT ": refresh ec_agg_eph " DF_X64 ", " "stable_eph " DF_X64, DP_CONT(pool_uuid, cont_uuid), ec_agg_eph, stable_eph); @@ -1941,8 +1946,9 @@ cont_agg_eph_load(struct cont_svc *svc, uuid_t cont_uuid, uint64_t *ec_agg_eph) ABT_rwlock_rdlock(svc->cs_lock); rc = cont_lookup(&tx, svc, cont_uuid, &cont); if (rc != 0) { - D_ERROR(DF_CONT ": Failed to look container: %d\n", - DP_CONT(svc->cs_pool_uuid, cont_uuid), rc); + DL_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_MD, rc, + DF_CONT ": Failed to look container", + DP_CONT(svc->cs_pool_uuid, cont_uuid)); D_GOTO(out_lock, rc); } @@ -2005,8 +2011,9 @@ cont_agg_eph_store(struct cont_svc *svc, uuid_t cont_uuid, uint64_t ec_agg_eph, ABT_rwlock_wrlock(svc->cs_lock); rc = cont_lookup(&tx, svc, cont_uuid, &cont); if (rc != 0) { - D_ERROR(DF_CONT ": Failed to look container: %d\n", - DP_CONT(svc->cs_pool_uuid, cont_uuid), rc); + DL_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_MD, rc, + DF_CONT ": Failed to look container", + DP_CONT(svc->cs_pool_uuid, cont_uuid)); D_GOTO(out_lock, rc); } @@ -2053,10 +2060,11 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc) uint64_t cur_eph, new_eph; daos_epoch_t min_ec_agg_eph; daos_epoch_t min_stable_eph; + uint64_t cur_ts; int i; int rc = 0; - rc = map_ranks_init(pool->sp_map, PO_COMP_ST_DOWNOUT | PO_COMP_ST_DOWN, &fail_ranks); + rc = map_ranks_failed(pool->sp_map, &fail_ranks); if (rc) { D_ERROR(DF_UUID ": ranks init failed: %d\n", DP_UUID(pool->sp_uuid), rc); return; @@ -2074,13 +2082,20 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc) if (eph_ldr->cte_rdb_ec_agg_eph == 0) { rc = cont_agg_eph_load(svc, eph_ldr->cte_cont_uuid, &eph_ldr->cte_rdb_ec_agg_eph); - if (rc) + if (rc) { + if (rc == -DER_NONEXIST) { + DL_INFO(rc, DF_CONT " container skipped", + DP_CONT(svc->cs_pool_uuid, eph_ldr->cte_cont_uuid)); + continue; + } DL_ERROR(rc, DF_CONT ": cont_agg_eph_load failed.", DP_CONT(svc->cs_pool_uuid, eph_ldr->cte_cont_uuid)); + } } min_ec_agg_eph = DAOS_EPOCH_MAX; min_stable_eph = DAOS_EPOCH_MAX; + cur_ts = daos_gettime_coarse(); for (i = 0; i < eph_ldr->cte_servers_num; i++) { d_rank_t rank = eph_ldr->cte_server_ephs[i].re_rank; @@ -2090,6 +2105,14 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc) continue; } + if (pool->sp_reclaim != DAOS_RECLAIM_DISABLED && + cur_ts > eph_ldr->cte_server_ephs[i].re_ec_agg_eph_update_ts + 600) + D_WARN(DF_CONT ": Sluggish EC boundary report from rank %d, " DF_U64 + " Seconds.", + DP_CONT(svc->cs_pool_uuid, eph_ldr->cte_cont_uuid), rank, + cur_ts - + eph_ldr->cte_server_ephs[i].re_ec_agg_eph_update_ts); + if (eph_ldr->cte_server_ephs[i].re_ec_agg_eph < min_ec_agg_eph) min_ec_agg_eph = eph_ldr->cte_server_ephs[i].re_ec_agg_eph; if (eph_ldr->cte_server_ephs[i].re_stable_eph < min_stable_eph) @@ -2101,7 +2124,8 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc) min_ec_agg_eph = eph_ldr->cte_rdb_ec_agg_eph; if (min_ec_agg_eph == eph_ldr->cte_current_ec_agg_eph && - min_stable_eph == eph_ldr->cte_current_stable_eph) + min_stable_eph == eph_ldr->cte_current_stable_eph && + eph_ldr->cte_current_ec_agg_eph != 0) continue; /** @@ -2135,20 +2159,31 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc) if (min_ec_agg_eph > eph_ldr->cte_rdb_ec_agg_eph) { rc = cont_agg_eph_store(svc, eph_ldr->cte_cont_uuid, min_ec_agg_eph, &eph_ldr->cte_rdb_ec_agg_eph); - if (rc) + if (rc) { + if (rc == -DER_NONEXIST) { + DL_INFO(rc, DF_CONT " container skipped", + DP_CONT(svc->cs_pool_uuid, eph_ldr->cte_cont_uuid)); + continue; + } DL_ERROR(rc, DF_CONT ": rdb_tx_update ec_agg_eph " DF_X64 " failed.", DP_CONT(svc->cs_pool_uuid, eph_ldr->cte_cont_uuid), min_ec_agg_eph); + } } rc = cont_iv_track_eph_refresh(pool->sp_iv_ns, eph_ldr->cte_cont_uuid, - min_ec_agg_eph, min_stable_eph); + min_ec_agg_eph, min_stable_eph, + svc->cs_cont_ephs_leader_req); if (rc) { DL_CDEBUG(rc == -DER_NONEXIST, DLOG_INFO, DLOG_ERR, rc, DF_CONT ": refresh failed", DP_CONT(svc->cs_pool_uuid, eph_ldr->cte_cont_uuid)); + /* If ULT is exiting, break out */ + if (rc == -DER_SHUTDOWN) + break; + /* If there are network error or pool map inconsistency, * let's skip the following eph sync, which will fail * anyway. @@ -2163,7 +2198,7 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc) } eph_ldr->cte_current_ec_agg_eph = min_ec_agg_eph; eph_ldr->cte_current_stable_eph = min_stable_eph; - if (pool->sp_rebuilding) + if (atomic_load(&pool->sp_rebuilding)) break; } ABT_mutex_unlock(svc->cs_cont_ephs_mutex); @@ -2231,7 +2266,8 @@ cont_svc_eph_track_leader_start(struct cont_svc *svc) D_ASSERT(svc->cs_cont_ephs_leader_req == NULL); uuid_clear(anonym_uuid); sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &anonym_uuid); - svc->cs_cont_ephs_leader_req = sched_create_ult(&attr, cont_track_eph_leader_ult, svc, 0); + svc->cs_cont_ephs_leader_req = + sched_create_ult(&attr, cont_track_eph_leader_ult, svc, DSS_DEEP_STACK_SZ); if (svc->cs_cont_ephs_leader_req == NULL) { D_ERROR(DF_UUID" Failed to create EC leader eph ULT.\n", DP_UUID(svc->cs_pool_uuid)); diff --git a/src/container/srv_internal.h b/src/container/srv_internal.h index c8a676c5b80..7e4a6c8a626 100644 --- a/src/container/srv_internal.h +++ b/src/container/srv_internal.h @@ -62,6 +62,7 @@ struct rank_eph { d_rank_t re_rank; daos_epoch_t re_ec_agg_eph; daos_epoch_t re_stable_eph; + uint64_t re_ec_agg_eph_update_ts; /* re_ec_agg_eph update timestamp */ }; /* container EC aggregation epoch and stable epoch control descriptor, which is only on leader */ @@ -301,10 +302,12 @@ int cont_iv_prop_update(void *ns, uuid_t cont_uuid, daos_prop_t *prop, bool sync int cont_iv_snapshots_refresh(void *ns, uuid_t cont_uuid); int cont_iv_snapshots_update(void *ns, uuid_t cont_uuid, uint64_t *snapshots, int snap_count); -int cont_iv_track_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, - daos_epoch_t stable_eph); -int cont_iv_track_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, - daos_epoch_t stable_eph); +int +cont_iv_track_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, + daos_epoch_t stable_eph, struct sched_request *req); +int + cont_iv_track_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, + daos_epoch_t stable_eph, struct sched_request *req); int cont_iv_entry_delete(void *ns, uuid_t pool_uuid, uuid_t cont_uuid); /* srv_metrics.c*/ diff --git a/src/container/srv_layout.c b/src/container/srv_layout.c index ae1b16b3ae2..56929f6a6d0 100644 --- a/src/container/srv_layout.c +++ b/src/container/srv_layout.c @@ -73,16 +73,16 @@ struct daos_prop_entry cont_prop_entries_default_v0[CONT_PROP_NUM_V0] = { .dpe_val = 1, }, { .dpe_type = DAOS_PROP_CO_CSUM, - .dpe_val = DAOS_PROP_CO_CSUM_OFF, + .dpe_val = DAOS_PROP_CO_CSUM_CRC32, }, { .dpe_type = DAOS_PROP_CO_CSUM_CHUNK_SIZE, .dpe_val = 32 * 1024, /** 32K */ }, { .dpe_type = DAOS_PROP_CO_CSUM_SERVER_VERIFY, - .dpe_val = DAOS_PROP_CO_CSUM_SV_OFF, + .dpe_val = DAOS_PROP_CO_CSUM_SV_ON, }, { .dpe_type = DAOS_PROP_CO_REDUN_FAC, - .dpe_val = DAOS_PROP_CO_REDUN_RF0, + .dpe_val = DAOS_PROP_CO_REDUN_RF3, }, { .dpe_type = DAOS_PROP_CO_REDUN_LVL, .dpe_val = DAOS_PROP_CO_REDUN_RANK, diff --git a/src/container/srv_target.c b/src/container/srv_target.c index b7dd3ce9b4f..6bfce2f6397 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -132,24 +132,25 @@ ds_cont_csummer_init(struct ds_cont_child *cont) bool dedup_only = false; D_ASSERT(cont != NULL); - cont_props = &cont->sc_props; + while (cont->sc_csummer_initing) { + ABT_mutex_lock(cont->sc_mutex); + ABT_cond_wait(cont->sc_init_cond, cont->sc_mutex); + ABT_mutex_unlock(cont->sc_mutex); + } - if (cont->sc_props_fetched) + if (cont->sc_csummer_inited) return 0; + D_ASSERT(cont->sc_csummer == NULL); + cont->sc_csummer_initing = 1; /** Get the container csum related properties * Need the pool for the IV namespace */ - D_ASSERT(cont->sc_csummer == NULL); + cont_props = &cont->sc_props; rc = ds_cont_get_props(cont_props, cont->sc_pool_uuid, cont->sc_uuid); if (rc != 0) goto done; - /* Check again since IV fetch yield */ - if (cont->sc_props_fetched) - goto done; - cont->sc_props_fetched = 1; - csum_val = cont_props->dcp_csum_type; if (!daos_cont_csum_prop_is_enabled(csum_val)) { dedup_only = true; @@ -162,10 +163,30 @@ ds_cont_csummer_init(struct ds_cont_child *cont) daos_contprop2hashtype(csum_val), cont_props->dcp_chunksize, cont_props->dcp_srv_verify); + if (rc != 0) + goto done; + if (dedup_only) dedup_configure_csummer(cont->sc_csummer, cont_props); } + + rc = vos_cont_save_props(cont->sc_hdl, cont_props); + if (rc != 0) { + /* + * The failure of saving checksum property copy only potentially affect ddb, but + * it is not fatal for current caller. Let's go ahead with some warning message. + */ + D_WARN("Cannot locally save container property for " DF_UUID ": " DF_RC "\n", + DP_UUID(cont->sc_uuid), DP_RC(rc)); + rc = 0; + } + D_ASSERT(!cont->sc_csummer_inited); /* nobody else can do this except me */ + cont->sc_csummer_inited = 1; done: + if (cont->sc_csummer_initing) { + cont->sc_csummer_initing = 0; + ABT_cond_broadcast(cont->sc_init_cond); + } return rc; } @@ -188,8 +209,8 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, if (ds_pool_is_rebuilding(pool) && !vos_agg) { D_DEBUG(DB_EPC, DF_CONT ": skip EC aggregation during rebuild %d, %d.\n", - DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), pool->sp_rebuilding, - pool->sp_rebuild_scan); + DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), + atomic_load(&pool->sp_rebuilding), pool->sp_rebuild_scan); return false; } @@ -203,7 +224,7 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid)); } - if (!cont->sc_props_fetched) + if (!cont->sc_csummer_inited) ds_cont_csummer_init(cont); if (cont->sc_props.dcp_dedup_enabled || @@ -260,6 +281,7 @@ get_hae(struct ds_cont_child *cont, bool vos_agg) /* EC aggregation */ if (!vos_agg) return cont->sc_ec_agg_eph; + /* * Query the 'Highest Aggregated Epoch', the HAE will be bumped * in vos_aggregate() @@ -499,8 +521,11 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb, DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), param->ap_vos_agg ? "VOS" : "EC"); } else if (sched_req_space_check(req) != SCHED_SPACE_PRESS_NONE) { - /* Don't sleep when there is space pressure */ - msecs = 0; + /* + * Introduce a small sleep interval between each round to yield CPU time + * for the flush & GC ULTs, irrespective of space pressure. DAOS-18012. + */ + msecs = 200; } if (param->ap_vos_agg) @@ -516,7 +541,7 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb, * if no space pressure. */ if (ds_pool_is_rebuilding(cont->sc_pool->spc_pool) && !param->ap_vos_agg && - msecs != 0) + msecs != 200) msecs = 18000; if (msecs != 0) @@ -636,6 +661,25 @@ cont_child_obj(struct daos_llink *llink) return container_of(llink, struct ds_cont_child, sc_list); } +static void +cont_child_fini_abt(struct ds_cont_child *cont) +{ + if (cont->sc_dtx_resync_cond) + ABT_cond_free(&cont->sc_dtx_resync_cond); + if (cont->sc_scrub_cond) + ABT_cond_free(&cont->sc_scrub_cond); + if (cont->sc_rebuild_cond) + ABT_cond_free(&cont->sc_rebuild_cond); + if (cont->sc_init_cond) + ABT_cond_free(&cont->sc_init_cond); + if (cont->sc_fini_cond) + ABT_cond_free(&cont->sc_fini_cond); + if (cont->sc_mutex) + ABT_mutex_free(&cont->sc_mutex); + if (cont->sc_open_mutex) + ABT_mutex_free(&cont->sc_open_mutex); +} + static int cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, struct daos_llink **link) @@ -659,34 +703,39 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, rc = ABT_mutex_create(&cont->sc_mutex); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_open_mutex; + goto out_abt; } rc = ABT_cond_create(&cont->sc_dtx_resync_cond); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_mutex; + goto out_abt; } rc = ABT_cond_create(&cont->sc_scrub_cond); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_resync_cond; + goto out_abt; } rc = ABT_cond_create(&cont->sc_rebuild_cond); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_scrub_cond; + goto out_abt; + } + rc = ABT_cond_create(&cont->sc_init_cond); + if (rc != ABT_SUCCESS) { + rc = dss_abterr2der(rc); + goto out_abt; } rc = ABT_cond_create(&cont->sc_fini_cond); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_rebuild_cond; + goto out_abt; } cont->sc_pool = ds_pool_child_lookup(po_uuid); if (cont->sc_pool == NULL) { rc = -DER_NO_HDL; - goto out_finish_cond; + goto out_abt; } rc = vos_cont_open(cont->sc_pool->spc_hdl, co_uuid, &cont->sc_hdl); @@ -726,18 +775,8 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, vos_cont_close(cont->sc_hdl); out_pool: ds_pool_child_put(cont->sc_pool); -out_finish_cond: - ABT_cond_free(&cont->sc_fini_cond); -out_rebuild_cond: - ABT_cond_free(&cont->sc_rebuild_cond); -out_scrub_cond: - ABT_cond_free(&cont->sc_scrub_cond); -out_resync_cond: - ABT_cond_free(&cont->sc_dtx_resync_cond); -out_mutex: - ABT_mutex_free(&cont->sc_mutex); -out_open_mutex: - ABT_mutex_free(&cont->sc_open_mutex); +out_abt: + cont_child_fini_abt(cont); out: D_FREE(cont); return rc; @@ -758,14 +797,10 @@ cont_child_free_ref(struct daos_llink *llink) cont_tgt_track_eph_fini(cont); vos_cont_close(cont->sc_hdl); ds_pool_child_put(cont->sc_pool); - daos_csummer_destroy(&cont->sc_csummer); + if (cont->sc_csummer) + daos_csummer_destroy(&cont->sc_csummer); D_FREE(cont->sc_snapshots); - ABT_cond_free(&cont->sc_dtx_resync_cond); - ABT_cond_free(&cont->sc_scrub_cond); - ABT_cond_free(&cont->sc_rebuild_cond); - ABT_cond_free(&cont->sc_fini_cond); - ABT_mutex_free(&cont->sc_mutex); - ABT_mutex_free(&cont->sc_open_mutex); + cont_child_fini_abt(cont); D_FREE(cont); } @@ -2323,10 +2358,9 @@ ds_cont_tgt_snapshot_notify_handler(crt_rpc_t *rpc) args.snap_opts = in->tsi_opts; args.oit_oid = in->tsi_oit_oid; - out->tso_rc = ds_pool_thread_collective(in->tsi_pool_uuid, - PO_COMP_ST_NEW | PO_COMP_ST_DOWN | - PO_COMP_ST_DOWNOUT, cont_snap_notify_one, - &args, 0); + out->tso_rc = ds_pool_thread_collective( + in->tsi_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, + cont_snap_notify_one, &args, DSS_ULT_DEEP_STACK); if (out->tso_rc != 0) D_ERROR(DF_CONT": Snapshot notify failed: "DF_RC"\n", DP_CONT(in->tsi_pool_uuid, in->tsi_cont_uuid), @@ -2795,9 +2829,17 @@ ds_cont_eph_report(struct ds_pool *pool) } } - if (min_ec_agg_eph == 0 || min_ec_agg_eph == DAOS_EPOCH_MAX || - min_stable_eph == 0 || min_stable_eph == DAOS_EPOCH_MAX || - (min_ec_agg_eph <= ec_eph->cte_last_ec_agg_epoch && + if (min_ec_agg_eph <= ec_eph->cte_last_ec_agg_epoch && + min_stable_eph <= ec_eph->cte_last_stable_epoch && + pool->sp_reclaim == DAOS_RECLAIM_DISABLED) + continue; + + /* if aggregation enabled, make sure to report ec_agg_eph at the start phase + * when min_ec_agg_eph and cte_last_ec_agg_epoch are both zero. + */ + if (min_ec_agg_eph == DAOS_EPOCH_MAX || min_stable_eph == DAOS_EPOCH_MAX || + (ec_eph->cte_last_ec_agg_epoch != 0 && + min_ec_agg_eph <= ec_eph->cte_last_ec_agg_epoch && min_stable_eph <= ec_eph->cte_last_stable_epoch)) { if (min_ec_agg_eph > 0 && min_stable_eph > 0 && (min_ec_agg_eph < ec_eph->cte_last_ec_agg_epoch || @@ -2820,8 +2862,9 @@ ds_cont_eph_report(struct ds_pool *pool) D_DEBUG(DB_MD, "Update ec_agg_eph " DF_X64 ", stable_eph " DF_X64 ", " DF_UUID "\n", min_ec_agg_eph, min_stable_eph, DP_UUID(ec_eph->cte_cont_uuid)); - ret = cont_iv_track_eph_update(pool->sp_iv_ns, ec_eph->cte_cont_uuid, - min_ec_agg_eph, min_stable_eph); + ret = + cont_iv_track_eph_update(pool->sp_iv_ns, ec_eph->cte_cont_uuid, min_ec_agg_eph, + min_stable_eph, pool->sp_ec_ephs_req); if (ret == 0) { ec_eph->cte_last_ec_agg_epoch = min_ec_agg_eph; ec_eph->cte_last_stable_epoch = min_stable_eph; diff --git a/src/control/SConscript b/src/control/SConscript index b64ed23bc0d..aa40e619d30 100644 --- a/src/control/SConscript +++ b/src/control/SConscript @@ -24,9 +24,23 @@ def get_build_tags(benv): tags.append("pprof") else: tags.append("release") + if is_server_build(benv): + print("Building server go binary: adding 'server' build tag") + tags.append("server") return f"-tags {','.join(tags)}" +def is_server_build(benv): + """Check if the go-lang binary being built is a server binary.""" + env = benv.get("ENV") + if env is None: + return False + ld_flags = env.get("CGO_LDFLAGS") + if ld_flags is None: + return False + return "-ldaos_common_pmem" in ld_flags + + def is_release_build(benv): "Check whether this build is for release." return benv.get("BUILD_TYPE") == "release" @@ -109,7 +123,7 @@ def install_go_bin(env, name, libs=None, install_man=False): menv = env.Clone() # This runs code from the build area so needs LD_LIBRARY_PATH set. menv.d_enable_ld_path(["cart", "gurt", "client/api", "common", "client/dfs", "utils", - "utils/self_test"]) + "utils/self_test", "vos", "common/dav_v2", "bio", "utils/ddb"]) menv.Command(build_path, target, f'{gen_bin} manpage -o {build_path}') menv.Install('$PREFIX/share/man/man8', build_path) @@ -192,10 +206,12 @@ def scons(): "-L$BUILD_DIR/src/common " "-L$BUILD_DIR/src/utils/ddb " "-L$SPDK_PREFIX/lib " - "-L$OFI_PREFIX/lib $_RPATH") + "-L$OFI_PREFIX/lib " + "-L$ISAL_PREFIX/lib64 " + "-L$ISAL_CRYPTO_PREFIX/lib64 $_RPATH") # Explicitly link RTE & SPDK libs for CGO access ldopts = cgolibdirs + " -lspdk_env_dpdk -lspdk_nvme -lspdk_vmd -lrte_mempool" + \ - " -lrte_mempool_ring -lrte_bus_pci -lnvme_control -lnuma -ldl" + " -lrte_mempool_ring -lrte_bus_pci -lnvme_control -lisal -lssl -lnuma -ldl" aenv.AppendENVPath("CGO_LDFLAGS", ldopts, sep=" ") aenv.AppendENVPath("CGO_CFLAGS", aenv.subst("$_CPPINCFLAGS"), sep=" ") @@ -214,8 +230,8 @@ def scons(): # Add vos and dependent libs for ddb ddb_env.AppendENVPath("CGO_LDFLAGS", " -lvos -ldav_v2 -ldaos_common_pmem -lpmem " - "-labt -lgurt -luuid -lbio -lcart", sep=" ") - install_go_bin(ddb_env, "ddb", ['ddb']) + "-labt -lgurt -luuid -lbio -lssl -lcart", sep=" ") + install_go_bin(ddb_env, "ddb", ['ddb'], True) if __name__ == "SCons.Script": diff --git a/src/control/cmd/daos/fi.go b/src/control/cmd/daos/fi.go index b47f463d0ce..adacf7575f7 100644 --- a/src/control/cmd/daos/fi.go +++ b/src/control/cmd/daos/fi.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -19,6 +20,8 @@ import ( "strings" "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/daos/api" ) type faultsCmdRoot struct { @@ -58,23 +61,53 @@ func (ff faultFrequency) HasSome() (uint64, bool) { type faultLocation uint64 func (fl *faultLocation) UnmarshalFlag(fv string) error { - // Ugh. Seems like there should be a more clever way to do this... - switch strings.TrimSpace(fv) { - case "DAOS_CHK_CONT_ORPHAN": - *fl = faultLocation(C.DAOS_CHK_CONT_ORPHAN) - case "DAOS_CHK_CONT_BAD_LABEL": - *fl = faultLocation(C.DAOS_CHK_CONT_BAD_LABEL) - default: - return errors.Errorf("unhandled fault location %q", fv) + if fv == "none" { + *fl = 0 + return nil + } + + loc, err := api.FaultLocationFromString(fv) + if err != nil { + return err } + *fl = faultLocation(loc) return nil } +// IsSet indicates whether a fault location has been set. +func (fl faultLocation) IsSet() bool { + return fl != 0 +} + +type faultValue uint64 + +const faultValueUnset = faultValue(^uint64(0)) + +func (fv *faultValue) UnmarshalFlag(fvStr string) error { + if fvStr == "none" { + *fv = faultValueUnset + return nil + } + + // Allow hexadecimal and binary values, as well as decimal. + v, err := strconv.ParseUint(fvStr, 0, 64) + if err != nil { + return errors.Errorf("invalid fault value %q", fvStr) + } + *fv = faultValue(v) + return nil +} + +// IsSet indicates whether a fault value has been set. +func (fv faultValue) IsSet() bool { + return fv != faultValueUnset +} + type faultRank uint32 func (fr *faultRank) UnmarshalFlag(fv string) error { - if fv == strconv.FormatUint(uint64(C.CRT_NO_RANK), 10) || fv == "-1" { + if fv == "all" || fv == strconv.FormatUint(uint64(C.CRT_NO_RANK), 10) || fv == "-1" { *fr = faultRank(C.CRT_NO_RANK) return nil } @@ -90,9 +123,10 @@ func (fr *faultRank) UnmarshalFlag(fv string) error { type faultInjectionCmd struct { daosCmd - Rank faultRank `short:"r" long:"rank" description:"Rank to inject fault on" default:"4294967295"` + Rank faultRank `short:"r" long:"rank" description:"Rank to inject fault on" default:"all"` Frequency faultFrequency `short:"f" long:"frequency" description:"Fault injection frequency" choices:"always,once" default:"once"` - Location faultLocation `short:"l" long:"location" description:"Fault injection location" required:"1"` + Location faultLocation `short:"l" long:"location" description:"Fault injection location" default:"none"` + Value faultValue `short:"v" long:"value" description:"Fault injection value" default:"none"` } func (cmd *faultInjectionCmd) setParams() error { @@ -112,19 +146,41 @@ func (cmd *faultInjectionCmd) setParams() error { if cmd.Rank != C.CRT_NO_RANK { rankMsg = fmt.Sprintf("rank %d", cmd.Rank) } - cmd.Debugf("injecting fault %d on %s", faultMask, rankMsg) + cmd.Debugf("injecting fault location 0x%x on %s", faultMask, rankMsg) rc := C.daos_debug_set_params(nil, C.d_rank_t(cmd.Rank), C.DMG_KEY_FAIL_LOC, faultMask, 0, nil) if err := daosError(rc); err != nil { return errors.Wrap(err, "failed to set fault injection") } + + if cmd.Value.IsSet() { + cmd.Debugf("injecting fault value %d on %s", cmd.Value, rankMsg) + rc = C.daos_debug_set_params(nil, C.d_rank_t(cmd.Rank), C.DMG_KEY_FAIL_VALUE, C.uint64_t(cmd.Value), 0, nil) + if err := daosError(rc); err != nil { + return errors.Wrap(err, "failed to set fault injection value") + } + } return nil } type debugFaultCmd struct { faultInjectionCmd + + Reset bool `long:"reset" description:"Reset all fault injection parameters"` } func (cmd *debugFaultCmd) Execute(_ []string) error { + if cmd.Reset { + if cmd.Location.IsSet() || cmd.Value.IsSet() { + return errors.New("cannot set location or value when resetting fault injection parameters") + } + + cmd.Debugf("resetting all fault injection parameters") + cmd.Frequency = 0 + cmd.Location = 0 + cmd.Value = 0 + } else if !cmd.Location.IsSet() { + return errors.New("--location must be specified unless --reset is used") + } return cmd.setParams() } diff --git a/src/control/cmd/daos/health.go b/src/control/cmd/daos/health.go index a91916ffe72..90f7c9adcad 100644 --- a/src/control/cmd/daos/health.go +++ b/src/control/cmd/daos/health.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -11,10 +12,10 @@ import ( "strings" "github.com/google/uuid" + "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/cmd/daos/pretty" - "github.com/daos-stack/daos/src/control/common/cmdutil" "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/daos/api" "github.com/daos-stack/daos/src/control/lib/ranklist" @@ -82,7 +83,7 @@ func (cmd *healthCheckCmd) Execute([]string) error { sysInfo, err := cmd.apiProvider.GetSystemInfo(ctx) if err != nil { - cmd.Errorf("failed to query system information: %v", err) + return errors.Wrapf(err, "failed to query system information") } systemHealth.SystemInfo = sysInfo @@ -154,9 +155,7 @@ func (cmd *healthCheckCmd) Execute([]string) error { } type netTestCmd struct { - cmdutil.JSONOutputCmd - cmdutil.LogCmd - sysCmd + daosCmd Ranks ui.RankSetFlag `short:"r" long:"ranks" description:"Use the specified ranks as test endpoints (default: all)"` Tags ui.RankSetFlag `short:"t" long:"tags" description:"Use the specified tags on ranks" default:"0"` XferSize ui.ByteSizeFlag `short:"s" long:"size" description:"Per-RPC transfer size (send/reply)"` @@ -167,8 +166,22 @@ type netTestCmd struct { } func (cmd *netTestCmd) Execute(_ []string) error { + ctx := cmd.MustLogCtx() + + sys := cmd.SysName + if sys == "" { + sysInfo, err := cmd.apiProvider.GetSystemInfo(ctx) + if err != nil { + return errors.Wrapf(err, "failed to query system information") + } + sys = sysInfo.Name + } + // Cart self-test requires the ability to initialize as server, so we have to clean up our + // client initialization. + cmd.apiProvider.Cleanup() + cfg := &daos.SelfTestConfig{ - GroupName: cmd.SysName, + GroupName: sys, EndpointRanks: cmd.Ranks.Ranks(), EndpointTags: ranklist.RanksToUint32(cmd.Tags.Ranks()), MaxInflightRPCs: cmd.MaxInflight, @@ -192,7 +205,7 @@ func (cmd *netTestCmd) Execute(_ []string) error { cmd.Info("Starting non-destructive network test (duration depends on performance)...\n\n") } - res, err := RunSelfTest(cmd.MustLogCtx(), cfg) + res, err := RunSelfTest(ctx, cfg) if err != nil { return err } diff --git a/src/control/cmd/daos/pretty/pool.go b/src/control/cmd/daos/pretty/pool.go index af5b99f9c76..c2650137f44 100644 --- a/src/control/cmd/daos/pretty/pool.go +++ b/src/control/cmd/daos/pretty/pool.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -136,8 +136,14 @@ func PrintPoolInfo(pi *daos.PoolInfo, out io.Writer) error { fmt.Fprintf(w, "- Rebuild %s, %d objs, %d recs\n", pi.Rebuild.State, pi.Rebuild.Objects, pi.Rebuild.Records) } else { - fmt.Fprintf(w, "- Rebuild failed, status=%d\n", pi.Rebuild.Status) + fmt.Fprintf(w, "- Rebuild %s (state=%s, status=%d)\n", + pi.Rebuild.DerivedState, pi.Rebuild.State, pi.Rebuild.Status) } + s := "normal" + if pi.Rebuild.Degraded { + s = "degraded" + } + fmt.Fprintf(w, "- Data redundancy: %s\n", s) } else { fmt.Fprintln(w, "- No rebuild status available.") } @@ -164,7 +170,7 @@ func PrintPoolQueryTargetInfo(pqti *daos.PoolQueryTargetInfo, out io.Writer) err w := txtfmt.NewErrWriter(out) // Maintain output compatibility with the `daos pool query-targets` output. - fmt.Fprintf(w, "Target: type %s, state %s\n", pqti.Type, pqti.State) + fmt.Fprintf(w, "Target: state %s\n", pqti.State) if pqti.Space != nil { if pqti.MdOnSsdActive { printPoolTiersMdOnSsd(pqti.MemFileBytes, pqti.Space, w, false) diff --git a/src/control/cmd/daos/pretty/pool_test.go b/src/control/cmd/daos/pretty/pool_test.go index ccc2f4e9537..0c20b247ae8 100644 --- a/src/control/cmd/daos/pretty/pool_test.go +++ b/src/control/cmd/daos/pretty/pool_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -50,9 +50,10 @@ Pool health info: PoolLayoutVer: 1, UpgradeLayoutVer: 2, Rebuild: &daos.PoolRebuildStatus{ - State: daos.PoolRebuildStateBusy, - Objects: 42, - Records: 21, + State: daos.PoolRebuildStateBusy, + Objects: 42, + Records: 21, + Degraded: true, }, TierStats: []*daos.StorageUsageStats{ { @@ -72,6 +73,7 @@ Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=TargetsExcluded Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: - Rebuild busy, 42 objs, 21 recs +- Data redundancy: degraded Pool space info: - Target count:1 - Storage tier 0 (SCM): @@ -96,9 +98,10 @@ Pool space info: UpgradeLayoutVer: 2, EnabledRanks: ranklist.MustCreateRankSet("[0,1,2]"), Rebuild: &daos.PoolRebuildStatus{ - State: daos.PoolRebuildStateBusy, - Objects: 42, - Records: 21, + State: daos.PoolRebuildStateBusy, + Objects: 42, + Records: 21, + Degraded: true, }, TierStats: []*daos.StorageUsageStats{ { @@ -119,6 +122,7 @@ Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: - Enabled ranks: 0-2 - Rebuild busy, 42 objs, 21 recs +- Data redundancy: degraded Pool space info: - Target count:1 - Storage tier 0 (SCM): @@ -144,9 +148,10 @@ Pool space info: DisabledRanks: ranklist.MustCreateRankSet("[0,1,3]"), DeadRanks: ranklist.MustCreateRankSet("[2]"), Rebuild: &daos.PoolRebuildStatus{ - State: daos.PoolRebuildStateBusy, - Objects: 42, - Records: 21, + State: daos.PoolRebuildStateBusy, + Objects: 42, + Records: 21, + Degraded: true, }, TierStats: []*daos.StorageUsageStats{ { @@ -166,6 +171,7 @@ Pool health info: - Disabled ranks: 0-1,3 - Dead ranks: 2 - Rebuild busy, 42 objs, 21 recs +- Data redundancy: degraded `, poolUUID.String()), }, "normal response; disabled ranks": { @@ -182,9 +188,10 @@ Pool health info: UpgradeLayoutVer: 2, DisabledRanks: ranklist.MustCreateRankSet("[0,1,3]"), Rebuild: &daos.PoolRebuildStatus{ - State: daos.PoolRebuildStateBusy, - Objects: 42, - Records: 21, + State: daos.PoolRebuildStateBusy, + Objects: 42, + Records: 21, + Degraded: true, }, TierStats: []*daos.StorageUsageStats{ { @@ -205,6 +212,7 @@ Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: - Disabled ranks: 0-1,3 - Rebuild busy, 42 objs, 21 recs +- Data redundancy: degraded Pool space info: - Target count:1 - Storage tier 0 (SCM): @@ -229,9 +237,10 @@ Pool space info: UpgradeLayoutVer: 2, DisabledRanks: ranklist.MustCreateRankSet("[0,1,3]"), Rebuild: &daos.PoolRebuildStatus{ - State: 42, - Objects: 42, - Records: 21, + State: 42, + Objects: 42, + Records: 21, + Degraded: false, }, TierStats: []*daos.StorageUsageStats{ { @@ -252,6 +261,7 @@ Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: - Disabled ranks: 0-1,3 - Rebuild unknown, 42 objs, 21 recs +- Data redundancy: normal Pool space info: - Target count:1 - Storage tier 0 (SCM): @@ -262,7 +272,7 @@ Pool space info: Free: 1 B, min:0 B, max:0 B, mean:0 B `, poolUUID.String()), }, - "rebuild failed": { + "rebuild failing": { pi: &daos.PoolInfo{ QueryMask: daos.DefaultPoolQueryMask, State: daos.PoolServiceStateTargetsExcluded, @@ -275,10 +285,12 @@ Pool space info: PoolLayoutVer: 1, UpgradeLayoutVer: 2, Rebuild: &daos.PoolRebuildStatus{ - Status: 2, - State: daos.PoolRebuildStateBusy, - Objects: 42, - Records: 21, + Status: -2, + State: daos.PoolRebuildStateBusy, + DerivedState: daos.PoolRebuildStateFailing, + Objects: 42, + Records: 21, + Degraded: true, }, TierStats: []*daos.StorageUsageStats{ { @@ -298,7 +310,8 @@ Pool space info: Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=TargetsExcluded Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: -- Rebuild failed, status=2 +- Rebuild failing (state=busy, status=-2) +- Data redundancy: degraded Pool space info: - Target count:1 - Storage tier 0 (SCM): @@ -322,9 +335,10 @@ Pool space info: PoolLayoutVer: 1, UpgradeLayoutVer: 2, Rebuild: &daos.PoolRebuildStatus{ - State: daos.PoolRebuildStateBusy, - Objects: 42, - Records: 21, + State: daos.PoolRebuildStateBusy, + Objects: 42, + Records: 21, + Degraded: true, }, TierStats: []*daos.StorageUsageStats{ { @@ -346,6 +360,7 @@ Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=TargetsExcluded Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: - Rebuild busy, 42 objs, 21 recs +- Data redundancy: degraded Pool space info: - Target count:1 - Total memory-file size: 1.1 GB @@ -355,6 +370,158 @@ Pool space info: - Data storage: Total size: 4 B Free: 2 B, min:0 B, max:0 B, mean:0 B +`, poolUUID.String()), + }, + "rebuild state idle": { + pi: &daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 8, + ActiveTargets: 8, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateIdle, + DerivedState: daos.PoolRebuildStateIdle, + Status: 0, + Objects: 0, + Records: 0, + Degraded: true, + }, + }, + expPrintStr: fmt.Sprintf(` +Pool %s, ntarget=8, disabled=0, leader=0, version=0, state=Ready +Pool health info: +- Rebuild idle, 0 objs, 0 recs +- Data redundancy: degraded +`, poolUUID.String()), + }, + "rebuild state stopped": { + pi: &daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 8, + ActiveTargets: 8, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateDone, + DerivedState: daos.PoolRebuildStateStopped, + Status: int32(daos.OpCanceled), + Objects: 0, + Records: 0, + Degraded: true, + }, + }, + expPrintStr: fmt.Sprintf(` +Pool %s, ntarget=8, disabled=0, leader=0, version=0, state=Ready +Pool health info: +- Rebuild stopped (state=done, status=-2027) +- Data redundancy: degraded +`, poolUUID.String()), + }, + "rebuild state done": { + pi: &daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 8, + ActiveTargets: 8, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateDone, + DerivedState: daos.PoolRebuildStateDone, + Status: 0, + Objects: 200, + Records: 1000, + Degraded: false, + }, + }, + expPrintStr: fmt.Sprintf(` +Pool %s, ntarget=8, disabled=0, leader=0, version=0, state=Ready +Pool health info: +- Rebuild done, 200 objs, 1000 recs +- Data redundancy: normal +`, poolUUID.String()), + }, + "rebuild state failed": { + pi: &daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 8, + ActiveTargets: 8, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateDone, + DerivedState: daos.PoolRebuildStateFailed, + Status: -1, + Degraded: true, + }, + }, + expPrintStr: fmt.Sprintf(` +Pool %s, ntarget=8, disabled=0, leader=0, version=0, state=Ready +Pool health info: +- Rebuild failed (state=done, status=-1) +- Data redundancy: degraded +`, poolUUID.String()), + }, + "rebuild state busy": { + pi: &daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 8, + ActiveTargets: 8, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateBusy, + DerivedState: daos.PoolRebuildStateBusy, + Status: 0, + Objects: 150, + Records: 750, + Degraded: true, + }, + }, + expPrintStr: fmt.Sprintf(` +Pool %s, ntarget=8, disabled=0, leader=0, version=0, state=Ready +Pool health info: +- Rebuild busy, 150 objs, 750 recs +- Data redundancy: degraded +`, poolUUID.String()), + }, + "rebuild state stopping": { + pi: &daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 8, + ActiveTargets: 8, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateBusy, + DerivedState: daos.PoolRebuildStateStopping, + Status: int32(daos.OpCanceled), + Objects: 100, + Records: 500, + Degraded: true, + }, + }, + expPrintStr: fmt.Sprintf(` +Pool %s, ntarget=8, disabled=0, leader=0, version=0, state=Ready +Pool health info: +- Rebuild stopping (state=busy, status=-2027) +- Data redundancy: degraded +`, poolUUID.String()), + }, + "rebuild state failing": { + pi: &daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 8, + ActiveTargets: 8, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateBusy, + DerivedState: daos.PoolRebuildStateFailing, + Status: -1, + Objects: 75, + Records: 300, + Degraded: true, + }, + }, + expPrintStr: fmt.Sprintf(` +Pool %s, ntarget=8, disabled=0, leader=0, version=0, state=Ready +Pool health info: +- Rebuild failing (state=busy, status=-1) +- Data redundancy: degraded `, poolUUID.String()), }, } { @@ -371,12 +538,42 @@ Pool space info: } } -// TODO DAOS-18128: Add more test cases -//func TestPretty_PrintPoolSelfHealDisable(t *testing.T) { -// for name, tc := range map[string]struct { -// sysSelfHeal string -// poolSelfHeal string -//} +func TestPretty_PrintPoolSelfHealDisable(t *testing.T) { + for name, tc := range map[string]struct { + poolSelfHeal string + sysSelfHeal string + expPrintStr string + }{ + "defaults": { + poolSelfHeal: "exclude;rebuild", + sysSelfHeal: "exclude;pool_exclude;pool_rebuild", + }, + "no pool flags": { + poolSelfHeal: "none", + sysSelfHeal: "exclude;pool_exclude;pool_rebuild", + expPrintStr: "exclude disabled on pool due to [pool] policy\nrebuild disabled on pool due to [pool] policy\n", + }, + "no system flags": { + poolSelfHeal: "exclude;rebuild", + sysSelfHeal: "none", + expPrintStr: "exclude disabled on pool due to [system] policy\nrebuild disabled on pool due to [system] policy\n", + }, + "no flags": { + poolSelfHeal: "none", + sysSelfHeal: "none", + expPrintStr: "exclude disabled on pool due to [pool system] policies\nrebuild disabled on pool due to [pool system] policies\n", + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + PrintPoolSelfHealDisable(tc.poolSelfHeal, tc.sysSelfHeal, &bld) + + if diff := cmp.Diff(strings.TrimLeft(tc.expPrintStr, "\n"), bld.String()); diff != "" { + t.Fatalf("unexpected print string (-want, +got):\n%s\n", diff) + } + }) + } +} func TestPretty_PrintPoolQueryTarget(t *testing.T) { for name, tc := range map[string]struct { @@ -389,7 +586,6 @@ func TestPretty_PrintPoolQueryTarget(t *testing.T) { }, "valid: single target (unknown, down_out)": { pqti: &daos.PoolQueryTargetInfo{ - Type: 0, State: daos.PoolTargetStateDownOut, Space: []*daos.StorageUsageStats{ { @@ -405,7 +601,7 @@ func TestPretty_PrintPoolQueryTarget(t *testing.T) { }, }, expPrintStr: ` -Target: type unknown, state down_out +Target: state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -416,7 +612,6 @@ Target: type unknown, state down_out }, "valid: single target (unknown, down)": { pqti: &daos.PoolQueryTargetInfo{ - Type: 0, State: daos.PoolTargetStateDown, Space: []*daos.StorageUsageStats{ { @@ -432,7 +627,7 @@ Target: type unknown, state down_out }, }, expPrintStr: ` -Target: type unknown, state down +Target: state down - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -443,7 +638,6 @@ Target: type unknown, state down }, "valid: single target (unknown, up)": { pqti: &daos.PoolQueryTargetInfo{ - Type: 0, State: daos.PoolTargetStateUp, Space: []*daos.StorageUsageStats{ { @@ -459,7 +653,7 @@ Target: type unknown, state down }, }, expPrintStr: ` -Target: type unknown, state up +Target: state up - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -470,7 +664,6 @@ Target: type unknown, state up }, "valid: single target (unknown, up_in)": { pqti: &daos.PoolQueryTargetInfo{ - Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{ { @@ -486,7 +679,7 @@ Target: type unknown, state up }, }, expPrintStr: ` -Target: type unknown, state up_in +Target: state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -497,7 +690,6 @@ Target: type unknown, state up_in }, "valid: single target (unknown, new)": { pqti: &daos.PoolQueryTargetInfo{ - Type: 0, State: daos.PoolTargetStateNew, Space: []*daos.StorageUsageStats{ { @@ -513,7 +705,7 @@ Target: type unknown, state up_in }, }, expPrintStr: ` -Target: type unknown, state new +Target: state new - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -524,7 +716,6 @@ Target: type unknown, state new }, "valid: single target (unknown, drain)": { pqti: &daos.PoolQueryTargetInfo{ - Type: 0, State: daos.PoolTargetStateDrain, Space: []*daos.StorageUsageStats{ { @@ -541,7 +732,7 @@ Target: type unknown, state new MemFileBytes: 3000000000, }, expPrintStr: ` -Target: type unknown, state drain +Target: state drain - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -552,7 +743,6 @@ Target: type unknown, state drain }, "valid: single target (unknown, down_out): MD-on-SSD": { pqti: &daos.PoolQueryTargetInfo{ - Type: 0, State: daos.PoolTargetStateDownOut, Space: []*daos.StorageUsageStats{ { @@ -570,7 +760,7 @@ Target: type unknown, state drain MdOnSsdActive: true, }, expPrintStr: ` -Target: type unknown, state down_out +Target: state down_out - Metadata storage: Total size: 6.0 GB Free: 5.0 GB diff --git a/src/control/cmd/daos/pretty/selftest.go b/src/control/cmd/daos/pretty/selftest.go index bf880dcd0b5..79ec47a8471 100644 --- a/src/control/cmd/daos/pretty/selftest.go +++ b/src/control/cmd/daos/pretty/selftest.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -148,7 +149,11 @@ func PrintSelfTestResult(out io.Writer, result *daos.SelfTestResult, verbose, sh for _, pct := range pcts { pctTitles[pct] = fmt.Sprintf("%d%%", pct) - row[pctTitles[pct]] = printLatencyVal(buckets[pct].UpperBound, dispUnit) + val := "N/A" + if b, found := buckets[pct]; found { + val = printLatencyVal(b.UpperBound, dispUnit) + } + row[pctTitles[pct]] = val } table = append(table, row) diff --git a/src/control/cmd/daos/pretty/selftest_test.go b/src/control/cmd/daos/pretty/selftest_test.go index 3bfaacf0dd0..d8fb8b79c51 100644 --- a/src/control/cmd/daos/pretty/selftest_test.go +++ b/src/control/cmd/daos/pretty/selftest_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -96,6 +97,7 @@ Client/Server Network Test Parameters }, "custom - verbose": { cfg: genCfg(func(cfg *daos.SelfTestConfig) { + cfg.GroupName = "daos_server_test" cfg.EndpointRanks = []ranklist.Rank{0, 1, 2} cfg.EndpointTags = []uint32{0, 1, 2} cfg.SendSizes = []uint64{1024, 1024 * 1024} @@ -109,7 +111,7 @@ Client/Server Network Test Parameters Send RPC Sizes : [1.00 KiB 1.00 MiB] Reply RPC Sizes : [2.00 MiB 2.00 GiB] RPCs Per Server : 10000 - System Name : daos_server + System Name : daos_server_test Tags : [0-2] Max In-Flight RPCs: 16 @@ -292,6 +294,55 @@ Per-Target Latency Results 0:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms 20.0% 1:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms 20.0% 2:0 0.00ms 15.00ms 22.50ms 27.00ms 28.50ms 29.70ms 30.00ms 15.00ms 8.66ms 20.0% +`, + }, + "verbose with all failures": { + result: func() *daos.SelfTestResult { + cfg := &daos.SelfTestConfig{} + cfg.SetDefaults() + r := &daos.SelfTestResult{ + MasterEndpoint: daos.SelfTestEndpoint{Rank: 3, Tag: 0}, + TargetEndpoints: []daos.SelfTestEndpoint{ + {Rank: 0, Tag: 0}, + {Rank: 1, Tag: 0}, + {Rank: 2, Tag: 0}, + }, + Repetitions: cfg.Repetitions * 3, + SendSize: cfg.SendSizes[0], + ReplySize: cfg.ReplySizes[0], + BufferAlignment: cfg.BufferAlignment, + Duration: 8500 * time.Millisecond, + MasterLatency: &daos.EndpointLatency{ + TotalRPCs: uint64(cfg.Repetitions), + }, + } + for i := int64(1); i <= int64(r.Repetitions); i++ { + r.MasterLatency.AddValue(-1) + r.AddTargetLatency(ranklist.Rank(i%3), 0, -1) + } + return r + }(), + verbose: true, + expStr: ` +Client/Server Network Test Summary +---------------------------------- + Server Endpoints: [0-2]:0 + RPC Throughput : 1176.47 RPC/s + RPC Bandwidth : 19.28 Mbps + Average Latency : 0.00ms + Client Endpoint : 3:0 + Duration : 8.5s + Repetitions : 30000 + Send Size : 1.00 KiB + Reply Size : 1.00 KiB + Failed RPCs : 30000 (100.0%) + +Per-Target Latency Results + Target Min 50% 75% 90% 95% 99% Max Average StdDev Failed + ------ --- --- --- --- --- --- --- ------- ------ ------ + 0:0 0.00ms N/A N/A N/A N/A N/A 0.00ms 0.00ms 0.00ms 100.0% + 1:0 0.00ms N/A N/A N/A N/A N/A 0.00ms 0.00ms 0.00ms 100.0% + 2:0 0.00ms N/A N/A N/A N/A N/A 0.00ms 0.00ms 0.00ms 100.0% `, }, } { diff --git a/src/control/cmd/daos/util.go b/src/control/cmd/daos/util.go index ce57e4e3923..f3961f288a5 100644 --- a/src/control/cmd/daos/util.go +++ b/src/control/cmd/daos/util.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -29,7 +30,9 @@ import ( // NB: There should only be one set of CFLAGS/LDFLAGS definitions // for the whole package! #cgo CFLAGS: -I${SRCDIR}/../../../utils -#cgo LDFLAGS: -lgurt -lcart -ldaos -ldaos_common -lduns -ldfs -luuid -ldaos_cmd_hdlrs +#cgo LDFLAGS: -lgurt -lcart -ldaos -lduns -ldfs -luuid -ldaos_cmd_hdlrs +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem #include "util.h" diff --git a/src/control/cmd/ddb/command_completers.go b/src/control/cmd/ddb/command_completers.go index 3c5a95a97b0..ae55222f911 100644 --- a/src/control/cmd/ddb/command_completers.go +++ b/src/control/cmd/ddb/command_completers.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022-2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,74 +8,92 @@ package main import ( - "io/fs" + "os" "path/filepath" + "regexp" "strings" ) -const ( - defMntPrefix = "/mnt" +var ( + vosRegexp = regexp.MustCompile(`^.+/((vos-([0-9]|([1-9][0-9]+)))|(rdb-pool))$`) ) -func listDirVos(match string) (result []string) { - if strings.HasSuffix(match, "vos-") { - match = filepath.Dir(match) +func listVosFiles(match string) (result []string) { + result = []string{} + + matches, err := filepath.Glob(match + "*") + if err != nil { + return } - filepath.Walk(match, func(path string, info fs.FileInfo, err error) error { + for _, match := range matches { + path := filepath.Clean(match) + fi, err := os.Stat(path) if err != nil { - /* ignore error */ - return nil + continue } - if strings.Contains(path, "vos-") { + + switch mode := fi.Mode(); { + case mode.IsDir(): + result = append(result, path+string(os.PathSeparator)) + case mode.IsRegular(): + if !vosRegexp.MatchString(path) { + continue + } result = append(result, path) } - return nil - }) + } + return } -func filterSuggestions(prefix string, initialSuggestions, additionalSuggestions []string) []string { - suggestions := append([]string{}, initialSuggestions...) - suggestions = append(suggestions, additionalSuggestions...) +func appendSuggestion(suggestions []string, suggestion string, prefix string) []string { + if len(prefix) == 0 { + return append(suggestions, suggestion) + } - if len(prefix) > 0 { - var newSuggestions []string - for _, s := range suggestions { - if strings.HasPrefix(s, prefix) { - newSuggestions = append(newSuggestions, strings.Trim(s, prefix)) - } - } - suggestions = newSuggestions + if !strings.HasPrefix(suggestion, prefix) { + return suggestions + } + + if len(suggestion) > 2 && suggestion[1] == prefix[0] { + // Workaround to properly handle invalid prefix management + return append(suggestions, suggestion) + } + return append(suggestions, strings.TrimPrefix(suggestion, prefix)) +} + +func filterSuggestions(prefix string, initialSuggestions, additionalSuggestions []string) (suggestions []string) { + suggestions = []string{} + + for _, suggestion := range initialSuggestions { + suggestions = appendSuggestion(suggestions, suggestion, prefix) + } + for _, suggestion := range additionalSuggestions { + suggestions = appendSuggestion(suggestions, suggestion, prefix) } - return suggestions + return } func openCompleter(prefix string, args []string) []string { - return filterSuggestions(prefix, []string{"-h", "-w", "--write_mode"}, listDirVos(defMntPrefix)) + return filterSuggestions( + prefix, + []string{"-w", "--write_mode", "-p", "--db_path=", "-h", "--help"}, + listVosFiles(prefix), + ) } func featureCompleter(prefix string, args []string) []string { - return filterSuggestions(prefix, []string{"-h", "-e", "--enable", "-d", "--disable", "-s", "--show"}, - listDirVos(defMntPrefix)) -} - -func listDirPool(match string) (result []string) { - if strings.HasSuffix(match, "vos-") { - match = filepath.Dir(match) - } - filepath.Walk(match, func(path string, info fs.FileInfo, err error) error { - if err != nil { - /* ignore error */ - return nil - } - result = append(result, path) - return nil - }) - return + return filterSuggestions( + prefix, + []string{"-e", "--enable", "-d", "--disable", "-s", "--show", "-h", "--help"}, + listVosFiles(prefix)) } func rmPoolCompleter(prefix string, args []string) []string { - return filterSuggestions(prefix, []string{"-h"}, listDirPool(defMntPrefix)) + return filterSuggestions( + prefix, + []string{"-h", "--help"}, + listVosFiles(prefix)) } diff --git a/src/control/cmd/ddb/command_completers_test.go b/src/control/cmd/ddb/command_completers_test.go new file mode 100644 index 00000000000..834fbfb048a --- /dev/null +++ b/src/control/cmd/ddb/command_completers_test.go @@ -0,0 +1,175 @@ +package main + +import ( + "os" + "path/filepath" + "testing" + + "github.com/daos-stack/daos/src/control/common/test" +) + +var ( + testPoolDirs = [...]string{"a", "ab", "aac", "aaad"} + testVosFiles = [...]string{"vos-0", "vos-1", "vos-2", "vos-10", "vos-201", "vos-000", "vos-a", "rdb-pool", "rdb-666"} +) + +func createFile(t *testing.T, filePath string) { + t.Helper() + + fd, err := os.Create(filePath) + if err != nil { + t.Fatalf("Failed to create test vos file %s: %v", filePath, err) + } + fd.Close() +} + +func createDirAll(t *testing.T, dirPath string) { + t.Helper() + + if err := os.MkdirAll(dirPath, 0755); err != nil { + t.Fatalf("Failed to create test pool directory %s: %v", dirPath, err) + } +} + +func testSetup(t *testing.T) (tmpDir string, teardown func()) { + t.Helper() + + tmpDir, teardown = test.CreateTestDir(t) + + for _, dir := range testPoolDirs { + createDirAll(t, filepath.Join(tmpDir, dir)) + for _, file := range testVosFiles { + createFile(t, filepath.Join(tmpDir, dir, file)) + } + } + + createDirAll(t, filepath.Join(tmpDir, "foo")) + createFile(t, filepath.Join(tmpDir, "foo", "bar")) + + createDirAll(t, filepath.Join(tmpDir, "bar")) + createDirAll(t, filepath.Join(tmpDir, "bar", "foo")) + createDirAll(t, filepath.Join(tmpDir, "bar", "baz")) + createFile(t, filepath.Join(tmpDir, "bar", "baz", "no_vos")) + + return +} + +func TestListVosFiles(t *testing.T) { + tmpDir, teardown := testSetup(t) + t.Cleanup(teardown) + + for name, tc := range map[string]struct { + args string + expRes []string + }{ + "unaccessible": { + args: "/root/", + expRes: []string{}, + }, + "No match": { + args: filepath.Join(tmpDir, "z"), + expRes: []string{}, + }, + "void director prefix": { + args: tmpDir + string(os.PathSeparator), + expRes: []string{ + filepath.Join(tmpDir, "a") + string(os.PathSeparator), + filepath.Join(tmpDir, "ab") + string(os.PathSeparator), + filepath.Join(tmpDir, "aac") + string(os.PathSeparator), + filepath.Join(tmpDir, "aaad") + string(os.PathSeparator), + filepath.Join(tmpDir, "foo") + string(os.PathSeparator), + filepath.Join(tmpDir, "bar") + string(os.PathSeparator), + }, + }, + "a pool directory prefix": { + args: filepath.Join(tmpDir, "a"), + expRes: []string{ + filepath.Join(tmpDir, "a") + string(os.PathSeparator), + filepath.Join(tmpDir, "ab") + string(os.PathSeparator), + filepath.Join(tmpDir, "aac") + string(os.PathSeparator), + filepath.Join(tmpDir, "aaad") + string(os.PathSeparator), + }, + }, + "aa pool directory prefix": { + args: filepath.Join(tmpDir, "aa"), + expRes: []string{ + filepath.Join(tmpDir, "aac") + string(os.PathSeparator), + filepath.Join(tmpDir, "aaad") + string(os.PathSeparator), + }, + }, + "all vos files": { + args: filepath.Join(tmpDir, "a") + string(os.PathSeparator), + expRes: []string{ + filepath.Join(tmpDir, "a", "vos-0"), + filepath.Join(tmpDir, "a", "vos-1"), + filepath.Join(tmpDir, "a", "vos-2"), + filepath.Join(tmpDir, "a", "vos-10"), + filepath.Join(tmpDir, "a", "vos-201"), + filepath.Join(tmpDir, "a", "rdb-pool"), + }, + }, + "vos-1 prefix files": { + args: filepath.Join(tmpDir, "a", "vos-1"), + expRes: []string{ + filepath.Join(tmpDir, "a", "vos-1"), + filepath.Join(tmpDir, "a", "vos-10"), + }, + }, + } { + t.Run(name, func(t *testing.T) { + results := listVosFiles(tc.args) + test.AssertStringsEqual(t, tc.expRes, results, "listDirVos results do not match expected") + }) + } +} + +func TestFilterSuggestions(t *testing.T) { + // The test cases are designed to cover various prefix scenarios. + // It should notably cover the case where the prefix is a single character that matches the + // second character of a suggestion, which is a special case in the appendSuggestion + // function: Workaround to properly handle invalid prefix management done by the grumble + // completion engine. + var ( + initialSuggestions = []string{"-a", "--all", "-b", "--bar="} + additionalSuggestions = []string{"foo", "a", "ab", "aac", "aaad"} + ) + + for name, tc := range map[string]struct { + prefix string + expRes []string + }{ + "no prefix": { + prefix: "", + expRes: []string{"-a", "--all", "-b", "--bar=", "foo", "a", "ab", "aac", "aaad"}, + }, + "no match prefix": { + prefix: "z", + expRes: []string{}, + }, + "with '-' prefix": { + prefix: "-", + expRes: []string{"a", "--all", "b", "--bar="}, + }, + "with '--' prefix": { + prefix: "--", + expRes: []string{"--all", "--bar="}, + }, + "with 'a' prefix": { + prefix: "a", + expRes: []string{"", "b", "aac", "aaad"}, + }, + "with 'aa' prefix": { + prefix: "aa", + expRes: []string{"aac", "aaad"}, + }, + "with 'aaa' prefix": { + prefix: "aaa", + expRes: []string{"aaad"}, + }, + } { + t.Run(name, func(t *testing.T) { + results := filterSuggestions(tc.prefix, initialSuggestions, additionalSuggestions) + test.AssertStringsEqual(t, tc.expRes, results, "filterSuggestions results do not match expected") + }) + } +} diff --git a/src/control/cmd/ddb/commands_wrapper.go b/src/control/cmd/ddb/commands_wrapper.go index e393b7b7b47..d9751dbd8e5 100644 --- a/src/control/cmd/ddb/commands_wrapper.go +++ b/src/control/cmd/ddb/commands_wrapper.go @@ -1,6 +1,6 @@ // // (C) Copyright 2022-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. // (C) Copyright 2025 Vdura Inc. // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -37,6 +37,15 @@ func freeString(s *C.char) { C.free(unsafe.Pointer(s)) } +func SetCString(out **C.char, s string) func() { + cstr := C.CString(s) + *out = cstr + + return func() { + C.free(unsafe.Pointer(cstr)) + } +} + // InitDdb initializes the ddb context and returns a closure to finalize it. func InitDdb(log *logging.LeveledLogger) (*DdbContext, func(), error) { // Must lock to OS thread because vos init/fini uses ABT init and finalize which must be called on the same thread @@ -78,13 +87,12 @@ func ddbLs(ctx *DdbContext, path string, recursive bool, details bool) error { return daosError(C.ddb_run_ls(&ctx.ctx, &options)) } -func ddbOpen(ctx *DdbContext, path string, db_path string, write_mode bool) error { +func ddbOpen(ctx *DdbContext, path string, write_mode bool) error { /* Set up the options */ options := C.struct_open_options{} options.path = C.CString(path) defer freeString(options.path) - options.db_path = C.CString(db_path) - defer freeString(options.db_path) + options.db_path = ctx.ctx.dc_db_path options.write_mode = C.bool(write_mode) /* Run the c code command */ return daosError(C.ddb_run_open(&ctx.ctx, &options)) @@ -232,13 +240,12 @@ func ddbDtxActAbort(ctx *DdbContext, path string, dtx_id string) error { return daosError(C.ddb_run_dtx_act_abort(&ctx.ctx, &options)) } -func ddbFeature(ctx *DdbContext, path, db_path, enable, disable string, show bool) error { +func ddbFeature(ctx *DdbContext, path, enable, disable string, show bool) error { /* Set up the options */ options := C.struct_feature_options{} options.path = C.CString(path) defer freeString(options.path) - options.db_path = C.CString(db_path) - defer freeString(options.db_path) + options.db_path = ctx.ctx.dc_db_path if enable != "" { err := daosError(C.ddb_feature_string2flags(&ctx.ctx, C.CString(enable), &options.set_compat_flags, &options.set_incompat_flags)) @@ -263,6 +270,7 @@ func ddbRmPool(ctx *DdbContext, path string) error { options := C.struct_rm_pool_options{} options.path = C.CString(path) defer freeString(options.path) + options.db_path = ctx.ctx.dc_db_path /* Run the c code command */ return daosError(C.ddb_run_rm_pool(&ctx.ctx, &options)) } diff --git a/src/control/cmd/ddb/ddb_commands.go b/src/control/cmd/ddb/ddb_commands.go index bee8aeddc4d..d515499264c 100644 --- a/src/control/cmd/ddb/ddb_commands.go +++ b/src/control/cmd/ddb/ddb_commands.go @@ -1,7 +1,7 @@ // // (C) Copyright 2022-2024 Intel Corporation. // (C) Copyright 2025 Vdura Inc. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -51,7 +51,11 @@ pool shard. Part of the path is used to determine what the pool uuid is.`, a.String("path", "Path to the vos file to open.") }, Run: func(c *grumble.Context) error { - return ddbOpen(ctx, c.Args.String("path"), c.Flags.String("db_path"), c.Flags.Bool("write_mode")) + if c.Flags.String("db_path") != "" { + cleanup := SetCString(&ctx.ctx.dc_db_path, c.Flags.String("db_path")) + defer cleanup() + } + return ddbOpen(ctx, c.Args.String("path"), c.Flags.Bool("write_mode")) }, Completer: openCompleter, }) @@ -319,7 +323,7 @@ the path must include the extent, otherwise, it must not.`, a.String("path", "Optional, Path to the vos file", grumble.Default("")) }, Run: func(c *grumble.Context) error { - return ddbFeature(ctx, c.Args.String("path"), c.Flags.String("db_path"), c.Flags.String("enable"), c.Flags.String("disable"), c.Flags.Bool("show")) + return ddbFeature(ctx, c.Args.String("path"), c.Flags.String("enable"), c.Flags.String("disable"), c.Flags.Bool("show")) }, Completer: featureCompleter, }) @@ -330,6 +334,9 @@ the path must include the extent, otherwise, it must not.`, Help: "Remove a vos pool.", LongHelp: "", HelpGroup: "vos", + Flags: func(f *grumble.Flags) { + f.String("p", "db_path", "", "Path to the sys db.") + }, Args: func(a *grumble.Args) { a.String("path", "Optional, Path to the vos file", grumble.Default("")) }, @@ -434,7 +441,7 @@ the path must include the extent, otherwise, it must not.`, a.String("path", "Optional, VOS tree path of a container to aggregate.", grumble.Default("")) }, Flags: func(f *grumble.Flags) { - f.Uint64("t", "cmt_time", math.MaxUint64, "Max aggregfation committed time in seconds") + f.Uint64("t", "cmt_time", math.MaxUint64, "Max aggregation committed time in seconds") f.String("d", "cmt_date", "", "Max aggregation committed date (format '1970-01-01 00:00:00')") }, Run: func(c *grumble.Context) error { diff --git a/src/control/cmd/ddb/main.go b/src/control/cmd/ddb/main.go index 41fa0b6021f..9cf5bb2089d 100644 --- a/src/control/cmd/ddb/main.go +++ b/src/control/cmd/ddb/main.go @@ -1,6 +1,6 @@ // // (C) Copyright 2022-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -9,211 +9,332 @@ package main import ( "bufio" + "fmt" + "io" "os" "path" "path/filepath" "runtime/debug" - "sort" "strings" - "unsafe" + "github.com/desertbit/columnize" + "github.com/desertbit/go-shlex" "github.com/desertbit/grumble" "github.com/jessevdk/go-flags" "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/fault" "github.com/daos-stack/daos/src/control/logging" + "github.com/daos-stack/daos/src/control/server/engine" ) -/* - #include -*/ -import "C" - -func exitWithError(log logging.Logger, err error) { +func exitWithError(err error) { cmdName := path.Base(os.Args[0]) - log.Errorf("%s: %v", cmdName, err) + fmt.Fprintf(os.Stderr, "ERROR: %s: %v\n", cmdName, err) if fault.HasResolution(err) { - log.Errorf("%s: %s", cmdName, fault.ShowResolutionFor(err)) + fmt.Fprintf(os.Stderr, "ERROR: %s: %s", cmdName, fault.ShowResolutionFor(err)) } os.Exit(1) } type cliOptions struct { - Debug bool `long:"debug" description:"enable debug output"` WriteMode bool `long:"write_mode" short:"w" description:"Open the vos file in write mode."` CmdFile string `long:"cmd_file" short:"f" description:"Path to a file containing a sequence of ddb commands to execute."` SysdbPath string `long:"db_path" short:"p" description:"Path to the sys db."` + VosPath string `long:"vos_path" short:"s" description:"Path to the VOS file to open."` Version bool `short:"v" long:"version" description:"Show version"` + Debug string `long:"debug" description:"Logging log level (default to ERROR). More details can be found in the ddb man page."` + LogDir string `long:"log_dir" description:"Directory to write log files to. If not provided, logs will only be written to the console."` Args struct { - VosPath vosPathStr `positional-arg-name:"vos_file_path"` - RunCmd ddbCmdStr `positional-arg-name:"ddb_command"` - RunCmdArgs []string `positional-arg-name:"ddb_command_args"` + RunCmd string `positional-arg-name:"ddb_command" description:"Optional ddb command to run. If not provided, the tool will run in interactive mode."` + RunCmdArgs []string `positional-arg-name:"ddb_command_args" description:"Arguments for the ddb command to run. If not provided, the command will be run without any arguments."` } `positional-args:"yes"` } -type vosPathStr string +const helpCommandsHeader = ` +Available commands: -func (pathStr vosPathStr) Complete(match string) (comps []flags.Completion) { - if match == "" || match == "/" { - match = defMntPrefix - } - for _, comp := range listDirVos(match) { - comps = append(comps, flags.Completion{Item: comp}) - } - sort.Slice(comps, func(i, j int) bool { return comps[i].Item < comps[j].Item }) +` - return -} +const helpTreePath = ` +Path -type ddbCmdStr string +Many of the commands take a VOS tree path. The format for this path is +[cont]/[obj]/[dkey]/[akey]/[extent]. To make it easier to navigate the tree, indexes can be used +instead of the path part. The index is in the format [i]. Indexes and actual path values can be used +together. -func (cmdStr ddbCmdStr) Complete(match string) (comps []flags.Completion) { - // hack to get at command names - ctx, cleanup, err := InitDdb(nil) - if err != nil { - return - } - defer cleanup() +More details on the path format can be found in the ddb man page. - app := createGrumbleApp(ctx) - for _, cmd := range app.Commands().All() { - if match == "" || strings.HasPrefix(cmd.Name, match) { - comps = append(comps, flags.Completion{Item: cmd.Name}) - } - } - sort.Slice(comps, func(i, j int) bool { return comps[i].Item < comps[j].Item }) +` - return -} +const ddbLongDescription = `The DAOS Debug Tool (ddb) allows a user to navigate through and modify +a file in the VOS format. It offers both a command line and interactive +shell mode. If neither a single command or '-f' option is provided, then +the tool will run in interactive mode. In order to modify the VOS file, +the '-w' option must be included. -func (cmdStr *ddbCmdStr) UnmarshalFlag(fv string) error { - *cmdStr = ddbCmdStr(fv) - return nil -} +If the command requires it, the VOS file must be provided with the parameter +--vos-path. The VOS file will be opened before any commands are executed. See +the command‑specific help for details.` + +const grumbleUnknownCmdErr = "unknown command, try 'help'" func runFileCmds(log logging.Logger, app *grumble.App, fileName string) error { file, err := os.Open(fileName) if err != nil { - return errors.Wrapf(err, "Error opening file: %s", fileName) + return errors.Wrapf(err, "Error opening file %q", fileName) } defer func() { err = file.Close() if err != nil { - log.Errorf("Error closing %s: %s\n", fileName, err) + log.Errorf("Error closing %q: %s\n", fileName, err) } }() - log.Debugf("Running commands in: %s\n", fileName) + log.Debugf("Running commands in %q\n", fileName) scanner := bufio.NewScanner(file) for scanner.Scan() { - fileCmd := scanner.Text() - log.Debugf("Running Command: %s\n", fileCmd) - err := runCmdStr(app, fileCmd) + lineStr := scanner.Text() + lineCmd, err := shlex.Split(lineStr, true) if err != nil { - return errors.Wrapf(err, "Failed running command %q", fileCmd) + return errors.Wrapf(err, "Failed running command %q", lineStr) + } + if len(lineCmd) == 0 || strings.HasPrefix(lineCmd[0], "#") { + continue + } + log.Debugf("Running Command %q\n", lineStr) + err = runCmdStr(app, nil, lineCmd[0], lineCmd[1:]...) + if err != nil { + return errors.Wrapf(err, "Failed running command %q", lineStr) } } return nil } -func parseOpts(args []string, opts *cliOptions, log *logging.LeveledLogger) error { +// One cannot relay on grumble to print the list of commands since app does not allow executing +// the help command from the outside of the interactive mode. +// This method extracts commands and their respective help (short) messages in the simplest possible way, +// put them in columns and print them using the provided log. +func printCommands(fd io.Writer, app *grumble.App) { + var output []string + for _, c := range app.Commands().All() { + if c.Name == "quit" { + continue + } + row := c.Name + columnize.DefaultConfig().Delim + c.Help + output = append(output, row) + } + fmt.Fprintf(fd, helpCommandsHeader+columnize.SimpleFormat(output)+"\n\n") +} + +func printGeneralHelp(app *grumble.App, generalMsg string) { + fmt.Println(generalMsg) // standard help from go-flags + printCommands(os.Stdout, app) // list of commands + fmt.Printf(helpTreePath) // extra info on VOS Tree Path syntax +} + +// Ask grumble to generate a help message for the requested command. +// Caveat: There is no known easy way of forcing grumble to use log to print the generated message +// so the output goes directly to stdout. +// Returns false in case the opts.Args.RunCmd is unknown. +func printCmdHelp(app *grumble.App, opts *cliOptions) bool { + err := runCmdStr(app, nil, string(opts.Args.RunCmd), "--help") + if err != nil { + if err.Error() == grumbleUnknownCmdErr { + fmt.Fprintf(os.Stderr, "ERROR: Unknown command '%s'", string(opts.Args.RunCmd)) + printCommands(os.Stderr, app) + } else { + fmt.Fprintf(os.Stderr, "ERROR: %s", err.Error()) + } + return false + } + return true +} + +// Prints either general or command-specific help message. +// Returns a reasonable return code in case the caller chooses to terminate the process. +func printHelp(generalMsg string, opts *cliOptions) int { + // ctx is not necessary since this instance of the app is not intended to run any of the commands + app := createGrumbleApp(nil) + + if string(opts.Args.RunCmd) == "" { + printGeneralHelp(app, generalMsg) + return 0 + } + + if printCmdHelp(app, opts) { + return 0 + } else { + return 1 + } +} + +func setenvIfNotSet(key, value string) { + if os.Getenv(key) == "" { + os.Setenv(key, value) + } +} + +// The golang cli and the C engine use separate logging systems with different log levels. +// This function maps a string log level to the closest matching levels for both systems. +// More details on the log levels can be found in the LOGGING section of the ddb man page. +func strToLogLevels(level string) (logging.LogLevel, engine.LogLevel, error) { + switch strings.ToUpper(level) { + case "TRACE": + return logging.LogLevelTrace, engine.LogLevelDbug, nil + case "DEBUG", "DBUG": + return logging.LogLevelDebug, engine.LogLevelDbug, nil + case "INFO": + return logging.LogLevelInfo, engine.LogLevelInfo, nil + case "NOTE", "NOTICE": + return logging.LogLevelNotice, engine.LogLevelNote, nil + case "WARN": + return logging.LogLevelNotice, engine.LogLevelWarn, nil + case "ERROR", "ERR": + return logging.LogLevelError, engine.LogLevelErr, nil + case "CRIT": + return logging.LogLevelError, engine.LogLevelCrit, nil + case "ALRT": + return logging.LogLevelError, engine.LogLevelAlrt, nil + case "FATAL", "EMRG": + return logging.LogLevelError, engine.LogLevelEmrg, nil + case "EMIT": + return logging.LogLevelError, engine.LogLevelEmit, nil + default: + return logging.LogLevelDisabled, engine.LogLevelUndefined, errors.Errorf("invalid log level %q", level) + } +} + +func newLogger(opts *cliOptions) (*logging.LeveledLogger, error) { + level := "ERR" + if opts.Debug != "" { + level = opts.Debug + } + cliLogLevel, engineLogLevel, err := strToLogLevels(level) + if err != nil { + return nil, errors.Wrap(err, "Error parsing log level") + } + + consoleLog := logging.NewCommandLineLogger() + consoleLog.WithLogLevel(cliLogLevel) + + setenvIfNotSet("D_LOG_MASK", engineLogLevel.String()) + setenvIfNotSet("DD_STDERR", "ERR") + + if opts.LogDir == "" { + return consoleLog, nil + } + + path := filepath.Clean(opts.LogDir) + fi, err := os.Stat(path) + if err != nil { + return nil, errors.Wrapf(err, "Error accessing debug directory %q", path) + } + if !fi.IsDir() { + return nil, errors.Errorf("Debug path %q is not a directory", path) + } + + setenvIfNotSet("D_LOG_FILE", filepath.Join(path, "ddb-engine.log")) + + var fd *os.File + fd, err = common.AppendFile(filepath.Join(path, "ddb-cli.log")) + if err != nil { + return nil, errors.Wrapf(err, "Error opening debug log file 'ddb-cli.log' in %q", path) + } + + consoleLog.WithLogLevel(logging.LogLevelError) + fileLog := logging.NewCombinedLogger("DDB", fd) + fileLog.WithLogLevel(cliLogLevel) + fileLog.WithErrorLogger(consoleLog) + + return fileLog, nil +} + +func parseOpts(args []string, opts *cliOptions) error { p := flags.NewParser(opts, flags.HelpFlag|flags.IgnoreUnknown) p.Name = "ddb" p.Usage = "[OPTIONS]" p.ShortDescription = "daos debug tool" - p.LongDescription = `The DAOS Debug Tool (ddb) allows a user to navigate through and modify -a file in the VOS format. It offers both a command line and interactive -shell mode. If neither a single command or '-f' option is provided, then -the tool will run in interactive mode. In order to modify the VOS file, -the '-w' option must be included. If supplied, the VOS file supplied in -the first positional parameter will be opened before commands are executed. - -Many of the commands take a vos tree path. The format for this path -is [cont]/[obj]/[dkey]/[akey]/[extent]. -- cont - the full container uuid. -- obj - the object id. -- keys (akey, dkey) - there are multiple types of keys - -- string keys are simply the string value. If the size of the - key is greater than strlen(key), then the size is included at - the end of the string value. Example: 'akey{5}' is the key: akey - with a null terminator at the end. - -- number keys are formatted as '{[type]: NNN}' where type is - 'uint8, uint16, uint32, or uint64'. NNN can be a decimal or - hex number. Example: '{uint32: 123456}' - -- binary keys are formatted as '{bin: 0xHHH}' where HHH is the hex - representation of the binary key. Example: '{bin: 0x1a2b}' -- extent for array values - in the format {lo-hi}. - -To make it easier to navigate the tree, indexes can be -used instead of the path part. The index is in the format [i]. Indexes -and actual path values can be used together - -Example Paths: -/3550f5df-e6b1-4415-947e-82e15cf769af/939000573846355970.0.13.1/dkey/akey/[0-1023] -[0]/[1]/[2]/[1]/[9] -/[0]/939000573846355970.0.13.1/[2]/akey{5}/[0-1023] -` + p.LongDescription = ddbLongDescription // Set the traceback level such that a crash results in // a coredump (when ulimit -c is set appropriately). debug.SetTraceback("crash") if _, err := p.ParseArgs(args); err != nil { + if fe, ok := errors.Cause(err).(*flags.Error); ok && fe.Type == flags.ErrHelp { + os.Exit(printHelp(fe.Error(), opts)) + } + return err } if opts.Version { - log.Infof("ddb version %s", build.DaosVersion) - return nil + opts.Args.RunCmd = "version" + opts.Args.RunCmdArgs = []string{} + opts.CmdFile = "" } - if opts.Debug { - log.WithLogLevel(logging.LogLevelDebug) - log.Debug("debug output enabled") + if opts.Args.RunCmd != "" && opts.CmdFile != "" { + return errors.New("Cannot use both command file and a command string") } - ctx, cleanup, err := InitDdb(log) + log, err := newLogger(opts) if err != nil { + return errors.Wrap(err, "Error configuring logging") + } + log.Debug("Logging facilities initialized") + + var ( + ctx *DdbContext + cleanup func() + ) + if ctx, cleanup, err = InitDdb(log); err != nil { return errors.Wrap(err, "Error initializing the DDB Context") } defer cleanup() app := createGrumbleApp(ctx) - if opts.Args.VosPath != "" { + if opts.SysdbPath != "" { + cleanup := SetCString(&ctx.ctx.dc_db_path, string(opts.SysdbPath)) + defer cleanup() + } + + if opts.VosPath != "" { + cleanup := SetCString(&ctx.ctx.dc_pool_path, string(opts.VosPath)) + defer cleanup() + if !strings.HasPrefix(string(opts.Args.RunCmd), "feature") && + !strings.HasPrefix(string(opts.Args.RunCmd), "open") && + !strings.HasPrefix(string(opts.Args.RunCmd), "close") && + !strings.HasPrefix(string(opts.Args.RunCmd), "prov_mem") && + !strings.HasPrefix(string(opts.Args.RunCmd), "smd_sync") && !strings.HasPrefix(string(opts.Args.RunCmd), "rm_pool") && !strings.HasPrefix(string(opts.Args.RunCmd), "dev_list") && !strings.HasPrefix(string(opts.Args.RunCmd), "dev_replace") { - log.Debugf("Connect to path: %s\n", opts.Args.VosPath) - if err := ddbOpen(ctx, string(opts.Args.VosPath), string(opts.SysdbPath), opts.WriteMode); err != nil { - return errors.Wrapf(err, "Error opening path: %s", opts.Args.VosPath) + log.Debugf("Connect to path: %s\n", opts.VosPath) + if err := ddbOpen(ctx, string(opts.VosPath), bool(opts.WriteMode)); err != nil { + return errors.Wrapf(err, "Error opening path: %s", opts.VosPath) } } } - if opts.Args.RunCmd != "" && opts.CmdFile != "" { - return errors.New("Cannot use both command file and a command string") - } - - if opts.Args.VosPath != "" { - ctx.ctx.dc_pool_path = C.CString(string(opts.Args.VosPath)) - defer C.free(unsafe.Pointer(ctx.ctx.dc_pool_path)) - } if opts.Args.RunCmd != "" || opts.CmdFile != "" { // Non-interactive mode if opts.Args.RunCmd != "" { - err := runCmdStr(app, string(opts.Args.RunCmd), opts.Args.RunCmdArgs...) + err := runCmdStr(app, p, string(opts.Args.RunCmd), opts.Args.RunCmdArgs...) if err != nil { log.Errorf("Error running command %q %s\n", string(opts.Args.RunCmd), err) } } else { err := runFileCmds(log, app, opts.CmdFile) if err != nil { - log.Error("Error running command file\n") + log.Errorf("Error running command file: %s\n", err) } } @@ -244,14 +365,9 @@ Example Paths: func main() { var opts cliOptions - log := logging.NewCommandLineLogger() - if err := parseOpts(os.Args[1:], &opts, log); err != nil { - if fe, ok := errors.Cause(err).(*flags.Error); ok && fe.Type == flags.ErrHelp { - log.Info(fe.Error()) - os.Exit(0) - } - exitWithError(log, err) + if err := parseOpts(os.Args[1:], &opts); err != nil { + exitWithError(err) } } @@ -286,7 +402,158 @@ func createGrumbleApp(ctx *DdbContext) *grumble.App { return app } +const manMacroSection = `.\" Miscellaneous Helper macros +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" ======================================================================== +.\"` + +const manArgsHeader = `.SH ARGUMENTS +.SS Application Arguments` + +const manCmdsHeader = `.SH COMMANDS +.SS Available Commands` + +const manPathSection = `.SH PATH +.SS VOS Tree Path +Many of the commands take a VOS tree path. The format for this path is [cont]/[obj]/[dkey]/[akey]/[extent]. +.TP +.B cont +The full container uuid. +.TP +.B obj +The object id. +.TP +.B keys (akey, dkey) +There are multiple types of keys: +.RS +.IP "*" 4 +.B string keys +are simply the string value. If the size of the key is greater than strlen(key), then +the size is included at the end of the string value. Example: 'akey{5}' is the key: akey with a null +terminator at the end. +.IP "*" 4 +.B number keys +are formatted as '{[type]: NNN}' where type is 'uint8, uint16, uint32, or uint64'. NNN +can be a decimal or hex number. Example: '{uint32: 123456}' +.IP "*" 4 +.B binary keys +are formatted as '{bin: 0xHHH}' where HHH is the hex representation of the binary key. +Example: '{bin: 0x1a2b}' +.RE +.TP +.B extent +For array values in the format {lo-hi}. +.SS Index Tree Path +.RE +To make it easier to navigate the tree, indexes can be used instead of the path part. The index is +in the format [i]. Indexes and actual path values can be used together. +.SS Path Examples +VOS tree path examples: +.Sp +.Vb 1 +\& /3550f5df-e6b1-4415-947e-82e15cf769af/939000573846355970.0.13.1/dkey/akey/[0-1023] +.Ve +.Sp +Index tree path examples: +.Sp +.Vb 1 +\& [0]/[1]/[2]/[1]/[9] +.Ve +.Sp +Mixed tree path examples: +.Sp +.Vb 1 +\& /[0]/939000573846355970.0.13.1/[2]/akey{5}/[0-1023] +.Ve +.Sp` + +const manLoggingSection = `.SH LOGGING +The golang cli and the C engine use separate logging systems with different log levels. +The \fI--debug=\fR option sets the log level for both systems to the closest matching +levels. The available log levels supported by this option are: \fBTRACE\fR, \fBDEBUG\fR (or +\fBDBG\fR), \fBINFO\fR, \fBNOTICE\fR (or \fBNOTE\fR), \fBWARN\fR, \fBERROR\fR (or \fBERR\fR), +\fBCRIT\fR, \fBALRT\fR, \fBFATAL\fR (or \fBEMRG\fr), and \fBEMIT\fR. The default log level is +\fBERROR\fR. + +To not pollute the console output, the logs can be redirected to a file using the +\fI--log_dir=\fR option. However, \fBERROR\fR log messages or above will still be printed to +the console regardless if the \fI--log_dir=\fR option is used or not.` + +func fprintManPage(dest io.Writer, app *grumble.App, parser *flags.Parser) { + fmt.Fprintln(dest, manMacroSection) + + parser.WriteManPage(dest) + + fmt.Fprintln(dest, manArgsHeader) + for _, arg := range parser.Args() { + fmt.Fprintf(dest, ".TP\n.B %s\n%s\n", arg.Name, arg.Description) + } + + fmt.Fprintln(dest, manCmdsHeader) + for _, cmd := range app.Commands().All() { + if cmd.Name == "manpage" { + continue + } + + var cmdHelp string + if cmd.LongHelp != "" { + cmdHelp = cmd.LongHelp + } else { + cmdHelp = cmd.Help + } + fmt.Fprintf(dest, ".TP\n.B %s\n%s\n", cmd.Name, cmdHelp) + } + + fmt.Fprintln(dest, manPathSection) + + fmt.Fprint(dest, manLoggingSection) +} + // Run the command in 'run' using the grumble app. shlex is used to parse the string into an argv/c format -func runCmdStr(app *grumble.App, cmd string, args ...string) error { +func runCmdStr(app *grumble.App, p *flags.Parser, cmd string, args ...string) error { + if p != nil { + app.AddCommand(&grumble.Command{ + Name: "manpage", + Help: "Generate an application man page in groff format.", + LongHelp: "Generate an application man page in groff format. This command is used internally to generate the man page for the application and is not intended for general use.", + HelpGroup: "", + Flags: func(a *grumble.Flags) { + a.String("o", "output", "", "Output file for the man page. If not provided, the man page will be printed to stdout.") + }, + Run: func(c *grumble.Context) error { + dest := os.Stdout + if c.Flags.String("output") != "" { + fd, err := os.Create(c.Flags.String("output")) + if err != nil { + return errors.Wrapf(err, "Error creating file %q", c.Flags.String("output")) + } + defer func() { + err = fd.Close() + if err != nil { + fmt.Fprintf(os.Stderr, "Error closing file %q: %s\n", c.Flags.String("output"), err) + } + }() + dest = fd + } + + fprintManPage(dest, app, p) + return nil + }, + Completer: nil, + }) + } + return app.RunCommand(append([]string{cmd}, args...)) } diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go index 578de0d75a6..bec414701d2 100644 --- a/src/control/cmd/dmg/auto_test.go +++ b/src/control/cmd/dmg/auto_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -592,9 +592,10 @@ disable_vfio: false disable_vmd: false disable_hotplug: false nr_hugepages: 0 -system_ram_reserved: 26 +system_ram_reserved: 64 disable_hugepages: false allow_numa_imbalance: false +allow_thp: false control_log_mask: INFO control_log_file: /var/log/daos/daos_server.log core_dump_filter: 19 diff --git a/src/control/cmd/dmg/pool_test.go b/src/control/cmd/dmg/pool_test.go index 61711bbe700..5eeae387584 100644 --- a/src/control/cmd/dmg/pool_test.go +++ b/src/control/cmd/dmg/pool_test.go @@ -138,7 +138,7 @@ func createACLFile(t *testing.T, dir string, acl *control.AccessControlList) str return test.CreateTestFile(t, dir, control.FormatACLDefault(acl)) } -func TestPoolCommands(t *testing.T) { +func TestDmg_PoolCommands(t *testing.T) { testSizeStr := "512GiB" testSize := 549755813888 eUsr, err := user.Current() diff --git a/src/control/cmd/dmg/pretty/check.go b/src/control/cmd/dmg/pretty/check.go index 334d4e848ea..7860e694d26 100644 --- a/src/control/cmd/dmg/pretty/check.go +++ b/src/control/cmd/dmg/pretty/check.go @@ -50,26 +50,40 @@ func PrintCheckerPolicies(out io.Writer, flags control.SystemCheckFlags, policie tf.Format(table) } -func countResultPools(resp *control.SystemCheckQueryResp) int { +func countResultPools(resp *control.SystemCheckQueryResp) (int, int) { if resp == nil { - return 0 + return 0, 0 } - poolMap := make(map[string]struct{}) + checkedMap := make(map[string]struct{}) + uncheckedMap := make(map[string]struct{}) for _, pool := range resp.Pools { - // Don't include pools that were not checked. if pool.Unchecked() { - continue + uncheckedMap[pool.UUID] = struct{}{} + } else { + checkedMap[pool.UUID] = struct{}{} } - poolMap[pool.UUID] = struct{}{} } for _, report := range resp.Reports { if report.IsRemovedPool() && report.PoolUuid != "" { - poolMap[report.PoolUuid] = struct{}{} + checkedMap[report.PoolUuid] = struct{}{} } } - return len(poolMap) + return len(checkedMap), len(uncheckedMap) +} + +func printSystemCheckPoolInfo(out io.Writer, pools []*control.SystemCheckPoolInfo, verbose bool) { + if verbose { + fmt.Fprintln(out, "\nPer-Pool Checker Info:") + } else { + fmt.Fprintln(out, "\nUnchecked Pools:") + } + for _, pool := range pools { + if verbose || pool.Unchecked() { + fmt.Fprintf(out, " %+v\n", pool) + } + } } // PrintCheckQueryResp prints the checker results to the console. @@ -91,15 +105,15 @@ func PrintCheckQueryResp(out io.Writer, resp *control.SystemCheckQueryResp, verb // should show the number of pools being checked. If the checker has completed, // we should show the number of unique pools found in the reports. action := "Checking" - poolCount := countResultPools(resp) + checkedCount, uncheckedCount := countResultPools(resp) if resp.Status == control.SystemCheckStatusCompleted { action = "Checked" } - if poolCount > 0 { - fmt.Fprintf(out, " %s %s\n", action, english.Plural(poolCount, "pool", "")) + if checkedCount > 0 { + fmt.Fprintf(out, " %s %s\n", action, english.Plural(checkedCount, "pool", "")) } - if len(resp.Pools) > 0 && verbose { + if len(resp.Pools) > 0 && (verbose || uncheckedCount > 0) { pools := make([]*control.SystemCheckPoolInfo, 0, len(resp.Pools)) for _, pool := range resp.Pools { pools = append(pools, pool) @@ -107,10 +121,7 @@ func PrintCheckQueryResp(out io.Writer, resp *control.SystemCheckQueryResp, verb sort.Slice(pools, func(i, j int) bool { return pools[i].UUID < pools[j].UUID }) - fmt.Fprintln(out, "\nPer-Pool Checker Info:") - for _, pool := range pools { - fmt.Fprintf(out, " %+v\n", pool) - } + printSystemCheckPoolInfo(out, pools, verbose) } fmt.Fprintln(out) diff --git a/src/control/cmd/dmg/pretty/check_test.go b/src/control/cmd/dmg/pretty/check_test.go index ca96f645fa7..3394206d020 100644 --- a/src/control/cmd/dmg/pretty/check_test.go +++ b/src/control/cmd/dmg/pretty/check_test.go @@ -203,11 +203,6 @@ Inconsistency Reports: Phase: chkpb.CheckScanPhase_CSP_DONE.String(), StartTime: checkTime, }, - "pool-5": { - UUID: "pool-5", - Status: chkpb.CheckPoolStatus_CPS_UNCHECKED.String(), - Phase: chkpb.CheckScanPhase_CSP_PREPARE.String(), - }, }, Reports: []*control.SystemCheckReport{ { @@ -264,6 +259,85 @@ ID Class Pool Resolution 0x3 POOL_LESS_SVC_WITHOUT_QUORUM pool-3 TRUST_PS 0x4 POOL_NONEXIST_ON_ENGINE pool-4 DISCARD (dry run) +`, + }, + "non-verbose with unchecked pools": { + resp: &control.SystemCheckQueryResp{ + Status: control.SystemCheckStatusCompleted, + ScanPhase: control.SystemCheckScanPhaseDone, + Pools: map[string]*control.SystemCheckPoolInfo{ + "pool-1": { + UUID: "pool-1", + Status: chkpb.CheckPoolStatus_CPS_CHECKED.String(), + Phase: chkpb.CheckScanPhase_CSP_DONE.String(), + StartTime: checkTime, + }, + "pool-2": { + UUID: "pool-2", + Status: chkpb.CheckPoolStatus_CPS_UNCHECKED.String(), + Phase: chkpb.CheckScanPhase_CSP_DONE.String(), + }, + "pool-3": { + UUID: "pool-3", + Status: chkpb.CheckPoolStatus_CPS_CHECKED.String(), + Phase: chkpb.CheckScanPhase_CSP_DONE.String(), + StartTime: checkTime, + }, + "pool-5": { + UUID: "pool-5", + Status: chkpb.CheckPoolStatus_CPS_UNCHECKED.String(), + Phase: chkpb.CheckScanPhase_CSP_PREPARE.String(), + }, + }, + Reports: []*control.SystemCheckReport{ + { + CheckReport: chkpb.CheckReport{ + Seq: 1, + Class: chkpb.CheckInconsistClass_CIC_POOL_BAD_SVCL, + Action: chkpb.CheckInconsistAction_CIA_IGNORE, + Msg: "message 1", + PoolUuid: "pool-1", + }, + }, + { + CheckReport: chkpb.CheckReport{ + Seq: 3, + Class: chkpb.CheckInconsistClass_CIC_POOL_LESS_SVC_WITHOUT_QUORUM, + Action: chkpb.CheckInconsistAction_CIA_TRUST_PS, + Msg: "message 3", + PoolUuid: "pool-3", + }, + }, + { + CheckReport: chkpb.CheckReport{ + Seq: 4, + Result: int32(chkpb.CheckResult_DRY_RUN), + Class: chkpb.CheckInconsistClass_CIC_POOL_NONEXIST_ON_ENGINE, + Action: chkpb.CheckInconsistAction_CIA_DISCARD, + Msg: "message 4", + PoolUuid: "pool-4", + }, + }, + }, + }, + expOut: ` +DAOS System Checker Info + Current status: COMPLETED + Current phase: DONE (Check completed) + Checked 3 pools + +Unchecked Pools: + Pool pool-2: 0 ranks, status: CPS_UNCHECKED, phase: CSP_DONE + Pool pool-5: 0 ranks, status: CPS_UNCHECKED, phase: CSP_PREPARE + +Inconsistency Reports: +- Resolved: +ID Class Pool Resolution +-- ----- ---- ---------- +0x1 POOL_BAD_SVCL pool-1 IGNORE +0x3 POOL_LESS_SVC_WITHOUT_QUORUM pool-3 TRUST_PS +0x4 POOL_NONEXIST_ON_ENGINE pool-4 DISCARD (dry run) + `, }, "non-verbose with container": { diff --git a/src/control/cmd/dmg/pretty/pool_test.go b/src/control/cmd/dmg/pretty/pool_test.go index 1ba539bfeeb..ddbd3fd3aed 100644 --- a/src/control/cmd/dmg/pretty/pool_test.go +++ b/src/control/cmd/dmg/pretty/pool_test.go @@ -54,50 +54,46 @@ func TestPretty_PrintPoolQueryTargetResponse(t *testing.T) { Status: 0, Infos: []*daos.PoolQueryTargetInfo{ { - Type: 0, State: daos.PoolTargetStateDown, Space: []*daos.StorageUsageStats{tier0, tier1}, }, { - Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{tier0, tier1}, }, { - Type: 0, State: daos.PoolTargetStateDownOut, Space: []*daos.StorageUsageStats{tier0, tier1}, }, { - Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{tier0, tier1}, }, }, }, expPrintStr: ` -Target: type unknown, state down +Target: state down - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB - Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -Target: type unknown, state up_in +Target: state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB - Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -Target: type unknown, state down_out +Target: state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB - Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -Target: type unknown, state up_in +Target: state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -111,107 +107,46 @@ Target: type unknown, state up_in Status: 0, Infos: []*daos.PoolQueryTargetInfo{ { - Type: 0, State: 42, Space: []*daos.StorageUsageStats{tier0, tier1}, }, { - Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{tier0, tier1}, }, { - Type: 0, State: daos.PoolTargetStateDownOut, Space: []*daos.StorageUsageStats{tier0, tier1}, }, { - Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{tier0, tier1}, }, }, }, expPrintStr: ` -Target: type unknown, state invalid +Target: state invalid - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB - Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -Target: type unknown, state up_in +Target: state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB - Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -Target: type unknown, state down_out +Target: state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB - Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -Target: type unknown, state up_in -- Storage tier 0 (SCM): - Total size: 6.0 GB - Free: 5.0 GB -- Storage tier 1 (NVME): - Total size: 100 GB - Free: 90 GB -`, - }, - "invalid target type": { - pqtr: &control.PoolQueryTargetResp{ - Status: 0, - Infos: []*daos.PoolQueryTargetInfo{ - { - Type: 42, - State: daos.PoolTargetStateDown, - Space: []*daos.StorageUsageStats{tier0, tier1}, - }, - { - Type: 0, - State: daos.PoolTargetStateUpIn, - Space: []*daos.StorageUsageStats{tier0, tier1}, - }, - { - Type: 0, - State: daos.PoolTargetStateDownOut, - Space: []*daos.StorageUsageStats{tier0, tier1}, - }, - { - Type: 0, - State: daos.PoolTargetStateUpIn, - Space: []*daos.StorageUsageStats{tier0, tier1}, - }, - }, - }, - expPrintStr: ` -Target: type invalid, state down -- Storage tier 0 (SCM): - Total size: 6.0 GB - Free: 5.0 GB -- Storage tier 1 (NVME): - Total size: 100 GB - Free: 90 GB -Target: type unknown, state up_in -- Storage tier 0 (SCM): - Total size: 6.0 GB - Free: 5.0 GB -- Storage tier 1 (NVME): - Total size: 100 GB - Free: 90 GB -Target: type unknown, state down_out -- Storage tier 0 (SCM): - Total size: 6.0 GB - Free: 5.0 GB -- Storage tier 1 (NVME): - Total size: 100 GB - Free: 90 GB -Target: type unknown, state up_in +Target: state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -225,7 +160,6 @@ Target: type unknown, state up_in Status: 0, Infos: []*daos.PoolQueryTargetInfo{ { - Type: 0, State: daos.PoolTargetStateDown, Space: []*daos.StorageUsageStats{ tier0, tier1, @@ -236,7 +170,6 @@ Target: type unknown, state up_in }, }, { - Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{ tier0, tier1, @@ -247,7 +180,6 @@ Target: type unknown, state up_in }, }, { - Type: 0, State: daos.PoolTargetStateDownOut, Space: []*daos.StorageUsageStats{ tier0, tier1, @@ -259,7 +191,6 @@ Target: type unknown, state up_in }, }, { - Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{ tier0, tier1, @@ -272,7 +203,7 @@ Target: type unknown, state up_in }, }, expPrintStr: ` -Target: type unknown, state down +Target: state down - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -282,7 +213,7 @@ Target: type unknown, state down - Storage tier 2 (UNKNOWN): Total size: 800 GB Free: 200 GB -Target: type unknown, state up_in +Target: state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -292,7 +223,7 @@ Target: type unknown, state up_in - Storage tier 2 (UNKNOWN): Total size: 800 GB Free: 200 GB -Target: type unknown, state down_out +Target: state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB @@ -302,7 +233,7 @@ Target: type unknown, state down_out - Storage tier 2 (UNKNOWN): Total size: 800 GB Free: 200 GB -Target: type unknown, state up_in +Target: state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB diff --git a/src/control/cmd/dmg/system_test.go b/src/control/cmd/dmg/system_test.go index 9332ec38432..dc61b6c7b91 100644 --- a/src/control/cmd/dmg/system_test.go +++ b/src/control/cmd/dmg/system_test.go @@ -635,7 +635,7 @@ func TestDmg_systemRebuildOpCmd_execute(t *testing.T) { resp: &mgmtpb.SystemRebuildManageResp{}, expInfo: "System-rebuild start request succeeded on 0 pools []", }, - "pool stop failed": { + "rebuild stop failed": { ctlCfg: &control.Config{}, opCode: control.PoolRebuildOpCodeStop, resp: &mgmtpb.SystemRebuildManageResp{ @@ -661,7 +661,7 @@ func TestDmg_systemRebuildOpCmd_execute(t *testing.T) { expErr: errors.New("failed on pool foo: failed, pool-rebuild stop failed on pool bar"), expInfo: "System-rebuild stop request succeeded on 1 pool", }, - "pool start succeeded; verbose": { + "rebuild start succeeded; verbose": { ctlCfg: &control.Config{}, opCode: control.PoolRebuildOpCodeStart, verbose: true, @@ -702,6 +702,9 @@ func TestDmg_systemRebuildOpCmd_execute(t *testing.T) { gotErr := rbldCmd.execute(tc.opCode, tc.force) test.CmpErr(t, tc.expErr, gotErr) + // Note this doesn't verify that the text is on an INFO or DEBUG line + // specifically, just that it appears in log output. + if !strings.Contains(buf.String(), tc.expInfo) { t.Fatalf("expected info log output to contain %s, got %s\n", tc.expInfo, buf.String()) @@ -713,3 +716,52 @@ func TestDmg_systemRebuildOpCmd_execute(t *testing.T) { }) } } + +func TestDmg_systemSelfHealEvalCmd_execute(t *testing.T) { + for name, tc := range map[string]struct { + ctlCfg *control.Config + resp *mgmtpb.DaosResp + msErr error + expErr error + expInfo string + }{ + "no config": { + expErr: errors.New("system self-heal eval failed: no configuration loaded"), + }, + "ms failures": { + ctlCfg: &control.Config{}, + msErr: errors.New("failed"), + expErr: errors.New("failed"), + }, + "success": { + ctlCfg: &control.Config{}, + resp: &mgmtpb.DaosResp{}, + expInfo: "System self-heal eval request succeeded", + }, + "daos error": { + ctlCfg: &control.Config{}, + resp: &mgmtpb.DaosResp{ + Status: -1, + }, + expErr: errors.New("DER_UNKNOWN"), + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + mi := control.NewMockInvoker(log, &control.MockInvokerConfig{ + UnaryResponse: control.MockMSResponse("10.0.0.1:10001", + tc.msErr, tc.resp), + }) + + cmd := new(systemSelfHealEvalCmd) + cmd.setInvoker(mi) + cmd.SetLog(log) + cmd.setConfig(tc.ctlCfg) + + gotErr := cmd.Execute(nil) + test.CmpErr(t, tc.expErr, gotErr) + }) + } +} diff --git a/src/control/common/proto/chk/chk.pb.go b/src/control/common/proto/chk/chk.pb.go index c53a2c1788e..fbfae461dde 100644 --- a/src/control/common/proto/chk/chk.pb.go +++ b/src/control/common/proto/chk/chk.pb.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -212,24 +213,27 @@ const ( CheckInconsistAction_CIA_TRUST_EC_PARITY CheckInconsistAction = 11 // Trust EC data shard. CheckInconsistAction_CIA_TRUST_EC_DATA CheckInconsistAction = 12 + // Stale unresolved interaction. The checker can no longer address this report without re-running on affected pool. + CheckInconsistAction_CIA_STALE CheckInconsistAction = 65535 ) // Enum value maps for CheckInconsistAction. var ( CheckInconsistAction_name = map[int32]string{ - 0: "CIA_DEFAULT", - 1: "CIA_INTERACT", - 2: "CIA_IGNORE", - 3: "CIA_DISCARD", - 4: "CIA_READD", - 5: "CIA_TRUST_MS", - 6: "CIA_TRUST_PS", - 7: "CIA_TRUST_TARGET", - 8: "CIA_TRUST_MAJORITY", - 9: "CIA_TRUST_LATEST", - 10: "CIA_TRUST_OLDEST", - 11: "CIA_TRUST_EC_PARITY", - 12: "CIA_TRUST_EC_DATA", + 0: "CIA_DEFAULT", + 1: "CIA_INTERACT", + 2: "CIA_IGNORE", + 3: "CIA_DISCARD", + 4: "CIA_READD", + 5: "CIA_TRUST_MS", + 6: "CIA_TRUST_PS", + 7: "CIA_TRUST_TARGET", + 8: "CIA_TRUST_MAJORITY", + 9: "CIA_TRUST_LATEST", + 10: "CIA_TRUST_OLDEST", + 11: "CIA_TRUST_EC_PARITY", + 12: "CIA_TRUST_EC_DATA", + 65535: "CIA_STALE", } CheckInconsistAction_value = map[string]int32{ "CIA_DEFAULT": 0, @@ -245,6 +249,7 @@ var ( "CIA_TRUST_OLDEST": 10, "CIA_TRUST_EC_PARITY": 11, "CIA_TRUST_EC_DATA": 12, + "CIA_STALE": 65535, } ) @@ -870,7 +875,7 @@ var file_chk_chk_proto_rawDesc = []byte{ 0x4f, 0x53, 0x54, 0x5f, 0x45, 0x43, 0x5f, 0x44, 0x41, 0x54, 0x41, 0x10, 0x13, 0x12, 0x1a, 0x0a, 0x16, 0x43, 0x49, 0x43, 0x5f, 0x4f, 0x42, 0x4a, 0x5f, 0x44, 0x41, 0x54, 0x41, 0x5f, 0x49, 0x4e, 0x43, 0x4f, 0x4e, 0x53, 0x49, 0x53, 0x54, 0x10, 0x14, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x49, 0x43, - 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x64, 0x2a, 0x97, 0x02, 0x0a, 0x14, 0x43, + 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x64, 0x2a, 0xa8, 0x02, 0x0a, 0x14, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x49, 0x6e, 0x63, 0x6f, 0x6e, 0x73, 0x69, 0x73, 0x74, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x49, 0x41, 0x5f, 0x44, 0x45, 0x46, 0x41, 0x55, 0x4c, 0x54, 0x10, 0x00, 0x12, 0x10, 0x0a, 0x0c, 0x43, 0x49, 0x41, 0x5f, 0x49, 0x4e, 0x54, 0x45, @@ -888,55 +893,56 @@ var file_chk_chk_proto_rawDesc = []byte{ 0x53, 0x54, 0x10, 0x0a, 0x12, 0x17, 0x0a, 0x13, 0x43, 0x49, 0x41, 0x5f, 0x54, 0x52, 0x55, 0x53, 0x54, 0x5f, 0x45, 0x43, 0x5f, 0x50, 0x41, 0x52, 0x49, 0x54, 0x59, 0x10, 0x0b, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x49, 0x41, 0x5f, 0x54, 0x52, 0x55, 0x53, 0x54, 0x5f, 0x45, 0x43, 0x5f, 0x44, 0x41, - 0x54, 0x41, 0x10, 0x0c, 0x2a, 0x89, 0x01, 0x0a, 0x09, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x46, 0x6c, - 0x61, 0x67, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x46, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, 0x12, - 0x0d, 0x0a, 0x09, 0x43, 0x46, 0x5f, 0x44, 0x52, 0x59, 0x52, 0x55, 0x4e, 0x10, 0x01, 0x12, 0x0c, - 0x0a, 0x08, 0x43, 0x46, 0x5f, 0x52, 0x45, 0x53, 0x45, 0x54, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, - 0x43, 0x46, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x4f, 0x55, 0x54, 0x10, 0x04, 0x12, 0x0b, 0x0a, 0x07, - 0x43, 0x46, 0x5f, 0x41, 0x55, 0x54, 0x4f, 0x10, 0x08, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x46, 0x5f, - 0x4f, 0x52, 0x50, 0x48, 0x41, 0x4e, 0x5f, 0x50, 0x4f, 0x4f, 0x4c, 0x10, 0x10, 0x12, 0x11, 0x0a, - 0x0d, 0x43, 0x46, 0x5f, 0x4e, 0x4f, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x4f, 0x55, 0x54, 0x10, 0x20, - 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x46, 0x5f, 0x4e, 0x4f, 0x5f, 0x41, 0x55, 0x54, 0x4f, 0x10, 0x40, - 0x2a, 0x88, 0x01, 0x0a, 0x0f, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x49, 0x6e, 0x73, 0x74, 0x53, 0x74, - 0x61, 0x74, 0x75, 0x73, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x49, 0x53, 0x5f, 0x49, 0x4e, 0x49, 0x54, - 0x10, 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x49, 0x53, 0x5f, 0x52, 0x55, 0x4e, 0x4e, 0x49, 0x4e, - 0x47, 0x10, 0x01, 0x12, 0x11, 0x0a, 0x0d, 0x43, 0x49, 0x53, 0x5f, 0x43, 0x4f, 0x4d, 0x50, 0x4c, - 0x45, 0x54, 0x45, 0x44, 0x10, 0x02, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x49, 0x53, 0x5f, 0x53, 0x54, - 0x4f, 0x50, 0x50, 0x45, 0x44, 0x10, 0x03, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x49, 0x53, 0x5f, 0x46, - 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x04, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x49, 0x53, 0x5f, 0x50, - 0x41, 0x55, 0x53, 0x45, 0x44, 0x10, 0x05, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x49, 0x53, 0x5f, 0x49, - 0x4d, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x54, 0x45, 0x44, 0x10, 0x06, 0x2a, 0x9d, 0x01, 0x0a, 0x0f, - 0x43, 0x68, 0x65, 0x63, 0x6b, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, - 0x11, 0x0a, 0x0d, 0x43, 0x50, 0x53, 0x5f, 0x55, 0x4e, 0x43, 0x48, 0x45, 0x43, 0x4b, 0x45, 0x44, - 0x10, 0x00, 0x12, 0x10, 0x0a, 0x0c, 0x43, 0x50, 0x53, 0x5f, 0x43, 0x48, 0x45, 0x43, 0x4b, 0x49, - 0x4e, 0x47, 0x10, 0x01, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x50, 0x53, 0x5f, 0x43, 0x48, 0x45, 0x43, - 0x4b, 0x45, 0x44, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x50, 0x53, 0x5f, 0x46, 0x41, 0x49, - 0x4c, 0x45, 0x44, 0x10, 0x03, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x50, 0x53, 0x5f, 0x50, 0x41, 0x55, - 0x53, 0x45, 0x44, 0x10, 0x04, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x50, 0x53, 0x5f, 0x50, 0x45, 0x4e, - 0x44, 0x49, 0x4e, 0x47, 0x10, 0x05, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x50, 0x53, 0x5f, 0x53, 0x54, - 0x4f, 0x50, 0x50, 0x45, 0x44, 0x10, 0x06, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x50, 0x53, 0x5f, 0x49, - 0x4d, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x54, 0x45, 0x44, 0x10, 0x07, 0x2a, 0xe0, 0x01, 0x0a, 0x0e, - 0x43, 0x68, 0x65, 0x63, 0x6b, 0x53, 0x63, 0x61, 0x6e, 0x50, 0x68, 0x61, 0x73, 0x65, 0x12, 0x0f, - 0x0a, 0x0b, 0x43, 0x53, 0x50, 0x5f, 0x50, 0x52, 0x45, 0x50, 0x41, 0x52, 0x45, 0x10, 0x00, 0x12, - 0x11, 0x0a, 0x0d, 0x43, 0x53, 0x50, 0x5f, 0x50, 0x4f, 0x4f, 0x4c, 0x5f, 0x4c, 0x49, 0x53, 0x54, - 0x10, 0x01, 0x12, 0x10, 0x0a, 0x0c, 0x43, 0x53, 0x50, 0x5f, 0x50, 0x4f, 0x4f, 0x4c, 0x5f, 0x4d, - 0x42, 0x53, 0x10, 0x02, 0x12, 0x14, 0x0a, 0x10, 0x43, 0x53, 0x50, 0x5f, 0x50, 0x4f, 0x4f, 0x4c, - 0x5f, 0x43, 0x4c, 0x45, 0x41, 0x4e, 0x55, 0x50, 0x10, 0x03, 0x12, 0x11, 0x0a, 0x0d, 0x43, 0x53, - 0x50, 0x5f, 0x43, 0x4f, 0x4e, 0x54, 0x5f, 0x4c, 0x49, 0x53, 0x54, 0x10, 0x04, 0x12, 0x14, 0x0a, - 0x10, 0x43, 0x53, 0x50, 0x5f, 0x43, 0x4f, 0x4e, 0x54, 0x5f, 0x43, 0x4c, 0x45, 0x41, 0x4e, 0x55, - 0x50, 0x10, 0x05, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x53, 0x50, 0x5f, 0x44, 0x54, 0x58, 0x5f, 0x52, - 0x45, 0x53, 0x59, 0x4e, 0x43, 0x10, 0x06, 0x12, 0x11, 0x0a, 0x0d, 0x43, 0x53, 0x50, 0x5f, 0x4f, - 0x42, 0x4a, 0x5f, 0x53, 0x43, 0x52, 0x55, 0x42, 0x10, 0x07, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x53, - 0x50, 0x5f, 0x52, 0x45, 0x42, 0x55, 0x49, 0x4c, 0x44, 0x10, 0x08, 0x12, 0x13, 0x0a, 0x0f, 0x43, - 0x53, 0x50, 0x5f, 0x41, 0x47, 0x47, 0x52, 0x45, 0x47, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x09, - 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x53, 0x50, 0x5f, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x0a, 0x2a, 0x27, - 0x0a, 0x0b, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x0b, 0x0a, - 0x07, 0x53, 0x55, 0x43, 0x43, 0x45, 0x53, 0x53, 0x10, 0x00, 0x12, 0x0b, 0x0a, 0x07, 0x44, 0x52, - 0x59, 0x5f, 0x52, 0x55, 0x4e, 0x10, 0x01, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, - 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, - 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, - 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, - 0x68, 0x6b, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x54, 0x41, 0x10, 0x0c, 0x12, 0x0f, 0x0a, 0x09, 0x43, 0x49, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x4c, + 0x45, 0x10, 0xff, 0xff, 0x03, 0x2a, 0x89, 0x01, 0x0a, 0x09, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x46, + 0x6c, 0x61, 0x67, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x46, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, + 0x12, 0x0d, 0x0a, 0x09, 0x43, 0x46, 0x5f, 0x44, 0x52, 0x59, 0x52, 0x55, 0x4e, 0x10, 0x01, 0x12, + 0x0c, 0x0a, 0x08, 0x43, 0x46, 0x5f, 0x52, 0x45, 0x53, 0x45, 0x54, 0x10, 0x02, 0x12, 0x0e, 0x0a, + 0x0a, 0x43, 0x46, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x4f, 0x55, 0x54, 0x10, 0x04, 0x12, 0x0b, 0x0a, + 0x07, 0x43, 0x46, 0x5f, 0x41, 0x55, 0x54, 0x4f, 0x10, 0x08, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x46, + 0x5f, 0x4f, 0x52, 0x50, 0x48, 0x41, 0x4e, 0x5f, 0x50, 0x4f, 0x4f, 0x4c, 0x10, 0x10, 0x12, 0x11, + 0x0a, 0x0d, 0x43, 0x46, 0x5f, 0x4e, 0x4f, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x4f, 0x55, 0x54, 0x10, + 0x20, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x46, 0x5f, 0x4e, 0x4f, 0x5f, 0x41, 0x55, 0x54, 0x4f, 0x10, + 0x40, 0x2a, 0x88, 0x01, 0x0a, 0x0f, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x49, 0x6e, 0x73, 0x74, 0x53, + 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x49, 0x53, 0x5f, 0x49, 0x4e, 0x49, + 0x54, 0x10, 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x49, 0x53, 0x5f, 0x52, 0x55, 0x4e, 0x4e, 0x49, + 0x4e, 0x47, 0x10, 0x01, 0x12, 0x11, 0x0a, 0x0d, 0x43, 0x49, 0x53, 0x5f, 0x43, 0x4f, 0x4d, 0x50, + 0x4c, 0x45, 0x54, 0x45, 0x44, 0x10, 0x02, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x49, 0x53, 0x5f, 0x53, + 0x54, 0x4f, 0x50, 0x50, 0x45, 0x44, 0x10, 0x03, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x49, 0x53, 0x5f, + 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x04, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x49, 0x53, 0x5f, + 0x50, 0x41, 0x55, 0x53, 0x45, 0x44, 0x10, 0x05, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x49, 0x53, 0x5f, + 0x49, 0x4d, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x54, 0x45, 0x44, 0x10, 0x06, 0x2a, 0x9d, 0x01, 0x0a, + 0x0f, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, + 0x12, 0x11, 0x0a, 0x0d, 0x43, 0x50, 0x53, 0x5f, 0x55, 0x4e, 0x43, 0x48, 0x45, 0x43, 0x4b, 0x45, + 0x44, 0x10, 0x00, 0x12, 0x10, 0x0a, 0x0c, 0x43, 0x50, 0x53, 0x5f, 0x43, 0x48, 0x45, 0x43, 0x4b, + 0x49, 0x4e, 0x47, 0x10, 0x01, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x50, 0x53, 0x5f, 0x43, 0x48, 0x45, + 0x43, 0x4b, 0x45, 0x44, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x50, 0x53, 0x5f, 0x46, 0x41, + 0x49, 0x4c, 0x45, 0x44, 0x10, 0x03, 0x12, 0x0e, 0x0a, 0x0a, 0x43, 0x50, 0x53, 0x5f, 0x50, 0x41, + 0x55, 0x53, 0x45, 0x44, 0x10, 0x04, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x50, 0x53, 0x5f, 0x50, 0x45, + 0x4e, 0x44, 0x49, 0x4e, 0x47, 0x10, 0x05, 0x12, 0x0f, 0x0a, 0x0b, 0x43, 0x50, 0x53, 0x5f, 0x53, + 0x54, 0x4f, 0x50, 0x50, 0x45, 0x44, 0x10, 0x06, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x50, 0x53, 0x5f, + 0x49, 0x4d, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x54, 0x45, 0x44, 0x10, 0x07, 0x2a, 0xe0, 0x01, 0x0a, + 0x0e, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x53, 0x63, 0x61, 0x6e, 0x50, 0x68, 0x61, 0x73, 0x65, 0x12, + 0x0f, 0x0a, 0x0b, 0x43, 0x53, 0x50, 0x5f, 0x50, 0x52, 0x45, 0x50, 0x41, 0x52, 0x45, 0x10, 0x00, + 0x12, 0x11, 0x0a, 0x0d, 0x43, 0x53, 0x50, 0x5f, 0x50, 0x4f, 0x4f, 0x4c, 0x5f, 0x4c, 0x49, 0x53, + 0x54, 0x10, 0x01, 0x12, 0x10, 0x0a, 0x0c, 0x43, 0x53, 0x50, 0x5f, 0x50, 0x4f, 0x4f, 0x4c, 0x5f, + 0x4d, 0x42, 0x53, 0x10, 0x02, 0x12, 0x14, 0x0a, 0x10, 0x43, 0x53, 0x50, 0x5f, 0x50, 0x4f, 0x4f, + 0x4c, 0x5f, 0x43, 0x4c, 0x45, 0x41, 0x4e, 0x55, 0x50, 0x10, 0x03, 0x12, 0x11, 0x0a, 0x0d, 0x43, + 0x53, 0x50, 0x5f, 0x43, 0x4f, 0x4e, 0x54, 0x5f, 0x4c, 0x49, 0x53, 0x54, 0x10, 0x04, 0x12, 0x14, + 0x0a, 0x10, 0x43, 0x53, 0x50, 0x5f, 0x43, 0x4f, 0x4e, 0x54, 0x5f, 0x43, 0x4c, 0x45, 0x41, 0x4e, + 0x55, 0x50, 0x10, 0x05, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x53, 0x50, 0x5f, 0x44, 0x54, 0x58, 0x5f, + 0x52, 0x45, 0x53, 0x59, 0x4e, 0x43, 0x10, 0x06, 0x12, 0x11, 0x0a, 0x0d, 0x43, 0x53, 0x50, 0x5f, + 0x4f, 0x42, 0x4a, 0x5f, 0x53, 0x43, 0x52, 0x55, 0x42, 0x10, 0x07, 0x12, 0x0f, 0x0a, 0x0b, 0x43, + 0x53, 0x50, 0x5f, 0x52, 0x45, 0x42, 0x55, 0x49, 0x4c, 0x44, 0x10, 0x08, 0x12, 0x13, 0x0a, 0x0f, + 0x43, 0x53, 0x50, 0x5f, 0x41, 0x47, 0x47, 0x52, 0x45, 0x47, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x10, + 0x09, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x53, 0x50, 0x5f, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x0a, 0x2a, + 0x27, 0x0a, 0x0b, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x0b, + 0x0a, 0x07, 0x53, 0x55, 0x43, 0x43, 0x45, 0x53, 0x53, 0x10, 0x00, 0x12, 0x0b, 0x0a, 0x07, 0x44, + 0x52, 0x59, 0x5f, 0x52, 0x55, 0x4e, 0x10, 0x01, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, + 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, + 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, + 0x63, 0x68, 0x6b, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/src/control/common/proto/mgmt/pool.pb.go b/src/control/common/proto/mgmt/pool.pb.go index 18cf729a41d..12286b528fb 100644 --- a/src/control/common/proto/mgmt/pool.pb.go +++ b/src/control/common/proto/mgmt/pool.pb.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -131,22 +131,34 @@ func (PoolServiceState) EnumDescriptor() ([]byte, []int) { type PoolRebuildStatus_State int32 const ( - PoolRebuildStatus_IDLE PoolRebuildStatus_State = 0 - PoolRebuildStatus_DONE PoolRebuildStatus_State = 1 - PoolRebuildStatus_BUSY PoolRebuildStatus_State = 2 + PoolRebuildStatus_BUSY PoolRebuildStatus_State = 0 + PoolRebuildStatus_IDLE PoolRebuildStatus_State = 1 + PoolRebuildStatus_DONE PoolRebuildStatus_State = 2 + PoolRebuildStatus_STOPPING PoolRebuildStatus_State = 3 + PoolRebuildStatus_STOPPED PoolRebuildStatus_State = 4 + PoolRebuildStatus_FAILING PoolRebuildStatus_State = 5 + PoolRebuildStatus_FAILED PoolRebuildStatus_State = 6 ) // Enum value maps for PoolRebuildStatus_State. var ( PoolRebuildStatus_State_name = map[int32]string{ - 0: "IDLE", - 1: "DONE", - 2: "BUSY", + 0: "BUSY", + 1: "IDLE", + 2: "DONE", + 3: "STOPPING", + 4: "STOPPED", + 5: "FAILING", + 6: "FAILED", } PoolRebuildStatus_State_value = map[string]int32{ - "IDLE": 0, - "DONE": 1, - "BUSY": 2, + "BUSY": 0, + "IDLE": 1, + "DONE": 2, + "STOPPING": 3, + "STOPPED": 4, + "FAILING": 5, + "FAILED": 6, } ) @@ -177,61 +189,6 @@ func (PoolRebuildStatus_State) EnumDescriptor() ([]byte, []int) { return file_mgmt_pool_proto_rawDescGZIP(), []int{20, 0} } -type PoolQueryTargetInfo_TargetType int32 - -const ( - PoolQueryTargetInfo_UNKNOWN PoolQueryTargetInfo_TargetType = 0 - PoolQueryTargetInfo_HDD PoolQueryTargetInfo_TargetType = 1 // Rotating disk - PoolQueryTargetInfo_SSD PoolQueryTargetInfo_TargetType = 2 // Flash-based - PoolQueryTargetInfo_PM PoolQueryTargetInfo_TargetType = 3 // Persistent memory - PoolQueryTargetInfo_VM PoolQueryTargetInfo_TargetType = 4 // Volatile memory -) - -// Enum value maps for PoolQueryTargetInfo_TargetType. -var ( - PoolQueryTargetInfo_TargetType_name = map[int32]string{ - 0: "UNKNOWN", - 1: "HDD", - 2: "SSD", - 3: "PM", - 4: "VM", - } - PoolQueryTargetInfo_TargetType_value = map[string]int32{ - "UNKNOWN": 0, - "HDD": 1, - "SSD": 2, - "PM": 3, - "VM": 4, - } -) - -func (x PoolQueryTargetInfo_TargetType) Enum() *PoolQueryTargetInfo_TargetType { - p := new(PoolQueryTargetInfo_TargetType) - *p = x - return p -} - -func (x PoolQueryTargetInfo_TargetType) String() string { - return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) -} - -func (PoolQueryTargetInfo_TargetType) Descriptor() protoreflect.EnumDescriptor { - return file_mgmt_pool_proto_enumTypes[3].Descriptor() -} - -func (PoolQueryTargetInfo_TargetType) Type() protoreflect.EnumType { - return &file_mgmt_pool_proto_enumTypes[3] -} - -func (x PoolQueryTargetInfo_TargetType) Number() protoreflect.EnumNumber { - return protoreflect.EnumNumber(x) -} - -// Deprecated: Use PoolQueryTargetInfo_TargetType.Descriptor instead. -func (PoolQueryTargetInfo_TargetType) EnumDescriptor() ([]byte, []int) { - return file_mgmt_pool_proto_rawDescGZIP(), []int{30, 0} -} - type PoolQueryTargetInfo_TargetState int32 const ( @@ -277,11 +234,11 @@ func (x PoolQueryTargetInfo_TargetState) String() string { } func (PoolQueryTargetInfo_TargetState) Descriptor() protoreflect.EnumDescriptor { - return file_mgmt_pool_proto_enumTypes[4].Descriptor() + return file_mgmt_pool_proto_enumTypes[3].Descriptor() } func (PoolQueryTargetInfo_TargetState) Type() protoreflect.EnumType { - return &file_mgmt_pool_proto_enumTypes[4] + return &file_mgmt_pool_proto_enumTypes[3] } func (x PoolQueryTargetInfo_TargetState) Number() protoreflect.EnumNumber { @@ -290,7 +247,7 @@ func (x PoolQueryTargetInfo_TargetState) Number() protoreflect.EnumNumber { // Deprecated: Use PoolQueryTargetInfo_TargetState.Descriptor instead. func (PoolQueryTargetInfo_TargetState) EnumDescriptor() ([]byte, []int) { - return file_mgmt_pool_proto_rawDescGZIP(), []int{30, 1} + return file_mgmt_pool_proto_rawDescGZIP(), []int{30, 0} } // PoolCreateReq supplies new pool parameters. @@ -1778,16 +1735,19 @@ func (x *StorageUsageStats) GetMediaType() StorageMediaType { return StorageMediaType_SCM } -// PoolRebuildStatus represents a pool's rebuild status. +// PoolRebuildStatus represents a pool's rebuild status, translates to enum daos_rebuild_state_t +// IN_PROGRESS/NOT_STARTED/COMPLETED states. type PoolRebuildStatus struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - Status int32 `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"` // DAOS error code - State PoolRebuildStatus_State `protobuf:"varint,2,opt,name=state,proto3,enum=mgmt.PoolRebuildStatus_State" json:"state,omitempty"` - Objects uint64 `protobuf:"varint,3,opt,name=objects,proto3" json:"objects,omitempty"` - Records uint64 `protobuf:"varint,4,opt,name=records,proto3" json:"records,omitempty"` + Status int32 `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"` // DAOS error code + State PoolRebuildStatus_State `protobuf:"varint,2,opt,name=state,proto3,enum=mgmt.PoolRebuildStatus_State" json:"state,omitempty"` + Objects uint64 `protobuf:"varint,3,opt,name=objects,proto3" json:"objects,omitempty"` + Records uint64 `protobuf:"varint,4,opt,name=records,proto3" json:"records,omitempty"` + DerivedState PoolRebuildStatus_State `protobuf:"varint,5,opt,name=derived_state,json=derivedState,proto3,enum=mgmt.PoolRebuildStatus_State" json:"derived_state,omitempty"` + Degraded bool `protobuf:"varint,6,opt,name=degraded,proto3" json:"degraded,omitempty"` // data redundancy degraded } func (x *PoolRebuildStatus) Reset() { @@ -1833,7 +1793,7 @@ func (x *PoolRebuildStatus) GetState() PoolRebuildStatus_State { if x != nil { return x.State } - return PoolRebuildStatus_IDLE + return PoolRebuildStatus_BUSY } func (x *PoolRebuildStatus) GetObjects() uint64 { @@ -1850,6 +1810,20 @@ func (x *PoolRebuildStatus) GetRecords() uint64 { return 0 } +func (x *PoolRebuildStatus) GetDerivedState() PoolRebuildStatus_State { + if x != nil { + return x.DerivedState + } + return PoolRebuildStatus_BUSY +} + +func (x *PoolRebuildStatus) GetDegraded() bool { + if x != nil { + return x.Degraded + } + return false +} + // PoolQueryResp represents a pool query response. type PoolQueryResp struct { state protoimpl.MessageState @@ -2635,7 +2609,6 @@ type PoolQueryTargetInfo struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - Type PoolQueryTargetInfo_TargetType `protobuf:"varint,1,opt,name=type,proto3,enum=mgmt.PoolQueryTargetInfo_TargetType" json:"type,omitempty"` // Target type jsee enum daos_target_type_t State PoolQueryTargetInfo_TargetState `protobuf:"varint,2,opt,name=state,proto3,enum=mgmt.PoolQueryTargetInfo_TargetState" json:"state,omitempty"` // target state see enum daos_target_state_t // TODO: target performance data Space []*StorageTargetUsage `protobuf:"bytes,3,rep,name=space,proto3" json:"space,omitempty"` // this target's usage per storage tier @@ -2675,13 +2648,6 @@ func (*PoolQueryTargetInfo) Descriptor() ([]byte, []int) { return file_mgmt_pool_proto_rawDescGZIP(), []int{30} } -func (x *PoolQueryTargetInfo) GetType() PoolQueryTargetInfo_TargetType { - if x != nil { - return x.Type - } - return PoolQueryTargetInfo_UNKNOWN -} - func (x *PoolQueryTargetInfo) GetState() PoolQueryTargetInfo_TargetState { if x != nil { return x.State @@ -3279,7 +3245,7 @@ var file_mgmt_pool_proto_rawDesc = []byte{ 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09, 0x6d, 0x65, 0x64, - 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0xbb, 0x01, 0x0a, 0x11, 0x50, 0x6f, 0x6f, 0x6c, 0x52, + 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0xcf, 0x02, 0x0a, 0x11, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x33, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, @@ -3288,184 +3254,186 @@ var file_mgmt_pool_proto_rawDesc = []byte{ 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x18, 0x04, - 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x22, 0x25, 0x0a, - 0x05, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x08, 0x0a, 0x04, 0x49, 0x44, 0x4c, 0x45, 0x10, 0x00, - 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x55, - 0x53, 0x59, 0x10, 0x02, 0x22, 0x89, 0x07, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, - 0x72, 0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, - 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x12, - 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, - 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61, - 0x6c, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, - 0x0c, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x25, 0x0a, - 0x0e, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, - 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x54, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x73, 0x12, 0x29, 0x0a, 0x10, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, - 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0f, - 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, - 0x31, 0x0a, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, - 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, - 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69, - 0x6c, 0x64, 0x12, 0x36, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x73, - 0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, - 0x6f, 0x72, 0x61, 0x67, 0x65, 0x55, 0x73, 0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x52, - 0x09, 0x74, 0x69, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, - 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, - 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x16, 0x0a, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, 0x0b, - 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x12, 0x23, 0x0a, 0x0d, - 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0c, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x0c, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b, - 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61, - 0x6e, 0x6b, 0x73, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x64, 0x69, 0x73, 0x61, 0x62, - 0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61, - 0x6c, 0x5f, 0x65, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x0d, 0x52, - 0x0c, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x45, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x12, 0x26, 0x0a, - 0x0f, 0x70, 0x6f, 0x6f, 0x6c, 0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, - 0x18, 0x0f, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x70, 0x6f, 0x6f, 0x6c, 0x4c, 0x61, 0x79, 0x6f, - 0x75, 0x74, 0x56, 0x65, 0x72, 0x12, 0x2c, 0x0a, 0x12, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, - 0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, 0x18, 0x10, 0x20, 0x01, 0x28, - 0x0d, 0x52, 0x10, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, - 0x56, 0x65, 0x72, 0x12, 0x2c, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x11, 0x20, 0x01, - 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, - 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, - 0x65, 0x12, 0x17, 0x0a, 0x07, 0x73, 0x76, 0x63, 0x5f, 0x6c, 0x64, 0x72, 0x18, 0x12, 0x20, 0x01, - 0x28, 0x0d, 0x52, 0x06, 0x73, 0x76, 0x63, 0x4c, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, - 0x63, 0x5f, 0x72, 0x65, 0x70, 0x73, 0x18, 0x13, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, - 0x63, 0x52, 0x65, 0x70, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72, 0x79, 0x5f, 0x6d, - 0x61, 0x73, 0x6b, 0x18, 0x14, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, 0x65, 0x72, 0x79, - 0x4d, 0x61, 0x73, 0x6b, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, 0x5f, 0x66, 0x69, 0x6c, 0x65, - 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x15, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, 0x6d, 0x65, - 0x6d, 0x46, 0x69, 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x64, 0x65, - 0x61, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x16, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, - 0x64, 0x65, 0x61, 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x27, 0x0a, 0x10, 0x6d, 0x64, 0x5f, - 0x6f, 0x6e, 0x5f, 0x73, 0x73, 0x64, 0x5f, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x18, 0x17, 0x20, - 0x01, 0x28, 0x08, 0x52, 0x0d, 0x6d, 0x64, 0x4f, 0x6e, 0x53, 0x73, 0x64, 0x41, 0x63, 0x74, 0x69, - 0x76, 0x65, 0x12, 0x28, 0x0a, 0x10, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x68, 0x65, 0x61, 0x6c, 0x5f, - 0x70, 0x6f, 0x6c, 0x69, 0x63, 0x79, 0x18, 0x18, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0e, 0x73, 0x65, - 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x50, 0x6f, 0x6c, 0x69, 0x63, 0x79, 0x12, 0x2f, 0x0a, 0x14, - 0x73, 0x79, 0x73, 0x5f, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x68, 0x65, 0x61, 0x6c, 0x5f, 0x70, 0x6f, - 0x6c, 0x69, 0x63, 0x79, 0x18, 0x19, 0x20, 0x01, 0x28, 0x09, 0x52, 0x11, 0x73, 0x79, 0x73, 0x53, - 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x50, 0x6f, 0x6c, 0x69, 0x63, 0x79, 0x4a, 0x04, 0x08, - 0x09, 0x10, 0x0a, 0x52, 0x0b, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x73, - 0x22, 0x63, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, - 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, - 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, 0x18, 0x0a, 0x06, 0x73, 0x74, 0x72, 0x76, - 0x61, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x48, 0x00, 0x52, 0x06, 0x73, 0x74, 0x72, 0x76, - 0x61, 0x6c, 0x12, 0x18, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x76, 0x61, 0x6c, 0x18, 0x03, 0x20, 0x01, - 0x28, 0x04, 0x48, 0x00, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x76, 0x61, 0x6c, 0x42, 0x07, 0x0a, 0x05, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x83, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, - 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, - 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, - 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, - 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, - 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, - 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, - 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, - 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, - 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, - 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x83, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x47, - 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, - 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, - 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, - 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x12, 0x42, 0x0a, + 0x0d, 0x64, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, + 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1d, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, + 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x2e, 0x53, 0x74, + 0x61, 0x74, 0x65, 0x52, 0x0c, 0x64, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, + 0x65, 0x12, 0x1a, 0x0a, 0x08, 0x64, 0x65, 0x67, 0x72, 0x61, 0x64, 0x65, 0x64, 0x18, 0x06, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x08, 0x64, 0x65, 0x67, 0x72, 0x61, 0x64, 0x65, 0x64, 0x22, 0x59, 0x0a, + 0x05, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x55, 0x53, 0x59, 0x10, 0x00, + 0x12, 0x08, 0x0a, 0x04, 0x49, 0x44, 0x4c, 0x45, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, + 0x4e, 0x45, 0x10, 0x02, 0x12, 0x0c, 0x0a, 0x08, 0x53, 0x54, 0x4f, 0x50, 0x50, 0x49, 0x4e, 0x47, + 0x10, 0x03, 0x12, 0x0b, 0x0a, 0x07, 0x53, 0x54, 0x4f, 0x50, 0x50, 0x45, 0x44, 0x10, 0x04, 0x12, + 0x0b, 0x0a, 0x07, 0x46, 0x41, 0x49, 0x4c, 0x49, 0x4e, 0x47, 0x10, 0x05, 0x12, 0x0a, 0x0a, 0x06, + 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x06, 0x22, 0x89, 0x07, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, + 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, + 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, + 0x75, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, + 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x12, 0x23, 0x0a, 0x0d, + 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, + 0x01, 0x28, 0x0d, 0x52, 0x0c, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, + 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x74, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x61, 0x63, 0x74, 0x69, 0x76, + 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x29, 0x0a, 0x10, 0x64, 0x69, 0x73, 0x61, + 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x06, 0x20, 0x01, + 0x28, 0x0d, 0x52, 0x0f, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x54, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x73, 0x12, 0x31, 0x0a, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x18, 0x07, + 0x20, 0x01, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, + 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x07, 0x72, + 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x12, 0x36, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x73, + 0x74, 0x61, 0x74, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, + 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x55, 0x73, 0x61, 0x67, 0x65, 0x53, 0x74, + 0x61, 0x74, 0x73, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x18, + 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x0d, 0x52, + 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x16, 0x0a, 0x06, 0x6c, 0x65, 0x61, 0x64, + 0x65, 0x72, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, + 0x12, 0x23, 0x0a, 0x0d, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, + 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, + 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, + 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x64, + 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x23, 0x0a, 0x0d, + 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x65, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x18, 0x0e, 0x20, + 0x01, 0x28, 0x0d, 0x52, 0x0c, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x45, 0x6e, 0x67, 0x69, 0x6e, 0x65, + 0x73, 0x12, 0x26, 0x0a, 0x0f, 0x70, 0x6f, 0x6f, 0x6c, 0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, + 0x5f, 0x76, 0x65, 0x72, 0x18, 0x0f, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x70, 0x6f, 0x6f, 0x6c, + 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x56, 0x65, 0x72, 0x12, 0x2c, 0x0a, 0x12, 0x75, 0x70, 0x67, + 0x72, 0x61, 0x64, 0x65, 0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, 0x18, + 0x10, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x4c, 0x61, + 0x79, 0x6f, 0x75, 0x74, 0x56, 0x65, 0x72, 0x12, 0x2c, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, + 0x18, 0x11, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, + 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, + 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x17, 0x0a, 0x07, 0x73, 0x76, 0x63, 0x5f, 0x6c, 0x64, 0x72, + 0x18, 0x12, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x73, 0x76, 0x63, 0x4c, 0x64, 0x72, 0x12, 0x19, + 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70, 0x73, 0x18, 0x13, 0x20, 0x03, 0x28, 0x0d, + 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, + 0x72, 0x79, 0x5f, 0x6d, 0x61, 0x73, 0x6b, 0x18, 0x14, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, + 0x75, 0x65, 0x72, 0x79, 0x4d, 0x61, 0x73, 0x6b, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, 0x5f, + 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x15, 0x20, 0x01, 0x28, 0x04, + 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x1d, + 0x0a, 0x0a, 0x64, 0x65, 0x61, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x16, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x09, 0x64, 0x65, 0x61, 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x27, 0x0a, + 0x10, 0x6d, 0x64, 0x5f, 0x6f, 0x6e, 0x5f, 0x73, 0x73, 0x64, 0x5f, 0x61, 0x63, 0x74, 0x69, 0x76, + 0x65, 0x18, 0x17, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0d, 0x6d, 0x64, 0x4f, 0x6e, 0x53, 0x73, 0x64, + 0x41, 0x63, 0x74, 0x69, 0x76, 0x65, 0x12, 0x28, 0x0a, 0x10, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x68, + 0x65, 0x61, 0x6c, 0x5f, 0x70, 0x6f, 0x6c, 0x69, 0x63, 0x79, 0x18, 0x18, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x0e, 0x73, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x50, 0x6f, 0x6c, 0x69, 0x63, 0x79, + 0x12, 0x2f, 0x0a, 0x14, 0x73, 0x79, 0x73, 0x5f, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x68, 0x65, 0x61, + 0x6c, 0x5f, 0x70, 0x6f, 0x6c, 0x69, 0x63, 0x79, 0x18, 0x19, 0x20, 0x01, 0x28, 0x09, 0x52, 0x11, + 0x73, 0x79, 0x73, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x50, 0x6f, 0x6c, 0x69, 0x63, + 0x79, 0x4a, 0x04, 0x08, 0x09, 0x10, 0x0a, 0x52, 0x0b, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x6e, + 0x6f, 0x64, 0x65, 0x73, 0x22, 0x63, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, + 0x65, 0x72, 0x74, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, 0x18, 0x0a, 0x06, + 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x48, 0x00, 0x52, 0x06, + 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x12, 0x18, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x76, 0x61, 0x6c, + 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x48, 0x00, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x76, 0x61, 0x6c, + 0x42, 0x07, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x83, 0x01, 0x0a, 0x0e, 0x50, 0x6f, + 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, + 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, + 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x32, + 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, + 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, + 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, + 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, + 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, + 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, + 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x83, 0x01, 0x0a, 0x0e, 0x50, + 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, + 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, + 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, + 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x03, 0x20, + 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, + 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, + 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, + 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, + 0x22, 0x5d, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, + 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x32, 0x0a, 0x0a, 0x70, + 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, - 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, - 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, - 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x5d, 0x0a, 0x0f, - 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, - 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, - 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, - 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, - 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, - 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x22, 0x4f, 0x0a, 0x0e, 0x50, - 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, + 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x22, + 0x4f, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x52, 0x65, + 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, + 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, + 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, + 0x22, 0x81, 0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, + 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, + 0x6b, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x18, 0x0a, + 0x07, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, + 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, + 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, + 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x75, 0x0a, 0x12, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, + 0x74, 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, + 0x12, 0x12, 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, + 0x66, 0x72, 0x65, 0x65, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, + 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, + 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, + 0x52, 0x09, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0xb8, 0x02, 0x0a, 0x13, + 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, + 0x6e, 0x66, 0x6f, 0x12, 0x3b, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, + 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, + 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, + 0x12, 0x2e, 0x0a, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, + 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x52, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, + 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, + 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, + 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x27, 0x0a, 0x10, 0x6d, 0x64, 0x5f, 0x6f, 0x6e, 0x5f, + 0x73, 0x73, 0x64, 0x5f, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, + 0x52, 0x0d, 0x6d, 0x64, 0x4f, 0x6e, 0x53, 0x73, 0x64, 0x41, 0x63, 0x74, 0x69, 0x76, 0x65, 0x22, + 0x5f, 0x0a, 0x0b, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x11, + 0x0a, 0x0d, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, + 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x4f, 0x57, 0x4e, 0x5f, 0x4f, 0x55, 0x54, 0x10, 0x01, 0x12, + 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, 0x55, 0x50, 0x10, + 0x03, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x50, 0x5f, 0x49, 0x4e, 0x10, 0x04, 0x12, 0x07, 0x0a, 0x03, + 0x4e, 0x45, 0x57, 0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x44, 0x52, 0x41, 0x49, 0x4e, 0x10, 0x06, + 0x4a, 0x04, 0x08, 0x01, 0x10, 0x02, 0x22, 0x5e, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, + 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, + 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, + 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2f, 0x0a, 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x18, 0x02, + 0x20, 0x03, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, + 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x52, + 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x22, 0x54, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, + 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x72, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, - 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x81, 0x01, 0x0a, - 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, - 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x18, 0x0a, 0x07, 0x74, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x74, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, - 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, - 0x22, 0x75, 0x0a, 0x12, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x12, 0x0a, 0x04, - 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66, 0x72, 0x65, 0x65, - 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, - 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, - 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09, 0x6d, 0x65, - 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0xa9, 0x03, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, - 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x12, - 0x38, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x24, 0x2e, - 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x54, - 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x3b, 0x0a, 0x05, 0x73, 0x74, 0x61, - 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, - 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, - 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, - 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x2e, 0x0a, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x18, - 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, - 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x52, - 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, 0x5f, 0x66, 0x69, - 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, - 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x27, 0x0a, 0x10, - 0x6d, 0x64, 0x5f, 0x6f, 0x6e, 0x5f, 0x73, 0x73, 0x64, 0x5f, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, - 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0d, 0x6d, 0x64, 0x4f, 0x6e, 0x53, 0x73, 0x64, 0x41, - 0x63, 0x74, 0x69, 0x76, 0x65, 0x22, 0x3b, 0x0a, 0x0a, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x54, - 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, - 0x12, 0x07, 0x0a, 0x03, 0x48, 0x44, 0x44, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x53, 0x44, - 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, 0x50, 0x4d, 0x10, 0x03, 0x12, 0x06, 0x0a, 0x02, 0x56, 0x4d, - 0x10, 0x04, 0x22, 0x5f, 0x0a, 0x0b, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, - 0x65, 0x12, 0x11, 0x0a, 0x0d, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, - 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x4f, 0x57, 0x4e, 0x5f, 0x4f, 0x55, 0x54, - 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, - 0x55, 0x50, 0x10, 0x03, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x50, 0x5f, 0x49, 0x4e, 0x10, 0x04, 0x12, - 0x07, 0x0a, 0x03, 0x4e, 0x45, 0x57, 0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x44, 0x52, 0x41, 0x49, - 0x4e, 0x10, 0x06, 0x22, 0x5e, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, - 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, - 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, - 0x75, 0x73, 0x12, 0x2f, 0x0a, 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, - 0x0b, 0x32, 0x19, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, - 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x52, 0x05, 0x69, 0x6e, - 0x66, 0x6f, 0x73, 0x22, 0x54, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, - 0x6c, 0x64, 0x53, 0x74, 0x61, 0x72, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, - 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, - 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, - 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, - 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x69, 0x0a, 0x12, 0x50, 0x6f, 0x6f, - 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, - 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, - 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, - 0x64, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, - 0x52, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, - 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, - 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x76, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x6c, 0x66, - 0x48, 0x65, 0x61, 0x6c, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, - 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, - 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x20, 0x0a, - 0x0c, 0x73, 0x79, 0x73, 0x5f, 0x70, 0x72, 0x6f, 0x70, 0x5f, 0x76, 0x61, 0x6c, 0x18, 0x04, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x0a, 0x73, 0x79, 0x73, 0x50, 0x72, 0x6f, 0x70, 0x56, 0x61, 0x6c, 0x12, - 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, - 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x2a, 0x25, 0x0a, 0x10, - 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, - 0x12, 0x07, 0x0a, 0x03, 0x53, 0x43, 0x4d, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4e, 0x56, 0x4d, - 0x45, 0x10, 0x01, 0x2a, 0x5d, 0x0a, 0x10, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, - 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x72, 0x65, 0x61, 0x74, - 0x69, 0x6e, 0x67, 0x10, 0x00, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x65, 0x61, 0x64, 0x79, 0x10, 0x01, - 0x12, 0x0e, 0x0a, 0x0a, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x69, 0x6e, 0x67, 0x10, 0x02, - 0x12, 0x13, 0x0a, 0x0f, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x45, 0x78, 0x63, 0x6c, 0x75, - 0x64, 0x65, 0x64, 0x10, 0x03, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, - 0x10, 0x04, 0x42, 0x3a, 0x5a, 0x38, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, - 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, - 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, - 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, 0x74, 0x62, 0x06, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x69, 0x0a, 0x12, + 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x6f, 0x70, 0x52, + 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x02, 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x18, 0x03, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, + 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, + 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x76, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x53, + 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x71, 0x12, 0x10, + 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, + 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, + 0x12, 0x20, 0x0a, 0x0c, 0x73, 0x79, 0x73, 0x5f, 0x70, 0x72, 0x6f, 0x70, 0x5f, 0x76, 0x61, 0x6c, + 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x73, 0x79, 0x73, 0x50, 0x72, 0x6f, 0x70, 0x56, + 0x61, 0x6c, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, + 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x2a, + 0x25, 0x0a, 0x10, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, + 0x79, 0x70, 0x65, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x43, 0x4d, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, + 0x4e, 0x56, 0x4d, 0x45, 0x10, 0x01, 0x2a, 0x5d, 0x0a, 0x10, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, + 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x72, + 0x65, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x10, 0x00, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x65, 0x61, 0x64, + 0x79, 0x10, 0x01, 0x12, 0x0e, 0x0a, 0x0a, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x69, 0x6e, + 0x67, 0x10, 0x02, 0x12, 0x13, 0x0a, 0x0f, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x45, 0x78, + 0x63, 0x6c, 0x75, 0x64, 0x65, 0x64, 0x10, 0x03, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e, + 0x6f, 0x77, 0x6e, 0x10, 0x04, 0x42, 0x3a, 0x5a, 0x38, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, + 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, + 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, + 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, + 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( @@ -3480,69 +3448,68 @@ func file_mgmt_pool_proto_rawDescGZIP() []byte { return file_mgmt_pool_proto_rawDescData } -var file_mgmt_pool_proto_enumTypes = make([]protoimpl.EnumInfo, 5) +var file_mgmt_pool_proto_enumTypes = make([]protoimpl.EnumInfo, 4) var file_mgmt_pool_proto_msgTypes = make([]protoimpl.MessageInfo, 37) var file_mgmt_pool_proto_goTypes = []interface{}{ (StorageMediaType)(0), // 0: mgmt.StorageMediaType (PoolServiceState)(0), // 1: mgmt.PoolServiceState (PoolRebuildStatus_State)(0), // 2: mgmt.PoolRebuildStatus.State - (PoolQueryTargetInfo_TargetType)(0), // 3: mgmt.PoolQueryTargetInfo.TargetType - (PoolQueryTargetInfo_TargetState)(0), // 4: mgmt.PoolQueryTargetInfo.TargetState - (*PoolCreateReq)(nil), // 5: mgmt.PoolCreateReq - (*PoolCreateResp)(nil), // 6: mgmt.PoolCreateResp - (*PoolDestroyReq)(nil), // 7: mgmt.PoolDestroyReq - (*PoolDestroyResp)(nil), // 8: mgmt.PoolDestroyResp - (*PoolEvictReq)(nil), // 9: mgmt.PoolEvictReq - (*PoolEvictResp)(nil), // 10: mgmt.PoolEvictResp - (*PoolExcludeReq)(nil), // 11: mgmt.PoolExcludeReq - (*PoolExcludeResp)(nil), // 12: mgmt.PoolExcludeResp - (*PoolDrainReq)(nil), // 13: mgmt.PoolDrainReq - (*PoolDrainResp)(nil), // 14: mgmt.PoolDrainResp - (*PoolExtendReq)(nil), // 15: mgmt.PoolExtendReq - (*PoolExtendResp)(nil), // 16: mgmt.PoolExtendResp - (*PoolReintReq)(nil), // 17: mgmt.PoolReintReq - (*PoolReintResp)(nil), // 18: mgmt.PoolReintResp - (*ListPoolsReq)(nil), // 19: mgmt.ListPoolsReq - (*ListPoolsResp)(nil), // 20: mgmt.ListPoolsResp - (*ListContReq)(nil), // 21: mgmt.ListContReq - (*ListContResp)(nil), // 22: mgmt.ListContResp - (*PoolQueryReq)(nil), // 23: mgmt.PoolQueryReq - (*StorageUsageStats)(nil), // 24: mgmt.StorageUsageStats - (*PoolRebuildStatus)(nil), // 25: mgmt.PoolRebuildStatus - (*PoolQueryResp)(nil), // 26: mgmt.PoolQueryResp - (*PoolProperty)(nil), // 27: mgmt.PoolProperty - (*PoolSetPropReq)(nil), // 28: mgmt.PoolSetPropReq - (*PoolSetPropResp)(nil), // 29: mgmt.PoolSetPropResp - (*PoolGetPropReq)(nil), // 30: mgmt.PoolGetPropReq - (*PoolGetPropResp)(nil), // 31: mgmt.PoolGetPropResp - (*PoolUpgradeReq)(nil), // 32: mgmt.PoolUpgradeReq - (*PoolQueryTargetReq)(nil), // 33: mgmt.PoolQueryTargetReq - (*StorageTargetUsage)(nil), // 34: mgmt.StorageTargetUsage - (*PoolQueryTargetInfo)(nil), // 35: mgmt.PoolQueryTargetInfo - (*PoolQueryTargetResp)(nil), // 36: mgmt.PoolQueryTargetResp - (*PoolRebuildStartReq)(nil), // 37: mgmt.PoolRebuildStartReq - (*PoolRebuildStopReq)(nil), // 38: mgmt.PoolRebuildStopReq - (*PoolSelfHealEvalReq)(nil), // 39: mgmt.PoolSelfHealEvalReq - (*ListPoolsResp_Pool)(nil), // 40: mgmt.ListPoolsResp.Pool - (*ListContResp_Cont)(nil), // 41: mgmt.ListContResp.Cont + (PoolQueryTargetInfo_TargetState)(0), // 3: mgmt.PoolQueryTargetInfo.TargetState + (*PoolCreateReq)(nil), // 4: mgmt.PoolCreateReq + (*PoolCreateResp)(nil), // 5: mgmt.PoolCreateResp + (*PoolDestroyReq)(nil), // 6: mgmt.PoolDestroyReq + (*PoolDestroyResp)(nil), // 7: mgmt.PoolDestroyResp + (*PoolEvictReq)(nil), // 8: mgmt.PoolEvictReq + (*PoolEvictResp)(nil), // 9: mgmt.PoolEvictResp + (*PoolExcludeReq)(nil), // 10: mgmt.PoolExcludeReq + (*PoolExcludeResp)(nil), // 11: mgmt.PoolExcludeResp + (*PoolDrainReq)(nil), // 12: mgmt.PoolDrainReq + (*PoolDrainResp)(nil), // 13: mgmt.PoolDrainResp + (*PoolExtendReq)(nil), // 14: mgmt.PoolExtendReq + (*PoolExtendResp)(nil), // 15: mgmt.PoolExtendResp + (*PoolReintReq)(nil), // 16: mgmt.PoolReintReq + (*PoolReintResp)(nil), // 17: mgmt.PoolReintResp + (*ListPoolsReq)(nil), // 18: mgmt.ListPoolsReq + (*ListPoolsResp)(nil), // 19: mgmt.ListPoolsResp + (*ListContReq)(nil), // 20: mgmt.ListContReq + (*ListContResp)(nil), // 21: mgmt.ListContResp + (*PoolQueryReq)(nil), // 22: mgmt.PoolQueryReq + (*StorageUsageStats)(nil), // 23: mgmt.StorageUsageStats + (*PoolRebuildStatus)(nil), // 24: mgmt.PoolRebuildStatus + (*PoolQueryResp)(nil), // 25: mgmt.PoolQueryResp + (*PoolProperty)(nil), // 26: mgmt.PoolProperty + (*PoolSetPropReq)(nil), // 27: mgmt.PoolSetPropReq + (*PoolSetPropResp)(nil), // 28: mgmt.PoolSetPropResp + (*PoolGetPropReq)(nil), // 29: mgmt.PoolGetPropReq + (*PoolGetPropResp)(nil), // 30: mgmt.PoolGetPropResp + (*PoolUpgradeReq)(nil), // 31: mgmt.PoolUpgradeReq + (*PoolQueryTargetReq)(nil), // 32: mgmt.PoolQueryTargetReq + (*StorageTargetUsage)(nil), // 33: mgmt.StorageTargetUsage + (*PoolQueryTargetInfo)(nil), // 34: mgmt.PoolQueryTargetInfo + (*PoolQueryTargetResp)(nil), // 35: mgmt.PoolQueryTargetResp + (*PoolRebuildStartReq)(nil), // 36: mgmt.PoolRebuildStartReq + (*PoolRebuildStopReq)(nil), // 37: mgmt.PoolRebuildStopReq + (*PoolSelfHealEvalReq)(nil), // 38: mgmt.PoolSelfHealEvalReq + (*ListPoolsResp_Pool)(nil), // 39: mgmt.ListPoolsResp.Pool + (*ListContResp_Cont)(nil), // 40: mgmt.ListContResp.Cont } var file_mgmt_pool_proto_depIdxs = []int32{ - 27, // 0: mgmt.PoolCreateReq.properties:type_name -> mgmt.PoolProperty - 40, // 1: mgmt.ListPoolsResp.pools:type_name -> mgmt.ListPoolsResp.Pool - 41, // 2: mgmt.ListContResp.containers:type_name -> mgmt.ListContResp.Cont + 26, // 0: mgmt.PoolCreateReq.properties:type_name -> mgmt.PoolProperty + 39, // 1: mgmt.ListPoolsResp.pools:type_name -> mgmt.ListPoolsResp.Pool + 40, // 2: mgmt.ListContResp.containers:type_name -> mgmt.ListContResp.Cont 0, // 3: mgmt.StorageUsageStats.media_type:type_name -> mgmt.StorageMediaType 2, // 4: mgmt.PoolRebuildStatus.state:type_name -> mgmt.PoolRebuildStatus.State - 25, // 5: mgmt.PoolQueryResp.rebuild:type_name -> mgmt.PoolRebuildStatus - 24, // 6: mgmt.PoolQueryResp.tier_stats:type_name -> mgmt.StorageUsageStats - 1, // 7: mgmt.PoolQueryResp.state:type_name -> mgmt.PoolServiceState - 27, // 8: mgmt.PoolSetPropReq.properties:type_name -> mgmt.PoolProperty - 27, // 9: mgmt.PoolGetPropReq.properties:type_name -> mgmt.PoolProperty - 27, // 10: mgmt.PoolGetPropResp.properties:type_name -> mgmt.PoolProperty - 0, // 11: mgmt.StorageTargetUsage.media_type:type_name -> mgmt.StorageMediaType - 3, // 12: mgmt.PoolQueryTargetInfo.type:type_name -> mgmt.PoolQueryTargetInfo.TargetType - 4, // 13: mgmt.PoolQueryTargetInfo.state:type_name -> mgmt.PoolQueryTargetInfo.TargetState - 34, // 14: mgmt.PoolQueryTargetInfo.space:type_name -> mgmt.StorageTargetUsage - 35, // 15: mgmt.PoolQueryTargetResp.infos:type_name -> mgmt.PoolQueryTargetInfo + 2, // 5: mgmt.PoolRebuildStatus.derived_state:type_name -> mgmt.PoolRebuildStatus.State + 24, // 6: mgmt.PoolQueryResp.rebuild:type_name -> mgmt.PoolRebuildStatus + 23, // 7: mgmt.PoolQueryResp.tier_stats:type_name -> mgmt.StorageUsageStats + 1, // 8: mgmt.PoolQueryResp.state:type_name -> mgmt.PoolServiceState + 26, // 9: mgmt.PoolSetPropReq.properties:type_name -> mgmt.PoolProperty + 26, // 10: mgmt.PoolGetPropReq.properties:type_name -> mgmt.PoolProperty + 26, // 11: mgmt.PoolGetPropResp.properties:type_name -> mgmt.PoolProperty + 0, // 12: mgmt.StorageTargetUsage.media_type:type_name -> mgmt.StorageMediaType + 3, // 13: mgmt.PoolQueryTargetInfo.state:type_name -> mgmt.PoolQueryTargetInfo.TargetState + 33, // 14: mgmt.PoolQueryTargetInfo.space:type_name -> mgmt.StorageTargetUsage + 34, // 15: mgmt.PoolQueryTargetResp.infos:type_name -> mgmt.PoolQueryTargetInfo 16, // [16:16] is the sub-list for method output_type 16, // [16:16] is the sub-list for method input_type 16, // [16:16] is the sub-list for extension type_name @@ -4010,7 +3977,7 @@ func file_mgmt_pool_proto_init() { File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_mgmt_pool_proto_rawDesc, - NumEnums: 5, + NumEnums: 4, NumMessages: 37, NumExtensions: 0, NumServices: 0, diff --git a/src/control/common/test/mocks.go b/src/control/common/test/mocks.go index 10a733dfaa4..ab3bf3e6094 100644 --- a/src/control/common/test/mocks.go +++ b/src/control/common/test/mocks.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -61,7 +62,7 @@ func MockHostAddr(varIdx ...int32) *net.TCPAddr { return hostAddrs[idx] } -// MockPCIAddr returns mock PCIAddr values for use in tests. +// MockPCIAddr returns mock PCIAddr value for use in tests. func MockPCIAddr(varIdx ...int32) string { idx := GetIndex(varIdx...) @@ -94,6 +95,17 @@ func MockVMDPCIAddrs(dom int, idxs ...int) (addrs []string) { return } +// MockTCPAddr returns mock TCPAddr value for use in tests. Create a mock IPv4 address +// (e.g., 127.0.0.1 on port 8080) +func MockTCPAddr(port int, varIdx ...int32) *net.TCPAddr { + idx := GetIndex(varIdx...) + + return &net.TCPAddr{ + IP: net.ParseIP(fmt.Sprintf("127.0.0.%d", idx)), + Port: port, + } +} + // MockWriter is a mock io.Writer that can be used to inject errors and check // values written. type MockWriter struct { @@ -105,6 +117,7 @@ func (w *MockWriter) Write(p []byte) (int, error) { if w.WriteErr != nil { return 0, w.WriteErr } + return w.builder.Write(p) } diff --git a/src/control/events/ras.go b/src/control/events/ras.go index 95dbc218858..902a8559e58 100644 --- a/src/control/events/ras.go +++ b/src/control/events/ras.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -58,6 +58,7 @@ const ( RASSystemFabricProvChanged RASID = C.RAS_SYSTEM_FABRIC_PROV_CHANGED // info RASNVMeLinkSpeedChanged RASID = C.RAS_DEVICE_LINK_SPEED_CHANGED // warning|notice RASNVMeLinkWidthChanged RASID = C.RAS_DEVICE_LINK_WIDTH_CHANGED // warning|notice + RASDeviceLEDSet RASID = C.RAS_DEVICE_LED_SET // info ) func (id RASID) String() string { diff --git a/src/control/fault/code/codes.go b/src/control/fault/code/codes.go index 3ce9299e1e7..1f11b8637a5 100644 --- a/src/control/fault/code/codes.go +++ b/src/control/fault/code/codes.go @@ -1,6 +1,6 @@ // // (C) Copyright 2018-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -161,6 +161,7 @@ const ( ServerBadFaultDomainLabels ServerJoinReplaceEnabledPoolRank ServerRankAdminExcluded + ServerTransparentHugepageEnabled ) // server config fault codes @@ -203,6 +204,9 @@ const ( ServerConfigEnableHotplugDeprecated ServerConfigBdevExcludeClash ServerConfigHugepagesDisabledWithNrSet + ServerConfigScmHugeEnabled + ServerConfigBadControlInterface + ServerConfigControlInterfaceMismatch ) // SPDK library bindings codes diff --git a/src/control/lib/control/check.go b/src/control/lib/control/check.go index a41aea37dcd..dec901099a4 100644 --- a/src/control/lib/control/check.go +++ b/src/control/lib/control/check.go @@ -1,6 +1,6 @@ // // (C) Copyright 2022-2023 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -11,6 +11,7 @@ import ( "context" "encoding/json" "fmt" + "sort" "strings" "time" @@ -429,6 +430,13 @@ func (r *SystemCheckReport) IsInteractive() bool { return r.Action == chkpb.CheckInconsistAction_CIA_INTERACT } +// IsStale indicates whether this report was awaiting user interaction when it became stale. Stale +// reports are still valid but can't be repaired without re-running the checker on the affected +// pool. +func (r *SystemCheckReport) IsStale() bool { + return r.Action == chkpb.CheckInconsistAction_CIA_STALE +} + // IsRemovedPool indicates whether the error detected in this report indicates a missing pool. func (r *SystemCheckReport) IsRemovedPool() bool { return r.Action == chkpb.CheckInconsistAction_CIA_DISCARD && @@ -595,6 +603,15 @@ func SystemCheckQuery(ctx context.Context, rpcClient UnaryInvoker, req *SystemCh proto.Merge(rpt, pbReport) resp.Reports = append(resp.Reports, rpt) } + + // Sort reports by class, then sequence for consistent ordering. + sort.Slice(resp.Reports, func(i, j int) bool { + if resp.Reports[i].Class != resp.Reports[j].Class { + return resp.Reports[i].Class < resp.Reports[j].Class + } + return resp.Reports[i].Seq < resp.Reports[j].Seq + }) + return resp, nil } diff --git a/src/control/lib/control/check_test.go b/src/control/lib/control/check_test.go index eab2c0eb5ee..b4f65b4978a 100644 --- a/src/control/lib/control/check_test.go +++ b/src/control/lib/control/check_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2023 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -11,8 +11,10 @@ import ( "testing" "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/testing/protocmp" chkpb "github.com/daos-stack/daos/src/control/common/proto/chk" + mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt" "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/lib/daos" ) @@ -146,3 +148,72 @@ func TestControl_SystemCheckReport_IsDryRun(t *testing.T) { }) } } + +func TestControl_SystemCheckReport_IsInteractive(t *testing.T) { + expInteractive := chkpb.CheckInconsistAction_CIA_INTERACT + + for name, actVal := range chkpb.CheckInconsistAction_value { + t.Run(name, func(t *testing.T) { + action := chkpb.CheckInconsistAction(actVal) + report := &SystemCheckReport{ + chkpb.CheckReport{ + Action: action, + }, + } + + test.AssertEqual(t, action == expInteractive, report.IsInteractive(), "") + }) + } +} + +func TestControl_SystemCheckReport_IsStale(t *testing.T) { + expStaleAction := chkpb.CheckInconsistAction_CIA_STALE + + for name, actVal := range chkpb.CheckInconsistAction_value { + t.Run(name, func(t *testing.T) { + action := chkpb.CheckInconsistAction(actVal) + report := &SystemCheckReport{ + chkpb.CheckReport{ + Action: action, + }, + } + + test.AssertEqual(t, action == expStaleAction, report.IsStale(), "") + }) + } +} + +func TestControl_SystemCheckQuery_ReportsSorted(t *testing.T) { + // Reports are returned in scrambled order to verify that + // SystemCheckQuery sorts them by class, then by sequence. + mockResp := &mgmtpb.CheckQueryResp{ + Reports: []*chkpb.CheckReport{ + {Seq: 3, Class: chkpb.CheckInconsistClass_CIC_POOL_BAD_LABEL}, + {Seq: 1, Class: chkpb.CheckInconsistClass_CIC_POOL_BAD_LABEL}, + {Seq: 5, Class: chkpb.CheckInconsistClass_CIC_POOL_NONEXIST_ON_MS}, + {Seq: 4, Class: chkpb.CheckInconsistClass_CIC_CONT_NONEXIST_ON_PS}, + {Seq: 2, Class: chkpb.CheckInconsistClass_CIC_POOL_MORE_SVC}, + }, + } + + mi := NewMockInvoker(nil, &MockInvokerConfig{ + UnaryResponse: MockMSResponse("", nil, mockResp), + }) + + resp, err := SystemCheckQuery(test.Context(t), mi, &SystemCheckQueryReq{}) + if err != nil { + t.Fatal(err) + } + + expReports := []*SystemCheckReport{ + {chkpb.CheckReport{Seq: 2, Class: chkpb.CheckInconsistClass_CIC_POOL_MORE_SVC}}, + {chkpb.CheckReport{Seq: 5, Class: chkpb.CheckInconsistClass_CIC_POOL_NONEXIST_ON_MS}}, + {chkpb.CheckReport{Seq: 1, Class: chkpb.CheckInconsistClass_CIC_POOL_BAD_LABEL}}, + {chkpb.CheckReport{Seq: 3, Class: chkpb.CheckInconsistClass_CIC_POOL_BAD_LABEL}}, + {chkpb.CheckReport{Seq: 4, Class: chkpb.CheckInconsistClass_CIC_CONT_NONEXIST_ON_PS}}, + } + + if diff := cmp.Diff(expReports, resp.Reports, protocmp.Transform()); diff != "" { + t.Fatalf("reports not sorted (-want +got):\n%s", diff) + } +} diff --git a/src/control/lib/control/pool.go b/src/control/lib/control/pool.go index a0fb5b33a07..6fd79c04af5 100644 --- a/src/control/lib/control/pool.go +++ b/src/control/lib/control/pool.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -529,20 +529,25 @@ func (pqr *PoolQueryResp) UpdateSelfHealPolicy(ctx context.Context, rpcClient Un props, err := PoolGetProp(ctx, rpcClient, req) if err != nil { - return err + return errors.Wrapf(err, "PoolGetProp") } switch len(props) { case 0: - rpcClient.Debug("self_heal pool property not found, assuming default value 'exclude;rebuild'") - pqr.SelfHealPolicy = "exclude;rebuild" + rpcClient.Debug("self_heal pool property not found, assuming default 'exclude;rebuild'") + pqr.SelfHealPolicy = daos.DefaultPoolSelfHealStr case 1: pqr.SelfHealPolicy = props[0].StringValue() + if pqr.SelfHealPolicy == "not set" { + pqr.SelfHealPolicy = daos.DefaultPoolSelfHealStr + } default: - return errors.Errorf("unexpected number of pool props returned, want 1 got %d", len(props)) + return errors.Errorf("unexpected number of pool props returned, want 1 got %d", + len(props)) } - rpcClient.Debugf("pool-query: fetched pool self_heal propval: %s", pqr.SelfHealPolicy) + rpcClient.Debugf("pool-query: fetched pool self_heal propval: %s (from props %+v)", + pqr.SelfHealPolicy, props) return nil } @@ -577,6 +582,10 @@ func poolQueryInt(ctx context.Context, rpcClient UnaryInvoker, req *PoolQueryReq return nil, err } + if err := resp.UpdateRebuildStatus(); err != nil { + return nil, err + } + if req.QueryMask.HasOption(daos.PoolQueryOptionSelfHealPolicy) { if err := resp.UpdateSelfHealPolicy(ctx, rpcClient); err != nil { return nil, errors.Wrap(err, "pool get-prop self_heal failed") @@ -637,7 +646,6 @@ func PoolQueryTargets(ctx context.Context, rpcClient UnaryInvoker, req *PoolQuer // For using the pretty printer that dmg uses for this target info. func convertPoolTargetInfo(pbInfo *mgmtpb.PoolQueryTargetInfo) (*daos.PoolQueryTargetInfo, error) { pqti := new(daos.PoolQueryTargetInfo) - pqti.Type = daos.PoolQueryTargetType(pbInfo.Type) pqti.State = daos.PoolQueryTargetState(pbInfo.State) pqti.Space = []*daos.StorageUsageStats{ { @@ -763,7 +771,8 @@ func PoolGetProp(ctx context.Context, rpcClient UnaryInvoker, req *PoolGetPropRe pbMap := make(map[uint32]*mgmtpb.PoolProperty) for _, prop := range pbResp.GetProperties() { if _, found := pbMap[prop.GetNumber()]; found { - return nil, errors.Errorf("got > 1 %d in response", prop.GetNumber()) + return nil, errors.Errorf("got > 1 occurrences of prop %d in resp", + prop.GetNumber()) } pbMap[prop.GetNumber()] = prop } @@ -858,6 +867,9 @@ func getPoolRanksResp(ctx context.Context, rpcClient UnaryInvoker, req *PoolRank return nil, errors.New("no ranks in request") } + // Set timeout to 5 minutes per rank to allow sufficient time for operation + req.SetTimeout(time.Duration(len(req.Ranks)) * DefaultPoolTimeout) + results := []*PoolRankResult{} for _, rank := range req.Ranks { result, err := poolRankOp(ctx, rpcClient, req, rank) @@ -1176,16 +1188,8 @@ func ListPools(ctx context.Context, rpcClient UnaryInvoker, req *ListPoolsReq) ( type rankFreeSpaceMap map[ranklist.Rank]uint64 -type filterRankFn func(rank ranklist.Rank) bool - -func newFilterRankFunc(ranks ranklist.RankList) filterRankFn { - return func(rank ranklist.Rank) bool { - return len(ranks) == 0 || rank.InList(ranks) - } -} - // Add namespace ranks to rankNVMeFreeSpace map and return minimum free available SCM namespace bytes. -func processSCMSpaceStats(log debugLogger, filterRank filterRankFn, scmNamespaces storage.ScmNamespaces, rankNVMeFreeSpace rankFreeSpaceMap) (uint64, error) { +func processSCMSpaceStats(log debugLogger, ranks ranklist.RankList, scmNamespaces storage.ScmNamespaces, rankNVMeFreeSpace rankFreeSpaceMap) (uint64, error) { scmBytes := uint64(math.MaxUint64) // Realistically there should only be one-per-rank but handle the case for multiple anyway. @@ -1195,7 +1199,7 @@ func processSCMSpaceStats(log debugLogger, filterRank filterRankFn, scmNamespace scmNamespace.UUID, scmNamespace.BlockDevice, scmNamespace.Name) } - if !filterRank(scmNamespace.Mount.Rank) { + if !scmNamespace.Mount.Rank.InList(ranks) { log.Debugf("Skipping SCM device %s (bdev %s, name %s, rank %d) not in ranklist", scmNamespace.UUID, scmNamespace.BlockDevice, scmNamespace.Name, scmNamespace.Mount.Rank) @@ -1221,7 +1225,7 @@ func processSCMSpaceStats(log debugLogger, filterRank filterRankFn, scmNamespace } // Add NVMe free bytes to rankNVMeFreeSpace map. -func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControllers storage.NvmeControllers, rankNVMeFreeSpace rankFreeSpaceMap) error { +func processNVMeSpaceStats(log debugLogger, ranks ranklist.RankList, nvmeControllers storage.NvmeControllers, rankNVMeFreeSpace rankFreeSpaceMap) error { for _, controller := range nvmeControllers { for _, smdDevice := range controller.SmdDevices { msgDev := fmt.Sprintf("SMD device %s (rank %d, ctrlr %s", smdDevice.UUID, @@ -1246,7 +1250,7 @@ func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControl controller.NvmeState.String()) } - if !filterRank(smdDevice.Rank) { + if !smdDevice.Rank.InList(ranks) { log.Debugf("Skipping %s, not in ranklist", msgDev) continue } @@ -1276,10 +1280,34 @@ func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, createReq *Pool return 0, 0, errors.New("invalid mem-ratio, should not be greater than one") } - // Verify that the DAOS system is ready before attempting to query storage. - if _, err := SystemQuery(ctx, rpcClient, &SystemQueryReq{}); err != nil { - return 0, 0, err + // Verify that the DAOS system is ready before attempting to query storage and record joined. + queryResp, err := SystemQuery(ctx, rpcClient, &SystemQueryReq{}) + if err != nil { + return 0, 0, errors.Wrap(err, "getMaxPoolSize: SystemQuery") + } + joinedRanks := ranklist.RankList{} + for _, member := range queryResp.Members { + if member.State == system.MemberStateJoined { + joinedRanks = append(joinedRanks, member.Rank) + } + } + + // Refuse if any requested ranks are not joined, update ranklist to contain only joined ranks. + filterRanks := ranklist.RankList{} + if len(createReq.Ranks) == 0 { + filterRanks = joinedRanks + } else { + for _, rank := range createReq.Ranks { + if !rank.InList(joinedRanks) { + return 0, 0, errors.Errorf("specified rank %d is not joined", rank) + } + filterRanks = append(filterRanks, rank) + } } + slices.Sort(filterRanks) + rpcClient.Debugf("requested/joined/filter ranks: %v/%v/%v", createReq.Ranks, joinedRanks, + filterRanks) + createReq.Ranks = filterRanks scanReq := &StorageScanReq{ Usage: true, @@ -1295,8 +1323,6 @@ func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, createReq *Pool return 0, 0, errors.New("Empty host storage response from StorageScan") } - // Generate function to verify a rank is in the provided rank slice. - filterRank := newFilterRankFunc(ranklist.RankList(createReq.Ranks)) rankNVMeFreeSpace := make(rankFreeSpaceMap) scmBytes := uint64(math.MaxUint64) for _, key := range scanResp.HostStorage.Keys() { @@ -1307,7 +1333,7 @@ func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, createReq *Pool scanResp.HostStorage[key].HostSet.String()) } - sb, err := processSCMSpaceStats(rpcClient, filterRank, hostStorage.ScmNamespaces, rankNVMeFreeSpace) + sb, err := processSCMSpaceStats(rpcClient, filterRanks, hostStorage.ScmNamespaces, rankNVMeFreeSpace) if err != nil { return 0, 0, err } @@ -1316,7 +1342,7 @@ func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, createReq *Pool scmBytes = sb } - if err := processNVMeSpaceStats(rpcClient, filterRank, hostStorage.NvmeDevices, rankNVMeFreeSpace); err != nil { + if err := processNVMeSpaceStats(rpcClient, filterRanks, hostStorage.NvmeDevices, rankNVMeFreeSpace); err != nil { return 0, 0, err } } diff --git a/src/control/lib/control/pool_test.go b/src/control/lib/control/pool_test.go index bb2dce0f181..14f098b6a62 100644 --- a/src/control/lib/control/pool_test.go +++ b/src/control/lib/control/pool_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -1461,6 +1461,92 @@ func TestControl_PoolQueryResp_UnmarshalJSON(t *testing.T) { } } +func TestControl_PoolQueryResp_UpdateSelfHealPolicy(t *testing.T) { + type prop struct { + number uint32 + value interface{} + } + makePropResp := func(props ...prop) *mgmtpb.PoolGetPropResp { + pbProps := make([]*mgmtpb.PoolProperty, 0, len(props)) + for _, p := range props { + switch v := p.value.(type) { + case string: + pbProps = append(pbProps, &mgmtpb.PoolProperty{ + Number: p.number, + Value: &mgmtpb.PoolProperty_Strval{Strval: v}, + }) + case int: + pbProps = append(pbProps, &mgmtpb.PoolProperty{ + Number: p.number, + Value: &mgmtpb.PoolProperty_Numval{Numval: uint64(v)}, + }) + } + } + return &mgmtpb.PoolGetPropResp{ + Properties: pbProps, + } + } + selfHealPropNum := propWithVal("self_heal", "").Number + + for name, tc := range map[string]struct { + getPropResp *mgmtpb.PoolGetPropResp + getPropErr error + expValue string + expErr string + }{ + "no properties returned": { + getPropResp: makePropResp(), // no properties + expValue: "exclude;rebuild", + }, + "single string value; not set value ignored": { + getPropResp: makePropResp(prop{selfHealPropNum, "rebuild"}), + expValue: "exclude;rebuild", + }, + "single num value": { + getPropResp: makePropResp(prop{selfHealPropNum, daos.PoolSelfHealingAutoRebuild}), + expValue: "rebuild", + }, + "multiple properties returned": { + getPropResp: makePropResp( + prop{selfHealPropNum, daos.PoolSelfHealingAutoRebuild}, + prop{selfHealPropNum, daos.PoolSelfHealingAutoExclude}, + ), + expErr: "> 1 occurrences of prop 4", + }, + "get-prop returns error": { + getPropErr: errors.New("something bad"), + expErr: "something bad", + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + mic := &MockInvokerConfig{ + UnaryResponseSet: []*UnaryResponse{ + MockMSResponse("host1", tc.getPropErr, tc.getPropResp), + }, + } + resp := &PoolQueryResp{} + gotErr := resp.UpdateSelfHealPolicy(context.Background(), + NewMockInvoker(log, mic)) + + var expErr error + if tc.expErr != "" { + expErr = errors.New(tc.expErr) + } + test.CmpErr(t, expErr, gotErr) + if expErr != nil { + return + } + + if resp.SelfHealPolicy != tc.expValue { + t.Errorf("expected SelfHealPolicy %q, got %q", tc.expValue, resp.SelfHealPolicy) + } + }) + } +} + func TestControl_PoolQuery(t *testing.T) { poolUUID := test.MockPoolUUID() @@ -1776,10 +1862,6 @@ func TestControl_PoolQuery(t *testing.T) { }, }, }, - // TODO DAOS-18128: Add more test cases - // sys-prop but no pool-prop - // pool-prop but no sys-prop - // neither pool or sys props "query succeeds self_heal policies provided; missing pool self_heal property": { req: &PoolQueryReq{ ID: poolUUID.String(), @@ -1800,9 +1882,10 @@ func TestControl_PoolQuery(t *testing.T) { ActiveTargets: 42, State: daos.PoolServiceStateReady, Rebuild: &daos.PoolRebuildStatus{ - State: daos.PoolRebuildStateIdle, - Objects: 1, - Records: 2, + State: daos.PoolRebuildStateIdle, + DerivedState: daos.PoolRebuildStateIdle, + Objects: 1, + Records: 2, }, TierStats: []*daos.StorageUsageStats{ { @@ -1860,9 +1943,10 @@ func TestControl_PoolQuery(t *testing.T) { ActiveTargets: 42, State: daos.PoolServiceStateReady, Rebuild: &daos.PoolRebuildStatus{ - State: daos.PoolRebuildStateIdle, - Objects: 1, - Records: 2, + State: daos.PoolRebuildStateIdle, + DerivedState: daos.PoolRebuildStateIdle, + Objects: 1, + Records: 2, }, TierStats: []*daos.StorageUsageStats{ { @@ -1889,6 +1973,229 @@ func TestControl_PoolQuery(t *testing.T) { SysSelfHealPolicy: "exclude;pool_exclude;pool_rebuild", }, }, + "pool get-prop returns error": { + req: &PoolQueryReq{ + ID: poolUUID.String(), + QueryMask: daos.MustNewPoolQueryMask(daos.PoolQueryOptionSelfHealPolicy), + }, + mic: &MockInvokerConfig{ + UnaryResponseSet: []*UnaryResponse{ + MockMSResponse("host1", nil, queryResp(1)), + MockMSResponse("host1", errors.New("get-prop failure"), nil), + }, + }, + expErr: errors.New("pool get-prop self_heal failed"), + }, + "pool get-prop returns multiple properties": { + req: &PoolQueryReq{ + ID: poolUUID.String(), + QueryMask: daos.MustNewPoolQueryMask(daos.PoolQueryOptionSelfHealPolicy), + }, + mic: &MockInvokerConfig{ + UnaryResponseSet: []*UnaryResponse{ + MockMSResponse("host1", nil, queryResp(1)), + MockMSResponse("host1", nil, &mgmtpb.PoolGetPropResp{ + Properties: []*mgmtpb.PoolProperty{ + { + Number: propWithVal("self_heal", "").Number, + Value: &mgmtpb.PoolProperty_Strval{Strval: "exclude"}, + }, + { + Number: propWithVal("self_heal", "").Number, + Value: &mgmtpb.PoolProperty_Strval{Strval: "rebuild"}, + }, + }, + }), + }, + }, + expErr: errors.New("> 1 occurrences of prop 4 in resp"), + }, + "query with rebuild state busy with DER_OP_CANCELED (stopping)": { + mic: &MockInvokerConfig{ + UnaryResponse: MockMSResponse("host1", nil, + &mgmtpb.PoolQueryResp{ + Uuid: poolUUID.String(), + TotalTargets: 42, + ActiveTargets: 42, + State: mgmtpb.PoolServiceState_Ready, + Rebuild: &mgmtpb.PoolRebuildStatus{ + Status: int32(daos.OpCanceled), + State: mgmtpb.PoolRebuildStatus_BUSY, + Objects: 100, + Records: 500, + }, + }, + ), + }, + expResp: &PoolQueryResp{ + PoolInfo: daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 42, + ActiveTargets: 42, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + Status: int32(daos.OpCanceled), + State: daos.PoolRebuildStateBusy, + DerivedState: daos.PoolRebuildStateStopping, + Objects: 100, + Records: 500, + }, + }, + }, + }, + "query with rebuild state idle with DER_OP_CANCELED (stopped)": { + mic: &MockInvokerConfig{ + UnaryResponse: MockMSResponse("host1", nil, + &mgmtpb.PoolQueryResp{ + Uuid: poolUUID.String(), + TotalTargets: 42, + ActiveTargets: 42, + State: mgmtpb.PoolServiceState_Ready, + Rebuild: &mgmtpb.PoolRebuildStatus{ + Status: int32(daos.OpCanceled), + State: mgmtpb.PoolRebuildStatus_IDLE, + Objects: 0, + Records: 0, + }, + }, + ), + }, + expResp: &PoolQueryResp{ + PoolInfo: daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 42, + ActiveTargets: 42, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + Status: int32(daos.OpCanceled), + State: daos.PoolRebuildStateIdle, + DerivedState: daos.PoolRebuildStateStopped, + Objects: 0, + Records: 0, + }, + }, + }, + }, + "query with rebuild state busy with error (failing)": { + mic: &MockInvokerConfig{ + UnaryResponse: MockMSResponse("host1", nil, + &mgmtpb.PoolQueryResp{ + Uuid: poolUUID.String(), + TotalTargets: 42, + ActiveTargets: 42, + State: mgmtpb.PoolServiceState_Ready, + Rebuild: &mgmtpb.PoolRebuildStatus{ + State: mgmtpb.PoolRebuildStatus_BUSY, + Status: -1, + Objects: 75, + Records: 300, + }, + }, + ), + }, + expResp: &PoolQueryResp{ + PoolInfo: daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 42, + ActiveTargets: 42, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateBusy, + DerivedState: daos.PoolRebuildStateFailing, + Status: -1, + Objects: 75, + Records: 300, + }, + }, + }, + }, + "query with rebuild state idle with error (failed)": { + mic: &MockInvokerConfig{ + UnaryResponse: MockMSResponse("host1", nil, + &mgmtpb.PoolQueryResp{ + Uuid: poolUUID.String(), + TotalTargets: 42, + ActiveTargets: 42, + State: mgmtpb.PoolServiceState_Ready, + Rebuild: &mgmtpb.PoolRebuildStatus{ + State: mgmtpb.PoolRebuildStatus_IDLE, + Status: -5, + }, + }, + ), + }, + expResp: &PoolQueryResp{ + PoolInfo: daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 42, + ActiveTargets: 42, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateIdle, + DerivedState: daos.PoolRebuildStateFailed, + Status: -5, + }, + }, + }, + }, + "query with rebuild state done": { + mic: &MockInvokerConfig{ + UnaryResponse: MockMSResponse("host1", nil, + &mgmtpb.PoolQueryResp{ + Uuid: poolUUID.String(), + TotalTargets: 42, + ActiveTargets: 42, + State: mgmtpb.PoolServiceState_Ready, + Rebuild: &mgmtpb.PoolRebuildStatus{ + State: mgmtpb.PoolRebuildStatus_DONE, + Objects: 200, + Records: 1000, + }, + }, + ), + }, + expResp: &PoolQueryResp{ + PoolInfo: daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 42, + ActiveTargets: 42, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateDone, + DerivedState: daos.PoolRebuildStateDone, + Objects: 200, + Records: 1000, + }, + }, + }, + }, + "query with rebuild state idle": { + mic: &MockInvokerConfig{ + UnaryResponse: MockMSResponse("host1", nil, + &mgmtpb.PoolQueryResp{ + Uuid: poolUUID.String(), + TotalTargets: 42, + ActiveTargets: 42, + State: mgmtpb.PoolServiceState_Ready, + Rebuild: &mgmtpb.PoolRebuildStatus{ + State: mgmtpb.PoolRebuildStatus_IDLE, + }, + }, + ), + }, + expResp: &PoolQueryResp{ + PoolInfo: daos.PoolInfo{ + UUID: poolUUID, + TotalTargets: 42, + ActiveTargets: 42, + State: daos.PoolServiceStateReady, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateIdle, + DerivedState: daos.PoolRebuildStateIdle, + }, + }, + }, + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) @@ -2453,9 +2760,10 @@ func TestControl_ListPools(t *testing.T) { rebuildState = daos.PoolRebuildStateBusy } return &daos.PoolRebuildStatus{ - State: rebuildState, - Objects: 1, - Records: 2, + State: rebuildState, + DerivedState: rebuildState, + Objects: 1, + Records: 2, } } expTierStats := []*daos.StorageUsageStats{ @@ -2806,19 +3114,39 @@ func newNvmeCfg(rank int, roles storage.OptionBits, size ...uint64) MockNvmeConf } } +// Helper to add joined members in SystemQueryResp for all ranks in hostsConfigArray. +func getSysQueryRespMembers(cfg []MockHostStorageConfig, resp *mgmtpb.SystemQueryResp) { + rankSet := make(map[ranklist.Rank]bool) + for _, hostCfg := range cfg { + for _, scmCfg := range hostCfg.ScmConfig { + rankSet[scmCfg.Rank] = true + } + } + for rank := range rankSet { + resp.Members = append(resp.Members, &mgmtpb.SystemMember{ + Rank: uint32(rank), + Uuid: test.MockUUID(int32(rank)), + State: system.MemberStateJoined.String(), + Addr: fmt.Sprintf("10.0.0.%d:10001", rank), + }) + } +} + func TestControl_getMaxPoolSize(t *testing.T) { devStateFaulty := storage.NvmeStateFaulty devStateNew := storage.NvmeStateNew for name, tc := range map[string]struct { - hostsConfigArray []MockHostStorageConfig - tgtRanks []ranklist.Rank - memRatio float32 - queryError error - expScmBytes uint64 - expNvmeBytes uint64 - expError error - expDebug string + hostsConfigArray []MockHostStorageConfig + tgtRanks []ranklist.Rank + memberStates map[ranklist.Rank]system.MemberState + memRatio float32 + queryError error + expCreateReqRanks []ranklist.Rank + expScmBytes uint64 + expNvmeBytes uint64 + expError error + expDebug string }{ "single server": { hostsConfigArray: []MockHostStorageConfig{ @@ -2828,8 +3156,9 @@ func TestControl_getMaxPoolSize(t *testing.T) { NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, }, - expScmBytes: 100 * humanize.GByte, - expNvmeBytes: humanize.TByte, + expCreateReqRanks: ranklist.RankList{0}, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, }, "single MD-on-SSD server; no mem-ratio specified; defaults to 1.0": { hostsConfigArray: []MockHostStorageConfig{ @@ -2844,8 +3173,9 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, }, - expScmBytes: 100 * humanize.GByte, - expNvmeBytes: humanize.TByte, + expCreateReqRanks: ranklist.RankList{0}, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, }, "single MD-on-SSD server; invalid mem-ratio; high": { hostsConfigArray: []MockHostStorageConfig{ @@ -2892,9 +3222,10 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, }, - memRatio: 1, - expScmBytes: 100 * humanize.GByte, - expNvmeBytes: humanize.TByte, + memRatio: 1, + expCreateReqRanks: ranklist.RankList{0}, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, }, "single MD-on-SSD server; phase-2 mode (mem-file-sz < meta-blob-sz)": { hostsConfigArray: []MockHostStorageConfig{ @@ -2909,9 +3240,10 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, }, - memRatio: 0.5, - expScmBytes: 200 * humanize.GByte, // Double meta-blob-sz due to mem-ratio. - expNvmeBytes: humanize.TByte, + memRatio: 0.5, + expCreateReqRanks: ranklist.RankList{0}, + expScmBytes: 200 * humanize.GByte, // Double meta-blob-sz due to mem-ratio. + expNvmeBytes: humanize.TByte, }, "single ephemeral server": { hostsConfigArray: []MockHostStorageConfig{ @@ -2921,8 +3253,9 @@ func TestControl_getMaxPoolSize(t *testing.T) { NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, }, - expScmBytes: 100 * humanize.GByte, - expNvmeBytes: humanize.TByte, + expCreateReqRanks: ranklist.RankList{0}, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, }, "double server": { hostsConfigArray: []MockHostStorageConfig{ @@ -2978,8 +3311,9 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, }, - expScmBytes: 50 * humanize.GByte, - expNvmeBytes: 700 * humanize.GByte, + expCreateReqRanks: ranklist.RankList{0, 1, 2, 3}, + expScmBytes: 50 * humanize.GByte, + expNvmeBytes: 700 * humanize.GByte, }, "double server; rank filter": { hostsConfigArray: []MockHostStorageConfig{ @@ -3059,8 +3393,9 @@ func TestControl_getMaxPoolSize(t *testing.T) { NvmeConfig: []MockNvmeConfig{}, }, }, - expScmBytes: 100 * humanize.GByte, - expNvmeBytes: uint64(0), + expCreateReqRanks: []ranklist.Rank{0}, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: uint64(0), }, "No NVMe; double server": { hostsConfigArray: []MockHostStorageConfig{ @@ -3141,8 +3476,9 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, }, - expScmBytes: 100 * humanize.GByte, - expNvmeBytes: 100 * humanize.TByte, + expCreateReqRanks: ranklist.RankList{0}, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: 100 * humanize.TByte, }, "invalid response message": { hostsConfigArray: []MockHostStorageConfig{{}}, @@ -3220,8 +3556,9 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, }, - expScmBytes: 100 * humanize.GByte, - expNvmeBytes: uint64(0), + expCreateReqRanks: ranklist.RankList{0}, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: uint64(0), }, "unmounted SCM device": { hostsConfigArray: []MockHostStorageConfig{ @@ -3291,6 +3628,11 @@ func TestControl_getMaxPoolSize(t *testing.T) { NvmeConfig: []MockNvmeConfig{newNvmeCfg(1, 0)}, }, }, + tgtRanks: []ranklist.Rank{0, 1}, + memberStates: map[ranklist.Rank]system.MemberState{ + 0: system.MemberStateJoined, + 1: system.MemberStateJoined, + }, expError: errors.New("without SCM device and at least one SMD device"), }, "no SCM": { @@ -3302,20 +3644,127 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, tgtRanks: []ranklist.Rank{1}, + memberStates: map[ranklist.Rank]system.MemberState{ + 1: system.MemberStateJoined, + }, expError: errors.New("No SCM storage space available"), }, + "requested rank not joined": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, + }, + }, + tgtRanks: []ranklist.Rank{0}, + memberStates: map[ranklist.Rank]system.MemberState{ + 0: system.MemberStateStopped, + }, + expError: errors.New("specified rank 0 is not joined"), + }, + "multiple requested ranks not joined": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, + }, + { + HostName: "bar", + ScmConfig: []MockScmConfig{newScmCfg(1), newScmCfg(2)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(1, 0), newNvmeCfg(2, 0)}, + }, + }, + tgtRanks: []ranklist.Rank{0, 1, 2}, + memberStates: map[ranklist.Rank]system.MemberState{ + 0: system.MemberStateJoined, + 1: system.MemberStateStopped, + 2: system.MemberStateExcluded, + }, + expError: errors.New("specified rank 1 is not joined"), + }, + "all requested ranks joined": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, + }, + { + HostName: "bar", + ScmConfig: []MockScmConfig{newScmCfg(1), newScmCfg(2)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(1, 0), newNvmeCfg(2, 0)}, + }, + }, + tgtRanks: []ranklist.Rank{0, 1}, + memberStates: map[ranklist.Rank]system.MemberState{ + 0: system.MemberStateJoined, + 1: system.MemberStateJoined, + 2: system.MemberStateStopped, + }, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, + }, + "no requested ranks; filters to joined ranks only": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, + }, + { + HostName: "bar", + ScmConfig: []MockScmConfig{ + newScmCfg(1, humanize.TByte), + newScmCfg(2), + newScmCfg(3, 50*humanize.GByte), + }, + NvmeConfig: []MockNvmeConfig{ + newNvmeCfg(1, 0), + newNvmeCfg(2, 0), + newNvmeCfg(3, 0, 500*humanize.GByte), + }, + }, + }, + memberStates: map[ranklist.Rank]system.MemberState{ + 0: system.MemberStateJoined, + 1: system.MemberStateJoined, + 2: system.MemberStateStopped, + 3: system.MemberStateExcluded, + }, + expCreateReqRanks: ranklist.RankList{0, 1}, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) + // Build SystemQueryResp with members based on memberStates + systemQueryResp := &mgmtpb.SystemQueryResp{} + if tc.memberStates != nil { + for rank, state := range tc.memberStates { + systemQueryResp.Members = append(systemQueryResp.Members, &mgmtpb.SystemMember{ + Rank: uint32(rank), + Uuid: test.MockUUID(int32(rank)), + State: state.String(), + Addr: fmt.Sprintf("10.0.0.%d:10001", rank), + }) + } + } else { + // If memberStates not specified, create joined members for all ranks in hostsConfigArray + getSysQueryRespMembers(tc.hostsConfigArray, systemQueryResp) + } + mockInvokerConfig := &MockInvokerConfig{ UnaryResponseSet: []*UnaryResponse{ { Responses: []*HostResponse{ { Addr: "foo", - Message: &mgmtpb.SystemQueryResp{}, + Message: systemQueryResp, Error: tc.queryError, }, }, @@ -3353,6 +3802,13 @@ func TestControl_getMaxPoolSize(t *testing.T) { return } + if tc.expCreateReqRanks == nil { + tc.expCreateReqRanks = tc.tgtRanks + } + if diff := cmp.Diff(tc.expCreateReqRanks, createReq.Ranks); diff != "" { + t.Fatalf("Unexpected ranks in create request (-want, +got):\n%s\n", diff) + } + test.AssertEqual(t, tc.expScmBytes, scmBytes, fmt.Sprintf("Invalid SCM pool size, want %s got %s", humanize.Bytes(tc.expScmBytes), humanize.Bytes(scmBytes))) @@ -3381,12 +3837,13 @@ func (invoker *MockRequestsRecorderInvoker) InvokeUnaryRPC(context context.Conte func TestControl_PoolCreateAllCmd(t *testing.T) { for name, tc := range map[string]struct { - hostsConfigArray []MockHostStorageConfig - storageRatio float64 - tgtRanks string - expPoolConfig MockPoolRespConfig - expError error - expWarning string + hostsConfigArray []MockHostStorageConfig + storageRatio float64 + tgtRanks string + expPoolConfig MockPoolRespConfig + expCreateReqRanks []ranklist.Rank + expError error + expWarning string }{ "single server": { storageRatio: 1, @@ -3621,13 +4078,17 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) + // Add joined members for ranks referenced in MockHostStorageConfig. + systemQueryResp := new(mgmtpb.SystemQueryResp) + getSysQueryRespMembers(tc.hostsConfigArray, systemQueryResp) + mockInvokerConfig := &MockInvokerConfig{ UnaryResponseSet: []*UnaryResponse{ { Responses: []*HostResponse{ { Addr: "foo", - Message: &mgmtpb.SystemQueryResp{}, + Message: systemQueryResp, }, }, }, @@ -3704,20 +4165,14 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { poolCreateRequest.TotalBytes, uint64(0), "Invalid size of TotalBytes attribute: disabled with manual allocation") - if tc.tgtRanks != "" { - test.AssertEqual(t, - ranklist.RankList(poolCreateRequest.Ranks).String(), - tc.expPoolConfig.Ranks, - "Invalid list of Ranks") - } else { - test.AssertEqual(t, - ranklist.RankList(poolCreateRequest.Ranks).String(), - "", - "Invalid list of Ranks") - } test.AssertTrue(t, poolCreateRequest.TierRatio == nil, "Invalid size of TierRatio attribute: disabled with manual allocation") + + test.AssertEqual(t, + poolCreateRequest.Ranks, + ranklist.MustCreateRankSet(tc.expPoolConfig.Ranks).Ranks(), + "Invalid list of Ranks") }) } } diff --git a/src/control/lib/control/system.go b/src/control/lib/control/system.go index 5c5fd8e4eb0..3c77374d932 100644 --- a/src/control/lib/control/system.go +++ b/src/control/lib/control/system.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -1318,6 +1318,7 @@ func SystemRebuildManage(ctx context.Context, rpcClient UnaryInvoker, req *Syste type SystemSelfHealEvalReq struct { unaryRequest msRequest + retryableRequest } // SystemSelfHealEvalResp contains the response. @@ -1341,6 +1342,10 @@ func SystemSelfHealEval(ctx context.Context, rpcClient UnaryInvoker, req *System req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) { return mgmtpb.NewMgmtSvcClient(conn).SystemSelfHealEval(ctx, pbReq) }) + req.retryTestFn = func(err error, _ uint) bool { + return (system.IsUnavailable(err) || IsRetryableConnErr(err) || + system.IsNotLeader(err) || system.IsNotReplica(err)) + } rpcClient.Debugf("DAOS system self-heal eval request: %s", pbUtil.Debug(pbReq)) ur, err := rpcClient.InvokeUnaryRPC(ctx, req) diff --git a/src/control/lib/control/system_test.go b/src/control/lib/control/system_test.go index 7d12cc04529..d43db611704 100644 --- a/src/control/lib/control/system_test.go +++ b/src/control/lib/control/system_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -19,6 +19,8 @@ import ( mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt" sharedpb "github.com/daos-stack/daos/src/control/common/proto/shared" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" "github.com/daos-stack/daos/src/control/lib/hostlist" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" @@ -2075,3 +2077,80 @@ func TestControl_SystemSelfHealEval(t *testing.T) { }) } } + +func TestControl_SystemSelfHealEval_RetryableErrors(t *testing.T) { + for name, testErr := range map[string]error{ + "system unavailable": system.ErrRaftUnavail, + "leader step-up": system.ErrLeaderStepUpInProgress, + "connection closed": FaultConnectionClosed(""), + "connection refused": FaultConnectionRefused(""), + "not leader": &system.ErrNotLeader{LeaderHint: "host1", Replicas: []string{"host2"}}, + "not replica": &system.ErrNotReplica{Replicas: []string{"host1", "host2"}}, + "data plane not started": &fault.Fault{Code: code.ServerDataPlaneNotStarted}, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(name) + defer test.ShowBufferOnFailure(t, buf) + + client := NewMockInvoker(log, &MockInvokerConfig{ + UnaryResponseSet: []*UnaryResponse{ + MockMSResponse("", testErr, nil), + MockMSResponse("", nil, &mgmtpb.DaosResp{}), + }, + }) + + gotResp, gotErr := SystemSelfHealEval(test.Context(t), client, &SystemSelfHealEvalReq{}) + if gotErr != nil { + t.Fatalf("unexpected error: %v", gotErr) + } + + expResp := new(SystemSelfHealEvalResp) + if diff := cmp.Diff(expResp, gotResp, cmpopts.IgnoreUnexported(SystemSelfHealEvalResp{})); diff != "" { + t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) + } + }) + } +} + +func TestControl_SystemSelfHealEval_NonRetryableErrors(t *testing.T) { + for name, tc := range map[string]struct { + testErr error + expErr error + }{ + "system uninitialized": { + testErr: system.ErrUninitialized, + expErr: system.ErrUninitialized, + }, + "generic error": { + testErr: errors.New("something went wrong"), + expErr: errors.New("something went wrong"), + }, + "connection bad host": { + testErr: FaultConnectionBadHost("badhost"), + expErr: FaultConnectionBadHost("badhost"), + }, + "connection no route": { + testErr: FaultConnectionNoRoute("10.0.0.1"), + expErr: FaultConnectionNoRoute("10.0.0.1"), + }, + "member exists": { + testErr: system.ErrRankExists(1), + expErr: system.ErrRankExists(1), + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(name) + defer test.ShowBufferOnFailure(t, buf) + + client := NewMockInvoker(log, &MockInvokerConfig{ + UnaryResponseSet: []*UnaryResponse{ + MockMSResponse("", tc.testErr, nil), + MockMSResponse("", nil, &mgmtpb.DaosResp{}), + }, + }) + + _, gotErr := SystemSelfHealEval(test.Context(t), client, &SystemSelfHealEvalReq{}) + test.CmpErr(t, tc.expErr, gotErr) + }) + } +} diff --git a/src/control/lib/daos/api/api.go b/src/control/lib/daos/api/api.go index add21b6da57..16664243522 100644 --- a/src/control/lib/daos/api/api.go +++ b/src/control/lib/daos/api/api.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -16,7 +17,9 @@ import ( /* #include -#cgo LDFLAGS: -lcart -lgurt -ldaos -ldaos_common +#cgo LDFLAGS: -lcart -lgurt -ldaos +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem */ import "C" diff --git a/src/control/lib/daos/api/container.go b/src/control/lib/daos/api/container.go index d43c1a40a5b..abb3e3b2291 100644 --- a/src/control/lib/daos/api/container.go +++ b/src/control/lib/daos/api/container.go @@ -1,5 +1,6 @@ // // (C) Copyright 2025 Google LLC +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -27,7 +28,8 @@ import ( #include "util.h" -#cgo LDFLAGS: -ldaos_common +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem */ import "C" diff --git a/src/control/lib/daos/api/fi.go b/src/control/lib/daos/api/fi.go new file mode 100644 index 00000000000..40d4e2ef79a --- /dev/null +++ b/src/control/lib/daos/api/fi.go @@ -0,0 +1,44 @@ +// +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package api + +/* +#include +#include +*/ +import "C" + +import "fmt" + +var ( + // failLocMap maps from strings to DAOS fault injection location constants. + // The definitions come from daos_common.h. + // TODO: Add the rest of existing fault locs. Maybe auto-generate this mapping? + failLocMap = map[string]C.uint64_t{ + "DAOS_CHK_CONT_ORPHAN": C.DAOS_CHK_CONT_ORPHAN, + "DAOS_CHK_CONT_BAD_LABEL": C.DAOS_CHK_CONT_BAD_LABEL, + "DAOS_CHK_LEADER_BLOCK": C.DAOS_CHK_LEADER_BLOCK, + "DAOS_CHK_LEADER_FAIL_REGPOOL": C.DAOS_CHK_LEADER_FAIL_REGPOOL, + "DAOS_CHK_PS_NOTIFY_LEADER": C.DAOS_CHK_PS_NOTIFY_LEADER, + "DAOS_CHK_PS_NOTIFY_ENGINE": C.DAOS_CHK_PS_NOTIFY_ENGINE, + "DAOS_CHK_SYNC_ORPHAN_PROCESS": C.DAOS_CHK_SYNC_ORPHAN_PROCESS, + "DAOS_CHK_FAIL_REPORT_POOL1": C.DAOS_CHK_FAIL_REPORT_POOL1, + "DAOS_CHK_FAIL_REPORT_POOL2": C.DAOS_CHK_FAIL_REPORT_POOL2, + "DAOS_CHK_ENGINE_DEATH": C.DAOS_CHK_ENGINE_DEATH, + "DAOS_CHK_VERIFY_CONT_SHARDS": C.DAOS_CHK_VERIFY_CONT_SHARDS, + "DAOS_CHK_ORPHAN_POOL_SHARD": C.DAOS_CHK_ORPHAN_POOL_SHARD, + } +) + +// FaultLocationFromString converts a string to a fault injection location value. +func FaultLocationFromString(str string) (uint64, error) { + loc, found := failLocMap[str] + if !found { + return 0, fmt.Errorf("invalid fault injection location %q", str) + } + return uint64(loc), nil +} diff --git a/src/control/lib/daos/api/handle.go b/src/control/lib/daos/api/handle.go index 34981c71ec6..34f4deb48f7 100644 --- a/src/control/lib/daos/api/handle.go +++ b/src/control/lib/daos/api/handle.go @@ -1,5 +1,6 @@ // // (C) Copyright 2025 Google LLC +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -19,7 +20,8 @@ import ( /* #include -#cgo LDFLAGS: -ldaos_common +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem */ import "C" diff --git a/src/control/lib/daos/api/libdaos.go b/src/control/lib/daos/api/libdaos.go index 58cf619f760..61d0e59db1e 100644 --- a/src/control/lib/daos/api/libdaos.go +++ b/src/control/lib/daos/api/libdaos.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -15,7 +16,9 @@ package api #include #include -#cgo LDFLAGS: -lcart -lgurt -ldaos -ldaos_common +#cgo LDFLAGS: -lcart -lgurt -ldaos +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem */ import "C" import "unsafe" diff --git a/src/control/lib/daos/api/libdaos_pool_stubs.go b/src/control/lib/daos/api/libdaos_pool_stubs.go index 675ac215ab2..0c24029839a 100644 --- a/src/control/lib/daos/api/libdaos_pool_stubs.go +++ b/src/control/lib/daos/api/libdaos_pool_stubs.go @@ -184,7 +184,6 @@ var ( } daos_default_PoolQueryTargetInfo daos.PoolQueryTargetInfo = daos.PoolQueryTargetInfo{ - Type: daos.PoolQueryTargetType(1), State: daos.PoolTargetStateUp, Space: func() []*daos.StorageUsageStats { tiStats := make([]*daos.StorageUsageStats, len(daos_default_PoolInfo.TierStats)) @@ -365,7 +364,6 @@ func daos_pool_query_target(poolHdl C.daos_handle_t, tgt C.uint32_t, rank C.uint daos_pool_query_target_SetTgt = tgt daos_pool_query_target_SetRank = rank - info.ta_type = C.daos_target_type_t(daos_pool_query_target_Info.Type) info.ta_state = C.daos_target_state_t(daos_pool_query_target_Info.State) info.ta_space = daos_gds2cds(daos_pool_query_target_Info.Space) diff --git a/src/control/lib/daos/api/pool.go b/src/control/lib/daos/api/pool.go index 36ccc7e1b4d..1f2556759cc 100644 --- a/src/control/lib/daos/api/pool.go +++ b/src/control/lib/daos/api/pool.go @@ -13,7 +13,6 @@ import ( "github.com/google/uuid" "github.com/pkg/errors" - "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" @@ -123,10 +122,11 @@ func newPoolRebuildStatus(drs *C.struct_daos_rebuild_status) *daos.PoolRebuildSt } return &daos.PoolRebuildStatus{ - Status: int32(drs.rs_errno), - Objects: uint64(drs.rs_obj_nr), - Records: uint64(drs.rs_rec_nr), - State: compatRebuildState(), + Status: int32(drs.rs_errno), + Objects: uint64(drs.rs_obj_nr), + Records: uint64(drs.rs_rec_nr), + State: compatRebuildState(), + Degraded: (drs.rs_flags & C.DAOS_RSF_DEGRADED) != 0, } } @@ -246,7 +246,9 @@ func PoolConnect(ctx context.Context, req PoolConnectReq) (*PoolConnectResp, err req.Flags = daos.PoolConnectFlagReadOnly } - var dpi C.daos_pool_info_t + dpi := (*C.daos_pool_info_t)(C.calloc(1, C.sizeof_daos_pool_info_t)) + defer C.free(unsafe.Pointer(dpi)) + if req.Query { dpi.pi_bits = C.ulong(daos.DefaultPoolQueryMask) } @@ -260,11 +262,14 @@ func PoolConnect(ctx context.Context, req PoolConnectReq) (*PoolConnectResp, err defer freeString(cSys) } - if err := daosError(daos_pool_connect(cPoolID, cSys, C.uint(req.Flags), &poolConn.daosHandle, &dpi, nil)); err != nil { + cHandle := (*C.daos_handle_t)(C.calloc(1, C.sizeof_daos_handle_t)) + defer C.free(unsafe.Pointer(cHandle)) + if err := daosError(daos_pool_connect(cPoolID, cSys, C.uint(req.Flags), cHandle, dpi, nil)); err != nil { return nil, errors.Wrap(err, "failed to connect to pool") } - poolInfo := newPoolInfo(&dpi) + poolInfo := newPoolInfo(dpi) + poolConn.daosHandle = *cHandle poolConn.connHandle.UUID = poolInfo.UUID if req.ID != poolInfo.UUID.String() { poolInfo.Label = req.ID @@ -407,12 +412,15 @@ func PoolQuery(ctx context.Context, sysName, poolID string, queryMask daos.PoolQ } } + if err := poolInfo.UpdateRebuildStatus(); err != nil { + return nil, err + } + return poolInfo, nil } func newPoolTargetInfo(ptinfo *C.daos_target_info_t) *daos.PoolQueryTargetInfo { return &daos.PoolQueryTargetInfo{ - Type: daos.PoolQueryTargetType(ptinfo.ta_type), State: daos.PoolQueryTargetState(ptinfo.ta_state), Space: []*daos.StorageUsageStats{ { @@ -462,17 +470,18 @@ func PoolQueryTargets(ctx context.Context, sysName, poolID string, rank ranklist defer disconnect() logging.FromContext(ctx).Debugf("PoolQueryTargets(%s:%d:[%s])", poolConn, rank, targets) - ptInfo := C.daos_target_info_t{} + ptInfo := (*C.daos_target_info_t)(C.calloc(1, C.sizeof_daos_target_info_t)) + defer C.free(unsafe.Pointer(ptInfo)) var rc C.int infos := make([]*daos.PoolQueryTargetInfo, 0, targets.Count()) for _, tgt := range targets.Ranks() { - rc = daos_pool_query_target(poolConn.daosHandle, C.uint32_t(tgt), C.uint32_t(rank), &ptInfo, nil) + rc = daos_pool_query_target(poolConn.daosHandle, C.uint32_t(tgt), C.uint32_t(rank), ptInfo, nil) if err := daosError(rc); err != nil { return nil, errors.Wrapf(err, "failed to query pool %s rank:target %d:%d", poolID, rank, tgt) } - infos = append(infos, newPoolTargetInfo(&ptInfo)) + infos = append(infos, newPoolTargetInfo(ptInfo)) } return infos, nil @@ -666,11 +675,11 @@ func GetPoolList(ctx context.Context, req GetPoolListReq) ([]*daos.PoolInfo, err log := logging.FromContext(ctx) log.Debugf("GetPoolList(%+v)", req) - if req.SysName == "" { - req.SysName = build.DefaultSystemName + var cSysName *C.char + if req.SysName != "" { + cSysName = C.CString(req.SysName) + defer freeString(cSysName) } - cSysName := C.CString(req.SysName) - defer freeString(cSysName) var cPools []C.daos_mgmt_pool_info_t for { diff --git a/src/control/lib/daos/api/pool_test.go b/src/control/lib/daos/api/pool_test.go index cc1c7e59f81..19cb4935cb1 100644 --- a/src/control/lib/daos/api/pool_test.go +++ b/src/control/lib/daos/api/pool_test.go @@ -955,11 +955,11 @@ func TestAPI_GetPoolList(t *testing.T) { ctx: test.Context(t), expPools: defaultPoolInfoResp, }, - "default system name supplied": { + "empty system name supplied": { ctx: test.Context(t), req: GetPoolListReq{}, checkParams: func(t *testing.T) { - test.CmpAny(t, "sysName", build.DefaultSystemName, daos_mgmt_list_pools_SetSys) + test.CmpAny(t, "sysName", "", daos_mgmt_list_pools_SetSys) }, expPools: defaultPoolInfoResp, }, diff --git a/src/control/lib/daos/api/system.go b/src/control/lib/daos/api/system.go index 73001363a25..6a631740bdf 100644 --- a/src/control/lib/daos/api/system.go +++ b/src/control/lib/daos/api/system.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -19,7 +20,9 @@ import ( /* #include -#cgo LDFLAGS: -lcart -lgurt -ldaos -ldaos_common +#cgo LDFLAGS: -lcart -lgurt -ldaos +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem */ import "C" diff --git a/src/control/lib/daos/container.go b/src/control/lib/daos/container.go index a5c1bae6c42..fc3b5ece6f2 100644 --- a/src/control/lib/daos/container.go +++ b/src/control/lib/daos/container.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -22,7 +23,8 @@ import ( #include #include -#cgo LDFLAGS: -ldaos_common +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem */ import "C" diff --git a/src/control/lib/daos/container_property.go b/src/control/lib/daos/container_property.go index b123b151efd..c4530223182 100644 --- a/src/control/lib/daos/container_property.go +++ b/src/control/lib/daos/container_property.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2023 Intel Corporation. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -45,7 +46,9 @@ daos_prop_co_status_val(uint32_t status, uint32_t flag, uint32_t ver) return DAOS_PROP_CO_STATUS_VAL(status, flag, ver); } -#cgo LDFLAGS: -ldaos_common -lgurt -lcart +#cgo LDFLAGS: -lgurt -lcart +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem */ import "C" @@ -368,7 +371,7 @@ var propHdlrs = propHdlrMap{ }, C.DAOS_PROP_ENTRY_REDUN_FAC: { C.DAOS_PROP_CO_REDUN_FAC, - "Redundancy Factor", + "Redundancy Factor (0-4)", nil, valHdlrMap{ "0": genSetValHdlr(C.DAOS_PROP_CO_REDUN_RF0), @@ -381,15 +384,15 @@ var propHdlrs = propHdlrMap{ func(p *ContainerProperty) string { switch p.GetValue() { case C.DAOS_PROP_CO_REDUN_RF0: - return "rd_fac0" + return "0" case C.DAOS_PROP_CO_REDUN_RF1: - return "rd_fac1" + return "1" case C.DAOS_PROP_CO_REDUN_RF2: - return "rd_fac2" + return "2" case C.DAOS_PROP_CO_REDUN_RF3: - return "rd_fac3" + return "3" case C.DAOS_PROP_CO_REDUN_RF4: - return "rd_fac4" + return "4" default: return propInvalidValue(p) } @@ -500,7 +503,7 @@ var propHdlrs = propHdlrMap{ }, C.DAOS_PROP_ENTRY_REDUN_LVL: { C.DAOS_PROP_CO_REDUN_LVL, - "Redundancy Level", + "Redundancy Level (rank=1, node=2)", nil, valHdlrMap{ "1": genSetValHdlr(C.DAOS_PROP_CO_REDUN_RANK), @@ -510,14 +513,13 @@ var propHdlrs = propHdlrMap{ }, []string{"rf_lvl"}, func(p *ContainerProperty) string { - lvl := p.GetValue() - switch lvl { + switch p.GetValue() { case C.DAOS_PROP_CO_REDUN_RANK: - return fmt.Sprintf("rank (%d)", lvl) + return "rank" case C.DAOS_PROP_CO_REDUN_NODE: - return fmt.Sprintf("node (%d)", lvl) + return "node" default: - return fmt.Sprintf("(%d)", lvl) + return propInvalidValue(p) } }, false, @@ -549,7 +551,16 @@ var propHdlrs = propHdlrMap{ // ---------------------------------------- C.DAOS_PROP_ENTRY_LAYOUT_TYPE: { C.DAOS_PROP_CO_LAYOUT_TYPE, - "Layout Type", + func() string { + acc := []string{} + for i := 0; i < C.DAOS_PROP_CO_LAYOUT_MAX; i++ { + var loStr [10]C.char + + C.daos_unparse_ctype(C.ushort(i), &loStr[0]) + acc = append(acc, C.GoString(&loStr[0])) + } + return "Layout Type (" + strings.Join(acc, ", ") + ")" + }(), nil, nil, nil, @@ -558,7 +569,7 @@ var propHdlrs = propHdlrMap{ loInt := C.ushort(p.GetValue()) C.daos_unparse_ctype(loInt, &loStr[0]) - return fmt.Sprintf("%s (%d)", C.GoString(&loStr[0]), loInt) + return fmt.Sprintf("%s", C.GoString(&loStr[0])) }, true, }, diff --git a/src/control/lib/daos/container_property_test.go b/src/control/lib/daos/container_property_test.go index abf98ead5ef..9f0d6fc164c 100644 --- a/src/control/lib/daos/container_property_test.go +++ b/src/control/lib/daos/container_property_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -201,13 +202,13 @@ func TestDaos_ContainerProperty_RedunLevel(t *testing.T) { var expStr string switch inputKey { case "1": - expStr = "rank (1)" + expStr = "rank" case "2": - expStr = "node (2)" + expStr = "node" case "rank": - expStr = "rank (1)" + expStr = "rank" case "node": - expStr = "node (2)" + expStr = "node" default: t.Fatalf("untested key %q", inputKey) } @@ -218,7 +219,7 @@ func TestDaos_ContainerProperty_RedunLevel(t *testing.T) { t.Run("unexpected level", func(t *testing.T) { testProp := newTestContainerProperty(ContainerPropRedunLevel) testProp.SetValue(42) - test.AssertEqual(t, "(42)", testProp.StringValue(), "unexpected string value") + test.AssertEqual(t, fmt.Sprintf("property %q: invalid value 0x2a", testProp.Name), testProp.StringValue(), "unexpected string value") }) } @@ -233,15 +234,15 @@ func TestDaos_ContainerProperty_RedunFactor(t *testing.T) { var expStr string switch inputKey { case "0": - expStr = "rd_fac0" + expStr = "0" case "1": - expStr = "rd_fac1" + expStr = "1" case "2": - expStr = "rd_fac2" + expStr = "2" case "3": - expStr = "rd_fac3" + expStr = "3" case "4": - expStr = "rd_fac4" + expStr = "4" default: t.Fatalf("untested key %q", inputKey) } @@ -323,19 +324,64 @@ func testReadOnlyContainerProperty(t *testing.T, propType ContainerPropType) { test.CmpErr(t, errors.Errorf("property %q is read-only", testProp.Name), testProp.Set("whoops")) } -func TestDaos_ContainerProperty_Layout(t *testing.T) { +func TestDaos_ContainerProperty_LayoutValues(t *testing.T) { testReadOnlyContainerProperty(t, ContainerPropLayoutType) - t.Run("valid layout", func(t *testing.T) { - testProp := newTestContainerProperty(ContainerPropLayoutType) - testProp.SetValue(uint64(ContainerLayoutPOSIX)) - test.AssertEqual(t, testProp.StringValue(), fmt.Sprintf("%s (%d)", ContainerLayoutPOSIX, ContainerLayoutPOSIX), "unexpected string value") - }) - t.Run("unknown layout", func(t *testing.T) { - testProp := newTestContainerProperty(ContainerPropLayoutType) - testProp.SetValue(uint64(ContainerLayoutUnknown)) - test.AssertEqual(t, testProp.StringValue(), "unknown (0)", "unexpected string value") - }) + for name, tc := range map[string]struct { + propVal uint64 + expStr string + }{ + "Valid unknown layout": { + propVal: uint64(ContainerLayoutUnknown), + expStr: "unknown", + }, + "Valid POSIX layout": { + propVal: uint64(ContainerLayoutPOSIX), + expStr: "POSIX", + }, + "Valid HDF5 layout": { + propVal: uint64(ContainerLayoutHDF5), + expStr: "HDF5", + }, + "Valid PYTHON layout": { + propVal: uint64(ContainerLayoutPython), + expStr: "PYTHON", + }, + "Valid SPARK layout": { + propVal: uint64(ContainerLayoutSpark), + expStr: "SPARK", + }, + "Valid DATABASE layout": { + propVal: uint64(ContainerLayoutDatabase), + expStr: "DATABASE", + }, + "Valid ROOT layout": { + propVal: uint64(ContainerLayoutRoot), + expStr: "ROOT", + }, + "Valid SEISMIC layout": { + propVal: uint64(ContainerLayoutSeismic), + expStr: "SEISMIC", + }, + "Valid METEO layout": { + propVal: uint64(ContainerLayoutMeteo), + expStr: "METEO", + }, + } { + t.Run(name, func(t *testing.T) { + testProp := newTestContainerProperty(ContainerPropLayoutType) + testProp.SetValue(tc.propVal) + + test.AssertEqual(t, tc.expStr, testProp.StringValue(), "unexpected string value") + }) + } +} + +func TestDaos_ContainerProperty_LayoutDescription(t *testing.T) { + testReadOnlyContainerProperty(t, ContainerPropLayoutType) + + testProp := newTestContainerProperty(ContainerPropLayoutType) + test.AssertEqual(t, testProp.Description, "Layout Type (unknown, POSIX, HDF5, PYTHON, SPARK, DATABASE, ROOT, SEISMIC, METEO)", "unexpected description") } func TestDaos_ContainerProperty_ACL(t *testing.T) { diff --git a/src/control/lib/daos/pool.go b/src/control/lib/daos/pool.go index 90e669bd559..f6d7835d948 100644 --- a/src/control/lib/daos/pool.go +++ b/src/control/lib/daos/pool.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -57,9 +57,11 @@ type ( PoolRebuildStatus struct { Status int32 `json:"status"` State PoolRebuildState `json:"state"` + DerivedState PoolRebuildState `json:"derived_state"` Objects uint64 `json:"objects"` Records uint64 `json:"records"` TotalObjects uint64 `json:"total_objects"` + Degraded bool `json:"degraded"` } // PoolInfo contains information about the pool. @@ -87,12 +89,11 @@ type ( SelfHealPolicy string `json:"self_heal_policy"` } - PoolQueryTargetType int32 + // PoolQueryTargetState represents the current state of the pool target. PoolQueryTargetState int32 // PoolQueryTargetInfo contains information about a single target PoolQueryTargetInfo struct { - Type PoolQueryTargetType `json:"target_type"` State PoolQueryTargetState `json:"target_state"` Space []*StorageUsageStats `json:"space"` MemFileBytes uint64 `json:"mem_file_bytes"` @@ -314,6 +315,40 @@ func (pi *PoolInfo) RebuildState() string { return pi.Rebuild.State.String() } +// UpdateRebuildStatus evaluates a derived state to indicate transient rebuild conditions. +func (pi *PoolInfo) UpdateRebuildStatus() error { + if pi.Rebuild == nil { + return nil + } + if pi.Rebuild.State > PoolRebuildStateDone { + return errors.New("illegal rebuild state value") + } + ds := pi.Rebuild.State + + switch pi.Rebuild.State { + case PoolRebuildStateIdle: + if pi.Rebuild.Status == int32(OpCanceled) { + ds = PoolRebuildStateStopped + } else if pi.Rebuild.Status != 0 { + ds = PoolRebuildStateFailed + } + case PoolRebuildStateDone: + if pi.Rebuild.Status != 0 { + ds = PoolRebuildStateFailed + } + case PoolRebuildStateBusy: + if pi.Rebuild.Status == int32(OpCanceled) { + ds = PoolRebuildStateStopping + } else if pi.Rebuild.Status != 0 { + ds = PoolRebuildStateFailing + } + } + + pi.Rebuild.DerivedState = ds + + return nil +} + // Name retrieves effective name for pool from either label or UUID. func (pi *PoolInfo) Name() string { name := pi.Label @@ -429,6 +464,14 @@ const ( PoolRebuildStateDone = PoolRebuildState(mgmtpb.PoolRebuildStatus_DONE) // PoolRebuildStateBusy indicates that the rebuild process is in progress. PoolRebuildStateBusy = PoolRebuildState(mgmtpb.PoolRebuildStatus_BUSY) + // PoolRebuildStateStopping indicates that the rebuild process is stopping (transient). + PoolRebuildStateStopping = PoolRebuildState(mgmtpb.PoolRebuildStatus_STOPPING) + // PoolRebuildStateStopped indicates that the rebuild process has stopped. + PoolRebuildStateStopped = PoolRebuildState(mgmtpb.PoolRebuildStatus_STOPPED) + // PoolRebuildStateFailing indicates that the rebuild process is failing (transient). + PoolRebuildStateFailing = PoolRebuildState(mgmtpb.PoolRebuildStatus_FAILING) + // PoolRebuildStateFailed indicates that the rebuild process has failed. + PoolRebuildStateFailed = PoolRebuildState(mgmtpb.PoolRebuildStatus_FAILED) ) func (prs PoolRebuildState) String() string { @@ -455,18 +498,6 @@ func (prs *PoolRebuildState) UnmarshalJSON(data []byte) error { return nil } -func (ptt PoolQueryTargetType) String() string { - ptts, ok := mgmtpb.PoolQueryTargetInfo_TargetType_name[int32(ptt)] - if !ok { - return "invalid" - } - return strings.ToLower(ptts) -} - -func (pqtt PoolQueryTargetType) MarshalJSON() ([]byte, error) { - return []byte(`"` + pqtt.String() + `"`), nil -} - const ( PoolTargetStateUnknown = PoolQueryTargetState(mgmtpb.PoolQueryTargetInfo_STATE_UNKNOWN) // PoolTargetStateDownOut indicates the target is not available diff --git a/src/control/lib/daos/pool_property.go b/src/control/lib/daos/pool_property.go index c9364377a51..59d642e1dba 100644 --- a/src/control/lib/daos/pool_property.go +++ b/src/control/lib/daos/pool_property.go @@ -1,7 +1,7 @@ // // (C) Copyright 2021-2023 Intel Corporation. // (C) Copyright 2025 Google LLC -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -30,7 +30,9 @@ import ( #include #include -#cgo LDFLAGS: -ldaos_common -lgurt -lcart +#cgo LDFLAGS: -lgurt -lcart +#cgo !server LDFLAGS: -ldaos_common +#cgo server LDFLAGS: -ldaos_common_pmem */ import "C" @@ -209,6 +211,9 @@ func DataThreshIsValid(size uint64) bool { return bool(C.daos_data_thresh_valid(C.uint32_t(size))) } +// DefaultPoolSelfHealStr describes the default self_heal flags. +const DefaultPoolSelfHealStr = "exclude;rebuild" + // PoolPropertySelfHealUnsetFlags returns disabled flags in the self-heal pool property as a // string slice. func PoolPropertySelfHealUnsetFlags(value string) []string { @@ -290,7 +295,7 @@ func PoolProperties() PoolPropertyMap { case PoolSelfHealingDelayRebuild: return "delay_rebuild" case PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild: - return "exclude;rebuild" + return DefaultPoolSelfHealStr case PoolSelfHealingAutoExclude | PoolSelfHealingDelayRebuild: return "exclude;delay_rebuild" default: @@ -303,7 +308,7 @@ func PoolProperties() PoolPropertyMap { "exclude": PoolSelfHealingAutoExclude, "rebuild": PoolSelfHealingAutoRebuild, "delay_rebuild": PoolSelfHealingDelayRebuild, - "exclude;rebuild": PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild, + DefaultPoolSelfHealStr: PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild, "rebuild;exclude": PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild, "delay_rebuild;exclude": PoolSelfHealingAutoExclude | PoolSelfHealingDelayRebuild, "exclude;delay_rebuild": PoolSelfHealingAutoExclude | PoolSelfHealingDelayRebuild, diff --git a/src/control/lib/daos/pool_test.go b/src/control/lib/daos/pool_test.go index 39ed135858c..5c90e03784d 100644 --- a/src/control/lib/daos/pool_test.go +++ b/src/control/lib/daos/pool_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -338,3 +338,226 @@ func TestDaos_PoolQueryMaskUnmarshalJSON(t *testing.T) { }) } } + +func TestDaos_PoolRebuildState_String(t *testing.T) { + for name, tc := range map[string]struct { + state PoolRebuildState + expString string + }{ + "idle": {PoolRebuildStateIdle, "idle"}, + "busy": {PoolRebuildStateBusy, "busy"}, + "done": {PoolRebuildStateDone, "done"}, + "stopping": {PoolRebuildStateStopping, "stopping"}, + "stopped": {PoolRebuildStateStopped, "stopped"}, + "failing": {PoolRebuildStateFailing, "failing"}, + "failed": {PoolRebuildStateFailed, "failed"}, + "unknown": {PoolRebuildState(999), "unknown"}, + } { + t.Run(name, func(t *testing.T) { + gotString := tc.state.String() + + test.AssertEqual(t, tc.expString, gotString, "unexpected string value") + }) + } +} + +func TestDaos_PoolRebuildState_MarshalJSON(t *testing.T) { + for name, tc := range map[string]struct { + state PoolRebuildState + expJSON string + expErr error + }{ + "idle": {PoolRebuildStateIdle, `"idle"`, nil}, + "busy": {PoolRebuildStateBusy, `"busy"`, nil}, + "done": {PoolRebuildStateDone, `"done"`, nil}, + "stopping": {PoolRebuildStateStopping, `"stopping"`, nil}, + "stopped": {PoolRebuildStateStopped, `"stopped"`, nil}, + "failing": {PoolRebuildStateFailing, `"failing"`, nil}, + "failed": {PoolRebuildStateFailed, `"failed"`, nil}, + "unknown": {PoolRebuildState(999), `"unknown"`, nil}, + } { + t.Run(name, func(t *testing.T) { + gotJSON, gotErr := tc.state.MarshalJSON() + + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + test.AssertEqual(t, tc.expJSON, string(gotJSON), "unexpected JSON") + }) + } +} + +func TestDaos_PoolRebuildState_UnmarshalJSON(t *testing.T) { + for name, tc := range map[string]struct { + json string + expState PoolRebuildState + expErr error + }{ + "idle": {`"idle"`, PoolRebuildStateIdle, nil}, + "busy": {`"busy"`, PoolRebuildStateBusy, nil}, + "done": {`"done"`, PoolRebuildStateDone, nil}, + "stopping": {`"stopping"`, PoolRebuildStateStopping, nil}, + "stopped": {`"stopped"`, PoolRebuildStateStopped, nil}, + "failing": {`"failing"`, PoolRebuildStateFailing, nil}, + "failed": {`"failed"`, PoolRebuildStateFailed, nil}, + "uppercase idle": {`"IDLE"`, PoolRebuildStateIdle, nil}, + "uppercase busy": {`"BUSY"`, PoolRebuildStateBusy, nil}, + "uppercase done": {`"DONE"`, PoolRebuildStateDone, nil}, + "uppercase stopping": {`"STOPPING"`, PoolRebuildStateStopping, nil}, + "uppercase stopped": {`"STOPPED"`, PoolRebuildStateStopped, nil}, + "uppercase failing": {`"FAILING"`, PoolRebuildStateFailing, nil}, + "uppercase failed": {`"FAILED"`, PoolRebuildStateFailed, nil}, + "mixed case stopped": {`"StOpPeD"`, PoolRebuildStateStopped, nil}, + "invalid": {`"invalid"`, PoolRebuildState(0), errors.New("failed to unmarshal")}, + "empty": {`""`, PoolRebuildState(0), errors.New("failed to unmarshal")}, + } { + t.Run(name, func(t *testing.T) { + var gotState PoolRebuildState + gotErr := gotState.UnmarshalJSON([]byte(tc.json)) + + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + test.AssertEqual(t, tc.expState, gotState, "unexpected state") + }) + } +} + +func TestDaos_PoolInfo_UpdateRebuildStatus(t *testing.T) { + for name, tc := range map[string]struct { + poolInfo *PoolInfo + expDerivedState PoolRebuildState + expErr error + }{ + "nil rebuild status": { + poolInfo: &PoolInfo{}, + }, + "idle state with status 0": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateIdle, + }, + }, + expDerivedState: PoolRebuildStateIdle, + }, + "idle state with canceled status": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateIdle, + Status: int32(OpCanceled), + }, + }, + expDerivedState: PoolRebuildStateStopped, + }, + "idle state with non-zero non-canceled status": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateIdle, + Status: -1008, + }, + }, + expDerivedState: PoolRebuildStateFailed, + }, + "done state with status 0": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateDone, + }, + }, + expDerivedState: PoolRebuildStateDone, + }, + "done state with non-zero status": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateDone, + Status: -1009, + }, + }, + expDerivedState: PoolRebuildStateFailed, + }, + "busy state with status 0": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateBusy, + }, + }, + expDerivedState: PoolRebuildStateBusy, + }, + "busy state with canceled status": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateBusy, + Status: int32(OpCanceled), + }, + }, + expDerivedState: PoolRebuildStateStopping, + }, + "busy state with non-zero non-canceled status": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateBusy, + Status: -1010, + }, + }, + expDerivedState: PoolRebuildStateFailing, + }, + "illegal stopped state": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateStopped, + }, + }, + expErr: errors.New("illegal rebuild state"), + }, + "illegal stopping state": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateStopping, + }, + }, + expErr: errors.New("illegal rebuild state"), + }, + "illegal failed state": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateFailed, + }, + }, + expErr: errors.New("illegal rebuild state"), + }, + "illegal failing state": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildStateFailing, + }, + }, + expErr: errors.New("illegal rebuild state"), + }, + "illegal rebuild state value": { + poolInfo: &PoolInfo{ + Rebuild: &PoolRebuildStatus{ + State: PoolRebuildState(999), + }, + }, + expErr: errors.New("illegal rebuild state value"), + }, + } { + t.Run(name, func(t *testing.T) { + gotErr := tc.poolInfo.UpdateRebuildStatus() + + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + if tc.poolInfo.Rebuild != nil { + test.AssertEqual(t, tc.expDerivedState, tc.poolInfo.Rebuild.DerivedState, + "unexpected derived state") + } + }) + } +} diff --git a/src/control/lib/daos/status.go b/src/control/lib/daos/status.go index 54099f31a2f..6a597d461ef 100644 --- a/src/control/lib/daos/status.go +++ b/src/control/lib/daos/status.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -120,6 +121,8 @@ const ( MercuryFatalError Status = -C.DER_HG_FATAL // NoService indicates the pool service is not up and didn't process the pool request NoService Status = -C.DER_NO_SERVICE + // OpCanceled indicates that an operation was cancelled (non-crt). + OpCanceled = -C.DER_OP_CANCELED ) const ( @@ -175,4 +178,6 @@ const ( RedundancyFactorExceeded Status = -C.DER_RF // AgentCommFailed indicates that client/agent communication failed. AgentCommFailed Status = -C.DER_AGENT_COMM + // NotSupported indicates that operation is unsupported. + NotSupported Status = -C.DER_NOTSUPPORTED ) diff --git a/src/control/lib/hardware/defaults/topology/defaults.go b/src/control/lib/hardware/defaults/topology/defaults.go index b7e764aeca7..d9aade97ce4 100644 --- a/src/control/lib/hardware/defaults/topology/defaults.go +++ b/src/control/lib/hardware/defaults/topology/defaults.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -42,3 +43,9 @@ func DefaultProcessNUMAProvider(log logging.Logger) hardware.ProcessNUMAProvider func DefaultIOMMUDetector(log logging.Logger) hardware.IOMMUDetector { return sysfs.NewProvider(log) } + +// DefaultTHPDetector gets the default provider for the transparent hugepage +// detector. +func DefaultTHPDetector(log logging.Logger) hardware.THPDetector { + return sysfs.NewProvider(log) +} diff --git a/src/control/lib/hardware/sysfs/provider.go b/src/control/lib/hardware/sysfs/provider.go index 0a253188344..7ed420bdc21 100644 --- a/src/control/lib/hardware/sysfs/provider.go +++ b/src/control/lib/hardware/sysfs/provider.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -601,3 +602,23 @@ func (s *Provider) IsIOMMUEnabled() (bool, error) { return err == nil && len(dmars) > 0, nil } + +// IsTHPEnabled checks whether transparent hugepages is enabled by interrogating sysfs and +// implements the THPDetector interface on sysfs provider. +func (s *Provider) IsTHPEnabled() (bool, error) { + if s == nil { + return false, errors.New("sysfs provider is nil") + } + + thpStatePath := s.sysPath("kernel", "mm", "transparent_hugepage", "enabled") + thpState, err := os.ReadFile(thpStatePath) + if err != nil { + return false, errors.Wrap(err, "unable to get transparent hugepage state") + } + + thpStateStr := strings.TrimSuffix(string(thpState), "\n") + strToks := strings.Split(thpStateStr, " ") + isDisabled := common.Includes(strToks, "[never]") + + return !isDisabled, nil +} diff --git a/src/control/lib/hardware/sysfs/provider_test.go b/src/control/lib/hardware/sysfs/provider_test.go index 6375654d815..791c7de49f9 100644 --- a/src/control/lib/hardware/sysfs/provider_test.go +++ b/src/control/lib/hardware/sysfs/provider_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -1348,3 +1349,65 @@ func TestSysfs_Provider_IsIOMMUEnabled(t *testing.T) { }) } } + +func setupTestIsTHPEnabled(t *testing.T, root, txt string, extraDirs ...string) { + t.Helper() + + dirs := append([]string{root}, extraDirs...) + + path := filepath.Join(dirs...) + os.MkdirAll(path, 0755) + + contents := []byte(txt + "\n") + if err := os.WriteFile(filepath.Join(path, "enabled"), contents, 0644); err != nil { + t.Fatal(err) + } +} + +func TestSysfs_Provider_IsTHPEnabled(t *testing.T) { + for name, tc := range map[string]struct { + nilProvider bool + extraDirs []string + enableText string + expResult bool + expErr error + }{ + "nil provider": { + nilProvider: true, + expErr: errors.New("provider is nil"), + }, + "missing thp dir": { + extraDirs: []string{"kernel", "mm"}, + expErr: errors.New("no such file or directory"), + }, + "thp enabled": { + extraDirs: []string{"kernel", "mm", "transparent_hugepage"}, + enableText: "[always] madvise never", + expResult: true, + }, + "thp disabled": { + extraDirs: []string{"kernel", "mm", "transparent_hugepage"}, + enableText: "always madvise [never]", + }, + } { + t.Run(name, func(t *testing.T) { + testDir, cleanupTestDir := test.CreateTestDir(t) + defer cleanupTestDir() + + log, buf := logging.NewTestLogger(name) + defer test.ShowBufferOnFailure(t, buf) + + var p *Provider + if !tc.nilProvider { + p = NewProvider(log) + p.root = testDir + setupTestIsTHPEnabled(t, testDir, tc.enableText, tc.extraDirs...) + } + + result, err := p.IsTHPEnabled() + + test.CmpErr(t, tc.expErr, err) + test.AssertEqual(t, tc.expResult, result, "") + }) + } +} diff --git a/src/control/lib/hardware/thp.go b/src/control/lib/hardware/thp.go new file mode 100644 index 00000000000..cea54e5dd7c --- /dev/null +++ b/src/control/lib/hardware/thp.go @@ -0,0 +1,14 @@ +// +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package hardware + +type ( + // THPDetector is an interface for detecting if transparent hugepages is enabled on a + // system. + THPDetector interface { + IsTHPEnabled() (bool, error) + } +) diff --git a/src/control/lib/ranklist/ranklist.go b/src/control/lib/ranklist/ranklist.go index add82be22aa..a677f4bf172 100644 --- a/src/control/lib/ranklist/ranklist.go +++ b/src/control/lib/ranklist/ranklist.go @@ -25,11 +25,7 @@ func init() { } } -func fixBrackets(stringRanks string, remove bool) string { - if remove { - return strings.Trim(stringRanks, "[]") - } - +func addBrackets(stringRanks string) string { if !strings.HasPrefix(stringRanks, "[") { stringRanks = "[" + stringRanks } @@ -40,6 +36,10 @@ func fixBrackets(stringRanks string, remove bool) string { return stringRanks } +func removeBrackets(stringRanks string) string { + return strings.Trim(stringRanks, "[]") +} + // RankList provides convenience methods for working with Rank slices. type RankList []Rank @@ -65,7 +65,7 @@ func (rs *RankSet) String() string { if rs == nil || rs.ns == nil { return "" } - return fixBrackets(rs.ns.String(), true) + return removeBrackets(rs.ns.String()) } // RangedString returns a ranged string representation of the RankSet. @@ -201,14 +201,11 @@ func MustCreateRankSet(stringRanks string) *RankSet { func CreateRankSet(stringRanks string) (*RankSet, error) { rs := NewRankSet() - if len(stringRanks) < 1 { + if len(removeBrackets(stringRanks)) < 1 { return rs, nil } - stringRanks = fixBrackets(stringRanks, false) - - // add enclosing brackets to input so CreateSet works without hostnames - ns, err := hostlist.CreateNumericSet(stringRanks) + ns, err := hostlist.CreateNumericSet(addBrackets(stringRanks)) if err != nil { return nil, err } diff --git a/src/control/lib/ranklist/ranklist_test.go b/src/control/lib/ranklist/ranklist_test.go index 07ddef38396..a4d773afad5 100644 --- a/src/control/lib/ranklist/ranklist_test.go +++ b/src/control/lib/ranklist/ranklist_test.go @@ -31,10 +31,22 @@ func TestRanklist_RankSet(t *testing.T) { expCount: 0, expRanks: []Rank{}, }, + "empty bracketed start list": { + ranks: "[]", + expOut: "", + expCount: 0, + expRanks: []Rank{}, + }, "invalid with hostnames": { ranks: "node2-1,node1-2.suffix1,node1-[45,47].suffix2,node3,node1-3", expErr: errors.New("unexpected alphabetic character(s)"), }, + "simple bracketed ranged rank list": { + ranks: "[0-10]", + expOut: "0-10", + expCount: 11, + expRanks: []Rank{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + }, "simple ranged rank list": { ranks: "0-10", expOut: "0-10", diff --git a/src/control/lib/spdk/ctests/SConscript b/src/control/lib/spdk/ctests/SConscript index a8bd196fed2..a19376392f4 100644 --- a/src/control/lib/spdk/ctests/SConscript +++ b/src/control/lib/spdk/ctests/SConscript @@ -23,7 +23,7 @@ def scons(): libs += ['rte_mempool_ring', 'rte_bus_pci', 'nvme_control'] # Other libs - libs += ['numa', 'dl', 'isal', 'cmocka', 'pthread'] + libs += ['numa', 'dl', 'isal', 'cmocka', 'pthread', 'ssl'] if GetOption('help'): return diff --git a/src/control/lib/spdk/ctests/nvme_control_ut.c b/src/control/lib/spdk/ctests/nvme_control_ut.c index 0863d4f4509..4bdcbe16d29 100644 --- a/src/control/lib/spdk/ctests/nvme_control_ut.c +++ b/src/control/lib/spdk/ctests/nvme_control_ut.c @@ -1,8 +1,9 @@ /** -* (C) Copyright 2019-2021 Intel Corporation. -* -* SPDX-License-Identifier: BSD-2-Clause-Patent -*/ + * (C) Copyright 2019-2021 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ #include #include @@ -102,6 +103,13 @@ mock_spdk_pci_device_get_socket_id(struct spdk_pci_device *dev) return 1; } +static const char * +mock_spdk_pci_device_get_type(const struct spdk_pci_device *dev) +{ + (void)dev; + return "pci"; +} + /** * =================== * Test functions @@ -208,9 +216,8 @@ test_collect(void **state) test_ret = init_ret(); assert_null(test_ret->ctrlrs); - _collect(test_ret, &mock_copy_ctrlr_data, - &mock_spdk_nvme_ctrlr_get_pci_device, - &mock_spdk_pci_device_get_socket_id); + _collect(test_ret, &mock_copy_ctrlr_data, &mock_spdk_nvme_ctrlr_get_pci_device, + &mock_spdk_pci_device_get_socket_id, &mock_spdk_pci_device_get_type); if (test_ret->rc != 0) fprintf(stderr, "collect err: %s\n", test_ret->info); diff --git a/src/control/lib/spdk/include/nvme_control_common.h b/src/control/lib/spdk/include/nvme_control_common.h index ae8780ad911..5bb0fd7850f 100644 --- a/src/control/lib/spdk/include/nvme_control_common.h +++ b/src/control/lib/spdk/include/nvme_control_common.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -170,8 +171,10 @@ typedef struct spdk_pci_device * typedef int (*socket_id_getter)(struct spdk_pci_device *); +typedef const char *(*pci_type_getter)(const struct spdk_pci_device *); + void -_collect(struct ret_t *, data_copier, pci_getter, socket_id_getter); +_collect(struct ret_t *, data_copier, pci_getter, socket_id_getter, pci_type_getter); /** * Collect controller and namespace information of the NVMe devices. diff --git a/src/control/lib/spdk/nvme_default.go b/src/control/lib/spdk/nvme_default.go index 0b0fc935738..09e50027b2d 100644 --- a/src/control/lib/spdk/nvme_default.go +++ b/src/control/lib/spdk/nvme_default.go @@ -1,6 +1,7 @@ // // (C) Copyright 2022-2023 Intel Corporation. // (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -14,7 +15,7 @@ package spdk /* #cgo CFLAGS: -I . -#cgo LDFLAGS: -L . -lnvme_control +#cgo LDFLAGS: -L . -lnvme_control -lssl #cgo LDFLAGS: -lspdk_env_dpdk -lspdk_nvme -lspdk_vmd -lspdk_util #cgo LDFLAGS: -lrte_mempool -lrte_mempool_ring -lrte_bus_pci diff --git a/src/control/lib/spdk/spdk_default.go b/src/control/lib/spdk/spdk_default.go index 77f382f0268..4f755d4d568 100644 --- a/src/control/lib/spdk/spdk_default.go +++ b/src/control/lib/spdk/spdk_default.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -98,7 +99,8 @@ func (ei *EnvImpl) InitSPDKEnv(log logging.Logger, opts *EnvOptions) error { C.setArrayString(cAllowList, C.CString(s), C.int(i)) } - envCtx := C.dpdk_cli_override_opts + // Use default logging level for all DPDK facilities. + envCtx := C.dpdk_cli_build_opts(C.DAOS_DPDK_LOG_DEFAULT, C.DAOS_DPDK_LOG_DEFAULT) retPtr := C.daos_spdk_init(0, envCtx, C.ulong(opts.PCIAllowList.Len()), cAllowList) diff --git a/src/control/lib/spdk/src/nvme_control.c b/src/control/lib/spdk/src/nvme_control.c index 137d3b91462..d609e485726 100644 --- a/src/control/lib/spdk/src/nvme_control.c +++ b/src/control/lib/spdk/src/nvme_control.c @@ -1,8 +1,9 @@ /** -* (C) Copyright 2018-2022 Intel Corporation. -* -* SPDX-License-Identifier: BSD-2-Clause-Patent -*/ + * (C) Copyright 2018-2022 Intel Corporation. + * (C) Copyright 2025 Google LLC + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ #include #include @@ -488,6 +489,7 @@ daos_spdk_init(int mem_sz, char *env_ctx, size_t nr_pcil, char **pcil) struct spdk_env_opts opts = {}; int rc, i; + opts.opts_size = sizeof(opts); spdk_env_opts_init(&opts); if (mem_sz > 0) diff --git a/src/control/lib/spdk/src/nvme_control_common.c b/src/control/lib/spdk/src/nvme_control_common.c index 4d7d138fd08..41d859e1a19 100644 --- a/src/control/lib/spdk/src/nvme_control_common.c +++ b/src/control/lib/spdk/src/nvme_control_common.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -445,7 +446,7 @@ populate_dev_health(struct nvme_stats *stats, void _collect(struct ret_t *ret, data_copier copy_data, pci_getter get_pci, - socket_id_getter get_socket_id) + socket_id_getter get_socket_id, pci_type_getter get_pci_type) { struct ctrlr_entry *ctrlr_entry; const struct spdk_nvme_ctrlr_data *cdata; @@ -499,8 +500,7 @@ _collect(struct ret_t *ret, data_copier copy_data, pci_getter get_pci, ctrlr_tmp->socket_id = get_socket_id(pci_dev); - pci_type = spdk_pci_device_get_type(pci_dev); - free(pci_dev); + pci_type = get_pci_type(pci_dev); ctrlr_tmp->pci_type = strndup(pci_type, NVME_DETAIL_BUFLEN); if (ctrlr_tmp->pci_type == NULL) { rc = -NVMEC_ERR_GET_PCI_TYPE; @@ -555,7 +555,7 @@ collect(void) ret = init_ret(); _collect(ret, ©_ctrlr_data, &spdk_nvme_ctrlr_get_pci_device, - &spdk_pci_device_get_socket_id); + &spdk_pci_device_get_socket_id, &spdk_pci_device_get_type); return ret; } diff --git a/src/control/lib/support/log.go b/src/control/lib/support/log.go index 721fc18fb50..dc22e8d915f 100644 --- a/src/control/lib/support/log.go +++ b/src/control/lib/support/log.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -50,8 +51,8 @@ type CollectLogSubCmd struct { TargetFolder string `short:"t" long:"target-folder" description:"Target Folder location where log will be copied"` Archive bool `short:"z" long:"archive" description:"Archive the log/config files"` ExtraLogsDir string `short:"c" long:"extra-logs-dir" description:"Collect the Logs from given directory"` - LogStartDate string `short:"D" long:"start-date" description:"Specify the start date, the day from log will be collected, Format: MM-DD"` - LogEndDate string `short:"F" long:"end-date" description:"Specify the end date, the day till the log will be collected, Format: MM-DD"` + LogStartDate string `short:"D" long:"start-date" description:"Specify the start date, the day from log will be collected, Format: YYYY-MM-DD"` + LogEndDate string `short:"F" long:"end-date" description:"Specify the end date, the day till the log will be collected, Format: YYYY-MM-DD"` LogStartTime string `short:"S" long:"log-start-time" description:"Specify the log collection start time, Format: HH:MM:SS"` LogEndTime string `short:"E" long:"log-end-time" description:"Specify the log collection end time, Format: HH:MM:SS"` FileTransferExecArgs string `short:"T" long:"transfer-args" description:"Extra arguments for alternate file transfer tool"` @@ -62,11 +63,10 @@ type LogTypeSubCmd struct { } const ( - MMDDYYYY = "1-2-2006" - HHMMSS = "15:4:5" - MMDDHHMMSS = "1/2-15:4:5" - MMDDYYYY_HHMMSS = "1-2-2006 15:4:5" - YYYYMMDD_HHMMSS = "2006/1/2 15:4:5" + YYYYMMDD = "2006-01-02" // Date format as it is expected from commandline argument + HHMMSS = "15:04:05" // Time format as it is expected from commandline argument + YYYYMMDD_HHMMSS = "2006-01-02 15:04:05" // Date/Time format as it is defined by ISO 8601 + YYYYMMDD_HHMMSS_LOG = "2006/01/02 15:04:05" // Date/Time format as it is used in DAOS logs ) // Folder names to copy logs and configs @@ -164,14 +164,14 @@ type logCopy struct { // Verify if the date and time argument is valid and return error if it's invalid func (cmd *CollectLogSubCmd) DateTimeValidate() error { if cmd.LogStartDate != "" || cmd.LogEndDate != "" { - startDate, err := time.Parse(MMDDYYYY, cmd.LogStartDate) + startDate, err := time.Parse(YYYYMMDD, cmd.LogStartDate) if err != nil { - return errors.New("Invalid date, please provide the startDate in MM-DD-YYYY format") + return errors.New("Invalid date, please provide the startDate in YYYY-MM-DD format") } - endDate, err := time.Parse(MMDDYYYY, cmd.LogEndDate) + endDate, err := time.Parse(YYYYMMDD, cmd.LogEndDate) if err != nil { - return errors.New("Invalid date, please provide the endDate in MM-DD-YYYY format") + return errors.New("Invalid date, please provide the endDate in YYYY-MM-DD format") } if startDate.After(endDate) { @@ -719,12 +719,12 @@ func getDateTime(log logging.Logger, opts ...CollectLogsParams) (time.Time, time startTimeStr := fmt.Sprintf("%s %s", opts[0].LogStartDate, opts[0].LogStartTime) endTimeStr := fmt.Sprintf("%s %s", opts[0].LogEndDate, opts[0].LogEndTime) - actStartTime, err := time.Parse(MMDDYYYY_HHMMSS, startTimeStr) + actStartTime, err := time.Parse(YYYYMMDD_HHMMSS, startTimeStr) if err != nil { return time.Time{}, time.Time{}, err } - actEndTime, err := time.Parse(MMDDYYYY_HHMMSS, endTimeStr) + actEndTime, err := time.Parse(YYYYMMDD_HHMMSS, endTimeStr) if err != nil { return time.Time{}, time.Time{}, err } @@ -766,17 +766,16 @@ func cpLinesFromLog(log logging.Logger, srcFile string, destFile string, opts .. scanner := bufio.NewScanner(readFile) var cpLogLine bool if opts[0].LogCmd == "EngineLog" { - // Remove year as engine log does not store the year information. - actStartTime, _ = time.Parse(MMDDHHMMSS, actStartTime.Format(MMDDHHMMSS)) - actEndTime, _ = time.Parse(MMDDHHMMSS, actEndTime.Format(MMDDHHMMSS)) + actStartTime, _ = time.Parse(YYYYMMDD_HHMMSS_LOG, actStartTime.Format(YYYYMMDD_HHMMSS_LOG)) + actEndTime, _ = time.Parse(YYYYMMDD_HHMMSS_LOG, actEndTime.Format(YYYYMMDD_HHMMSS_LOG)) - var validDateTime = regexp.MustCompile(`^\d\d\/\d\d-\d\d:\d\d:\d\d.\d\d`) + var validDateTime = regexp.MustCompile(`^\d\d\d\d\/\d\d\/\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d`) for scanner.Scan() { lineData := scanner.Text() lineDataSlice := strings.Split(lineData, " ") // Verify if log line has date/time stamp and copy line if it's in range. - if validDateTime.MatchString(lineData) == false { + if !validDateTime.MatchString(lineData) { if cpLogLine { _, err = writeFile.WriteString(lineData + "\n") if err != nil { @@ -786,10 +785,10 @@ func cpLinesFromLog(log logging.Logger, srcFile string, destFile string, opts .. continue } - dateTime := strings.Split(lineDataSlice[0], "-") - timeOnly := strings.Split(dateTime[1], ".") - expDateTime := fmt.Sprintf("%s-%s", dateTime[0], timeOnly[0]) - expLogTime, _ := time.Parse(MMDDHHMMSS, expDateTime) + dateTime := lineDataSlice[0] + timeOnly := lineDataSlice[1] + expDateTime := fmt.Sprintf("%s %s", dateTime, timeOnly) + expLogTime, _ := time.Parse(YYYYMMDD_HHMMSS_LOG, expDateTime) // Copy line, if the log line has time stamp between the given range of start/end date and time. if expLogTime.After(actStartTime) && expLogTime.Before(actEndTime) { @@ -818,7 +817,7 @@ func cpLinesFromLog(log logging.Logger, srcFile string, destFile string, opts .. lineData := scanner.Text() // Verify if log line has date/time stamp and copy line if it's in range. - if validDateTime.MatchString(lineData) == false { + if !validDateTime.MatchString(lineData) { if cpLogLine { _, err = writeFile.WriteString(lineData + "\n") if err != nil { @@ -829,7 +828,7 @@ func cpLinesFromLog(log logging.Logger, srcFile string, destFile string, opts .. } data := validDateTime.FindAllString(lineData, -1) - expLogTime, _ := time.Parse(YYYYMMDD_HHMMSS, data[0]) + expLogTime, _ := time.Parse(YYYYMMDD_HHMMSS_LOG, data[0]) // Copy line, if the log line has time stamp between the given range of start/end date and time. if expLogTime.After(actStartTime) && expLogTime.Before(actEndTime) { cpLogLine = true diff --git a/src/control/lib/support/log_test.go b/src/control/lib/support/log_test.go index db7a5c048eb..1b3ebf8d32f 100644 --- a/src/control/lib/support/log_test.go +++ b/src/control/lib/support/log_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -893,30 +894,30 @@ func TestSupport_DateTimeValidate(t *testing.T) { expErr: nil, }, "Valid StartDate No EndDate": { - logStartDate: "12-01-2024", - expErr: errors.New("Invalid date, please provide the endDate in MM-DD-YYYY format"), + logStartDate: "2024-12-01", + expErr: errors.New("Invalid date, please provide the endDate in YYYY-MM-DD format"), }, "No StartDate Valid EndDate": { - logEndDate: "12-31-2024", - expErr: errors.New("Invalid date, please provide the startDate in MM-DD-YYYY format"), + logEndDate: "2024-12-31", + expErr: errors.New("Invalid date, please provide the startDate in YYYY-MM-DD format"), }, "Invalid StartDate No EndDate": { - logStartDate: "44-22-2024", - expErr: errors.New("Invalid date, please provide the startDate in MM-DD-YYYY format"), + logStartDate: "2024-44-22", + expErr: errors.New("Invalid date, please provide the startDate in YYYY-MM-DD format"), }, "Invalid EndDate": { - logStartDate: "12-01-2024", - logEndDate: "44-22-2024", - expErr: errors.New("Invalid date, please provide the endDate in MM-DD-YYYY format"), + logStartDate: "2024-12-01", + logEndDate: "2024-44-22", + expErr: errors.New("Invalid date, please provide the endDate in YYYY-MM-DD format"), }, "StartDate after EndDate": { - logStartDate: "10-01-2024", - logEndDate: "05-06-2024", + logStartDate: "2024-10-01", + logEndDate: "2024-05-06", expErr: errors.New("start-date can not be after end-date"), }, "Valid StartDate and EndDate": { - logStartDate: "12-01-2024", - logEndDate: "12-31-2024", + logStartDate: "2024-12-01", + logEndDate: "2024-12-31", expErr: nil, }, "Valid StartTime No EndTime": { @@ -942,8 +943,8 @@ func TestSupport_DateTimeValidate(t *testing.T) { expErr: nil, }, "Valid Date Time": { - logStartDate: "12-01-2024", - logEndDate: "12-31-2024", + logStartDate: "2024-12-01", + logEndDate: "2024-12-31", logStartTime: "13:15:59", logEndTime: "20:30:50", expErr: nil, @@ -1034,19 +1035,19 @@ func TestSupport_cpLinesFromLog(t *testing.T) { collLogParams := CollectLogsParams{} - DummyEngineLog := `01/01-01:01:01.90 system-01 LOG LINE 1 -02/02-04:04:04.90 system-02 LOG LINE 2 -03/03-06:06:06.90 system-02 LOG LINE 3 -04/04-08:08:08.90 system-02 LOG LINE 4 -05/05-10:10:10.90 system-02 LOG LINE 5 -06/06-12:12:12.90 system-02 LOG LINE 6 -07/07-14:14:14.90 system-02 LOG LINE 7 + DummyEngineLog := `2023/01/01-01:01:01.90 system-01 LOG LINE 1 +2023/02/02 04:04:04.908070 system-02 LOG LINE 2 +2023/03/03 06:06:06.907060 system-02 LOG LINE 3 +2023/04/04 08:08:08.905040 system-02 LOG LINE 4 +2023/05/05 10:10:10.904030 system-02 LOG LINE 5 +2023/06/06 12:12:12.903020 system-02 LOG LINE 6 +2023/07/07 14:14:14.902010 system-02 LOG LINE 7 LINE WITHOUT DATE AND TIME -08/08-16:16:16.90 system-02 LOG LINE 8 -09/09-18:18:18.90 system-02 LOG LINE 9 -10/10-20:20:20.90 system-02 LOG LINE 10 -11/11-22:22:22.90 system-02 LOG LINE 11 -12/12-23:59:59.90 system-02 LOG LINE 12 +2023/08/08 16:16:16.901090 system-02 LOG LINE 8 +2023/09/09 18:18:18.909080 system-02 LOG LINE 9 +2023/10/10 20:20:20.908070 system-02 LOG LINE 10 +2023/11/11 22:22:22.907060 system-02 LOG LINE 11 +2023/12/12 23:59:59.906050 system-02 LOG LINE 12 ` MockEngineLogFile := test.CreateTestFile(t, targetTestDir, DummyEngineLog) @@ -1108,42 +1109,42 @@ INFO 2023/12/12 23:59:59.441241 LOG LINE 12 expErr: errors.New("unable to Copy File"), }, "Invalid Source File": { - logStartDate: "01-01-2023", - logEndDate: "12-31-2023", + logStartDate: "2023-01-01", + logEndDate: "2023-12-31", srcFile: srcPath + "unknownFile", destFile: dstTestDir, expErr: errors.New("no such file or directory"), }, "Valid date without any time": { - logStartDate: "01-01-2023", - logEndDate: "12-31-2023", + logStartDate: "2023-01-01", + logEndDate: "2023-12-31", srcFile: srcPath, destFile: dstTestDir, expErr: nil, }, "Verify the content of Engine log line based on date": { - logStartDate: "04-01-2023", - logEndDate: "08-08-2023", + logStartDate: "2023-04-01", + logEndDate: "2023-08-08", srcFile: MockEngineLogFile, destFile: dstTestDir, logCmd: "EngineLog", expErr: nil, - verifyLog: "08/08-16:16:16.90 system-02 LOG LINE 8", + verifyLog: "2023/08/08 16:16:16.901090 system-02 LOG LINE 8", }, "Verify the content of Engine log line based on date and time": { - logStartDate: "09-09-2023", - logEndDate: "11-11-2023", + logStartDate: "2023-09-09", + logEndDate: "2023-11-11", logStartTime: "12:00:00", logEndTime: "23:23:23", srcFile: MockEngineLogFile, destFile: dstTestDir, logCmd: "EngineLog", expErr: nil, - verifyLog: "11/11-22:22:22.90 system-02 LOG LINE 11", + verifyLog: "2023/11/11 22:22:22.907060 system-02 LOG LINE 11", }, "Verify the content of Control log line based on date": { - logStartDate: "04-01-2023", - logEndDate: "08-08-2023", + logStartDate: "2023-04-01", + logEndDate: "2023-08-08", srcFile: MockControlLogFile, destFile: dstTestDir, logCmd: "ControlLog", @@ -1151,8 +1152,8 @@ INFO 2023/12/12 23:59:59.441241 LOG LINE 12 verifyLog: "hostname INFO 2023/08/08 16:16:16 LOG LINE 8", }, "Verify the content of Control log line based on date and time": { - logStartDate: "09-09-2023", - logEndDate: "11-11-2023", + logStartDate: "2023-09-09", + logEndDate: "2023-11-11", logStartTime: "12:00:00", logEndTime: "23:23:23", srcFile: MockControlLogFile, @@ -1162,8 +1163,8 @@ INFO 2023/12/12 23:59:59.441241 LOG LINE 12 verifyLog: "hostname INFO 2023/11/11 22:22:22 LOG LINE 11", }, "Verify the content of Admin log line based on date": { - logStartDate: "04-01-2023", - logEndDate: "08-08-2023", + logStartDate: "2023-04-01", + logEndDate: "2023-08-08", srcFile: MockAdminLogFile, destFile: dstTestDir, logCmd: "HelperLog", @@ -1171,8 +1172,8 @@ INFO 2023/12/12 23:59:59.441241 LOG LINE 12 verifyLog: "INFO 2023/08/08 16:16:16.441237 LOG LINE 8", }, "Verify the content of Admin log line based on date and time": { - logStartDate: "09-09-2023", - logEndDate: "11-11-2023", + logStartDate: "2023-09-09", + logEndDate: "2023-11-11", logStartTime: "12:00:00", logEndTime: "23:23:23", srcFile: MockAdminLogFile, @@ -1223,39 +1224,39 @@ func TestSupport_getDateTime(t *testing.T) { expErr error }{ "No StartTime": { - logStartDate: "1-2-2023", - logEndDate: "1-3-2023", + logStartDate: "2023-01-02", + logEndDate: "2023-01-03", expErr: nil, }, "No EndTime": { - logStartDate: "1-2-2023", - logEndDate: "1-3-2023", + logStartDate: "2023-01-02", + logEndDate: "2023-01-03", logStartTime: "10:10:10", - expStartTime: "01-02-2023 10:10:10", - expEndTime: "01-03-2023 23:59:59", + expStartTime: "2023-01-02 10:10:10", + expEndTime: "2023-01-03 23:59:59", expErr: nil, }, "Valid Date and Invalid Start Time": { - logStartDate: "1-2-2023", - logEndDate: "1-3-2023", + logStartDate: "2023-01-02", + logEndDate: "2023-01-03", logStartTime: "99:99:99", logEndTime: "12:12:12", - expErr: errors.New("parsing time \"1-2-2023 99:99:99\": hour out of range"), + expErr: errors.New("parsing time \"2023-01-02 99:99:99\": hour out of range"), }, "Valid Date and Invalid End Time": { - logStartDate: "1-2-2023", - logEndDate: "1-3-2023", + logStartDate: "2023-01-02", + logEndDate: "2023-01-03", logStartTime: "10:10:10", logEndTime: "99:99:99", - expErr: errors.New("parsing time \"1-3-2023 99:99:99\": hour out of range"), + expErr: errors.New("parsing time \"2023-01-03 99:99:99\": hour out of range"), }, "Valid Date and Time": { - logStartDate: "1-2-2023", - logEndDate: "1-3-2023", + logStartDate: "2023-01-02", + logEndDate: "2023-01-03", logStartTime: "10:10:10", logEndTime: "12:12:12", - expStartTime: "01-02-2023 10:10:10", - expEndTime: "01-03-2023 12:12:12", + expStartTime: "2023-01-02 10:10:10", + expEndTime: "2023-01-03 12:12:12", expErr: nil, }, } { @@ -1267,13 +1268,13 @@ func TestSupport_getDateTime(t *testing.T) { startTime, endTime, gotErr := getDateTime(log, collLogParams) test.CmpErr(t, tc.expErr, gotErr) if tc.expStartTime != "" { - tmpStartTime, _ := time.Parse(MMDDYYYY_HHMMSS, tc.expStartTime) + tmpStartTime, _ := time.Parse(YYYYMMDD_HHMMSS, tc.expStartTime) if tmpStartTime.Equal(startTime) == false { t.Fatalf("Expected StartTime:=%s But Got :=%s", tmpStartTime, startTime) } } if tc.expEndTime != "" { - tmpEndTime, _ := time.Parse(MMDDYYYY_HHMMSS, tc.expEndTime) + tmpEndTime, _ := time.Parse(YYYYMMDD_HHMMSS, tc.expEndTime) if tmpEndTime.Equal(endTime) == false { t.Fatalf("Expected EndTime:=%s But Got :=%s", tmpEndTime, endTime) } diff --git a/src/control/lib/telemetry/promexp/httpd.go b/src/control/lib/telemetry/promexp/httpd.go index 2f4c86d485d..238e4d69fe9 100644 --- a/src/control/lib/telemetry/promexp/httpd.go +++ b/src/control/lib/telemetry/promexp/httpd.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -29,9 +30,10 @@ type ( // ExporterConfig defines the configuration for the Prometheus exporter. ExporterConfig struct { - Port int - Title string - Register RegMonFn + Port int + BindAddress string // optional: IP address to bind to (default: 0.0.0.0) + Title string + Register RegMonFn } ) @@ -60,7 +62,11 @@ func StartExporter(ctx context.Context, log logging.Logger, cfg *ExporterConfig) return nil, errors.Wrap(err, "failed to register client monitor") } - listenAddress := fmt.Sprintf("0.0.0.0:%d", cfg.Port) + bindAddr := cfg.BindAddress + if bindAddr == "" { + bindAddr = "0.0.0.0" + } + listenAddress := fmt.Sprintf("%s:%d", bindAddr, cfg.Port) srv := http.Server{Addr: listenAddress} http.Handle("/metrics", promhttp.HandlerFor( diff --git a/src/control/provider/system/mocks.go b/src/control/provider/system/mocks.go index dc52d7b3b96..52384054f8a 100644 --- a/src/control/provider/system/mocks.go +++ b/src/control/provider/system/mocks.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -32,31 +33,33 @@ type ( // MockSysConfig alters mock SystemProvider behavior. MockSysConfig struct { - IsMountedBool bool - IsMountedErr error - MountErr error - UnmountErr error - MkfsErr error - ChmodErr error - ChownErr error - GetfsStr string - GetfsErr error - SourceToTarget map[string]string - GetfsIndex int - GetfsUsageResps []GetfsUsageRetval - GetfsTypeRes *FsType - GetfsTypeErr []error - StatErrors map[string]error - RealStat bool - ReadFileResults map[string][]byte - ReadFileErrors map[string]error - RealReadFile bool - GeteuidRes int - GetegidRes int - MkdirErr error - RealMkdir bool - RemoveAllErr error - RealRemoveAll bool + IsMountedBool bool + IsMountedErr error + MountErr error + UnmountErr error + MkfsErr error + ChmodErr error + ChownErr error + GetfsStr string + GetfsErr error + SourceToTarget map[string]string + GetfsIndex int + GetfsUsageResps []GetfsUsageRetval + GetfsTypeRes *FsType + GetfsTypeErr []error + GetDeviceLabelRes string + GetDeviceLabelErr error + StatErrors map[string]error + RealStat bool + ReadFileResults map[string][]byte + ReadFileErrors map[string]error + RealReadFile bool + GeteuidRes int + GetegidRes int + MkdirErr error + RealMkdir bool + RemoveAllErr error + RealRemoveAll bool } // MockSysProvider gives a mock SystemProvider implementation. @@ -67,6 +70,7 @@ type ( isMounted MountMap IsMountedInputs []string GetfsTypeCount int + MkfsReqs []MkfsReq } ) @@ -146,7 +150,10 @@ func (msp *MockSysProvider) Unmount(target string, _ int) error { return msp.cfg.UnmountErr } -func (msp *MockSysProvider) Mkfs(_ MkfsReq) error { +func (msp *MockSysProvider) Mkfs(in MkfsReq) error { + msp.Lock() + msp.MkfsReqs = append(msp.MkfsReqs, in) + msp.Unlock() return msp.cfg.MkfsErr } @@ -187,6 +194,10 @@ func (msp *MockSysProvider) GetfsType(path string) (*FsType, error) { return result, err } +func (msp *MockSysProvider) GetDeviceLabel(device string) (string, error) { + return msp.cfg.GetDeviceLabelRes, msp.cfg.GetDeviceLabelErr +} + func (msp *MockSysProvider) Stat(path string) (os.FileInfo, error) { msp.RLock() defer msp.RUnlock() @@ -257,6 +268,7 @@ func NewMockSysProvider(log logging.Logger, cfg *MockSysConfig) *MockSysProvider isMounted: MountMap{ mounted: make(map[string]string), }, + MkfsReqs: make([]MkfsReq, 0), } log.Debugf("creating MockSysProvider with cfg: %+v", msp.cfg) return msp diff --git a/src/control/provider/system/system_linux.go b/src/control/provider/system/system_linux.go index e3fb439c0d8..846e2f4ca50 100644 --- a/src/control/provider/system/system_linux.go +++ b/src/control/provider/system/system_linux.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -55,12 +56,18 @@ var magicToStr = map[int64]string{ // DefaultProvider returns the package-default provider implementation. func DefaultProvider() *LinuxProvider { - return &LinuxProvider{} + return &LinuxProvider{ + runCommand: func(name string, args ...string) ([]byte, error) { + return exec.Command(name, args...).Output() + }, + } } // LinuxProvider encapsulates Linux-specific implementations of system // interfaces. -type LinuxProvider struct{} +type LinuxProvider struct { + runCommand func(string, ...string) ([]byte, error) +} // mountId,parentId,major:minor,root,mountPoint const ( @@ -253,6 +260,10 @@ type MkfsReq struct { // Mkfs attempts to create a filesystem of the supplied type, on the // supplied device. func (s LinuxProvider) Mkfs(req MkfsReq) error { + if req.Filesystem == "" { + return errors.New("no filesystem type specified") + } + cmdPath, err := exec.LookPath(fmt.Sprintf("mkfs.%s", req.Filesystem)) if err != nil { return errors.Wrapf(err, "unable to find mkfs.%s", req.Filesystem) @@ -262,7 +273,7 @@ func (s LinuxProvider) Mkfs(req MkfsReq) error { return err } - args := make([]string, 0, len(req.Options)) + args := make([]string, len(req.Options)) _ = copy(args, req.Options) // TODO: Think about a way to allow for some kind of progress // callback so that the user has some visibility into long-running @@ -273,7 +284,7 @@ func (s LinuxProvider) Mkfs(req MkfsReq) error { if req.Force { args = append([]string{"-F"}, args...) } - out, err := exec.Command(cmdPath, args...).Output() + out, err := s.runCommand(cmdPath, args...) if err != nil { return &RunCmdError{ Wrapped: err, @@ -284,6 +295,33 @@ func (s LinuxProvider) Mkfs(req MkfsReq) error { return nil } +// GetDeviceLabel retrieves the filesystem label for the specified device. +func (s LinuxProvider) GetDeviceLabel(device string) (string, error) { + if device == "" { + return "", errors.New("empty path") + } + + cmdPath, err := exec.LookPath("lsblk") + if err != nil { + return "", errors.Wrap(err, "unable to find lsblk") + } + + if err := s.checkDevice(device); err != nil { + return "", err + } + + args := []string{"-o", "label", "--noheadings", device} + out, err := s.runCommand(cmdPath, args...) + if err != nil { + return "", &RunCmdError{ + Wrapped: err, + Stdout: string(out), + } + } + + return strings.TrimSpace(string(out)), nil +} + // Getfs probes the specified device in an attempt to determine the // formatted filesystem type, if any. func (s LinuxProvider) Getfs(device string) (string, error) { @@ -297,7 +335,7 @@ func (s LinuxProvider) Getfs(device string) (string, error) { } args := []string{"-s", device} - out, err := exec.Command(cmdPath, args...).Output() + out, err := s.runCommand(cmdPath, args...) if err != nil { return FsTypeNone, &RunCmdError{ Wrapped: err, diff --git a/src/control/provider/system/system_linux_test.go b/src/control/provider/system/system_linux_test.go index dc9e6b21a04..3cdcb2652cf 100644 --- a/src/control/provider/system/system_linux_test.go +++ b/src/control/provider/system/system_linux_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,6 +9,9 @@ package system import ( "errors" + "os" + "path/filepath" + "regexp" "strings" "syscall" "testing" @@ -72,6 +76,14 @@ func TestScanMountInfo(t *testing.T) { func TestIsMounted(t *testing.T) { provider := LinuxProvider{} + tmpDir, cleanup := test.CreateTestDir(t) + defer cleanup() + + testFilePath := tmpDir + "/testfile" + if err := os.WriteFile(testFilePath, []byte("test"), 0644); err != nil { + t.Fatalf("unable to create test file %q: %v", testFilePath, err) + } + for name, tc := range map[string]struct { target string expMounted bool @@ -97,7 +109,7 @@ func TestIsMounted(t *testing.T) { expErr: errors.New("no such file or directory"), }, "neither dir nor device": { - target: "/dev/stderr", + target: testFilePath, expErr: errors.New("not a valid mount target"), }, } { @@ -172,13 +184,6 @@ func TestSystemLinux_GetfsType(t *testing.T) { path: "notreal", expErr: syscall.ENOENT, }, - "temp dir": { - path: "/dev", - expResult: &FsType{ - Name: "tmpfs", - NoSUID: true, - }, - }, } { t.Run(name, func(t *testing.T) { result, err := DefaultProvider().GetfsType(tc.path) @@ -191,6 +196,75 @@ func TestSystemLinux_GetfsType(t *testing.T) { } } +func validDev(t *testing.T) string { + t.Helper() + + // Only want numbered partitions, not whole disks. + // Exclude loop/nbd devices which may not be attached. + re := regexp.MustCompile(`^[a-zA-Z]+[0-9]+$`) + exclude := regexp.MustCompile(`^(loop|nbd|zram)`) + + sysRoot := "/sys/class/block/" + entries, err := os.ReadDir(sysRoot) + if err != nil { + t.Fatalf("unable to read %q: %v", sysRoot, err) + } + + for _, entry := range entries { + if !re.MatchString(entry.Name()) || exclude.MatchString(entry.Name()) { + continue + } + + devPath := "/dev/" + entry.Name() + info, err := os.Stat(devPath) + if err != nil { + continue + } + if (info.Mode()&os.ModeDevice) != 0 && (info.Mode()&os.ModeCharDevice) == 0 { + t.Logf("using block device %q for test", devPath) + return devPath + } + } + + t.Fatal("no valid block device found for test") + return "" +} + +func TestSystemLinux_GetDeviceLabel(t *testing.T) { + for name, tc := range map[string]struct { + path string + expErr error + }{ + "no path": { + expErr: errors.New("empty path"), + }, + "nonexistent": { + path: "fake", + expErr: syscall.ENOENT, + }, + "not a device": { + path: "/tmp", + expErr: errors.New("not a device file"), + }, + "valid block device": { + path: validDev(t), + }, + } { + t.Run(name, func(t *testing.T) { + result, err := DefaultProvider().GetDeviceLabel(tc.path) + + test.CmpErr(t, tc.expErr, err) + + if tc.expErr != nil { + test.AssertEqual(t, "", result, "") + } else { + // We can't predict the label since it's system dependent. It might even be empty. + t.Logf("got label %q", result) + } + }) + } +} + func TestSystemLinux_fsStrFromMagic(t *testing.T) { for name, tc := range map[string]struct { magic int64 @@ -234,3 +308,79 @@ func TestSystemLinux_fsStrFromMagic(t *testing.T) { }) } } + +func TestSystemLinux_Mkfs(t *testing.T) { + for name, tc := range map[string]struct { + req MkfsReq + expErr error + expCmdName string + expCmdArgs []string + }{ + "empty": { + req: MkfsReq{}, + expErr: errors.New("no filesystem"), + }, + "bad filesystem": { + req: MkfsReq{ + Filesystem: "moo", + }, + expErr: errors.New("unable to find mkfs.moo"), + }, + "bad device": { + req: MkfsReq{ + Filesystem: "ext4", + Device: "/notreal", + }, + expErr: syscall.ENOENT, + }, + "success": { + req: MkfsReq{ + Filesystem: "ext4", + Device: validDev(t), // real device, but actual mkfs command is mocked + }, + expCmdName: "mkfs.ext4", + expCmdArgs: []string{validDev(t)}, + }, + "force": { + req: MkfsReq{ + Filesystem: "ext4", + Device: validDev(t), + Force: true, + }, + expCmdName: "mkfs.ext4", + expCmdArgs: []string{"-F", validDev(t)}, + }, + "options": { + req: MkfsReq{ + Filesystem: "ext4", + Device: validDev(t), + Options: []string{"-L", "my_device"}, + }, + expCmdName: "mkfs.ext4", + expCmdArgs: []string{"-L", "my_device", validDev(t)}, + }, + } { + t.Run(name, func(t *testing.T) { + p := DefaultProvider() + + var seenName string + var seenArgs []string + p.runCommand = func(name string, args ...string) ([]byte, error) { + seenName = name + seenArgs = args + return []byte{}, nil + } + + err := p.Mkfs(tc.req) + + test.CmpErr(t, tc.expErr, err) + + if seenName != "" { + // don't care where the binary was found, just that it was + seenName = filepath.Base(seenName) + } + test.AssertEqual(t, tc.expCmdName, seenName, "mkfs command name") + test.AssertEqual(t, tc.expCmdArgs, seenArgs, "mkfs args") + }) + } +} diff --git a/src/control/security/grpc_authorization.go b/src/control/security/grpc_authorization.go index ed178b62214..50104b5b475 100644 --- a/src/control/security/grpc_authorization.go +++ b/src/control/security/grpc_authorization.go @@ -66,14 +66,14 @@ var methodAuthorizations = map[string][]Component{ "/mgmt.MgmtSvc/PoolUpdateACL": {ComponentAdmin}, "/mgmt.MgmtSvc/PoolDeleteACL": {ComponentAdmin}, "/mgmt.MgmtSvc/PoolExclude": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolDrain": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolReintegrate": {ComponentAdmin}, + "/mgmt.MgmtSvc/PoolDrain": {ComponentAdmin, ComponentServer}, + "/mgmt.MgmtSvc/PoolReintegrate": {ComponentAdmin, ComponentServer}, "/mgmt.MgmtSvc/PoolEvict": {ComponentAdmin, ComponentAgent}, "/mgmt.MgmtSvc/PoolExtend": {ComponentAdmin}, "/mgmt.MgmtSvc/PoolUpgrade": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolRebuildStart": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolRebuildStop": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolSelfHealEval": {ComponentAdmin}, + "/mgmt.MgmtSvc/PoolRebuildStart": {ComponentAdmin, ComponentServer}, + "/mgmt.MgmtSvc/PoolRebuildStop": {ComponentAdmin, ComponentServer}, + "/mgmt.MgmtSvc/PoolSelfHealEval": {ComponentAdmin, ComponentServer}, "/mgmt.MgmtSvc/GetAttachInfo": {ComponentAgent}, "/mgmt.MgmtSvc/ListPools": {ComponentAdmin}, "/mgmt.MgmtSvc/ListContainers": {ComponentAdmin}, diff --git a/src/control/security/grpc_authorization_test.go b/src/control/security/grpc_authorization_test.go index eeef1167f39..3bd1d3a8152 100644 --- a/src/control/security/grpc_authorization_test.go +++ b/src/control/security/grpc_authorization_test.go @@ -91,14 +91,14 @@ func TestSecurity_ComponentHasAccess(t *testing.T) { "/mgmt.MgmtSvc/PoolUpdateACL": {ComponentAdmin}, "/mgmt.MgmtSvc/PoolDeleteACL": {ComponentAdmin}, "/mgmt.MgmtSvc/PoolExclude": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolDrain": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolReintegrate": {ComponentAdmin}, + "/mgmt.MgmtSvc/PoolDrain": {ComponentAdmin, ComponentServer}, + "/mgmt.MgmtSvc/PoolReintegrate": {ComponentAdmin, ComponentServer}, "/mgmt.MgmtSvc/PoolEvict": {ComponentAdmin, ComponentAgent}, "/mgmt.MgmtSvc/PoolExtend": {ComponentAdmin}, "/mgmt.MgmtSvc/PoolUpgrade": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolRebuildStart": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolRebuildStop": {ComponentAdmin}, - "/mgmt.MgmtSvc/PoolSelfHealEval": {ComponentAdmin}, + "/mgmt.MgmtSvc/PoolRebuildStart": {ComponentAdmin, ComponentServer}, + "/mgmt.MgmtSvc/PoolRebuildStop": {ComponentAdmin, ComponentServer}, + "/mgmt.MgmtSvc/PoolSelfHealEval": {ComponentAdmin, ComponentServer}, "/mgmt.MgmtSvc/GetAttachInfo": {ComponentAgent}, "/mgmt.MgmtSvc/ListPools": {ComponentAdmin}, "/mgmt.MgmtSvc/ListContainers": {ComponentAdmin}, diff --git a/src/control/server/config/faults.go b/src/control/server/config/faults.go index 4a2088f4220..f295b413254 100644 --- a/src/control/server/config/faults.go +++ b/src/control/server/config/faults.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -167,6 +167,15 @@ func FaultConfigScmDiffClass(curIdx, seenIdx int) *fault.Fault { ) } +func FaultConfigScmDiffHugeEnabled(curIdx, seenIdx int) *fault.Fault { + return serverConfigFault( + code.ServerConfigScmHugeEnabled, + fmt.Sprintf("the scm_hugepages_disabled in engine %d is different from engine %d", + curIdx, seenIdx), + "ensure that each I/O Engine has the same setting for this parameter and restart", + ) +} + func FaultConfigOverlappingBdevDeviceList(curIdx, seenIdx int) *fault.Fault { return serverConfigFault( code.ServerConfigOverlappingBdevDeviceList, @@ -274,6 +283,25 @@ func FaultConfigEngineNUMAImbalance(nodeMap map[int]int) *fault.Fault { ) } +// FaultConfigBadControlInterface creates a fault for an invalid control plane network interface. +func FaultConfigBadControlInterface(iface string, err error) *fault.Fault { + return serverConfigFault( + code.ServerConfigBadControlInterface, + fmt.Sprintf("control_iface %q is invalid: %s", iface, err), + "update the 'control_iface' parameter with a valid network interface and restart", + ) +} + +// FaultConfigControlInterfaceMismatch creates a fault when the control interface address +// doesn't match the configured MS replica address. +func FaultConfigControlInterfaceMismatch(ifaceAddr, replicaAddr string) *fault.Fault { + return serverConfigFault( + code.ServerConfigControlInterfaceMismatch, + fmt.Sprintf("control_iface address %s doesn't match configured MS replica address %s", ifaceAddr, replicaAddr), + "ensure 'control_iface' specifies an interface with an address matching this server's entry in 'mgmt_svc_replicas'", + ) +} + func serverConfigFault(code code.Code, desc, res string) *fault.Fault { return &fault.Fault{ Domain: "serverconfig", diff --git a/src/control/server/config/server.go b/src/control/server/config/server.go index f61d7cd58d2..c6a808baf29 100644 --- a/src/control/server/config/server.go +++ b/src/control/server/config/server.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -58,6 +58,7 @@ type deprecatedParams struct { type Server struct { // control-specific ControlPort int `yaml:"port"` + ControlInterface string `yaml:"control_iface,omitempty"` TransportConfig *security.TransportConfig `yaml:"transport_config"` Engines []*engine.Config `yaml:"engines"` BdevExclude []string `yaml:"bdev_exclude,omitempty"` @@ -68,6 +69,7 @@ type Server struct { SystemRamReserved int `yaml:"system_ram_reserved"` // total for all engines DisableHugepages bool `yaml:"disable_hugepages"` AllowNumaImbalance bool `yaml:"allow_numa_imbalance"` + AllowTHP bool `yaml:"allow_thp"` ControlLogMask common.ControlLogLevel `yaml:"control_log_mask"` ControlLogFile string `yaml:"control_log_file,omitempty"` ControlLogJSON bool `yaml:"control_log_json,omitempty"` @@ -146,7 +148,6 @@ func (cfg *Server) WithFabricProvider(provider string) *Server { // WithFabricAuthKey sets the top-level fabric authorization key. func (cfg *Server) WithFabricAuthKey(key string) *Server { cfg.Fabric.AuthKey = key - cfg.ClientEnvVars = common.MergeKeyValues(cfg.ClientEnvVars, []string{cfg.Fabric.GetAuthKeyEnv()}) for _, engine := range cfg.Engines { engine.Fabric.AuthKey = cfg.Fabric.AuthKey } @@ -230,6 +231,12 @@ func (cfg *Server) WithControlPort(port int) *Server { return cfg } +// WithControlInterface sets the network interface for the control plane listener. +func (cfg *Server) WithControlInterface(iface string) *Server { + cfg.ControlInterface = iface + return cfg +} + // WithTransportConfig sets the gRPC transport configuration. func (cfg *Server) WithTransportConfig(cfgTransport *security.TransportConfig) *Server { cfg.TransportConfig = cfgTransport @@ -299,6 +306,12 @@ func (cfg *Server) WithAllowNumaImbalance(allowed bool) *Server { return cfg } +// WithAllowTHP allows DAOS server to run with transparent hugepage support enabled. +func (cfg *Server) WithAllowTHP(allowed bool) *Server { + cfg.AllowTHP = allowed + return cfg +} + // WithSystemRamReserved sets the amount of system memory to reserve for system (non-DAOS) // use. In units of GiB. func (cfg *Server) WithSystemRamReserved(nr int) *Server { @@ -390,10 +403,6 @@ func (cfg *Server) Load(log logging.Logger) error { cfg.updateServerConfig(&cfg.Engines[i]) } - if cfg.Fabric.AuthKey != "" { - cfg.ClientEnvVars = common.MergeKeyValues(cfg.ClientEnvVars, []string{cfg.Fabric.GetAuthKeyEnv()}) - } - if len(cfg.deprecatedParams.AccessPoints) > 0 { if len(cfg.MgmtSvcReplicas) > 0 { return errors.New(msgAPsMSReps) @@ -897,6 +906,8 @@ func (cfg *Server) validateMultiEngineConfig(log logging.Logger) error { seenHelperStreamCount := -1 seenScmCls := storage.ClassNone seenScmClsIdx := -1 + var seenScmHuge *bool + seenScmHugeIdx := -1 for idx, engine := range cfg.Engines { fabricConfig := fmt.Sprintf("fabric:%q-%q-%q", @@ -943,6 +954,22 @@ func (cfg *Server) validateMultiEngineConfig(log logging.Logger) error { } seenScmCls = scmConf.Class seenScmClsIdx = idx + + if seenScmHugeIdx != -1 { + switch { + case scmConf.Scm.DisableHugepages == nil && seenScmHuge == nil: + case scmConf.Scm.DisableHugepages != nil && seenScmHuge == nil: + return FaultConfigScmDiffHugeEnabled(idx, seenScmHugeIdx) + case scmConf.Scm.DisableHugepages == nil && seenScmHuge != nil: + return FaultConfigScmDiffHugeEnabled(idx, seenScmHugeIdx) + case *scmConf.Scm.DisableHugepages != *seenScmHuge: + log.Debugf("scm_hugepages_disabled entry %v in %d doesn't match %d", + *scmConf.Scm.DisableHugepages, idx, seenScmHugeIdx) + return FaultConfigScmDiffHugeEnabled(idx, seenScmHugeIdx) + } + } + seenScmHuge = scmConf.Scm.DisableHugepages + seenScmHugeIdx = idx } bdevs := engine.Storage.GetBdevs() diff --git a/src/control/server/config/server_test.go b/src/control/server/config/server_test.go index 4c8cac655d4..08b53010c10 100644 --- a/src/control/server/config/server_test.go +++ b/src/control/server/config/server_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -240,6 +240,7 @@ func TestServerConfig_Constructed(t *testing.T) { // possible to construct an identical configuration with the helpers. constructed := DefaultServer(). WithControlPort(10001). + WithControlInterface("eth0"). WithControlMetadata(storage.ControlMetadata{ Path: "/home/daos_server/control_meta", DevicePath: "/dev/sdb1", @@ -264,7 +265,8 @@ func TestServerConfig_Constructed(t *testing.T) { WithFabricAuthKey("foo:bar"). WithHyperthreads(true). // hyper-threads disabled by default WithSystemRamReserved(5). - WithAllowNumaImbalance(true) + WithAllowNumaImbalance(true). + WithAllowTHP(true) // add engines explicitly to test functionality applied in WithEngines() constructed.Engines = []*engine.Config{ @@ -278,7 +280,7 @@ func TestServerConfig_Constructed(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(false), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -296,7 +298,8 @@ func TestServerConfig_Constructed(t *testing.T) { WithLogFile("/var/log/daos/daos_engine.0.log"). WithLogMask("INFO"). WithStorageEnableHotplug(false). - WithStorageAutoFaultyCriteria(true, 100, 200), + WithStorageAutoFaultyCriteria(true, 100, 200). + WithStorageSpdkIobufProps(16384, 2048), engine.MockConfig(). WithSystemName("daos_server"). WithSocketDir("./.daos/daos_server"). @@ -306,7 +309,8 @@ func TestServerConfig_Constructed(t *testing.T) { WithStorage( storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/2"). - WithStorageClass("ram"), + WithStorageClass("ram"). + WithScmHugepagesDisabled(false), storage.NewTierConfig(). WithStorageClass("file"). WithBdevDeviceList("/tmp/daos-bdev1", "/tmp/daos-bdev2"). @@ -323,7 +327,8 @@ func TestServerConfig_Constructed(t *testing.T) { WithLogFile("/var/log/daos/daos_engine.1.log"). WithLogMask("INFO"). WithStorageEnableHotplug(false). - WithStorageAutoFaultyCriteria(false, 0, 0), + WithStorageAutoFaultyCriteria(false, 0, 0). + WithStorageSpdkIobufProps(0, 0), } constructed.Path = testFile // just to avoid failing the cmp @@ -332,7 +337,7 @@ func TestServerConfig_Constructed(t *testing.T) { t.Logf("default: %+v", defaultCfg.Engines[i]) } - if diff := cmp.Diff(defaultCfg, constructed, defConfigCmpOpts...); diff != "" { + if diff := cmp.Diff(constructed, defaultCfg, defConfigCmpOpts...); diff != "" { t.Fatalf("(-want, +got): %s", diff) } } @@ -694,7 +699,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -707,7 +712,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/2"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:91:00.0", "0000:92:00.0"). @@ -729,7 +734,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -752,7 +757,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/2"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:91:00.0", "0000:92:00.0"). @@ -783,7 +788,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -804,7 +809,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -834,7 +839,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/0"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:80:00.0"). @@ -846,7 +851,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0"), @@ -867,7 +872,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -886,7 +891,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -930,7 +935,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0"), @@ -949,7 +954,7 @@ func TestServerConfig_Validation(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0"). @@ -1135,7 +1140,7 @@ func TestServerConfig_getMinNrHugepages(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -1271,7 +1276,7 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -1291,7 +1296,7 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -1475,7 +1480,7 @@ func TestServerConfig_SetRamdiskSize(t *testing.T) { storage.NewTierConfig(). WithScmMountPoint("/mnt/daos/1"). WithStorageClass("ram"). - WithScmDisableHugepages(), + WithScmHugepagesDisabled(true), storage.NewTierConfig(). WithStorageClass("nvme"). WithBdevDeviceList("0000:81:00.0", "0000:82:00.0"). @@ -1570,6 +1575,7 @@ func replaceFile(t *testing.T, name, oldTxt, newTxt string) { if linesChanged == 0 { t.Fatalf("no occurrences of %q in file %q", oldTxt, name) } + t.Logf("replaceFile: %d lines changed", linesChanged) // make sure the tmp file was successfully written to if err := tmp.Close(); err != nil { @@ -1730,6 +1736,30 @@ func TestServerConfig_Parsing(t *testing.T) { return nil }, }, + "scm_hugepages_disabled unset": { + inTxt: " scm_hugepages_disabled: false", + outTxt: "", + expCheck: func(c *Server) error { + for _, e := range c.Engines { + if e.Storage.Tiers.ScmConfigs()[0].Scm.DisableHugepages != nil { + return errors.New("expecting scm hugepages to be enabled") + } + } + return nil + }, + }, + "explicitly set scm_hugepages_disabled true": { + inTxt: " scm_hugepages_disabled: false", + outTxt: " scm_hugepages_disabled: true", + expCheck: func(c *Server) error { + for _, e := range c.Engines { + if !*e.Storage.Tiers.ScmConfigs()[0].Scm.DisableHugepages { + return errors.New("expecting scm hugepages to be enabled") + } + } + return nil + }, + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) @@ -1988,12 +2018,24 @@ func TestServerConfig_validateMultiEngineConfig(t *testing.T) { ), expLog: "engine 1 has 2 but engine 0 has 1", }, + "mismatched scm_hugepages_disabled": { + configA: configA(), + configB: configB(). + WithStorage( + storage.NewTierConfig(). + WithStorageClass("ram"). + WithScmMountPoint("b"). + WithScmHugepagesDisabled(true), + ), + expErr: FaultConfigScmDiffHugeEnabled(1, 0), + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) conf := DefaultServer(). + WithAllowTHP(true). // Enable differences between scm_hugepages_disabled. WithFabricProvider("test"). WithMgmtSvcReplicas( fmt.Sprintf("localhost:%d", build.DefaultControlPort)). diff --git a/src/control/server/ctl_smd_rpc.go b/src/control/server/ctl_smd_rpc.go index 16d5b6486c3..2cd75390f76 100644 --- a/src/control/server/ctl_smd_rpc.go +++ b/src/control/server/ctl_smd_rpc.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2023 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -174,7 +174,7 @@ func extractReqIDs(log logging.Logger, ids string, addrs idMap, uuids idMap) err tokens := strings.Split(ids, ",") for _, token := range tokens { - if addr, e := hardware.NewPCIAddress(token); e == nil && addr.IsVMDBackingAddress() { + if addr, e := hardware.NewPCIAddress(token); e == nil { addrs[addr.String()] = true continue } @@ -184,7 +184,7 @@ func extractReqIDs(log logging.Logger, ids string, addrs idMap, uuids idMap) err continue } - return errors.Errorf("req id entry %q is neither a valid vmd backing device pci "+ + return errors.Errorf("req id entry %q is neither a valid device pci "+ "address or uuid", token) } @@ -240,7 +240,7 @@ func (svc *ControlService) mapIDsToEngine(ctx context.Context, ids string, useTr matchAll := false if ids == "" { - // Selecting all is not supported unless using transport addresses. + // Selecting all not supported unless using transport addresses. if !useTrAddr { return nil, errors.New("empty id string") } diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go index 06f1276fa25..bbc7de54a7f 100644 --- a/src/control/server/ctl_smd_rpc_test.go +++ b/src/control/server/ctl_smd_rpc_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -919,7 +920,7 @@ func TestServer_CtlSvc_SmdManage(t *testing.T) { }, expErr: errors.New("neither a valid"), }, - "led-manage; pci address not of a vmd backing device": { + "led-manage; pci address of a non-vmd device": { req: &ctlpb.SmdManageReq{ Op: &ctlpb.SmdManageReq_Led{ Led: &ctlpb.LedManageReq{ @@ -927,7 +928,29 @@ func TestServer_CtlSvc_SmdManage(t *testing.T) { }, }, }, - expErr: errors.New("neither a valid"), + drpcResps: map[int][]*mockDrpcResponse{ + 0: { + { + Message: &ctlpb.SmdDevResp{ + Devices: []*ctlpb.SmdDevice{pbNormDev(1)}, + }, + }, + { + Message: &ctlpb.DevManageResp{ + Device: pbIdentDev(1), + }, + }, + }, + }, + expResp: &ctlpb.SmdManageResp{ + Ranks: []*ctlpb.SmdManageResp_RankResp{ + { + Results: []*ctlpb.SmdManageResp_Result{ + {Device: pbIdentDev(1)}, + }, + }, + }, + }, }, "led-manage; valid pci address of vmd backing device": { req: &ctlpb.SmdManageReq{ diff --git a/src/control/server/ctl_storage_rpc.go b/src/control/server/ctl_storage_rpc.go index 21fdc770bfc..4efefd6c32f 100644 --- a/src/control/server/ctl_storage_rpc.go +++ b/src/control/server/ctl_storage_rpc.go @@ -108,14 +108,18 @@ func findBdevTier(pciAddr string, tcs storage.TierConfigs) *storage.TierConfig { } // Convert bdev scan results to protobuf response. -func bdevScanToProtoResp(scan scanBdevsFn, bdevCfgs storage.TierConfigs) (*ctlpb.ScanNvmeResp, error) { +func bdevScanToProtoResp(log logging.DebugLogger, scan scanBdevsFn, bdevCfgs storage.TierConfigs) (*ctlpb.ScanNvmeResp, error) { req := storage.BdevScanRequest{DeviceList: bdevCfgs.Bdevs()} + log.Debugf("bdevScanToProtoResp: bdev provider scan, req: %+v", req) + resp, err := scan(req) if err != nil { - return nil, err + return nil, errors.Wrap(err, "bdev provider scan") } + log.Debugf("bdevScanToProtoResp: bdev provider scan, resp: %+v", resp) + pbCtrlrs := make(proto.NvmeControllers, 0, len(resp.Controllers)) if err := pbCtrlrs.FromNative(resp.Controllers); err != nil { @@ -230,7 +234,13 @@ func bdevScanAssigned(ctx context.Context, cs *ControlService, req *ctlpb.ScanNv return nil, errors.New("meta smd usage info unavailable as engines stopped") } - return bdevScanToProtoResp(cs.storage.ScanBdevs, bdevCfgs) + resp, err := bdevScanToProtoResp(cs.log, cs.storage.ScanBdevs, bdevCfgs) + if err != nil { + return nil, errors.Wrap(err, "bdevScanAssigned: bdevScanToProtoResp") + } + + cs.log.Debugf("bdevScanAssigned: bdevScanToProtoResp returned: %+v", resp) + return resp, nil } // Delegate scan to engine instances as soon as one engine with assigned bdevs has started. @@ -264,11 +274,12 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n cs.log.Debugf("scan bdevs from control service as no bdevs in cfg") // No bdevs configured for engines to claim so scan through control service. - resp, err = bdevScanToProtoResp(cs.storage.ScanBdevs, bdevCfgs) + resp, err = bdevScanToProtoResp(cs.log, cs.storage.ScanBdevs, bdevCfgs) if err != nil { - return nil, err + return nil, errors.Wrap(err, "bdevScan: bdevScanToProtoResp") } + cs.log.Debugf("bdevScan: bdevScanToProtoResp returned: %+v", resp) return bdevScanTrimResults(req, resp), nil } @@ -287,7 +298,7 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n return nil, err } - cs.log.Tracef("bdevScanAssigned returned %d, want %d", nrScannedBdevs, nrCfgBdevs) + cs.log.Debugf("bdevScanAssigned returned %d, want %d", nrScannedBdevs, nrCfgBdevs) if nrScannedBdevs == nrCfgBdevs { return bdevScanTrimResults(req, resp), nil @@ -774,8 +785,9 @@ func (cs *ControlService) StorageScan(ctx context.Context, req *ctlpb.StorageSca } else { respNvme, err := scanBdevs(ctx, cs, req.Nvme, respScm.Namespaces) if err != nil { - return nil, err + return nil, errors.Wrap(err, "scan bdevs") } + cs.log.Debugf("scanBdevs returned respNvme: %+v", respNvme) resp.Nvme = respNvme } diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go index d34c6732d9d..e8c16ee15fb 100644 --- a/src/control/server/engine/config.go +++ b/src/control/server/engine/config.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -215,11 +215,6 @@ func (fc *FabricConfig) Validate() error { return nil } -// GetAuthKeyEnv returns the environment variable string for the auth key. -func (fc *FabricConfig) GetAuthKeyEnv() string { - return fmt.Sprintf("D_PROVIDER_AUTH_KEY=%s", fc.AuthKey) -} - // cleanEnvVars scrubs the supplied slice of environment // variables by removing all variables not included in the // allow list. @@ -790,6 +785,13 @@ func (c *Config) WithStorageAutoFaultyCriteria(enable bool, maxIoErrs, maxCsumEr return c } +// WithStorageSpdkIobufProps specifies SPDK I/O buffer pool settings in the I/O Engine. +func (c *Config) WithStorageSpdkIobufProps(smallPoolCount, largePoolCount uint32) *Config { + c.Storage.SpdkIobufProps.SmallPoolCount = smallPoolCount + c.Storage.SpdkIobufProps.LargePoolCount = largePoolCount + return c +} + // WithIndex sets the I/O Engine instance index. func (c *Config) WithIndex(i uint32) *Config { c.Index = i diff --git a/src/control/server/engine/config_test.go b/src/control/server/engine/config_test.go index 5ad23861e0f..48c29a7b794 100644 --- a/src/control/server/engine/config_test.go +++ b/src/control/server/engine/config_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -1419,3 +1419,37 @@ func TestConfig_SetNUMAAffinity(t *testing.T) { }) } } + +func TestConfig_WithStorageSpdkIobufProps(t *testing.T) { + for name, tc := range map[string]struct { + smallPoolCount uint32 + largePoolCount uint32 + }{ + "zero values": { + smallPoolCount: 0, + largePoolCount: 0, + }, + "small pool count only": { + smallPoolCount: 1024, + largePoolCount: 0, + }, + "large pool count only": { + smallPoolCount: 0, + largePoolCount: 512, + }, + "both pool counts set": { + smallPoolCount: 2048, + largePoolCount: 1024, + }, + } { + t.Run(name, func(t *testing.T) { + cfg := NewConfig(). + WithStorageSpdkIobufProps(tc.smallPoolCount, tc.largePoolCount) + + test.AssertEqual(t, tc.smallPoolCount, cfg.Storage.SpdkIobufProps.SmallPoolCount, + "unexpected small pool count") + test.AssertEqual(t, tc.largePoolCount, cfg.Storage.SpdkIobufProps.LargePoolCount, + "unexpected large pool count") + }) + } +} diff --git a/src/control/server/engine/exec.go b/src/control/server/engine/exec.go index c7abed7e2b4..c1efe634e3f 100644 --- a/src/control/server/engine/exec.go +++ b/src/control/server/engine/exec.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -161,21 +162,22 @@ func processLogEnvs(env []string) ([]string, error) { func (r *Runner) Start(ctx context.Context) (RunnerExitChan, error) { args, err := r.Config.CmdLineArgs() if err != nil { - return nil, err + return nil, errors.Wrap(err, "CmdLineArgs") } env, err := r.Config.CmdLineEnv() if err != nil { - return nil, err + return nil, errors.Wrap(err, "CmdLineEnv") } env = common.MergeKeyValues(cleanEnvVars(os.Environ(), r.Config.EnvPassThrough), env) env, err = processLogEnvs(env) if err != nil { + return nil, errors.Wrap(err, "processLogEnvs") return nil, err } exitCh := make(RunnerExitChan) - return exitCh, r.run(ctx, args, env, exitCh) + return exitCh, errors.Wrap(r.run(ctx, args, env, exitCh), "Runner run") } // IsRunning indicates whether the Runner process is running or not. diff --git a/src/control/server/faults.go b/src/control/server/faults.go index a866de34767..a2e51760026 100644 --- a/src/control/server/faults.go +++ b/src/control/server/faults.go @@ -38,6 +38,11 @@ var ( "disable_vfio: true in config while running as non-root user with NVMe devices", "set disable_vfio: false or run daos_server as root", ) + FaultTransparentHugepageEnabled = serverFault( + code.ServerTransparentHugepageEnabled, + "transparent hugepage (THP) enabled on storage server, DAOS requires THP to be disabled", + "disable THP by adding 'transparent_hugepage=never' kernel parameter in the grub configuration file then reboot and restart daos_server", + ) FaultHarnessNotStarted = serverFault( code.ServerHarnessNotStarted, fmt.Sprintf("%s harness not started", build.DataPlaneName), diff --git a/src/control/server/init/setup_spdk.sh b/src/control/server/init/setup_spdk.sh index 059d1d3c4b0..845baf0eb24 100755 --- a/src/control/server/init/setup_spdk.sh +++ b/src/control/server/init/setup_spdk.sh @@ -92,8 +92,12 @@ else set +x if [ -d "/dev/hugepages/" ]; then - echo "RUN: chown -R ${_TARGET_USER} /dev/hugepages" - chown -R "${_TARGET_USER}" "/dev/hugepages" + echo "RUN: chown -R ${_TARGET_USER}:${_TARGET_USER} /dev/hugepages" + chown -R "${_TARGET_USER}:${_TARGET_USER}" /dev/hugepages + fi + if [ -d "/tmp/dpdk/" ]; then + echo "RUN: chmod -R g+rwx /tmp/dpdk" + chmod -R g+rwx /tmp/dpdk fi echo "Setting VFIO file permissions for unprivileged access" diff --git a/src/control/server/instance_exec.go b/src/control/server/instance_exec.go index b153d1b09ac..d38ab1613fe 100644 --- a/src/control/server/instance_exec.go +++ b/src/control/server/instance_exec.go @@ -36,10 +36,10 @@ func (ei *EngineInstance) format(ctx context.Context) error { ei.log.Debugf("instance %d: checking if storage is formatted", idx) if err := ei.awaitStorageReady(ctx); err != nil { - return err + return errors.Wrap(err, "awaitStorageReady") } if err := ei.createSuperblock(); err != nil { - return err + return errors.Wrap(err, "createSuperblock") } if !ei.hasSuperblock() { @@ -50,7 +50,7 @@ func (ei *EngineInstance) format(ctx context.Context) error { // any callbacks that were waiting for this state. for _, readyFn := range ei.onStorageReady { if err := readyFn(ctx); err != nil { - return err + return errors.Wrap(err, "onStorageReady readyFn") } } @@ -82,10 +82,11 @@ func (ei *EngineInstance) start(ctx context.Context) (chan *engine.RunnerExitInf } if err := ei.initIncarnationFromSuperblock(); err != nil { - return nil, err + return nil, errors.Wrap(err, "initIncarnationFromSuperblock") } - return ei.runner.Start(ctx) + ch, err := ei.runner.Start(ctx) + return ch, errors.Wrap(err, "runner Start") } // waitReady awaits ready signal from I/O Engine before starting @@ -235,7 +236,7 @@ func (ei *EngineInstance) Run(ctx context.Context) { runnerExitCh, err = ei.startRunner(ctx) if err != nil { - ei.log.Errorf("runner exited without starting process: %s", err) + ei.log.Errorf("runner exited without starting process: %+v", err) ei.handleExit(ctx, 0, err) continue } diff --git a/src/control/server/instance_storage.go b/src/control/server/instance_storage.go index 1d6f3cd6cca..ab8f8adcb98 100644 --- a/src/control/server/instance_storage.go +++ b/src/control/server/instance_storage.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -150,7 +150,7 @@ func (ei *EngineInstance) awaitStorageReady(ctx context.Context) error { if !needsSuperblock { ei.log.Debugf("%s: superblock not needed", msgIdx) - if ei.storage.HasBlockDevices() { + if ei.storage.HasBlockDevices() && !ei.storage.AllowSpdkConfOverride() { ei.log.Debugf("%s: checking bdev config", msgIdx) ctrlrs, err := getEngineBdevCtrlrs(ctx, ei) diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go index b5d482e31bf..c0a34a6c1f5 100644 --- a/src/control/server/instance_storage_rpc.go +++ b/src/control/server/instance_storage_rpc.go @@ -452,7 +452,13 @@ func bdevScanEngineAssigned(ctx context.Context, engine Engine, req *ctlpb.ScanN if !*isStarted { engine.Debugf("scanning engine-%d bdevs while engine is down", engine.Index()) - return bdevScanToProtoResp(engine.GetStorage().ScanBdevs, bdevCfgs) + resp, err := bdevScanToProtoResp(engine, engine.GetStorage().ScanBdevs, bdevCfgs) + if err != nil { + return nil, errors.Wrap(err, "bdevScanEngineAssigned: bdevScanToProtoResp") + } + + engine.Debugf("bdevScanEngineAssigned: bdevScanToProtoResp returned: %+v", resp) + return resp, err } engine.Debugf("scanning engine-%d bdevs while engine is up", engine.Index()) diff --git a/src/control/server/mgmt_check.go b/src/control/server/mgmt_check.go index ce4ba773a57..a9266d47e78 100644 --- a/src/control/server/mgmt_check.go +++ b/src/control/server/mgmt_check.go @@ -263,24 +263,72 @@ func (svc *mgmtSvc) SystemCheckStart(ctx context.Context, req *mgmtpb.CheckStart } if resp.Status > 0 { - if len(req.Uuids) == 0 { - svc.log.Debug("resetting checker findings DB") - if err := svc.sysdb.ResetCheckerData(); err != nil { - return nil, errors.Wrap(err, "failed to reset checker finding database") - } - } else { - pools := strings.Join(req.Uuids, ", ") - svc.log.Debugf("removing old checker findings for pools: %s", pools) - if err := svc.sysdb.RemoveCheckerFindingsForPools(req.Uuids...); err != nil { - return nil, errors.Wrapf(err, "failed to remove old findings for pools: %s", pools) - } + // Checker instance was reset. We can safely clear all findings related to any pools + // requested. + if err := svc.resetFindings(req.Uuids); err != nil { + return nil, err } resp.Status = 0 // reset status to indicate success } + // If either the checker was not reset, or it was only reset against specified pools above, + // there may still be unresolved findings in the DB that need to be marked stale or removed. + if resp.Status == 0 { + svc.handleUnresolvedInteractions(req.Uuids) + } + return resp, nil } +func (svc *mgmtSvc) resetFindings(uuids []string) error { + if len(uuids) == 0 { + svc.log.Debug("resetting checker findings DB") + if err := svc.sysdb.ResetCheckerData(); err != nil { + return errors.Wrap(err, "failed to reset checker finding database") + } + } else { + pools := strings.Join(uuids, ", ") + svc.log.Debugf("removing old checker findings for pools: %s", pools) + if err := svc.sysdb.RemoveCheckerFindingsForPools(uuids...); err != nil { + return errors.Wrapf(err, "failed to remove old findings for pools: %s", pools) + } + } + return nil +} + +// handleUnresolvedInteractions goes through all unresolved (INTERACT/STALE) findings in the database. +// Those that will be rediscovered in the next run can be removed. All others must be marked stale +// as the user will be unable to act on them after the new check instance started. To fix the +// inconsistency, they'll need to re-run the checker on the affected pool. +func (svc *mgmtSvc) handleUnresolvedInteractions(uuids []string) { + findings, err := svc.sysdb.GetCheckerFindings() + if err != nil { + svc.log.Errorf("unable to fetch old checker findings: %s", err.Error()) + return + } + + uuidSet := common.NewStringSet(uuids...) + for _, f := range findings { + switch f.Action { + case chkpb.CheckInconsistAction_CIA_INTERACT, chkpb.CheckInconsistAction_CIA_STALE: + if len(uuids) == 0 || uuidSet.Has(f.PoolUuid) { + // Unresolved interactive and stale findings for pools that will be scanned will be re-discovered. + svc.log.Debugf("removing unresolved %s finding %d for pool %s", f.Action, f.Seq, f.PoolUuid) + if err := svc.sysdb.RemoveCheckerFinding(f); err != nil { + svc.log.Errorf("unable to remove stale checker finding %s: %s", f, err.Error()) + } + } else if f.Action != chkpb.CheckInconsistAction_CIA_STALE { // No need to re-mark stale interactions + // If the pool isn't being re-checked, we should keep the unresolved finding, but the user + // won't be able to act on it anymore. + svc.log.Debugf("marking unresolved interaction %d stale for pool %s", f.Seq, f.PoolUuid) + if err := svc.sysdb.SetCheckerFindingAction(f.Seq, int32(chkpb.CheckInconsistAction_CIA_STALE)); err != nil { + svc.log.Errorf("unable to mark interactive finding %s stale: %s", f, err.Error()) + } + } + } + } +} + func (svc *mgmtSvc) mergePoliciesWithCurrent(policies []*mgmtpb.CheckInconsistPolicy) ([]*mgmtpb.CheckInconsistPolicy, error) { pm, err := svc.getCheckerPolicyMap() if err != nil { diff --git a/src/control/server/mgmt_check_test.go b/src/control/server/mgmt_check_test.go index ba0662e5f2e..b1d381340fd 100644 --- a/src/control/server/mgmt_check_test.go +++ b/src/control/server/mgmt_check_test.go @@ -118,18 +118,54 @@ func TestServer_mgmtSvc_SystemCheckStart(t *testing.T) { testPolicies := testPoliciesWithAction(chkpb.CheckInconsistAction_CIA_INTERACT) uuids := testPoolUUIDs(3) - testFindings := func() []*checker.Finding { + testFindings := func(act chkpb.CheckInconsistAction) []*checker.Finding { findings := []*checker.Finding{} for i, uuid := range uuids { f := &checker.Finding{CheckReport: chkpb.CheckReport{ Seq: uint64(i + 1), PoolUuid: uuid, + Action: act, }} findings = append(findings, f) } return findings } + defaultTestFindings := func() []*checker.Finding { + return testFindings(chkpb.CheckInconsistAction_CIA_TRUST_MS) + } + + actionTestFindings := func(act chkpb.CheckInconsistAction, idx ...int) []*checker.Finding { + findings := defaultTestFindings() + for _, i := range idx { + t.Logf("findings[%d].Action: %s -> %s", i, findings[i].Action, act) + findings[i].Action = act + } + t.Logf("findings: %+v", findings) + return findings + } + + interactTestFindings := func(idx ...int) []*checker.Finding { + return actionTestFindings(chkpb.CheckInconsistAction_CIA_INTERACT, idx...) + } + + staleTestFindings := func(idx ...int) []*checker.Finding { + return actionTestFindings(chkpb.CheckInconsistAction_CIA_STALE, idx...) + } + + createMSWithFindings := func(t *testing.T, log logging.Logger, findings []*checker.Finding) *mgmtSvc { + svc := testSvcCheckerEnabled(t, log, system.MemberStateCheckerStarted, uuids) + if err := svc.setCheckerPolicyMap(testPolicies); err != nil { + t.Fatal(err) + } + for _, f := range findings { + if err := svc.sysdb.AddCheckerFinding(f); err != nil { + t.Fatal(err) + } + } + return svc + } + for name, tc := range map[string]struct { createMS func(*testing.T, logging.Logger) *mgmtSvc getMockDrpc func() *mockDrpcClient @@ -178,7 +214,7 @@ func TestServer_mgmtSvc_SystemCheckStart(t *testing.T) { Sys: "daos_server", }, expErr: errors.New("mock dRPC"), - expFindings: testFindings(), + expFindings: defaultTestFindings(), expPolicies: testPolicies, }, "bad resp": { @@ -189,7 +225,7 @@ func TestServer_mgmtSvc_SystemCheckStart(t *testing.T) { Sys: "daos_server", }, expErr: errors.New("unmarshal CheckStart response"), - expFindings: testFindings(), + expFindings: defaultTestFindings(), expPolicies: testPolicies, }, "request failed": { @@ -200,7 +236,7 @@ func TestServer_mgmtSvc_SystemCheckStart(t *testing.T) { Sys: "daos_server", }, expResp: &mgmt.CheckStartResp{Status: int32(daos.MiscError)}, - expFindings: testFindings(), + expFindings: defaultTestFindings(), expPolicies: testPolicies, }, "no reset": { @@ -208,7 +244,7 @@ func TestServer_mgmtSvc_SystemCheckStart(t *testing.T) { Sys: "daos_server", }, expResp: &mgmtpb.CheckStartResp{}, - expFindings: testFindings(), + expFindings: defaultTestFindings(), expPolicies: testPolicies, }, "reset": { @@ -236,12 +272,7 @@ func TestServer_mgmtSvc_SystemCheckStart(t *testing.T) { }, expResp: &mgmtpb.CheckStartResp{}, expFindings: []*checker.Finding{ - { - CheckReport: chkpb.CheckReport{ - Seq: 2, - PoolUuid: uuids[1], - }, - }, + defaultTestFindings()[1], }, expPolicies: testPolicies, }, @@ -263,6 +294,63 @@ func TestServer_mgmtSvc_SystemCheckStart(t *testing.T) { expResp: &mgmtpb.CheckStartResp{}, expPolicies: mergeTestPolicies(testPolicies, specificPolicies), }, + "interactive findings removed for all pools": { + createMS: func(t *testing.T, log logging.Logger) *mgmtSvc { + return createMSWithFindings(t, log, interactTestFindings(0, 2)) + }, + req: &mgmtpb.CheckStartReq{ + Sys: "daos_server", + }, + expResp: &mgmtpb.CheckStartResp{}, // non-reset + expPolicies: testPolicies, + expFindings: []*checker.Finding{ + defaultTestFindings()[1], // non-interactive is left alone + }, + }, + "interactive findings stale for unspecified pool": { + createMS: func(t *testing.T, log logging.Logger) *mgmtSvc { + return createMSWithFindings(t, log, interactTestFindings(0, 2)) + }, + req: &mgmtpb.CheckStartReq{ + Sys: "daos_server", + Uuids: []string{uuids[0]}, + }, + expResp: &mgmtpb.CheckStartResp{}, // non-reset + expPolicies: testPolicies, + expFindings: []*checker.Finding{ + defaultTestFindings()[1], // non-interactive is left alone + // interactive for unspecified pool is marked stale and re-annotated + checker.AnnotateFinding(testFindings(chkpb.CheckInconsistAction_CIA_STALE)[2]), + }, + }, + "stale findings removed for all pools": { + createMS: func(t *testing.T, log logging.Logger) *mgmtSvc { + return createMSWithFindings(t, log, staleTestFindings(0, 2)) + }, + req: &mgmtpb.CheckStartReq{ + Sys: "daos_server", + }, + expResp: &mgmtpb.CheckStartResp{}, // non-reset + expPolicies: testPolicies, + expFindings: []*checker.Finding{ + defaultTestFindings()[1], // non-stale is left alone + }, + }, + "stale finding ignored for unspecified pool": { + createMS: func(t *testing.T, log logging.Logger) *mgmtSvc { + return createMSWithFindings(t, log, staleTestFindings(0, 2)) + }, + req: &mgmtpb.CheckStartReq{ + Sys: "daos_server", + Uuids: []string{uuids[0]}, + }, + expResp: &mgmtpb.CheckStartResp{}, // non-reset + expPolicies: testPolicies, + expFindings: []*checker.Finding{ + defaultTestFindings()[1], // non-stale is left alone + testFindings(chkpb.CheckInconsistAction_CIA_STALE)[2], // stale for unspecified pool remains + }, + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) @@ -270,16 +358,7 @@ func TestServer_mgmtSvc_SystemCheckStart(t *testing.T) { if tc.createMS == nil { tc.createMS = func(t *testing.T, log logging.Logger) *mgmtSvc { - svc := testSvcCheckerEnabled(t, log, system.MemberStateCheckerStarted, uuids) - if err := svc.setCheckerPolicyMap(testPolicies); err != nil { - t.Fatal(err) - } - for _, f := range testFindings() { - if err := svc.sysdb.AddCheckerFinding(f); err != nil { - t.Fatal(err) - } - } - return svc + return createMSWithFindings(t, log, defaultTestFindings()) } } svc := tc.createMS(t, log) diff --git a/src/control/server/mgmt_drpc.go b/src/control/server/mgmt_drpc.go index 9dc06d9f6cf..3789b0dec8c 100644 --- a/src/control/server/mgmt_drpc.go +++ b/src/control/server/mgmt_drpc.go @@ -79,20 +79,20 @@ type srvModule struct { checkerDB checker.FindingStore engines []Engine events *events.PubSub - client *control.Client + rpcClient control.UnaryInvoker msReplicas []string } // newSrvModule creates a new srv module references to the system database, // resident EngineInstances and event publish subscribe reference. -func newSrvModule(log logging.Logger, pdb poolDatabase, cdb checker.FindingStore, engines []Engine, events *events.PubSub, client *control.Client, msReplicas []string) *srvModule { +func newSrvModule(log logging.Logger, pdb poolDatabase, cdb checker.FindingStore, engines []Engine, events *events.PubSub, client control.UnaryInvoker, msReplicas []string) *srvModule { return &srvModule{ log: log, poolDB: pdb, checkerDB: cdb, engines: engines, events: events, - client: client, + rpcClient: client, msReplicas: msReplicas, } } @@ -297,8 +297,9 @@ func (mod *srvModule) handleGetSysProps(reqb []byte) ([]byte, error) { msReq.Keys = append(msReq.Keys, t) } msReq.SetHostList(mod.msReplicas) + msReq.SetSystem(req.Sys) - msResp, err := control.SystemGetProp(ctx, mod.client, msReq) + msResp, err := control.SystemGetProp(ctx, mod.rpcClient, msReq) if err != nil { return nil, errors.Wrap(err, "failed to get system properties from MS") } diff --git a/src/control/server/mgmt_drpc_test.go b/src/control/server/mgmt_drpc_test.go index d6d5cff2139..f2a229d6b67 100644 --- a/src/control/server/mgmt_drpc_test.go +++ b/src/control/server/mgmt_drpc_test.go @@ -13,12 +13,16 @@ import ( "testing" "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/pkg/errors" "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/testing/protocmp" + mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt" srvpb "github.com/daos-stack/daos/src/control/common/proto/srv" "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/drpc" + "github.com/daos-stack/daos/src/control/lib/control" "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" @@ -60,7 +64,7 @@ func addEngineInstances(mod *srvModule, numInstances int, log logging.Logger) { } } -func TestSrvModule_HandleNotifyReady_Invalid(t *testing.T) { +func TestSrvModule_handleNotifyReady_Invalid(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) @@ -86,7 +90,7 @@ func TestSrvModule_HandleNotifyReady_Invalid(t *testing.T) { } } -func TestSrvModule_HandleNotifyReady_BadSockPath(t *testing.T) { +func TestSrvModule_handleNotifyReady_BadSockPath(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) @@ -108,7 +112,7 @@ func TestSrvModule_HandleNotifyReady_BadSockPath(t *testing.T) { } } -func TestSrvModule_HandleNotifyReady_Success_Single(t *testing.T) { +func TestSrvModule_handleNotifyReady_Success_Single(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) @@ -134,7 +138,7 @@ func TestSrvModule_HandleNotifyReady_Success_Single(t *testing.T) { waitForEngineReady(t, mod.engines[0].(*EngineInstance)) } -func TestSrvModule_HandleNotifyReady_Success_Multi(t *testing.T) { +func TestSrvModule_handleNotifyReady_Success_Multi(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) @@ -171,7 +175,7 @@ func TestSrvModule_HandleNotifyReady_Success_Multi(t *testing.T) { } } -func TestSrvModule_HandleNotifyReady_IdxOutOfRange(t *testing.T) { +func TestSrvModule_handleNotifyReady_IdxOutOfRange(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) @@ -204,7 +208,7 @@ func TestSrvModule_HandleNotifyReady_IdxOutOfRange(t *testing.T) { } } -func TestSrvModule_HandleClusterEvent_Invalid(t *testing.T) { +func TestSrvModule_handleClusterEvent_Invalid(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) @@ -238,64 +242,76 @@ func getTestBytes(t *testing.T, msg proto.Message) []byte { return testBytes } -func TestSrvModule_handleGetPoolServiceRanks(t *testing.T) { - log, buf := logging.NewTestLogger(t.Name()) - defer test.ShowBufferOnFailure(t, buf) +func cmpTestResp(t *testing.T, respBytes []byte, resp, expResp proto.Message) { + t.Helper() + + if err := proto.Unmarshal(respBytes, resp); err != nil { + t.Fatal(err) + } + if diff := cmp.Diff(expResp, resp, protocmp.Transform()); diff != "" { + t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) + } +} +func TestSrvModule_handleGetPoolServiceRanks(t *testing.T) { for name, tc := range map[string]struct { - reqBytes []byte + req *srvpb.GetPoolSvcReq + badReq bool testPool *system.PoolService - expResp []byte + expResp *srvpb.GetPoolSvcResp expErr error }{ "bad request bytes": { - reqBytes: []byte("bad bytes"), - expErr: drpc.UnmarshalingPayloadFailure(), + badReq: true, + expErr: drpc.UnmarshalingPayloadFailure(), }, "bad pool uuid in request": { - reqBytes: getTestBytes(t, &srvpb.GetPoolSvcReq{ + req: &srvpb.GetPoolSvcReq{ Uuid: "bad-uuid", - }), + }, expErr: errors.New("invalid pool uuid"), }, "not found": { - reqBytes: getTestBytes(t, &srvpb.GetPoolSvcReq{ + req: &srvpb.GetPoolSvcReq{ Uuid: test.MockUUID(), - }), - expResp: getTestBytes(t, &srvpb.GetPoolSvcResp{ + }, + expResp: &srvpb.GetPoolSvcResp{ Status: int32(daos.Nonexistent), - }), + }, }, "found, but not Ready": { - reqBytes: getTestBytes(t, &srvpb.GetPoolSvcReq{ + req: &srvpb.GetPoolSvcReq{ Uuid: test.MockUUID(), - }), + }, testPool: &system.PoolService{ PoolUUID: test.MockPoolUUID(), PoolLabel: "testlabel", State: system.PoolServiceStateCreating, Replicas: []ranklist.Rank{0, 1, 2}, }, - expResp: getTestBytes(t, &srvpb.GetPoolSvcResp{ + expResp: &srvpb.GetPoolSvcResp{ Status: int32(daos.Nonexistent), - }), + }, }, "success": { - reqBytes: getTestBytes(t, &srvpb.GetPoolSvcReq{ + req: &srvpb.GetPoolSvcReq{ Uuid: test.MockUUID(), - }), + }, testPool: &system.PoolService{ PoolUUID: test.MockPoolUUID(), PoolLabel: "testlabel", State: system.PoolServiceStateReady, Replicas: []ranklist.Rank{0, 1, 2}, }, - expResp: getTestBytes(t, &srvpb.GetPoolSvcResp{ + expResp: &srvpb.GetPoolSvcResp{ Svcreps: []uint32{0, 1, 2}, - }), + }, }, } { t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + ctx := test.Context(t) db := raft.MockDatabase(t, log) @@ -314,72 +330,76 @@ func TestSrvModule_handleGetPoolServiceRanks(t *testing.T) { } } - resp, err := mod.handleGetPoolServiceRanks(tc.reqBytes) + reqBytes := []byte("bad bytes") + if !tc.badReq { + reqBytes = getTestBytes(t, tc.req) + } + + respBytes, err := mod.handleGetPoolServiceRanks(reqBytes) test.CmpErr(t, tc.expErr, err) if err != nil { return } - if diff := cmp.Diff(tc.expResp, resp); diff != "" { - t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) - } + cmpTestResp(t, respBytes, new(srvpb.GetPoolSvcResp), tc.expResp) }) } } func TestSrvModule_handlePoolFindByLabel(t *testing.T) { - log, buf := logging.NewTestLogger(t.Name()) - defer test.ShowBufferOnFailure(t, buf) - for name, tc := range map[string]struct { - reqBytes []byte + req *srvpb.PoolFindByLabelReq + badReq bool testPool *system.PoolService - expResp []byte + expResp *srvpb.PoolFindByLabelResp expErr error }{ "bad request bytes": { - reqBytes: []byte("bad bytes"), - expErr: drpc.UnmarshalingPayloadFailure(), + badReq: true, + expErr: drpc.UnmarshalingPayloadFailure(), }, "not found": { - reqBytes: getTestBytes(t, &srvpb.PoolFindByLabelReq{ + req: &srvpb.PoolFindByLabelReq{ Label: "testlabel", - }), - expResp: getTestBytes(t, &srvpb.PoolFindByLabelResp{ + }, + expResp: &srvpb.PoolFindByLabelResp{ Status: int32(daos.Nonexistent), - }), + }, }, "found, but not Ready": { - reqBytes: getTestBytes(t, &srvpb.PoolFindByLabelReq{ + req: &srvpb.PoolFindByLabelReq{ Label: "testlabel", - }), + }, testPool: &system.PoolService{ PoolUUID: test.MockPoolUUID(), PoolLabel: "testlabel", State: system.PoolServiceStateCreating, Replicas: []ranklist.Rank{0, 1, 2}, }, - expResp: getTestBytes(t, &srvpb.PoolFindByLabelResp{ + expResp: &srvpb.PoolFindByLabelResp{ Status: int32(daos.Nonexistent), - }), + }, }, "success": { - reqBytes: getTestBytes(t, &srvpb.PoolFindByLabelReq{ + req: &srvpb.PoolFindByLabelReq{ Label: "testlabel", - }), + }, testPool: &system.PoolService{ PoolUUID: test.MockPoolUUID(), PoolLabel: "testlabel", State: system.PoolServiceStateReady, Replicas: []ranklist.Rank{0, 1, 2}, }, - expResp: getTestBytes(t, &srvpb.PoolFindByLabelResp{ + expResp: &srvpb.PoolFindByLabelResp{ Uuid: test.MockPoolUUID().String(), Svcreps: []uint32{0, 1, 2}, - }), + }, }, } { t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + ctx := test.Context(t) db := raft.MockDatabase(t, log) @@ -398,14 +418,308 @@ func TestSrvModule_handlePoolFindByLabel(t *testing.T) { } } - resp, err := mod.handlePoolFindByLabel(tc.reqBytes) + reqBytes := []byte("bad bytes") + if !tc.badReq { + reqBytes = getTestBytes(t, tc.req) + } + + respBytes, err := mod.handlePoolFindByLabel(reqBytes) + test.CmpErr(t, tc.expErr, err) + if err != nil { + return + } + + cmpTestResp(t, respBytes, new(srvpb.PoolFindByLabelResp), tc.expResp) + }) + } +} + +func TestSrvModule_handleListPools(t *testing.T) { + for name, tc := range map[string]struct { + req *srvpb.ListPoolsReq + badReq bool + testPools []*system.PoolService + expResp *srvpb.ListPoolsResp + expErr error + }{ + "bad request bytes": { + badReq: true, + expErr: drpc.UnmarshalingPayloadFailure(), + }, + "no pools": { + req: &srvpb.ListPoolsReq{ + IncludeAll: false, + }, + expResp: &srvpb.ListPoolsResp{ + Pools: []*srvpb.ListPoolsResp_Pool{}, + }, + }, + "single pool": { + req: &srvpb.ListPoolsReq{ + IncludeAll: false, + }, + testPools: []*system.PoolService{ + { + PoolUUID: test.MockPoolUUID(1), + PoolLabel: "pool1", + State: system.PoolServiceStateReady, + Replicas: []ranklist.Rank{0, 1, 2}, + }, + }, + expResp: &srvpb.ListPoolsResp{ + Pools: []*srvpb.ListPoolsResp_Pool{ + { + Uuid: test.MockPoolUUID(1).String(), + Label: "pool1", + Svcreps: []uint32{0, 1, 2}, + }, + }, + }, + }, + "multiple pools": { + req: &srvpb.ListPoolsReq{ + IncludeAll: true, + }, + testPools: []*system.PoolService{ + { + PoolUUID: test.MockPoolUUID(1), + PoolLabel: "pool1", + State: system.PoolServiceStateReady, + Replicas: []ranklist.Rank{0, 1, 2}, + }, + { + PoolUUID: test.MockPoolUUID(2), + PoolLabel: "pool2", + State: system.PoolServiceStateCreating, + Replicas: []ranklist.Rank{3, 4, 5}, + }, + }, + expResp: &srvpb.ListPoolsResp{ + Pools: []*srvpb.ListPoolsResp_Pool{ + { + Uuid: test.MockPoolUUID(1).String(), + Label: "pool1", + Svcreps: []uint32{0, 1, 2}, + }, + { + Uuid: test.MockPoolUUID(2).String(), + Label: "pool2", + Svcreps: []uint32{3, 4, 5}, + }, + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + ctx := test.Context(t) + + db := raft.MockDatabase(t, log) + mod := &srvModule{ + log: log, + poolDB: db, + } + for _, pool := range tc.testPools { + lock, err := db.TakePoolLock(ctx, pool.PoolUUID) + if err != nil { + t.Fatal(err) + } + if err := db.AddPoolService(lock.InContext(ctx), pool); err != nil { + lock.Release() + t.Fatal(err) + } + lock.Release() + } + + reqBytes := []byte("bad bytes") + if !tc.badReq { + reqBytes = getTestBytes(t, tc.req) + } + + respBytes, err := mod.handleListPools(reqBytes) + test.CmpErr(t, tc.expErr, err) + if err != nil { + return + } + + resp := new(srvpb.ListPoolsResp) + if err := proto.Unmarshal(respBytes, resp); err != nil { + t.Fatal(err) + } + + if len(tc.expResp.Pools) != len(resp.Pools) { + t.Fatal("unexpected number of pools returned") + } + for _, pool := range tc.expResp.Pools { + found := false + for _, expPool := range resp.Pools { + if pool.Uuid != expPool.Uuid { + continue + } + if diff := cmp.Diff(expPool, pool, protocmp.Transform()); diff != "" { + t.Fatalf("unexpected pool in response (-want, +got):\n%s\n", diff) + } + found = true + break + } + if !found { + t.Fatalf("pool %v not found", pool) + } + } + }) + } +} + +func TestSrvModule_handleGetSysProps(t *testing.T) { + mockMSReplicas := []string{"host1:10001"} + + for name, tc := range map[string]struct { + req *mgmtpb.SystemGetPropReq + badReq bool + mic *control.MockInvokerConfig // For control-API SystemGetProp + expCtlCall *control.SystemGetPropReq + expResp *mgmtpb.SystemGetPropResp + expErr error + }{ + "bad request bytes": { + badReq: true, + expErr: drpc.UnmarshalingPayloadFailure(), + }, + "invalid system property key": { + req: &mgmtpb.SystemGetPropReq{ + Sys: "daos_server", + Keys: []string{"invalid-key"}, + }, + expErr: errors.New("invalid system property key"), + }, + "control API error": { + req: &mgmtpb.SystemGetPropReq{ + Sys: "daos_server", + Keys: []string{"self_heal"}, + }, + mic: &control.MockInvokerConfig{ + UnaryError: errors.New("control API failed"), + }, + expCtlCall: &control.SystemGetPropReq{}, + expErr: errors.New("failed to get system properties from MS"), + }, + "success with single property": { + req: &mgmtpb.SystemGetPropReq{ + Sys: "daos_server", + Keys: []string{"self_heal"}, + }, + mic: &control.MockInvokerConfig{ + UnaryResponse: control.MockMSResponse("host1:10001", nil, + &mgmtpb.SystemGetPropResp{ + Properties: map[string]string{ + "self_heal": "exclude", + }, + }), + }, + expCtlCall: &control.SystemGetPropReq{ + Keys: []daos.SystemPropertyKey{ + daos.SystemPropertySelfHeal, + }, + }, + expResp: &mgmtpb.SystemGetPropResp{ + Properties: map[string]string{ + "self_heal": "exclude", + }, + }, + }, + "success with multiple properties": { + req: &mgmtpb.SystemGetPropReq{ + Sys: "marigolds", + Keys: []string{"self_heal", "pool_scrub_thresh"}, + }, + mic: &control.MockInvokerConfig{ + UnaryResponse: control.MockMSResponse("host1:10001", nil, + &mgmtpb.SystemGetPropResp{ + Properties: map[string]string{ + "self_heal": "exclude", + "pool_scrub_thresh": "0", + }, + }), + }, + expCtlCall: &control.SystemGetPropReq{ + Keys: []daos.SystemPropertyKey{ + daos.SystemPropertySelfHeal, + daos.SystemPropertyPoolScrubThresh, + }, + }, + expResp: &mgmtpb.SystemGetPropResp{ + Properties: map[string]string{ + "self_heal": "exclude", + "pool_scrub_thresh": "0", + }, + }, + }, + "empty request returns empty response": { + req: &mgmtpb.SystemGetPropReq{ + Sys: "daos_server", + Keys: []string{}, + }, + mic: &control.MockInvokerConfig{ + UnaryResponse: control.MockMSResponse("host1:10001", nil, + &mgmtpb.SystemGetPropResp{ + Properties: map[string]string{}, + }), + }, + expCtlCall: &control.SystemGetPropReq{ + Keys: []daos.SystemPropertyKey{}, + }, + expResp: &mgmtpb.SystemGetPropResp{ + Properties: map[string]string{}, + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + mi := control.NewMockInvoker(log, tc.mic) + mod := &srvModule{ + log: log, + rpcClient: mi, + msReplicas: mockMSReplicas, + } + + reqBytes := []byte("bad bytes") + if !tc.badReq { + reqBytes = getTestBytes(t, tc.req) + } + + respBytes, err := mod.handleGetSysProps(reqBytes) test.CmpErr(t, tc.expErr, err) if err != nil { return } - if diff := cmp.Diff(tc.expResp, resp); diff != "" { - t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) + cmpTestResp(t, respBytes, new(mgmtpb.SystemGetPropResp), tc.expResp) + + switch mi.GetInvokeCount() { + case 0: + if tc.expCtlCall != nil { + t.Fatal("expected control API call but got none") + } + case 1: + if tc.expCtlCall == nil { + t.Fatal("unexpected control API call") + } + getPropReqSent := mi.SentReqs[0].(*control.SystemGetPropReq) + cmpOpt := cmpopts.IgnoreFields(control.SystemGetPropReq{}, + "unaryRequest", "msRequest") + if diff := cmp.Diff(tc.expCtlCall, getPropReqSent, cmpOpt); diff != "" { + t.Fatalf("unexpected control API call (-want, +got):\n%s\n", + diff) + } + test.AssertEqual(t, tc.req.Sys, getPropReqSent.Sys, + "system name mismatch") + default: + t.Fatalf("unexpected number of control API calls: %d", + mi.GetInvokeCount()) } }) } diff --git a/src/control/server/mgmt_pool_test.go b/src/control/server/mgmt_pool_test.go index 54a1c7621a7..51a3bdc2813 100644 --- a/src/control/server/mgmt_pool_test.go +++ b/src/control/server/mgmt_pool_test.go @@ -2344,11 +2344,12 @@ func TestServer_MgmtSvc_PoolQuery(t *testing.T) { } for name, tc := range map[string]struct { - mgmtSvc *mgmtSvc - setupMockDrpc func(_ *mgmtSvc, _ error) - req *mgmtpb.PoolQueryReq - expResp *mgmtpb.PoolQueryResp - expErr error + mgmtSvc *mgmtSvc + setupMockDrpc func(_ *mgmtSvc, _ error) + req *mgmtpb.PoolQueryReq + missingSelfHealSysProp bool + expResp *mgmtpb.PoolQueryResp + expErr error }{ "nil request": { expErr: errors.New("nil request"), @@ -2449,6 +2450,18 @@ func TestServer_MgmtSvc_PoolQuery(t *testing.T) { SysSelfHealPolicy: "pool_rebuild", }, }, + "successful query; sys self-heal prop fetch; missing system property": { + missingSelfHealSysProp: true, + req: &mgmtpb.PoolQueryReq{ + Id: mockUUID, + QueryMask: uint64(daos.MustNewPoolQueryMask(daos.PoolQueryOptionSelfHealPolicy)), + }, + expResp: &mgmtpb.PoolQueryResp{ + State: mgmtpb.PoolServiceState_Ready, + Uuid: mockUUID, + SysSelfHealPolicy: daos.DefaultSysSelfHealFlagsStr, + }, + }, } { t.Run(name, func(t *testing.T) { buf.Reset() @@ -2479,10 +2492,12 @@ func TestServer_MgmtSvc_PoolQuery(t *testing.T) { tc.req.Sys = build.DefaultSystemName } - // Change stored value to something different from the default. - if err := system.SetUserProperty(tc.mgmtSvc.sysdb, tc.mgmtSvc.systemProps, - "self_heal", "pool_rebuild"); err != nil { - t.Fatal(err) + if !tc.missingSelfHealSysProp { + // Change stored value to something different from the default. + if err := system.SetUserProperty(tc.mgmtSvc.sysdb, tc.mgmtSvc.systemProps, + "self_heal", "pool_rebuild"); err != nil { + t.Fatal(err) + } } gotResp, gotErr := tc.mgmtSvc.PoolQuery(test.Context(t), tc.req) diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index c0243810c50..7eb43f6232e 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -57,7 +57,7 @@ const ( // the client network autoconfiguration hints, and the set of ranks associated with MS // replicas. If req.AllRanks is true, all ranks' fabric URIs are also given the client. func (svc *mgmtSvc) GetAttachInfo(ctx context.Context, req *mgmtpb.GetAttachInfoReq) (*mgmtpb.GetAttachInfoResp, error) { - if err := svc.checkReplicaRequest(req); err != nil { + if err := svc.checkReplicaRequest(wrapCheckerReq(req)); err != nil { return nil, err } if len(svc.clientNetworkHint) == 0 { @@ -1316,6 +1316,11 @@ type poolRanksOpSig func(context.Context, control.UnaryInvoker, *control.PoolRan func (svc *mgmtSvc) getPoolRanksResps(ctx context.Context, sys string, poolIDs []string, poolRanks poolRanksMap, ctlApiCall poolRanksOpSig) ([]*control.PoolRanksResp, error) { resps := []*control.PoolRanksResp{} + _, replicas, err := svc.sysdb.LeaderQuery() + if err != nil { + return nil, err + } + for _, id := range poolIDs { rs := poolRanks[id] if rs.Count() == 0 { @@ -1327,6 +1332,9 @@ func (svc *mgmtSvc) getPoolRanksResps(ctx context.Context, sys string, poolIDs [ Ranks: rs.Ranks(), } req.Sys = sys + // Set request hostlist from leader query as we don't have + // access to the server config from here. + req.SetHostList(replicas) svc.log.Tracef("%T: %+v", req, req) @@ -1432,6 +1440,11 @@ func (svc *mgmtSvc) SystemRebuildManage(ctx context.Context, pbReq *mgmtpb.Syste return &mgmtpb.SystemRebuildManageResp{}, nil // Successful no-op. } + _, replicas, err := svc.sysdb.LeaderQuery() + if err != nil { + return nil, err + } + var results []*control.PoolRebuildManageResult for _, id := range poolIDs { opCode := control.PoolRebuildOpCode(pbReq.OpCode) @@ -1441,6 +1454,10 @@ func (svc *mgmtSvc) SystemRebuildManage(ctx context.Context, pbReq *mgmtpb.Syste OpCode: opCode, Force: pbReq.Force, } + // Set request hostlist from leader query as we don't have + // access to the server config from here. + req.SetHostList(replicas) + svc.log.Tracef("%T: %+v", req, req) result := &control.PoolRebuildManageResult{ @@ -1468,7 +1485,7 @@ func (svc *mgmtSvc) SystemRebuildManage(ctx context.Context, pbReq *mgmtpb.Syste // selfHealExcludeRanks fetches a list of detected dead ranks from the leader's engine and updates // states within the control-plane membership appropriately. func (svc *mgmtSvc) selfHealExcludeRanks(ctx context.Context) error { - // TODO: Pass a real, nonzero map version. + // DAOS-18163 TODO: Pass a real, nonzero map version. req := &mgmtpb.GetGroupStatusReq{} // Fetch dead rank list from leader's engine with group status dRPC call. @@ -1528,12 +1545,21 @@ func (svc *mgmtSvc) selfHealNotifyPSes(ctx context.Context, propVal string) erro return nil // Successful no-op. } + _, replicas, err := svc.sysdb.LeaderQuery() + if err != nil { + return err + } + var successes, failures []string for _, id := range poolIDs { req := &control.PoolSelfHealEvalReq{ ID: id, SysPropVal: propVal, } + // Set request hostlist from leader query as we don't have + // access to the server config from here. + req.SetHostList(replicas) + svc.log.Tracef("%T: %+v", req, req) if err := control.PoolSelfHealEval(ctx, svc.rpcClient, req); err != nil { @@ -1591,7 +1617,6 @@ func (svc *mgmtSvc) SystemSelfHealEval(ctx context.Context, pbReq *mgmtpb.System !daos.SystemPropertySelfHealHasFlag(selfHeal, daos.SysSelfHealFlagPoolExclude) { return new(mgmtpb.DaosResp), nil } - if err := svc.selfHealNotifyPSes(ctx, selfHeal); err != nil { return nil, errors.Wrapf(err, "notify pool services of self_heal=%q", selfHeal) } diff --git a/src/control/server/mgmt_system_test.go b/src/control/server/mgmt_system_test.go index 1d25ddcfc7d..46e7266af6d 100644 --- a/src/control/server/mgmt_system_test.go +++ b/src/control/server/mgmt_system_test.go @@ -252,6 +252,26 @@ func stateString(s system.MemberState) string { return strings.ToLower(s.String()) } +func startSysDB(t *testing.T, ctx context.Context, log logging.Logger, replicas []*net.TCPAddr, svc *mgmtSvc) func() { + db, cleanup := raft.TestDatabase(t, log, replicas...) + svc.sysdb = db + + if err := db.Start(ctx); err != nil { + cleanup() + t.Fatal(err) + } + + // wait for the bootstrap to finish + for { + if leader, _, _ := db.LeaderQuery(); leader != "" { + break + } + time.Sleep(250 * time.Millisecond) + } + + return cleanup +} + func TestServer_MgmtSvc_LeaderQuery(t *testing.T) { localhost := common.LocalhostCtrlAddr() @@ -282,22 +302,11 @@ func TestServer_MgmtSvc_LeaderQuery(t *testing.T) { defer test.ShowBufferOnFailure(t, buf) svc := newTestMgmtSvc(t, log) - db, cleanup := raft.TestDatabase(t, log) - defer cleanup() - svc.sysdb = db - ctx := test.Context(t) - if err := db.Start(ctx); err != nil { - t.Fatal(err) - } + replicas := []*net.TCPAddr{common.LocalhostCtrlAddr()} - // wait for the bootstrap to finish - for { - if leader, _, _ := db.LeaderQuery(); leader != "" { - break - } - time.Sleep(250 * time.Millisecond) - } + cleanup := startSysDB(t, ctx, log, replicas, svc) + defer cleanup() gotResp, gotErr := svc.LeaderQuery(test.Context(t), tc.req) test.CmpErr(t, tc.expErr, gotErr) @@ -595,6 +604,46 @@ func TestServer_MgmtSvc_getPoolRanks(t *testing.T) { }, expDrpcCount: 2, }, + "two pools; bracketed zero disabled ranks": { + pools: []string{test.MockUUID(1), test.MockUUID(2)}, + inRanks: ranklist.MustCreateRankSet("1,8"), + getEnabled: false, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolQueryResp{ + EnabledRanks: "0-4", + DisabledRanks: "[]", + }, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolQueryResp{ + EnabledRanks: "1-7", + DisabledRanks: "[]", + }, + }, + }, + expDrpcCount: 2, + }, + "two pools; bracketed zero enabled ranks": { + pools: []string{test.MockUUID(1), test.MockUUID(2)}, + inRanks: ranklist.MustCreateRankSet("1,8"), + getEnabled: true, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolQueryResp{ + EnabledRanks: "[]", + DisabledRanks: "0-4", + }, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolQueryResp{ + EnabledRanks: "[]", + DisabledRanks: "1-7", + }, + }, + }, + expDrpcCount: 2, + }, "match zero ranks; two pools": { pools: []string{test.MockUUID(1), test.MockUUID(2)}, inRanks: ranklist.MustCreateRankSet("8-10"), @@ -2285,6 +2334,7 @@ func TestServer_MgmtSvc_SystemDrain(t *testing.T) { useLabels bool pools []string members system.Members + replica *net.TCPAddr drpcResps []*mockDrpcResponse // For dRPC PoolQuery expDrpcCount int mic *control.MockInvokerConfig // For control-API PoolDrain/Reint @@ -2472,14 +2522,17 @@ func TestServer_MgmtSvc_SystemDrain(t *testing.T) { expDrpcCount: 2, expCtlApiCount: 1, }, - "drain multiple ranks on multiple pools": { - req: &mgmtpb.SystemDrainReq{Ranks: "0-3"}, + "drain rank on multiple pools; pool requests contain replica address": { + req: &mgmtpb.SystemDrainReq{ + Ranks: "0-3", + }, members: system.Members{ system.MockMember(t, 1, system.MemberStateJoined), system.MockMember(t, 2, system.MemberStateJoined), system.MockMember(t, 3, system.MemberStateJoined), }, - pools: []string{test.MockUUID(1), test.MockUUID(2)}, + pools: []string{test.MockUUID(1), test.MockUUID(2)}, + replica: test.MockTCPAddr(10003, 5), drpcResps: []*mockDrpcResponse{ &mockDrpcResponse{ Message: &mgmtpb.PoolQueryResp{ @@ -2513,14 +2566,15 @@ func TestServer_MgmtSvc_SystemDrain(t *testing.T) { expDrpcCount: 2, expCtlApiCount: 5, }, - "reintegrate multiple ranks on multiple pools": { + "reintegrate rank on multiple pools; pool requests contain replica address": { req: &mgmtpb.SystemDrainReq{Ranks: "0-3", Reint: true}, members: system.Members{ system.MockMember(t, 1, system.MemberStateJoined), system.MockMember(t, 2, system.MemberStateJoined), system.MockMember(t, 3, system.MemberStateJoined), }, - pools: []string{test.MockUUID(1), test.MockUUID(2)}, + pools: []string{test.MockUUID(1), test.MockUUID(2)}, + replica: test.MockTCPAddr(10003, 5), drpcResps: []*mockDrpcResponse{ &mockDrpcResponse{ Message: &mgmtpb.PoolQueryResp{ @@ -2616,8 +2670,8 @@ func TestServer_MgmtSvc_SystemDrain(t *testing.T) { req: &mgmtpb.SystemDrainReq{ Reint: true, // Resolves to ranks 1-2. - Hosts: fmt.Sprintf("%s,%s", test.MockHostAddr(1), - test.MockHostAddr(2)), + Hosts: fmt.Sprintf("%s,%s", system.MockControlAddr(t, 1), + system.MockControlAddr(t, 2)), }, members: system.Members{ system.MockMember(t, 1, system.MemberStateJoined), @@ -2652,21 +2706,36 @@ func TestServer_MgmtSvc_SystemDrain(t *testing.T) { Id: "00000002", Results: []*sharedpb.RankResult{ {Rank: 1}, + {Rank: 2}, }, }, }, }, expDrpcCount: 2, - expCtlApiCount: 2, + expCtlApiCount: 3, // One per pool-rank }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) + harness := NewEngineHarness(log) + sp := storage.NewProvider(log, 0, nil, nil, nil, nil, nil) + e := newTestEngine(log, true, sp) + if err := harness.AddInstance(e); err != nil { + t.Fatal(err) + } + harness.started.SetTrue() + ctx := test.MustLogContext(t) - svc := newTestMgmtSvc(t, log) + if tc.replica == nil { + tc.replica = common.LocalhostCtrlAddr() + } + + db := raft.MockDatabaseWithAddr(t, log, tc.replica) + ms := system.NewMembership(log, db) + svc := newMgmtSvc(harness, ms, db, nil, nil) for _, m := range tc.members { if _, err := svc.membership.Add(m); err != nil { t.Fatal(err) @@ -2721,6 +2790,19 @@ func TestServer_MgmtSvc_SystemDrain(t *testing.T) { "dRPC invoke count") test.AssertEqual(t, tc.expCtlApiCount, mi.GetInvokeCount(), "rpc client invoke count") + + if tc.expCtlApiCount > 0 { + for _, sr := range mi.SentReqs { + // Mock database implementation will only return first + // replica so make sure the hostlist sent in pool drain or + // reint requests matches what leader query returns as the + // first replica. + reqSent := sr.(*control.PoolRanksReq) + exp := fmt.Sprintf("%v", tc.replica) + got := fmt.Sprintf("%v", reqSent.HostList[0]) + test.AssertEqual(t, exp, got, "first request host") + } + } }) } } @@ -2730,6 +2812,7 @@ func TestServer_MgmtSvc_SystemRebuildManage(t *testing.T) { req *mgmtpb.SystemRebuildManageReq useLabels bool pools []string + replica *net.TCPAddr mic *control.MockInvokerConfig // For control-API PoolRebuildStart/Stop expCtlApiCount int expErr error @@ -2843,12 +2926,13 @@ func TestServer_MgmtSvc_SystemRebuildManage(t *testing.T) { }, expCtlApiCount: 1, }, - "start pool rebuild results on multiple pools; use label identifiers": { + "start pool rebuild results on multiple pools; use label identifiers; sent to replicas": { req: &mgmtpb.SystemRebuildManageReq{ OpCode: uint32(control.PoolRebuildOpCodeStart), }, useLabels: true, pools: []string{test.MockUUID(3), test.MockUUID(2), test.MockUUID(1)}, + replica: test.MockTCPAddr(10003, 5), mic: &control.MockInvokerConfig{ UnaryResponseSet: []*control.UnaryResponse{ control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), @@ -2885,8 +2969,23 @@ func TestServer_MgmtSvc_SystemRebuildManage(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) + harness := NewEngineHarness(log) + sp := storage.NewProvider(log, 0, nil, nil, nil, nil, nil) + e := newTestEngine(log, true, sp) + if err := harness.AddInstance(e); err != nil { + t.Fatal(err) + } + harness.started.SetTrue() + ctx := test.MustLogContext(t) - svc := newTestMgmtSvc(t, log) + + if tc.replica == nil { + tc.replica = common.LocalhostCtrlAddr() + } + + db := raft.MockDatabaseWithAddr(t, log, tc.replica) + m := system.NewMembership(log, db) + svc := newMgmtSvc(harness, m, db, nil, nil) cfg := new(mockDrpcClientConfig) mdc := newMockDrpcClient(cfg) @@ -2919,10 +3018,14 @@ func TestServer_MgmtSvc_SystemRebuildManage(t *testing.T) { gotResp, gotErr := svc.SystemRebuildManage(ctx, tc.req) test.CmpErr(t, tc.expErr, gotErr) + cmpOpts := []cmp.Option{ + cmpopts.IgnoreUnexported(mgmtpb.SystemRebuildManageResp{}, + mgmtpb.PoolRebuildManageResult{}, control.PoolRebuildManageReq{}), + } + if tc.expErr == nil { - cmpOpts := []cmp.Option{ - cmpopts.IgnoreUnexported(mgmtpb.SystemRebuildManageResp{}, - mgmtpb.PoolRebuildManageResult{}), + if gotResp == nil { + t.Fatal("expected non-nil response") } if diff := cmp.Diff(tc.expResp, gotResp, cmpOpts...); diff != "" { t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) @@ -2931,6 +3034,520 @@ func TestServer_MgmtSvc_SystemRebuildManage(t *testing.T) { test.AssertEqual(t, tc.expCtlApiCount, mi.GetInvokeCount(), "rpc client invoke count") + + if tc.expCtlApiCount > 0 { + for _, sr := range mi.SentReqs { + // Mock database implementation will only return first + // replica so make sure the hostlist sent in pool rebuild + // manage requests matches what leader query returns as the + // first replica. + rbldReqSent := sr.(*control.PoolRebuildManageReq) + exp := fmt.Sprintf("%v", tc.replica) + got := fmt.Sprintf("%v", rbldReqSent.HostList[0]) + test.AssertEqual(t, exp, got, "first request host") + } + } + }) + } +} + +func TestServer_MgmtSvc_getSysSelfHeal(t *testing.T) { + for name, tc := range map[string]struct { + selfHealProp string + propErr error + expSetPropErr error + expResult string + expErr error + }{ + "property set to empty": { + selfHealProp: "", + expResult: daos.DefaultSysSelfHealFlagsStr, + }, + "property set to exclude": { + selfHealProp: "exclude", + expResult: "exclude", + }, + "property set to pool_rebuild": { + selfHealProp: "pool_rebuild", + expResult: "pool_rebuild", + }, + "property set to invalid flag combination": { + selfHealProp: "exclude;pool_rebuild;pool_exclude", + expSetPropErr: errors.New("invalid value"), + }, + "property set to multiple flags": { + selfHealProp: "exclude;pool_exclude;pool_rebuild", + expResult: "exclude;pool_exclude;pool_rebuild", + }, + "property error": { + propErr: errors.New("database error"), + expErr: errors.New("unknown property \"self_heal\""), + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + svc := newTestMgmtSvc(t, log) + + if tc.selfHealProp != "" { + gotErr := system.SetUserProperty(svc.sysdb, svc.systemProps, + "self_heal", tc.selfHealProp) + test.CmpErr(t, tc.expSetPropErr, gotErr) + if tc.expSetPropErr != nil { + return + } + } + + if tc.propErr != nil { + // Simulate error by clearing the systemProps + svc.systemProps = nil + } + + gotResult, gotErr := svc.getSysSelfHeal() + + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr == nil { + test.AssertEqual(t, tc.expResult, gotResult, "self_heal property value") + } + }) + } +} + +func TestServer_MgmtSvc_selfHealExcludeRanks(t *testing.T) { + for name, tc := range map[string]struct { + drpcResp proto.Message + drpcErr error + deadRanks []uint32 + members system.Members + expErr error + expDebug string + }{ + "drpc call fails": { + drpcErr: errors.New("drpc failed"), + expErr: errors.New("drpc failed"), + }, + "drpc returns error status": { + drpcResp: &mgmtpb.GetGroupStatusResp{ + Status: int32(daos.Nonexistent), + }, + expErr: daos.Nonexistent, + }, + "no dead ranks": { + drpcResp: &mgmtpb.GetGroupStatusResp{ + Status: 0, + }, + }, + "one dead rank": { + drpcResp: &mgmtpb.GetGroupStatusResp{ + Status: 0, + DeadRanks: []uint32{1}, + }, + members: system.Members{ + system.NewMember(1, test.MockUUID(1), nil, test.MockHostAddr(), + system.MemberStateJoined), + system.NewMember(2, test.MockUUID(2), nil, test.MockHostAddr(), + system.MemberStateJoined), + }, + expDebug: "do group update", + }, + "multiple dead ranks": { + drpcResp: &mgmtpb.GetGroupStatusResp{ + Status: 0, + DeadRanks: []uint32{1, 2, 3}, + }, + members: system.Members{ + system.NewMember(1, test.MockUUID(1), nil, test.MockHostAddr(), + system.MemberStateJoined), + system.NewMember(2, test.MockUUID(2), nil, test.MockHostAddr(), + system.MemberStateJoined), + system.NewMember(3, test.MockUUID(3), nil, test.MockHostAddr(), + system.MemberStateJoined), + }, + expDebug: "do group update", + }, + "dead rank not in membership": { + drpcResp: &mgmtpb.GetGroupStatusResp{ + Status: 0, + DeadRanks: []uint32{99}, + }, + }, + "dead rank already excluded": { + drpcResp: &mgmtpb.GetGroupStatusResp{ + Status: 0, + DeadRanks: []uint32{1}, + }, + members: system.Members{ + system.NewMember(1, test.MockUUID(1), nil, test.MockHostAddr(), + system.MemberStateExcluded), + system.NewMember(2, test.MockUUID(2), nil, test.MockHostAddr(), + system.MemberStateJoined), + system.NewMember(3, test.MockUUID(3), nil, test.MockHostAddr(), + system.MemberStateJoined), + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + ctx := test.MustLogContext(t) + svc := newTestMgmtSvc(t, log) + + // Add members to the database + seenMembers := make(map[ranklist.Rank]system.MemberState) + for _, m := range tc.members { + if _, err := svc.membership.Add(m); err != nil { + t.Fatal(err) + } + seenMembers[m.Rank] = m.State + } + + cfg := new(mockDrpcClientConfig) + rb, _ := proto.Marshal(tc.drpcResp) + cfg.setSendMsgResponse(drpc.Status_SUCCESS, rb, tc.drpcErr) + mdc := newMockDrpcClient(cfg) + setupSvcDrpcClient(svc, 0, mdc) + + gotErr := svc.selfHealExcludeRanks(ctx) + + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + // Verify members were marked as dead if expected + if tc.drpcResp != nil { + resp := tc.drpcResp.(*mgmtpb.GetGroupStatusResp) + for _, deadRank := range resp.DeadRanks { + m, err := svc.membership.Get(ranklist.Rank(deadRank)) + if system.IsMemberNotFound(err) { + continue + } + if err != nil { + t.Fatal(err) + } + test.AssertEqual(t, system.MemberStateExcluded, m.State, + fmt.Sprintf("rank %d state", deadRank)) + seenMembers[ranklist.Rank(deadRank)] = system.MemberStateExcluded + } + } + + // Verify members have expected states in the database + for rank, state := range seenMembers { + m, err := svc.membership.Get(ranklist.Rank(rank)) + if err != nil { + t.Fatal(err) + } + test.AssertEqual(t, state, m.State, + fmt.Sprintf("rank %d end state", rank)) + } + + if !strings.Contains(buf.String(), tc.expDebug) { + t.Fatalf("expected debug log output to contain %s, got %s\n", + tc.expDebug, buf.String()) + } + }) + } +} + +func TestServer_MgmtSvc_selfHealNotifyPSes(t *testing.T) { + for name, tc := range map[string]struct { + propVal string + pools []string + replica *net.TCPAddr + mic *control.MockInvokerConfig + expErr error + expCtlApiCount int + }{ + "no pools": { + propVal: "pool_rebuild", + }, + "one pool success": { + propVal: "pool_rebuild", + pools: []string{test.MockUUID(1)}, + mic: &control.MockInvokerConfig{ + UnaryResponseSet: []*control.UnaryResponse{ + control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + }, + }, + expCtlApiCount: 1, + }, + "multiple pools all succeed; replica address sent in pool request": { + propVal: "pool_exclude", + pools: []string{test.MockUUID(1), test.MockUUID(2), test.MockUUID(3)}, + replica: test.MockTCPAddr(10003, 5), + mic: &control.MockInvokerConfig{ + UnaryResponseSet: []*control.UnaryResponse{ + control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + }, + }, + expCtlApiCount: 3, + }, + "one pool fails": { + propVal: "pool_rebuild", + pools: []string{test.MockUUID(1), test.MockUUID(2)}, + mic: &control.MockInvokerConfig{ + UnaryResponseSet: []*control.UnaryResponse{ + control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + control.MockMSResponse("host1", errors.New("pool failed"), nil), + }, + }, + expErr: errors.New("pool self-heal evaluate drpc failed for 1 pool"), + expCtlApiCount: 2, + }, + "multiple pools fail": { + propVal: "pool_exclude", + pools: []string{test.MockUUID(1), test.MockUUID(2), test.MockUUID(3)}, + mic: &control.MockInvokerConfig{ + UnaryResponseSet: []*control.UnaryResponse{ + control.MockMSResponse("host1", errors.New("fail1"), nil), + control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + control.MockMSResponse("host1", errors.New("fail2"), nil), + }, + }, + expErr: errors.New("pool self-heal evaluate drpc failed for 2 pools"), + expCtlApiCount: 3, + }, + "empty propVal with pools": { + propVal: "", + pools: []string{test.MockUUID(1)}, + mic: &control.MockInvokerConfig{ + UnaryResponseSet: []*control.UnaryResponse{ + control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + }, + }, + expCtlApiCount: 1, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + harness := NewEngineHarness(log) + sp := storage.NewProvider(log, 0, nil, nil, nil, nil, nil) + e := newTestEngine(log, true, sp) + if err := harness.AddInstance(e); err != nil { + t.Fatal(err) + } + harness.started.SetTrue() + + ctx := test.MustLogContext(t) + + if tc.replica == nil { + tc.replica = common.LocalhostCtrlAddr() + } + + db := raft.MockDatabaseWithAddr(t, log, tc.replica) + m := system.NewMembership(log, db) + svc := newMgmtSvc(harness, m, db, nil, nil) + + mic := tc.mic + if mic == nil { + mic = control.DefaultMockInvokerConfig() + } + mi := control.NewMockInvoker(log, mic) + svc.rpcClient = mi + + for _, uuidStr := range tc.pools { + addTestPoolService(t, svc.sysdb, &system.PoolService{ + PoolUUID: uuid.MustParse(uuidStr), + State: system.PoolServiceStateReady, + Replicas: []ranklist.Rank{0}, + }) + } + + gotErr := svc.selfHealNotifyPSes(ctx, tc.propVal) + + test.CmpErr(t, tc.expErr, gotErr) + test.AssertEqual(t, tc.expCtlApiCount, mi.GetInvokeCount(), + "rpc client invoke count") + + if tc.expCtlApiCount > 0 { + for _, sr := range mi.SentReqs { + // Mock database implementation will only return first + // replica so make sure the hostlist sent in pool self-heal + // eval requests matches what leader query returns as the + // first replica. + reqSent := sr.(*control.PoolSelfHealEvalReq) + exp := fmt.Sprintf("%v", tc.replica) + got := fmt.Sprintf("%v", reqSent.HostList[0]) + test.AssertEqual(t, exp, got, "first request host") + } + } + }) + } +} + +func TestServer_MgmtSvc_SystemSelfHealEval(t *testing.T) { + for name, tc := range map[string]struct { + req *mgmtpb.SystemSelfHealEvalReq + selfHealProp string + mic *control.MockInvokerConfig // For control-API PoolSelfHealEval + drpcResp proto.Message + drpcErr error + expErr error + noPools bool + noMembers bool + expCtlApiCount int + expGrpUpd bool + }{ + "nil req": { + req: (*mgmtpb.SystemSelfHealEvalReq)(nil), + expErr: errors.New("nil *mgmt.SystemSelfHealEvalReq"), + }, + "not system leader": { + req: &mgmtpb.SystemSelfHealEvalReq{Sys: "quack"}, + expErr: FaultWrongSystem("quack", build.DefaultSystemName), + }, + "exclude flag set; drpc call fails": { + req: &mgmtpb.SystemSelfHealEvalReq{}, + selfHealProp: "exclude", + drpcErr: errors.New("drpc failed"), + expErr: errors.New("excluding ranks based on self_heal.exclude"), + }, + "exclude flag set; no dead ranks": { + req: &mgmtpb.SystemSelfHealEvalReq{}, + selfHealProp: "exclude", + drpcResp: &mgmtpb.GetGroupStatusResp{}, + }, + "exclude flag set; with dead ranks": { + req: &mgmtpb.SystemSelfHealEvalReq{}, + selfHealProp: "exclude", + drpcResp: &mgmtpb.GetGroupStatusResp{ + DeadRanks: []uint32{1, 2}, + }, + expGrpUpd: true, + }, + "pool_rebuild flag set; no pools": { + req: &mgmtpb.SystemSelfHealEvalReq{}, + selfHealProp: "pool_rebuild", + noPools: true, + }, + "pool_rebuild flag set; multiple pool success": { + req: &mgmtpb.SystemSelfHealEvalReq{}, + selfHealProp: "pool_rebuild", + mic: &control.MockInvokerConfig{ + UnaryResponse: control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + }, + expCtlApiCount: 3, + }, + "pool_exclude flag set; multiple pool failures": { + req: &mgmtpb.SystemSelfHealEvalReq{}, + selfHealProp: "pool_exclude", + mic: &control.MockInvokerConfig{ + UnaryResponseSet: []*control.UnaryResponse{ + control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + control.MockMSResponse("host1", errors.New("pool failed"), nil), + control.MockMSResponse("host1", errors.New("pool failed"), nil), + }, + }, + expErr: errors.New("pool self-heal evaluate drpc failed for 2 pools"), + expCtlApiCount: 3, + }, + "pool_rebuild and pool_exclude flags set; multiple pools": { + req: &mgmtpb.SystemSelfHealEvalReq{}, + selfHealProp: "pool_exclude;pool_rebuild", + mic: &control.MockInvokerConfig{ + UnaryResponse: control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + }, + expCtlApiCount: 3, + }, + "all flags set; exclude with dead ranks and pool operations; pool requests sent to replica": { + req: &mgmtpb.SystemSelfHealEvalReq{}, + selfHealProp: "exclude;pool_exclude;pool_rebuild", + drpcResp: &mgmtpb.GetGroupStatusResp{ + DeadRanks: []uint32{0}, + }, + mic: &control.MockInvokerConfig{ + UnaryResponse: control.MockMSResponse("host1", nil, &mgmtpb.DaosResp{}), + }, + expCtlApiCount: 3, + expGrpUpd: true, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + ctx := test.MustLogContext(t) + svc := newTestMgmtSvc(t, log) + + cfg := new(mockDrpcClientConfig) + rb, _ := proto.Marshal(tc.drpcResp) + cfg.setSendMsgResponse(drpc.Status_SUCCESS, rb, tc.drpcErr) + mdc := newMockDrpcClient(cfg) + setupSvcDrpcClient(svc, 0, mdc) + + mic := tc.mic + if mic == nil { + mic = control.DefaultMockInvokerConfig() + } + mi := control.NewMockInvoker(log, mic) + svc.rpcClient = mi + + // Set up system self_heal property + if tc.selfHealProp != "" { + if err := system.SetUserProperty(svc.sysdb, svc.systemProps, + "self_heal", tc.selfHealProp); err != nil { + t.Fatal(err) + } + } + + // Add pool service entries to the system database + if !tc.noPools { + pools := []string{test.MockUUID(1), test.MockUUID(2), test.MockUUID(3)} + for _, uuidStr := range pools { + addTestPoolService(t, svc.sysdb, &system.PoolService{ + PoolUUID: uuid.MustParse(uuidStr), + State: system.PoolServiceStateReady, + Replicas: []ranklist.Rank{0}, + }) + } + } + + // Add members to the system membership + if !tc.noMembers { + members := system.Members{ + system.NewMember(1, test.MockUUID(1), nil, test.MockHostAddr(), + system.MemberStateExcluded), + system.NewMember(2, test.MockUUID(2), nil, test.MockHostAddr(), + system.MemberStateJoined), + system.NewMember(3, test.MockUUID(3), nil, test.MockHostAddr(), + system.MemberStateJoined), + } + for _, m := range members { + if _, err := svc.membership.Add(m); err != nil { + t.Fatal(err) + } + } + } + + if tc.req != nil && tc.req.Sys == "" { + tc.req.Sys = build.DefaultSystemName + } + + gotResp, gotErr := svc.SystemSelfHealEval(ctx, tc.req) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr == nil { + if gotResp == nil { + t.Fatal("expected non-nil response") + } + cmpOpts := []cmp.Option{ + cmpopts.IgnoreUnexported(mgmtpb.DaosResp{}), + } + if diff := cmp.Diff(&mgmtpb.DaosResp{}, gotResp, cmpOpts...); diff != "" { + t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) + } + } + + test.AssertEqual(t, tc.expCtlApiCount, mi.GetInvokeCount(), + "rpc client invoke count") + + didGrpUpd := strings.Contains(buf.String(), "do group update") + test.AssertEqual(t, tc.expGrpUpd, didGrpUpd, "group update performed") }) } } diff --git a/src/control/server/mocks.go b/src/control/server/mocks.go index f2d5f0c1ebd..6b654034557 100644 --- a/src/control/server/mocks.go +++ b/src/control/server/mocks.go @@ -86,3 +86,23 @@ func (ms *mockSubscriber) getRx() []string { return ms.rx } + +type mockIOMMUDetector struct { + enabled bool + err error +} + +// IsIOMMUEnabled implements hardware.IOMMUDetector interface +func (mid mockIOMMUDetector) IsIOMMUEnabled() (bool, error) { + return mid.enabled, mid.err +} + +type mockTHPDetector struct { + enabled bool + err error +} + +// IsTHPEnabled implements hardware.THPDetector interface +func (mid mockTHPDetector) IsTHPEnabled() (bool, error) { + return mid.enabled, mid.err +} diff --git a/src/control/server/server.go b/src/control/server/server.go index 22131b3cb4e..4ea72603f3d 100644 --- a/src/control/server/server.go +++ b/src/control/server/server.go @@ -1,6 +1,6 @@ // // (C) Copyright 2018-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -308,21 +308,37 @@ func (srv *server) setCoreDumpFilter() error { func (srv *server) initNetwork() error { defer srv.logDuration(track("time to init network")) - ctlAddr, err := getControlAddr(ctlAddrParams{ + params := ctlAddrParams{ port: srv.cfg.ControlPort, replicaAddrSrc: srv.sysdb, lookupHost: net.LookupIP, - }) + } + + // If a control interface is configured, look it up and pass it to getControlAddr. + // Also track whether we should bind to a specific IP (only when control_iface is set). + bindToCtlAddr := false + if srv.cfg.ControlInterface != "" { + iface, err := net.InterfaceByName(srv.cfg.ControlInterface) + if err != nil { + return config.FaultConfigBadControlInterface(srv.cfg.ControlInterface, err) + } + params.ctlIface = iface + bindToCtlAddr = true + srv.log.Debugf("using control interface %s for listener", srv.cfg.ControlInterface) + } + + ctlAddr, err := getControlAddr(params) if err != nil { return err } - listener, err := createListener(ctlAddr, net.Listen) + listener, err := createListener(ctlAddr, net.Listen, bindToCtlAddr) if err != nil { return err } srv.ctlAddr = ctlAddr srv.listener = listener + srv.log.Debugf("control plane listener bound to %s", ctlAddr) return nil } @@ -356,13 +372,11 @@ func (srv *server) addEngines(ctx context.Context, smi *common.SysMemInfo) error var allStarted sync.WaitGroup registerTelemetryCallbacks(ctx, srv) - iommuEnabled, err := topology.DefaultIOMMUDetector(srv.log).IsIOMMUEnabled() - if err != nil { - return err - } + iommuChecker := topology.DefaultIOMMUDetector(srv.log) + thpChecker := topology.DefaultTHPDetector(srv.log) // Allocate hugepages and rebind NVMe devices to userspace drivers. - if err := prepBdevStorage(srv, iommuEnabled, smi); err != nil { + if err := prepBdevStorage(srv, smi, iommuChecker, thpChecker); err != nil { return err } diff --git a/src/control/server/server_utils.go b/src/control/server/server_utils.go index ea2933db373..b9b57481f50 100644 --- a/src/control/server/server_utils.go +++ b/src/control/server/server_utils.go @@ -1,6 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -150,11 +150,30 @@ type ctlAddrParams struct { port int replicaAddrSrc replicaAddrGetter lookupHost ipLookupFn + ctlIface netInterface // optional: if set, use this interface for bind address } func getControlAddr(params ctlAddrParams) (*net.TCPAddr, error) { - ipStr := "0.0.0.0" + // If a control interface is configured, use its first IPv4 address. + if params.ctlIface != nil { + ip, err := getFirstIPv4Addr(params.ctlIface) + if err != nil { + return nil, errors.Wrap(err, "getting control interface address") + } + + // If this node is a replica, verify the control interface address matches + // the configured replica address. A mismatch would break raft connectivity. + if repAddr, err := params.replicaAddrSrc.ReplicaAddr(); err == nil { + if !repAddr.IP.Equal(ip) { + return nil, config.FaultConfigControlInterfaceMismatch(ip.String(), repAddr.IP.String()) + } + } + + return &net.TCPAddr{IP: ip, Port: params.port}, nil + } + // Fall back to legacy behavior: use replica address if available, otherwise 0.0.0.0. + ipStr := "0.0.0.0" if repAddr, err := params.replicaAddrSrc.ReplicaAddr(); err == nil { ipStr = repAddr.IP.String() } @@ -167,11 +186,17 @@ func getControlAddr(params ctlAddrParams) (*net.TCPAddr, error) { return ctlAddr, nil } -func createListener(ctlAddr *net.TCPAddr, listen netListenFn) (net.Listener, error) { +func createListener(ctlAddr *net.TCPAddr, listen netListenFn, bindToCtlAddr bool) (net.Listener, error) { // Create and start listener on management network. - lis, err := listen("tcp4", fmt.Sprintf("0.0.0.0:%d", ctlAddr.Port)) + // Only bind to ctlAddr.IP if explicitly requested (i.e., control_iface is set), + // otherwise bind to all interfaces (0.0.0.0) for backwards compatibility. + bindAddr := fmt.Sprintf("0.0.0.0:%d", ctlAddr.Port) + if bindToCtlAddr { + bindAddr = ctlAddr.String() + } + lis, err := listen("tcp4", bindAddr) if err != nil { - return nil, errors.Wrap(err, "unable to listen on management interface") + return nil, errors.Wrapf(err, "unable to listen on %s", bindAddr) } return lis, nil @@ -362,7 +387,7 @@ func SetHugeNodes(log logging.Logger, srvCfg *config.Server, smi *common.SysMemI // Prepare bdev storage. Assumes validation has already been performed on server config. Hugepages // are required for both emulated (AIO devices) and real NVMe bdevs. VFIO and IOMMU are not // mandatory requirements for emulated NVMe. -func prepBdevStorage(srv *server, iommuEnabled bool, smi *common.SysMemInfo) error { +func prepBdevStorage(srv *server, smi *common.SysMemInfo, iommuChecker hardware.IOMMUDetector, thpChecker hardware.THPDetector) error { defer srv.logDuration(track("time to prepare bdev storage")) if srv.cfg == nil { @@ -373,6 +398,22 @@ func prepBdevStorage(srv *server, iommuEnabled bool, smi *common.SysMemInfo) err return nil } + // Fail to start if transparent hugepages are enabled. DAOS requires exclusive control over + // hugepages and therefore needs feature disabled. AllowTHP override flag provided for + // edge cases. + if !srv.cfg.AllowTHP { + if thpEnabled, err := thpChecker.IsTHPEnabled(); err != nil { + return errors.Wrap(err, "transparent hugepage check") + } else if thpEnabled { + return FaultTransparentHugepageEnabled + } + } + + iommuEnabled, err := iommuChecker.IsIOMMUEnabled() + if err != nil { + return errors.Wrap(err, "iommu check") + } + bdevCfgs := srv.cfg.GetBdevConfigs() // Perform these checks only if non-emulated NVMe is used and user is unprivileged. @@ -714,9 +755,12 @@ func registerTelemetryCallbacks(ctx context.Context, srv *server) { return } + // Use the same bind address as the control plane listener. + bindAddr := srv.ctlAddr.IP.String() + srv.OnEnginesStarted(func(ctxIn context.Context) error { srv.log.Debug("starting Prometheus exporter") - cleanup, err := startPrometheusExporter(ctxIn, srv.log, telemPort, srv.harness.Instances()) + cleanup, err := startPrometheusExporter(ctxIn, srv.log, telemPort, bindAddr, srv.harness.Instances()) if err != nil { return err } @@ -859,6 +903,35 @@ type netInterface interface { Addrs() ([]net.Addr, error) } +// getFirstIPv4Addr returns the first (lowest) IPv4 address from the interface. +// If multiple IPv4 addresses exist, the lowest one is returned for determinism. +func getFirstIPv4Addr(iface netInterface) (net.IP, error) { + addrs, err := iface.Addrs() + if err != nil { + return nil, errors.Wrap(err, "failed to get interface addresses") + } + + var ipv4s []net.IP + for _, a := range addrs { + if ipNet, ok := a.(*net.IPNet); ok && ipNet.IP != nil { + if v4 := ipNet.IP.To4(); v4 != nil { + ipv4s = append(ipv4s, v4) + } + } + } + + if len(ipv4s) == 0 { + return nil, errors.New("no IPv4 addresses on interface") + } + + // Sort for deterministic selection (lowest address first). + sort.Slice(ipv4s, func(i, j int) bool { + return bytes.Compare(ipv4s[i], ipv4s[j]) < 0 + }) + + return ipv4s[0], nil +} + func getSrxSetting(cfg *config.Server) (int32, error) { if len(cfg.Engines) == 0 { return -1, nil diff --git a/src/control/server/server_utils_test.go b/src/control/server/server_utils_test.go index db5bd957f67..9536c20d8aa 100644 --- a/src/control/server/server_utils_test.go +++ b/src/control/server/server_utils_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -173,6 +173,96 @@ func TestServer_checkFabricInterface(t *testing.T) { } } +func TestServer_getFirstIPv4Addr(t *testing.T) { + for name, tc := range map[string]struct { + iface netInterface + expIP net.IP + expErr error + }{ + "Addrs fails": { + iface: &mockInterface{ + err: errors.New("mock Addrs error"), + }, + expErr: errors.New("mock Addrs error"), + }, + "no addresses": { + iface: &mockInterface{ + addrs: []net.Addr{}, + }, + expErr: errors.New("no IPv4 addresses"), + }, + "only IPv6 addresses": { + iface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: net.ParseIP("::1")}, + &net.IPNet{IP: net.ParseIP("fe80::1")}, + }, + }, + expErr: errors.New("no IPv4 addresses"), + }, + "single IPv4 address": { + iface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: net.ParseIP("192.168.1.100")}, + }, + }, + expIP: net.ParseIP("192.168.1.100").To4(), + }, + "multiple IPv4 addresses - returns lowest": { + iface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: net.ParseIP("192.168.1.100")}, + &net.IPNet{IP: net.ParseIP("10.0.0.5")}, + &net.IPNet{IP: net.ParseIP("172.16.0.1")}, + }, + }, + expIP: net.ParseIP("10.0.0.5").To4(), + }, + "mixed IPv4 and IPv6 - returns lowest IPv4": { + iface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: net.ParseIP("::1")}, + &net.IPNet{IP: net.ParseIP("192.168.1.100")}, + &net.IPNet{IP: net.ParseIP("fe80::1")}, + &net.IPNet{IP: net.ParseIP("10.0.0.5")}, + }, + }, + expIP: net.ParseIP("10.0.0.5").To4(), + }, + "non-IPNet addresses ignored": { + iface: &mockInterface{ + addrs: []net.Addr{ + &mockAddr{}, // not a *net.IPNet + &net.IPNet{IP: net.ParseIP("192.168.1.100")}, + }, + }, + expIP: net.ParseIP("192.168.1.100").To4(), + }, + "nil IP in IPNet ignored": { + iface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: nil}, + &net.IPNet{IP: net.ParseIP("192.168.1.100")}, + }, + }, + expIP: net.ParseIP("192.168.1.100").To4(), + }, + } { + t.Run(name, func(t *testing.T) { + ip, err := getFirstIPv4Addr(tc.iface) + + test.CmpErr(t, tc.expErr, err) + if tc.expErr != nil { + return + } + + if !tc.expIP.Equal(ip) { + t.Fatalf("expected IP %v, got %v", tc.expIP, ip) + } + }) + } +} + func TestServer_getSrxSetting(t *testing.T) { defCfg := config.DefaultServer() @@ -320,7 +410,6 @@ func TestServer_prepBdevStorage_setEngineMemSize(t *testing.T) { } for name, tc := range map[string]struct { - iommuDisabled bool srvCfgExtra func(*config.Server) *config.Server memInfo1 *common.SysMemInfo // Before prepBdevStorage() memInfo2 *common.SysMemInfo // After prepBdevStorage() @@ -328,6 +417,10 @@ func TestServer_prepBdevStorage_setEngineMemSize(t *testing.T) { hugepagesTotal int // Values for all NUMA nodes, will be split per-node. bmbc *bdev.MockBackendConfig overrideUser string + iommuDisabled bool + iommuCheckErr error + thpEnabled bool + thpCheckErr error expPrepErr error expPrepCalls []storage.BdevPrepareRequest expMemSize int @@ -379,6 +472,13 @@ func TestServer_prepBdevStorage_setEngineMemSize(t *testing.T) { expMemSize: 16384, expHugepageSize: 2, }, + "iommu check error": { + iommuCheckErr: errors.New("fail"), + srvCfgExtra: func(sc *config.Server) *config.Server { + return sc.WithEngines(pmemEngine(0), pmemEngine(1)) + }, + expPrepErr: errors.New("iommu check: fail"), + }, "iommu disabled": { iommuDisabled: true, srvCfgExtra: func(sc *config.Server) *config.Server { @@ -422,6 +522,40 @@ func TestServer_prepBdevStorage_setEngineMemSize(t *testing.T) { expMemSize: 16384, expHugepageSize: 2, }, + "thp check error": { + thpCheckErr: errors.New("fail"), + srvCfgExtra: func(sc *config.Server) *config.Server { + return sc.WithEngines(pmemEngine(0), pmemEngine(1)) + }, + expPrepErr: errors.New("transparent hugepage check: fail"), + }, + "thp enabled": { + thpEnabled: true, + srvCfgExtra: func(sc *config.Server) *config.Server { + return sc.WithEngines(pmemEngine(0), pmemEngine(1)) + }, + expPrepErr: FaultTransparentHugepageEnabled, + }, + "thp enabled; override flag set": { + thpEnabled: true, + srvCfgExtra: func(sc *config.Server) *config.Server { + return sc.WithAllowTHP(true). + WithEngines(pmemEngine(0), pmemEngine(1)) + }, + expPrepCalls: []storage.BdevPrepareRequest{ + defCleanDualEngine, + { + HugeNodes: "nodes_hp[0]=8192,nodes_hp[1]=8192", + TargetUser: username, + PCIAllowList: fmt.Sprintf("%s%s%s", test.MockPCIAddr(0), + storage.BdevPciAddrSep, test.MockPCIAddr(1)), + EnableVMD: true, + }, + }, + expHugepageSize: 2, + // Allocation change logged. + expNotice: true, + }, "no bdevs configured; hugepages disabled": { srvCfgExtra: func(sc *config.Server) *config.Server { return sc.WithDisableHugepages(true). @@ -1064,6 +1198,16 @@ func TestServer_prepBdevStorage_setEngineMemSize(t *testing.T) { cfg = tc.srvCfgExtra(cfg) } + // Defaults are IOMMU=ON and THP=OFF. + iommuChecker := mockIOMMUDetector{ + enabled: !tc.iommuDisabled, + err: tc.iommuCheckErr, + } + thpChecker := mockTHPDetector{ + enabled: tc.thpEnabled, + err: tc.thpCheckErr, + } + mockAffSrc := func(l logging.Logger, e *engine.Config) (uint, error) { iface := e.Fabric.Interface l.Debugf("eval affinity of iface %q", iface) @@ -1156,7 +1300,7 @@ func TestServer_prepBdevStorage_setEngineMemSize(t *testing.T) { srv.runningUser = &user.User{Username: tc.overrideUser} } - gotPrepErr := prepBdevStorage(srv, !tc.iommuDisabled, tc.memInfo1) + gotPrepErr := prepBdevStorage(srv, tc.memInfo1, iommuChecker, thpChecker) mbb.RLock() if diff := cmp.Diff(tc.expPrepCalls, mbb.PrepareCalls, prepCmpOpt); diff != "" { @@ -1187,13 +1331,12 @@ func TestServer_prepBdevStorage_setEngineMemSize(t *testing.T) { test.AssertEqual(t, tc.expHugepageSize, ei.runner.GetConfig().HugepageSz, "unexpected huge page size") - txtMod := "" - if !tc.expNotice { - txtMod = "no " + gotNotice := strings.Contains(buf.String(), "NOTICE") + if tc.expNotice && !gotNotice { + t.Fatal("expected NOTICE level message but got none") + } else if !tc.expNotice && gotNotice { + t.Fatal("expected no NOTICE level message but got one") } - msg := fmt.Sprintf("expected %sNOTICE level message", txtMod) - test.AssertEqual(t, tc.expNotice, strings.Contains(buf.String(), "NOTICE"), - msg) }) } } @@ -1672,6 +1815,69 @@ func TestServerUtils_getControlAddr(t *testing.T) { }, expErr: errors.New("mock resolve"), }, + "with control interface": { + params: ctlAddrParams{ + port: testTCPAddr.Port, + ctlIface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: net.ParseIP("192.168.1.100")}, + }, + }, + }, + expAddr: &net.TCPAddr{IP: net.ParseIP("192.168.1.100").To4(), Port: 1234}, + }, + "control interface matches replica address": { + params: ctlAddrParams{ + port: testTCPAddr.Port, + ctlIface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: net.ParseIP("127.0.0.1")}, + }, + }, + replicaAddrSrc: &mockReplicaAddrSrc{ + replicaAddrResult: testTCPAddr, + }, + lookupHost: func(addr string) ([]net.IP, error) { + t.Fatal("lookupHost should not be called when ctlIface is set") + return nil, nil + }, + }, + expAddr: testTCPAddr, + }, + "control interface mismatches replica address": { + params: ctlAddrParams{ + port: testTCPAddr.Port, + ctlIface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: net.ParseIP("10.0.0.50")}, + }, + }, + replicaAddrSrc: &mockReplicaAddrSrc{ + replicaAddrResult: testTCPAddr, + }, + }, + expErr: config.FaultConfigControlInterfaceMismatch("10.0.0.50", "127.0.0.1"), + }, + "control interface fails to get address": { + params: ctlAddrParams{ + port: testTCPAddr.Port, + ctlIface: &mockInterface{ + err: errors.New("mock interface error"), + }, + }, + expErr: errors.New("mock interface error"), + }, + "control interface has no IPv4 addresses": { + params: ctlAddrParams{ + port: testTCPAddr.Port, + ctlIface: &mockInterface{ + addrs: []net.Addr{ + &net.IPNet{IP: net.ParseIP("::1")}, + }, + }, + }, + expErr: errors.New("no IPv4 addresses"), + }, } { t.Run(name, func(t *testing.T) { if tc.params.lookupHost == nil { @@ -1689,6 +1895,9 @@ func TestServerUtils_getControlAddr(t *testing.T) { addr, err := getControlAddr(tc.params) test.CmpErr(t, tc.expErr, err) + if tc.expErr != nil { + return + } test.AssertEqual(t, tc.expAddr.String(), addr.String(), "") }) } diff --git a/src/control/server/storage/bdev.go b/src/control/server/storage/bdev.go index d30ac6af487..a69023ab977 100644 --- a/src/control/server/storage/bdev.go +++ b/src/control/server/storage/bdev.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -61,6 +61,7 @@ const ( ConfSetAccelProps = C.NVME_CONF_SET_ACCEL_PROPS ConfSetSpdkRpcServer = C.NVME_CONF_SET_SPDK_RPC_SERVER ConfSetAutoFaultyProps = C.NVME_CONF_SET_AUTO_FAULTY + ConfIobufSetOptions = "iobuf_set_options" ) // Acceleration related constants for engine setting and optional capabilities. @@ -616,6 +617,7 @@ type ( AccelProps AccelProps SpdkRpcSrvProps SpdkRpcServer AutoFaultyProps BdevAutoFaulty + SpdkIobufProps SpdkIobuf VMDEnabled bool ScannedBdevs NvmeControllers // VMD needs address mapping for backing devices. } diff --git a/src/control/server/storage/bdev/backend.go b/src/control/server/storage/bdev/backend.go index d1fd8e56f07..0921f56d3e8 100644 --- a/src/control/server/storage/bdev/backend.go +++ b/src/control/server/storage/bdev/backend.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2023 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -320,8 +320,15 @@ func (sb *spdkBackend) prepare(req storage.BdevPrepareRequest, vmdDetect vmdDete // // Applies block (not allow) list if VMD is configured so specific NVMe devices can // be reserved for other use (bdev_exclude). - if err := sb.script.Unbind(&req); err != nil { - return resp, errors.Wrap(err, "un-binding devices") + // + // NOTE DAOS-18606: There may be a bug in SPDK setup script that results in backing + // device addresses being unbound despite relevant VMD address + // being supplied in blocklist. As a workaround, skip unbind in VMD + // mode if blocklist populated. + if req.PCIBlockList == "" { + if err := sb.script.Unbind(&req); err != nil { + return resp, errors.Wrap(err, "un-binding devices") + } } } else { if err := sb.script.Reset(&req); err != nil { diff --git a/src/control/server/storage/bdev/backend_json.go b/src/control/server/storage/bdev/backend_json.go index 020566df4d1..8caa2014717 100644 --- a/src/control/server/storage/bdev/backend_json.go +++ b/src/control/server/storage/bdev/backend_json.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -88,6 +89,15 @@ type AioCreateParams struct { func (_ AioCreateParams) isSpdkSubsystemConfigParams() {} +// IobufParams specifies details for a storage.ConfIobufSetOptions method. Zero values are not +// marshalled to JSON config output. +type IobufParams struct { + SmallPoolCount uint32 `json:"small_pool_count,omitzero"` + LargePoolCount uint32 `json:"large_pool_count,omitzero"` +} + +func (_ IobufParams) isSpdkSubsystemConfigParams() {} + // HotplugBusidRangeParams specifies details for a storage.ConfSetHotplugBusidRange method. type HotplugBusidRangeParams struct { Begin uint8 `json:"begin"` @@ -140,6 +150,8 @@ func (ssc *SpdkSubsystemConfig) UnmarshalJSON(data []byte) error { ssc.Params = &VmdEnableParams{} case storage.ConfBdevAioCreate: ssc.Params = &AioCreateParams{} + case storage.ConfIobufSetOptions: + ssc.Params = &IobufParams{} default: return errors.Errorf("unknown SPDK subsystem config method %q", ssc.Method) } @@ -324,6 +336,29 @@ func (sc *SpdkConfig) WithBdevConfigs(log logging.Logger, req *storage.BdevWrite return sc } +// WithSpdkIobufOpts adds custom SPDK iobuf options. No config entry is added if values are all +// zero. Only non-zero IobufParams field values are +func (sc *SpdkConfig) WithSpdkIobufOpts(req *storage.BdevWriteConfigRequest) *SpdkConfig { + if req.SpdkIobufProps.IsEmpty() { + return sc + } + + sc.Subsystems = append(sc.Subsystems, &SpdkSubsystem{ + Name: "iobuf", + Configs: []*SpdkSubsystemConfig{ + { + Method: storage.ConfIobufSetOptions, + Params: &IobufParams{ + SmallPoolCount: req.SpdkIobufProps.SmallPoolCount, + LargePoolCount: req.SpdkIobufProps.LargePoolCount, + }, + }, + }, + }) + + return sc +} + // Add hotplug bus-ID range to DAOS config data for use by non-SPDK consumers in // engine e.g. BIO or VOS. func hotplugPropSet(req *storage.BdevWriteConfigRequest, data *DaosData) { @@ -388,6 +423,7 @@ func newSpdkConfig(log logging.Logger, req *storage.BdevWriteConfigRequest) (*Sp rpcSrvSet(req, sc.DaosData) autoFaultySet(req, sc.DaosData) sc.WithBdevConfigs(log, req) + sc.WithSpdkIobufOpts(req) // SPDK-3370: Ensure hotplug config appears after attach directives to avoid race when VMD // with hotplug is enabled with multiple domains. diff --git a/src/control/server/storage/bdev/backend_json_test.go b/src/control/server/storage/bdev/backend_json_test.go index 33eff6f0a37..bce19effc37 100644 --- a/src/control/server/storage/bdev/backend_json_test.go +++ b/src/control/server/storage/bdev/backend_json_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -86,6 +87,8 @@ func TestBackend_newSpdkConfig(t *testing.T) { autoFaultyEnable bool autoFaultyIO uint32 autoFaultyCsum uint32 + iobufSmallPoolNr uint32 + iobufLargePoolNr uint32 expExtraSubsystems []*SpdkSubsystem expBdevCfgs []*SpdkSubsystemConfig expDaosCfgs []*DaosConfig @@ -196,7 +199,7 @@ func TestBackend_newSpdkConfig(t *testing.T) { }...), vosEnv: "AIO", }, - "multiple controllers; accel, rpc server & auto faulty settings": { + "accel, rpc server & auto faulty settings": { class: storage.ClassNvme, devList: []string{test.MockPCIAddr(1), test.MockPCIAddr(2)}, accelEngine: storage.AccelEngineSPDK, @@ -232,6 +235,28 @@ func TestBackend_newSpdkConfig(t *testing.T) { }, }, }, + "iobuf custom settings provided": { + class: storage.ClassNvme, + devList: []string{test.MockPCIAddr(1), test.MockPCIAddr(2)}, + devRoles: storage.BdevRoleAll, + iobufSmallPoolNr: 16384, + iobufLargePoolNr: 2048, + expBdevCfgs: multiCtrlrConfs(storage.BdevRoleAll, false), + expExtraSubsystems: []*SpdkSubsystem{ + { + Name: "iobuf", + Configs: []*SpdkSubsystemConfig{ + { + Method: storage.ConfIobufSetOptions, + Params: &IobufParams{ + SmallPoolCount: 16384, + LargePoolCount: 2048, + }, + }, + }, + }, + }, + }, } for name, tc := range tests { @@ -273,7 +298,8 @@ func TestBackend_newSpdkConfig(t *testing.T) { WithStorageAccelProps(tc.accelEngine, tc.accelOptMask). WithStorageSpdkRpcSrvProps(tc.rpcSrvEnable, tc.rpcSrvSockAddr). WithStorageAutoFaultyCriteria(tc.autoFaultyEnable, tc.autoFaultyIO, - tc.autoFaultyCsum) + tc.autoFaultyCsum). + WithStorageSpdkIobufProps(tc.iobufSmallPoolNr, tc.iobufLargePoolNr) if tc.devRoles != 0 { engineConfig.Storage.ControlMetadata = storage.ControlMetadata{ @@ -332,3 +358,150 @@ func TestBackend_unreadableSpdkConfig(t *testing.T) { t.Fatal("expected error") } } + +func TestBackend_IobufParams_JSONUnmarshal(t *testing.T) { + for name, tc := range map[string]struct { + input string + expOutput IobufParams + }{ + "empty": { + input: `{}`, + expOutput: IobufParams{ + SmallPoolCount: 0, + LargePoolCount: 0, + }, + }, + "small_pool_count only": { + input: `{"small_pool_count":1024}`, + expOutput: IobufParams{ + SmallPoolCount: 1024, + LargePoolCount: 0, + }, + }, + "large_pool_count only": { + input: `{"large_pool_count":512}`, + expOutput: IobufParams{ + SmallPoolCount: 0, + LargePoolCount: 512, + }, + }, + "both values set": { + input: `{"small_pool_count":2048,"large_pool_count":1024}`, + expOutput: IobufParams{ + SmallPoolCount: 2048, + LargePoolCount: 1024, + }, + }, + } { + t.Run(name, func(t *testing.T) { + var v IobufParams + if err := json.Unmarshal([]byte(tc.input), &v); err != nil { + t.Fatal(err) + } + + if diff := cmp.Diff(v, tc.expOutput); diff != "" { + t.Fatalf("unmarshal mismatch (-want +got):\n%s\nJSON: %s", diff, tc.input) + } + }) + } +} + +func TestBackend_IobufParams_JSON_RoundTrip(t *testing.T) { + for name, tc := range map[string]struct { + params IobufParams + }{ + "zero values": { + params: IobufParams{ + SmallPoolCount: 0, + LargePoolCount: 0, + }, + }, + "small_pool_count only": { + params: IobufParams{ + SmallPoolCount: 1024, + LargePoolCount: 0, + }, + }, + "large_pool_count only": { + params: IobufParams{ + SmallPoolCount: 0, + LargePoolCount: 512, + }, + }, + "both values set": { + params: IobufParams{ + SmallPoolCount: 2048, + LargePoolCount: 1024, + }, + }, + } { + t.Run(name, func(t *testing.T) { + // Marshal to JSON + buf, err := json.Marshal(&tc.params) + if err != nil { + t.Fatal(err) + } + + // Unmarshal back + var unmarshaled IobufParams + if err := json.Unmarshal(buf, &unmarshaled); err != nil { + t.Fatal(err) + } + + // Verify round-trip preserves values + if diff := cmp.Diff(tc.params, unmarshaled); diff != "" { + t.Fatalf("round-trip mismatch (-want +got):\n%s\nJSON: %s", diff, string(buf)) + } + }) + } +} + +func TestBackend_IobufParams_JSONOutput(t *testing.T) { + for name, tc := range map[string]struct { + params IobufParams + expJSON string + }{ + "both zero produces empty object": { + params: IobufParams{ + SmallPoolCount: 0, + LargePoolCount: 0, + }, + expJSON: `{}`, + }, + "only small_pool_count": { + params: IobufParams{ + SmallPoolCount: 1024, + LargePoolCount: 0, + }, + expJSON: `{"small_pool_count":1024}`, + }, + "only large_pool_count": { + params: IobufParams{ + SmallPoolCount: 0, + LargePoolCount: 512, + }, + expJSON: `{"large_pool_count":512}`, + }, + "both non-zero": { + params: IobufParams{ + SmallPoolCount: 2048, + LargePoolCount: 1024, + }, + expJSON: `{"small_pool_count":2048,"large_pool_count":1024}`, + }, + } { + t.Run(name, func(t *testing.T) { + buf, err := json.Marshal(&tc.params) + if err != nil { + t.Fatal(err) + } + + jsonStr := string(buf) + + // Check exact JSON output + if diff := cmp.Diff(tc.expJSON, jsonStr); diff != "" { + t.Fatalf("unexpected JSON output (-want +got):\n%s", diff) + } + }) + } +} diff --git a/src/control/server/storage/bdev/backend_test.go b/src/control/server/storage/bdev/backend_test.go index 2a3d5d7dcad..24934d9db62 100644 --- a/src/control/server/storage/bdev/backend_test.go +++ b/src/control/server/storage/bdev/backend_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2018-2022 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -1112,6 +1112,7 @@ func TestBackend_prepare_reset(t *testing.T) { { Env: []string{ fmt.Sprintf("PATH=%s", os.Getenv("PATH")), + fmt.Sprintf("%s=%s", pciAllowListEnv, mockAddrList(1, 2)), fmt.Sprintf("%s=%s", driverOverrideEnv, noDriver), }, }, @@ -1174,6 +1175,7 @@ func TestBackend_prepare_reset(t *testing.T) { Env: []string{ fmt.Sprintf("PATH=%s", os.Getenv("PATH")), fmt.Sprintf("%s=%s", driverOverrideEnv, noDriver), + fmt.Sprintf("%s=%s", pciAllowListEnv, mockAddrList(3)), }, }, { @@ -1198,13 +1200,6 @@ func TestBackend_prepare_reset(t *testing.T) { }, vmdDetectRet: mockAddrList(3, 5), expScriptCalls: []scriptCall{ - { - Env: []string{ - fmt.Sprintf("PATH=%s", os.Getenv("PATH")), - fmt.Sprintf("%s=%s", driverOverrideEnv, noDriver), - fmt.Sprintf("%s=%s", pciBlockListEnv, mockAddrList(4)), - }, - }, { Env: []string{ fmt.Sprintf("PATH=%s", os.Getenv("PATH")), @@ -1245,6 +1240,7 @@ func TestBackend_prepare_reset(t *testing.T) { }, }, }, + // Populated blocklist results in unbind operation being skipped. "prepare setup; vmd enabled; vmd devices allowed and blocked": { req: storage.BdevPrepareRequest{ HugepageCount: testNrHugepages, @@ -1255,13 +1251,6 @@ func TestBackend_prepare_reset(t *testing.T) { }, vmdDetectRet: mockAddrList(3, 2), expScriptCalls: []scriptCall{ - { - Env: []string{ - fmt.Sprintf("PATH=%s", os.Getenv("PATH")), - fmt.Sprintf("%s=%s", driverOverrideEnv, noDriver), - fmt.Sprintf("%s=%s", pciBlockListEnv, mockAddrList(4)), - }, - }, { Env: []string{ fmt.Sprintf("PATH=%s", os.Getenv("PATH")), diff --git a/src/control/server/storage/bdev/provider.go b/src/control/server/storage/bdev/provider.go index d86c2611d09..d823dad185b 100644 --- a/src/control/server/storage/bdev/provider.go +++ b/src/control/server/storage/bdev/provider.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -52,9 +53,12 @@ func NewProvider(log logging.Logger, backend Backend) *Provider { // Scan calls into the backend to discover NVMe components in the // system. -func (p *Provider) Scan(req storage.BdevScanRequest) (resp *storage.BdevScanResponse, err error) { +func (p *Provider) Scan(req storage.BdevScanRequest) (*storage.BdevScanResponse, error) { p.log.Debugf("run bdev storage provider scan, req: %+v", req) - return p.backend.Scan(req) + resp, err := p.backend.Scan(req) + p.log.Debugf("run bdev storage provider scan, resp: %+v", resp) + + return resp, err } // Prepare attempts to perform all actions necessary to make NVMe components diff --git a/src/control/server/storage/bdev/runner.go b/src/control/server/storage/bdev/runner.go index 79a2207adf8..350af544c17 100644 --- a/src/control/server/storage/bdev/runner.go +++ b/src/control/server/storage/bdev/runner.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2022 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -131,6 +132,7 @@ func (s *spdkSetupScript) Prepare(req *storage.BdevPrepareRequest) error { func (s *spdkSetupScript) Unbind(req *storage.BdevPrepareRequest) error { s.env = map[string]string{ "PATH": os.Getenv("PATH"), + pciAllowListEnv: req.PCIAllowList, pciBlockListEnv: req.PCIBlockList, driverOverrideEnv: noDriver, } @@ -144,6 +146,7 @@ func (s *spdkSetupScript) Unbind(req *storage.BdevPrepareRequest) error { // is not set, otherwise PCI devices can be specified by passing in a allow list of PCI addresses. // // NOTE: will make the controller reappear in /dev. +// TODO DAOS-18606: Should allowlist not be sent so all non-blocked devices get bound back to kernel? func (s *spdkSetupScript) Reset(req *storage.BdevPrepareRequest) error { s.env = map[string]string{ "PATH": os.Getenv("PATH"), diff --git a/src/control/server/storage/config.go b/src/control/server/storage/config.go index 1468d2e7bdb..ecccde9e171 100644 --- a/src/control/server/storage/config.go +++ b/src/control/server/storage/config.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -59,8 +60,9 @@ const ( // ControlMetadata describes configuration options for control plane metadata storage on the // DAOS server. type ControlMetadata struct { - Path string `yaml:"path,omitempty"` - DevicePath string `yaml:"device,omitempty"` + Path string `yaml:"path,omitempty"` + DevicePath string `yaml:"device,omitempty"` + AllowSpdkConfOverride bool `yaml:"allow_spdk_conf_override"` } // Directory returns the full path to the directory where the control plane metadata is saved. @@ -171,9 +173,9 @@ func (tc *TierConfig) WithStorageClass(cls string) *TierConfig { return tc } -// WithScmDisableHugepages disables hugepages for tmpfs. -func (tc *TierConfig) WithScmDisableHugepages() *TierConfig { - tc.Scm.DisableHugepages = true +// WithScmHugepagesDisabled disables hugepages for tmpfs. +func (tc *TierConfig) WithScmHugepagesDisabled(b bool) *TierConfig { + tc.Scm.DisableHugepages = &b return tc } @@ -572,7 +574,7 @@ func (tcs *TierConfigs) UnmarshalYAML(unmarshal func(interface{}) error) error { type ScmConfig struct { MountPoint string `yaml:"scm_mount,omitempty" cmdLongFlag:"--storage" cmdShortFlag:"-s"` RamdiskSize uint `yaml:"scm_size,omitempty"` - DisableHugepages bool `yaml:"scm_hugepages_disabled,omitempty"` + DisableHugepages *bool `yaml:"scm_hugepages_disabled,omitempty"` DeviceList []string `yaml:"scm_list,omitempty"` NumaNodeIndex uint `yaml:"-"` } @@ -591,9 +593,6 @@ func (sc *ScmConfig) Validate(class Class) error { if len(sc.DeviceList) == 0 { return errors.New("scm_list must be set when class is dcpm") } - if sc.DisableHugepages { - return errors.New("scm_hugepages_disabled may not be set when class is dcpm") - } case ClassRam: if len(sc.DeviceList) > 0 { return errors.New("scm_list may not be set when class is ram") @@ -1147,6 +1146,18 @@ type BdevAutoFaulty struct { MaxCsumErrs uint32 `yaml:"max_csum_errs,omitempty" json:"max_csum_errs"` } +// SpdkIobuf struct describes settings for DAOS I/O buffer pool configuration within the BIO +// module of the engine process. +type SpdkIobuf struct { + SmallPoolCount uint32 `yaml:"small_pool_count,omitempty" json:"small_pool_count,omitempty"` + LargePoolCount uint32 `yaml:"large_pool_count,omitempty" json:"large_pool_count,omitempty"` +} + +// IsEmpty returns true if all struct values are zero. +func (si *SpdkIobuf) IsEmpty() bool { + return si.SmallPoolCount == 0 && si.LargePoolCount == 0 +} + // Config defines engine storage. type Config struct { ControlMetadata ControlMetadata `yaml:"-"` // inherited from server @@ -1159,6 +1170,7 @@ type Config struct { AccelProps AccelProps `yaml:"acceleration,omitempty"` SpdkRpcSrvProps SpdkRpcServer `yaml:"spdk_rpc_server,omitempty"` AutoFaultyProps BdevAutoFaulty `yaml:"bdev_auto_faulty,omitempty"` + SpdkIobufProps SpdkIobuf `yaml:"spdk_iobuf,omitempty"` } // SetNUMAAffinity enables the assignment of NUMA affinity to tier configs. diff --git a/src/control/server/storage/config_test.go b/src/control/server/storage/config_test.go index f595aaa4ffa..2d53a9f646e 100644 --- a/src/control/server/storage/config_test.go +++ b/src/control/server/storage/config_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -737,7 +738,8 @@ func TestStorage_BdevDeviceRoles_ToYAML(t *testing.T) { NewTierConfig(). WithStorageClass("ram"). WithScmRamdiskSize(16). - WithScmMountPoint("/mnt/daos"), + WithScmMountPoint("/mnt/daos"). + WithScmHugepagesDisabled(true), NewTierConfig(). WithTier(1). WithStorageClass("nvme"). @@ -750,6 +752,7 @@ storage: - class: ram scm_mount: /mnt/daos scm_size: 16 + scm_hugepages_disabled: true - class: nvme bdev_list: - 0000:80:00.0 @@ -961,6 +964,168 @@ acceleration: } } +func TestStorage_SpdkIobuf_FromYAML(t *testing.T) { + for name, tc := range map[string]struct { + input string + expProps SpdkIobuf + expErr error + }{ + "iobuf section missing": { + input: ``, + }, + "iobuf section empty": { + input: ` +spdk_iobuf: +`, + }, + "small_pool_count only": { + input: ` +spdk_iobuf: + small_pool_count: 1024 +`, + expProps: SpdkIobuf{ + SmallPoolCount: 1024, + }, + }, + "large_pool_count only": { + input: ` +spdk_iobuf: + large_pool_count: 512 +`, + expProps: SpdkIobuf{ + LargePoolCount: 512, + }, + }, + "both pool counts set": { + input: ` +spdk_iobuf: + small_pool_count: 2048 + large_pool_count: 1024 +`, + expProps: SpdkIobuf{ + SmallPoolCount: 2048, + LargePoolCount: 1024, + }, + }, + "zero values": { + input: ` +spdk_iobuf: + small_pool_count: 0 + large_pool_count: 0 +`, + expProps: SpdkIobuf{ + SmallPoolCount: 0, + LargePoolCount: 0, + }, + }, + } { + t.Run(name, func(t *testing.T) { + cfg := new(Config) + err := yaml.UnmarshalStrict([]byte(tc.input), cfg) + test.CmpErr(t, tc.expErr, err) + if tc.expErr != nil { + return + } + + if diff := cmp.Diff(tc.expProps, cfg.SpdkIobufProps, defConfigCmpOpts()...); diff != "" { + t.Fatalf("bad props (-want +got):\n%s", diff) + } + }) + } +} + +func TestStorage_SpdkIobuf_ToYAML(t *testing.T) { + for name, tc := range map[string]struct { + props SpdkIobuf + expOut string + }{ + "empty": { + expOut: "{}\n", + }, + "small_pool_count only": { + props: SpdkIobuf{ + SmallPoolCount: 1024, + }, + expOut: "small_pool_count: 1024\n", + }, + "large_pool_count only": { + props: SpdkIobuf{ + LargePoolCount: 512, + }, + expOut: "large_pool_count: 512\n", + }, + "both pool counts set": { + props: SpdkIobuf{ + SmallPoolCount: 2048, + LargePoolCount: 1024, + }, + expOut: "small_pool_count: 2048\nlarge_pool_count: 1024\n", + }, + } { + t.Run(name, func(t *testing.T) { + buf, err := yaml.Marshal(&tc.props) + if err != nil { + t.Fatal(err) + } + + if diff := cmp.Diff(tc.expOut, string(buf)); diff != "" { + t.Fatalf("bad output (-want +got):\n%s", diff) + } + }) + } +} + +func TestStorage_SpdkIobuf_JSON(t *testing.T) { + for name, tc := range map[string]struct { + props SpdkIobuf + expOut string + }{ + "empty": { + expOut: `{}`, + }, + "small_pool_count only": { + props: SpdkIobuf{ + SmallPoolCount: 1024, + }, + expOut: `{"small_pool_count":1024}`, + }, + "large_pool_count only": { + props: SpdkIobuf{ + LargePoolCount: 512, + }, + expOut: `{"large_pool_count":512}`, + }, + "both pool counts set": { + props: SpdkIobuf{ + SmallPoolCount: 2048, + LargePoolCount: 1024, + }, + expOut: `{"small_pool_count":2048,"large_pool_count":1024}`, + }, + } { + t.Run(name, func(t *testing.T) { + buf, err := json.Marshal(&tc.props) + if err != nil { + t.Fatal(err) + } + + if diff := cmp.Diff(tc.expOut, string(buf)); diff != "" { + t.Fatalf("bad output (-want +got):\n%s", diff) + } + + // Test round-trip + var unmarshaled SpdkIobuf + if err := json.Unmarshal(buf, &unmarshaled); err != nil { + t.Fatal(err) + } + + if diff := cmp.Diff(tc.props, unmarshaled); diff != "" { + t.Fatalf("bad round-trip (-want +got):\n%s", diff) + } + }) + } +} + func TestStorage_ControlMetadata_Directory(t *testing.T) { for name, tc := range map[string]struct { cm ControlMetadata diff --git a/src/control/server/storage/metadata/provider.go b/src/control/server/storage/metadata/provider.go index ba3b3110ee8..07a25157817 100644 --- a/src/control/server/storage/metadata/provider.go +++ b/src/control/server/storage/metadata/provider.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022-2024 Intel Corporation. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -27,6 +28,7 @@ type ( Chown(string, int, int) error Getfs(device string) (string, error) GetfsType(path string) (*system.FsType, error) + GetDeviceLabel(string) (string, error) Mkdir(string, os.FileMode) error Mkfs(req system.MkfsReq) error RemoveAll(string) error @@ -99,10 +101,26 @@ func (p *Provider) setupMountPoint(req storage.MetadataFormatRequest) error { return errors.Wrap(err, "creating control metadata mount point") } - p.log.Debugf("formatting device %q", req.Device) + p.log.Debugf("checking existing device label for %q", req.Device) + label, err := p.sys.GetDeviceLabel(req.Device) + if err != nil { + return errors.Wrap(err, "checking existing device label") + } + + opts := []string{ + // Quiet mode + "-q", + } + if label != "" { + p.log.Debugf("preserving existing device label %q for %q", label, req.Device) + opts = append(opts, "-L", label) + } + + p.log.Debugf("mkfs.%s %q with options: %s", defaultDevFS, req.Device, strings.Join(opts, " ")) if err := p.sys.Mkfs(system.MkfsReq{ Filesystem: defaultDevFS, Device: req.Device, + Options: opts, Force: true, }); err != nil { return errors.Wrap(err, "formatting control metadata device filesystem") diff --git a/src/control/server/storage/metadata/provider_test.go b/src/control/server/storage/metadata/provider_test.go index a3262f78f41..2215187f1d6 100644 --- a/src/control/server/storage/metadata/provider_test.go +++ b/src/control/server/storage/metadata/provider_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022-2024 Intel Corporation. +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -38,12 +39,14 @@ func TestMetadata_Provider_Format(t *testing.T) { } for name, tc := range map[string]struct { - nilProv bool - sysCfg *system.MockSysConfig - mountCfg *storage.MockMountProviderConfig - setup func(*testing.T, string) func() - req storage.MetadataFormatRequest - expErr error + nilProv bool + sysCfg *system.MockSysConfig + mountCfg *storage.MockMountProviderConfig + setup func(*testing.T, string) func() + req storage.MetadataFormatRequest + expErr error + expMkfs bool + expMkfsOpts []string }{ "nil provider": { nilProv: true, @@ -107,6 +110,8 @@ func TestMetadata_Provider_Format(t *testing.T) { sysCfg: &system.MockSysConfig{ GetfsTypeErr: []error{errors.New("mock GetfsType")}, }, + expMkfsOpts: []string{"-q"}, + expMkfs: true, }, "GetfsType retries with parent if dir doesn't exist": { req: pathReq, @@ -129,19 +134,30 @@ func TestMetadata_Provider_Format(t *testing.T) { }, expErr: errors.New("mock MakeMountPath"), }, + "get label fails": { + req: deviceReq, + sysCfg: &system.MockSysConfig{ + GetDeviceLabelErr: errors.New("mock GetDeviceLabel"), + }, + expErr: errors.New("mock GetDeviceLabel"), + }, "mkfs fails": { req: deviceReq, sysCfg: &system.MockSysConfig{ MkfsErr: errors.New("mock mkfs"), }, - expErr: errors.New("mock mkfs"), + expErr: errors.New("mock mkfs"), + expMkfsOpts: []string{"-q"}, + expMkfs: true, }, "Mount fails": { req: deviceReq, mountCfg: &storage.MockMountProviderConfig{ MountErr: errors.New("mock Mount"), }, - expErr: errors.New("mock Mount"), + expErr: errors.New("mock Mount"), + expMkfsOpts: []string{"-q"}, + expMkfs: true, }, "remove old data dir fails": { req: deviceReq, @@ -159,7 +175,9 @@ func TestMetadata_Provider_Format(t *testing.T) { } } }, - expErr: errors.New("removing old control metadata subdirectory"), + expErr: errors.New("removing old control metadata subdirectory"), + expMkfsOpts: []string{"-q"}, + expMkfs: true, }, "create data dir fails": { req: deviceReq, @@ -177,14 +195,18 @@ func TestMetadata_Provider_Format(t *testing.T) { } } }, - expErr: errors.New("creating control metadata subdirectory"), + expErr: errors.New("creating control metadata subdirectory"), + expMkfsOpts: []string{"-q"}, + expMkfs: true, }, "chown data dir fails": { req: deviceReq, sysCfg: &system.MockSysConfig{ ChownErr: errors.New("mock chown"), }, - expErr: errors.New("mock chown"), + expErr: errors.New("mock chown"), + expMkfsOpts: []string{"-q"}, + expMkfs: true, }, "Unmount fails": { req: deviceReq, @@ -192,10 +214,22 @@ func TestMetadata_Provider_Format(t *testing.T) { IsMountedRes: true, UnmountErr: errors.New("mock Unmount"), }, - expErr: errors.New("mock Unmount"), + expErr: errors.New("mock Unmount"), + expMkfsOpts: []string{"-q"}, + expMkfs: true, }, "device success": { + req: deviceReq, + expMkfsOpts: []string{"-q"}, + expMkfs: true, + }, + "preserve existing label": { req: deviceReq, + sysCfg: &system.MockSysConfig{ + GetDeviceLabelRes: "old_label", + }, + expMkfsOpts: []string{"-q", "-L", "old_label"}, + expMkfs: true, }, "path only doesn't attempt device format": { req: pathReq, @@ -244,14 +278,23 @@ func TestMetadata_Provider_Format(t *testing.T) { defer teardown() var p *Provider + mockSys := system.NewMockSysProvider(log, tc.sysCfg) if !tc.nilProv { - p = NewProvider(log, system.NewMockSysProvider(log, tc.sysCfg), - storage.NewMockMountProvider(tc.mountCfg)) + p = NewProvider(log, mockSys, storage.NewMockMountProvider(tc.mountCfg)) } err := p.Format(tc.req) test.CmpErr(t, tc.expErr, err) + + if tc.expMkfs { + test.AssertEqual(t, 1, len(mockSys.MkfsReqs), "should have called mkfs") + if diff := cmp.Diff(tc.expMkfsOpts, mockSys.MkfsReqs[0].Options); diff != "" { + t.Errorf("unexpected mkfs options (-want +got):\n%s\n", diff) + } + } else { + test.AssertEqual(t, 0, len(mockSys.MkfsReqs), "should not have called mkfs") + } }) } } diff --git a/src/control/server/storage/mocks.go b/src/control/server/storage/mocks.go index 3d1647ad20c..f795c29dc29 100644 --- a/src/control/server/storage/mocks.go +++ b/src/control/server/storage/mocks.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -244,6 +244,7 @@ func MockProvider(log logging.Logger, idx int, engineStorage *Config, sys System p.scm = scm p.bdev = bdev p.metadata = meta + p.getTopology = MockGetTopology return p } diff --git a/src/control/server/storage/provider.go b/src/control/server/storage/provider.go index 625b95c9849..9749739f862 100644 --- a/src/control/server/storage/provider.go +++ b/src/control/server/storage/provider.go @@ -1,6 +1,6 @@ // // (C) Copyright 2021-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -45,6 +45,7 @@ type Provider struct { scm ScmProvider bdev BdevProvider vmdEnabled bool + getTopology topologyGetter } // DefaultProvider returns a provider populated with default parameters. @@ -52,8 +53,10 @@ func DefaultProvider(log logging.Logger, idx int, engineStorage *Config) *Provid if engineStorage == nil { engineStorage = new(Config) } - return NewProvider(log, idx, engineStorage, system.DefaultProvider(), + p := NewProvider(log, idx, engineStorage, system.DefaultProvider(), NewScmForwarder(log), NewBdevForwarder(log), NewMetadataForwarder(log)) + p.getTopology = hwloc.NewProvider(log).GetTopology + return p } // FormatControlMetadata formats the storage used for control metadata. @@ -202,6 +205,15 @@ func (p *Provider) ControlMetadataIsMounted() (bool, error) { return p.Sys.IsMounted(p.engineStorage.ControlMetadata.Path) } +// AllowSpdkConfOverride returns true if override of SPDK JSON config file (daos_nvme.conf) has been +// explicitly enabled within the ControlMetadata section of the server config file. +func (p *Provider) AllowSpdkConfOverride() bool { + if !p.engineStorage.ControlMetadata.HasPath() { + return false + } + return p.engineStorage.ControlMetadata.AllowSpdkConfOverride +} + // PrepareScm calls into storage SCM provider to attempt to configure PMem devices to be usable by // DAOS. func (p *Provider) PrepareScm(req ScmPrepareRequest) (*ScmPrepareResponse, error) { @@ -272,10 +284,14 @@ func (p *Provider) MountScm() error { switch cfg.Class { case ClassRam: + disableHugepages := true + if cfg.Scm.DisableHugepages != nil { + disableHugepages = *cfg.Scm.DisableHugepages + } req.Ramdisk = &RamdiskParams{ Size: cfg.Scm.RamdiskSize, NUMANode: cfg.Scm.NumaNodeIndex, - DisableHugepages: cfg.Scm.DisableHugepages, + DisableHugepages: disableHugepages, } case ClassDcpm: if len(cfg.Scm.DeviceList) != 1 { @@ -334,10 +350,14 @@ func createScmFormatRequest(class Class, scmCfg ScmConfig, force bool) (*ScmForm switch class { case ClassRam: + disableHugepages := true + if scmCfg.DisableHugepages != nil { + disableHugepages = *scmCfg.DisableHugepages + } req.Ramdisk = &RamdiskParams{ Size: scmCfg.RamdiskSize, NUMANode: scmCfg.NumaNodeIndex, - DisableHugepages: scmCfg.DisableHugepages, + DisableHugepages: disableHugepages, } case ClassDcpm: if len(scmCfg.DeviceList) != 1 { @@ -580,6 +600,7 @@ func BdevWriteConfigRequestFromConfig(ctx context.Context, log logging.Logger, c AccelProps: cfg.AccelProps, SpdkRpcSrvProps: cfg.SpdkRpcSrvProps, AutoFaultyProps: cfg.AutoFaultyProps, + SpdkIobufProps: cfg.SpdkIobufProps, } for idx, tier := range cfg.Tiers.BdevConfigs() { @@ -606,10 +627,11 @@ func (p *Provider) WriteNvmeConfig(ctx context.Context, log logging.Logger, ctrl vmdEnabled := p.vmdEnabled engineIndex := p.engineIndex engineStorage := p.engineStorage + getTopology := p.getTopology p.RUnlock() req, err := BdevWriteConfigRequestFromConfig(ctx, log, engineStorage, - vmdEnabled, hwloc.NewProvider(log).GetTopology) + vmdEnabled, getTopology) if err != nil { return errors.Wrap(err, "creating write config request") } @@ -719,3 +741,9 @@ func NewProvider(log logging.Logger, idx int, engineStorage *Config, sys SystemP metadata: meta, } } + +// setTopologyGetter sets the topology getter function for the provider. This is +// used in tests to inject a mock topology. +func (p *Provider) setTopologyGetter(fn topologyGetter) { + p.getTopology = fn +} diff --git a/src/control/server/storage/provider_test.go b/src/control/server/storage/provider_test.go index 043d5ca6a0d..e60e6a1441f 100644 --- a/src/control/server/storage/provider_test.go +++ b/src/control/server/storage/provider_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2021-2023 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -342,6 +343,7 @@ func TestStorage_ProviderUpgradeBdevConfig(t *testing.T) { ctx := test.MustLogContext(t, test.Context(t)) p := NewProvider(logging.FromContext(ctx), 0, tc.cfg, nil, nil, tc.bdevProv, nil) + p.setTopologyGetter(MockGetTopology) gotErr := p.UpgradeBdevConfig(ctx, tc.ctrlrs) test.CmpErr(t, tc.expErr, gotErr) if tc.expErr != nil { diff --git a/src/control/server/storage/scm.go b/src/control/server/storage/scm.go index 593f93b55d4..48c25e7e6da 100644 --- a/src/control/server/storage/scm.go +++ b/src/control/server/storage/scm.go @@ -52,7 +52,7 @@ const ( // Memory reservation constant defaults to be used when calculating RAM-disk size for DAOS I/O engine. const ( - DefaultSysMemRsvd = humanize.GiByte * 26 // per-system + DefaultSysMemRsvd = humanize.GiByte * 64 // per-system DefaultTgtMemRsvd = humanize.MiByte * 128 // per-engine-target DefaultEngineMemRsvd = humanize.GiByte * 1 // per-engine ) diff --git a/src/control/server/storage/scm_test.go b/src/control/server/storage/scm_test.go index b5061148e88..67651214432 100644 --- a/src/control/server/storage/scm_test.go +++ b/src/control/server/storage/scm_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2023-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -47,20 +48,20 @@ func Test_CalcRamdiskSize(t *testing.T) { expErr: errors.New("insufficient ram"), // 30 - (14+26+1) = -1 }, "default values; high mem": { - memTotal: humanize.GiByte * 70, + memTotal: humanize.GiByte * 108, memHuge: humanize.GiByte * 30, memSys: DefaultSysMemRsvd, tgtCount: 16, engCount: 2, - expSize: humanize.GiByte * 5, // (70 - (30+26+4)) / 2 + expSize: humanize.GiByte * 5, // (108 - (30+64+4)) / 2 }, "default values; low nr targets": { - memTotal: humanize.GiByte * 70, + memTotal: humanize.GiByte * 108, memHuge: humanize.GiByte * 30, memSys: DefaultSysMemRsvd, tgtCount: 1, engCount: 2, - expSize: humanize.GiByte * 6, // (70 - (30+26+2)) / 2 + expSize: humanize.GiByte * 6, // (108 - (30+64+2)) / 2 }, "custom values; low sys reservation": { memTotal: humanize.GiByte * 60, diff --git a/src/control/server/telemetry.go b/src/control/server/telemetry.go index 0a7403fc7e0..b5eb9f7cb54 100644 --- a/src/control/server/telemetry.go +++ b/src/control/server/telemetry.go @@ -1,6 +1,6 @@ // // (C) Copyright 2018-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -69,10 +69,11 @@ func regPromEngineSources(ctx context.Context, log logging.Logger, engines []Eng return nil } -func startPrometheusExporter(ctx context.Context, log logging.Logger, port int, engines []Engine) (func(), error) { +func startPrometheusExporter(ctx context.Context, log logging.Logger, port int, bindAddr string, engines []Engine) (func(), error) { expCfg := &promexp.ExporterConfig{ - Port: port, - Title: "DAOS Engine Telemetry", + Port: port, + BindAddress: bindAddr, + Title: "DAOS Engine Telemetry", Register: func(ctx context.Context, log logging.Logger) error { return regPromEngineSources(ctx, log, engines) }, diff --git a/src/control/system/checker/finding.go b/src/control/system/checker/finding.go index 770dfcd7132..2be0119987f 100644 --- a/src/control/system/checker/finding.go +++ b/src/control/system/checker/finding.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -73,9 +74,9 @@ func NewFinding(report *chkpb.CheckReport) *Finding { return f } -// descAction attempts to generate a human-readable description of the +// GetActionMsg attempts to generate a human-readable description of the // action that may be taken for the given finding. -func descAction(class chkpb.CheckInconsistClass, action chkpb.CheckInconsistAction, details ...string) string { +func GetActionMsg(class chkpb.CheckInconsistClass, action chkpb.CheckInconsistAction, details ...string) string { var ro reportObject switch { case class >= chkpb.CheckInconsistClass_CIC_POOL_LESS_SVC_WITH_QUORUM && class <= chkpb.CheckInconsistClass_CIC_POOL_BAD_LABEL: @@ -151,6 +152,8 @@ func descAction(class chkpb.CheckInconsistClass, action chkpb.CheckInconsistActi return fmt.Sprintf("Trust the parity of the %s results", ro) case chkpb.CheckInconsistAction_CIA_TRUST_EC_DATA: return fmt.Sprintf("Trust the data of the %s results", ro) + case chkpb.CheckInconsistAction_CIA_STALE: + return "Current checker instance cannot act on this finding. Restart checker against the pool to handle the inconsistency." default: return fmt.Sprintf("%s: %s (details: %+v)", ro, action, details) } @@ -174,6 +177,7 @@ func trimProtoSpaces(pm proto.Message) { }) } +// AnnotateFinding updates human-readable action messages. func AnnotateFinding(f *Finding) *Finding { if f == nil { return nil @@ -195,11 +199,11 @@ func AnnotateFinding(f *Finding) *Finding { if len(f.ActChoices) > 0 { f.ActMsgs = make([]string, len(f.ActChoices)) for i, act := range f.ActChoices { - f.ActMsgs[i] = descAction(f.Class, act, append([]string{f.PoolUuid, f.ContUuid}, f.ActDetails...)...) + f.ActMsgs[i] = GetActionMsg(f.Class, act, append([]string{f.PoolUuid, f.ContUuid}, f.ActDetails...)...) } } else { f.ActMsgs = make([]string, 1) - f.ActMsgs[0] = descAction(f.Class, f.Action, append([]string{f.PoolUuid, f.ContUuid}, f.ActDetails...)...) + f.ActMsgs[0] = GetActionMsg(f.Class, f.Action, append([]string{f.PoolUuid, f.ContUuid}, f.ActDetails...)...) } } if len(f.Msg) == 0 { diff --git a/src/control/system/checker/finding_test.go b/src/control/system/checker/finding_test.go index 3517ce33315..96c6a819619 100644 --- a/src/control/system/checker/finding_test.go +++ b/src/control/system/checker/finding_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -438,6 +439,30 @@ func TestChecker_AnnotateFinding(t *testing.T) { }, }), }, + "stale": { + rpt: &chkpb.CheckReport{ + Seq: 972775323717861377, + Class: chkpb.CheckInconsistClass_CIC_CONT_BAD_LABEL, + Action: chkpb.CheckInconsistAction_CIA_STALE, + PoolUuid: "9614ebfb-cbad-4250-a4e4-d24b7b70d85e", + ContUuid: "18b9b418-211c-455f-aa42-0cc13dedcff9", + Timestamp: "Mon Dec 5 16:27:56 2022\n", + Msg: "Check engine detects inconsistent container label: new-label (CS) vs foo (property).\n", + }, + expFinding: checker.NewFinding( + &chkpb.CheckReport{ + Seq: 972775323717861377, + Class: chkpb.CheckInconsistClass_CIC_CONT_BAD_LABEL, + Action: chkpb.CheckInconsistAction_CIA_STALE, + PoolUuid: "9614ebfb-cbad-4250-a4e4-d24b7b70d85e", + ContUuid: "18b9b418-211c-455f-aa42-0cc13dedcff9", + Timestamp: "Mon Dec 5 16:27:56 2022", + Msg: "Check engine detects inconsistent container label: new-label (CS) vs foo (property).", + ActMsgs: []string{ + "Current checker instance cannot act on this finding. Restart checker against the pool to handle the inconsistency.", + }, + }), + }, } { t.Run(name, func(t *testing.T) { f := checker.NewFinding(tc.rpt) diff --git a/src/control/system/errors.go b/src/control/system/errors.go index 509bee13906..335a255bf2f 100644 --- a/src/control/system/errors.go +++ b/src/control/system/errors.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -17,6 +17,8 @@ import ( "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" "github.com/daos-stack/daos/src/control/lib/ranklist" ) @@ -39,8 +41,10 @@ func IsUnavailable(err error) bool { if err == nil { return false } - cause := errors.Cause(err).Error() - return strings.Contains(cause, ErrRaftUnavail.Error()) || strings.Contains(cause, ErrLeaderStepUpInProgress.Error()) + cause := errors.Cause(err) + return strings.Contains(cause.Error(), ErrRaftUnavail.Error()) || + strings.Contains(cause.Error(), ErrLeaderStepUpInProgress.Error()) || + fault.IsFaultCode(cause, code.ServerDataPlaneNotStarted) } // IsEmptyGroupMap returns a boolean indicating whether or not the diff --git a/src/control/system/errors_test.go b/src/control/system/errors_test.go index d2ea4eda1ab..02c896a1b86 100644 --- a/src/control/system/errors_test.go +++ b/src/control/system/errors_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -12,6 +13,8 @@ import ( "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" ) func TestSystem_Errors_IsNotReady(t *testing.T) { @@ -79,12 +82,32 @@ func TestSystem_Errors_IsUnavailable(t *testing.T) { err: ErrLeaderStepUpInProgress, expResult: true, }, + "data plane not started": { + err: &fault.Fault{Code: code.ServerDataPlaneNotStarted}, + expResult: true, + }, + "wrapped data plane not started": { + err: errors.Wrap(&fault.Fault{Code: code.ServerDataPlaneNotStarted}, "wrapped error"), + expResult: true, + }, "uninitialized not unavailable": { err: ErrUninitialized, }, "something else": { err: errors.New("something is wrong"), }, + "member exists not unavailable": { + err: ErrRankExists(1), + }, + "member not found not unavailable": { + err: ErrMemberRankNotFound(1), + }, + "pool not found not unavailable": { + err: ErrPoolRankNotFound(1), + }, + "different fault code not unavailable": { + err: &fault.Fault{Code: code.ClientUnknown}, + }, } { t.Run(name, func(t *testing.T) { test.AssertEqual(t, tc.expResult, IsUnavailable(tc.err), "") diff --git a/src/control/system/membership.go b/src/control/system/membership.go index eb4746c0769..691243c0682 100644 --- a/src/control/system/membership.go +++ b/src/control/system/membership.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -102,7 +102,7 @@ func (m *Membership) FindRankFromJoinRequest(req *JoinRequest) (Rank, error) { return NilRank, errors.New("unexpected rank in replace-rank request") } - currentMembers, err := m.Members(nil) + currentMembers, err := m.Members(nil, AllMemberFilter) if err != nil { return NilRank, errors.Wrap(err, "failed to get all system members") } diff --git a/src/control/system/membership_test.go b/src/control/system/membership_test.go index 238b296ebcf..7d3e2db8473 100644 --- a/src/control/system/membership_test.go +++ b/src/control/system/membership_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -779,7 +779,8 @@ func TestSystem_Membership_FindRankFromJoinRequest(t *testing.T) { FabricContexts: newMember.PrimaryFabricContexts, FaultDomain: newMember.FaultDomain, }, - expErr: FaultJoinReplaceRankNotFound(4), // Takes nr not matching fields + // Fault constructor takes the number of non-matching fields. + expErr: FaultJoinReplaceRankNotFound(4), }, "partially matching member": { req: &JoinRequest{ @@ -790,7 +791,9 @@ func TestSystem_Membership_FindRankFromJoinRequest(t *testing.T) { FabricContexts: curMember.PrimaryFabricContexts, FaultDomain: curMember.FaultDomain, }, - expErr: FaultJoinReplaceRankNotFound(1), // Diff resolution when nr == 1 + // A different fault resolution is printed when the number of non-matching + // fields is only one. + expErr: FaultJoinReplaceRankNotFound(1), }, "matching member; identical UUID": { req: &JoinRequest{ @@ -825,6 +828,20 @@ func TestSystem_Membership_FindRankFromJoinRequest(t *testing.T) { }, expRank: curMember.Rank, }, + "admin excluded existing member": { + curMembers: []*Member{ + MockMember(t, 1, MemberStateAdminExcluded).WithFaultDomain(fd1), + }, + req: &JoinRequest{ + Rank: NilRank, + UUID: newUUID, + ControlAddr: curMember.Addr, + PrimaryFabricURI: curMember.Addr.String(), + FabricContexts: curMember.PrimaryFabricContexts, + FaultDomain: curMember.FaultDomain, + }, + expRank: curMember.Rank, + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) diff --git a/src/control/system/raft/database_checker.go b/src/control/system/raft/database_checker.go index 6fa13acead5..4e02d36dc9a 100644 --- a/src/control/system/raft/database_checker.go +++ b/src/control/system/raft/database_checker.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022-2023 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -161,15 +162,36 @@ func (db *Database) SetCheckerFindingAction(seq uint64, action int32) error { return err } - for i, d := range f.ActChoices { - if d != chkAction { - continue - } + if chkAction == chk.CheckInconsistAction_CIA_STALE { f.Action = chkAction - if len(f.ActMsgs) > i { - f.ActMsgs = []string{f.ActMsgs[i]} - } + + // Clear old choices and re-annotate f.ActChoices = nil + f.ActDetails = nil + f.ActMsgs = nil + f = checker.AnnotateFinding(f) + } else { + found := false + + for i, d := range f.ActChoices { + if d != chkAction { + continue + } + f.Action = chkAction + if len(f.ActMsgs) > i { + f.ActMsgs = []string{f.ActMsgs[i]} + } + if len(f.ActDetails) > i { + f.ActDetails = []string{f.ActDetails[i]} + } + f.ActChoices = nil + found = true + break + } + + if !found { + return errors.Errorf("action not available for this finding: %s", chk.CheckInconsistAction_name[action]) + } } return db.submitCheckerUpdate(raftOpUpdateCheckerFinding, f) diff --git a/src/control/system/raft/database_checker_test.go b/src/control/system/raft/database_checker_test.go new file mode 100644 index 00000000000..44ecdfbe19c --- /dev/null +++ b/src/control/system/raft/database_checker_test.go @@ -0,0 +1,204 @@ +// +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package raft + +import ( + "testing" + + "github.com/daos-stack/daos/src/control/common/proto/chk" + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/logging" + "github.com/daos-stack/daos/src/control/system/checker" + "github.com/pkg/errors" +) + +func TestRaft_Database_SetCheckerFindingAction(t *testing.T) { + createDBWithFindings := func(t *testing.T, log logging.Logger, findings ...*checker.Finding) *Database { + db := MockDatabase(t, log) + for _, f := range findings { + if err := db.data.Checker.addFinding(f); err != nil { + t.Fatal(err) + } + } + return db + } + + staleMsg := checker.GetActionMsg(0, chk.CheckInconsistAction_CIA_STALE) + + for name, tc := range map[string]struct { + startFindings []*checker.Finding + seq uint64 + action chk.CheckInconsistAction + expErr error + expActionChoices []chk.CheckInconsistAction + expActionMsg []string + expActionDetails []string + }{ + "invalid action": { + action: chk.CheckInconsistAction(4242), // arbitrary + expErr: errors.New("invalid action"), + }, + "empty db": { + seq: 123, + action: chk.CheckInconsistAction_CIA_IGNORE, + expErr: ErrFindingNotFound(123), + }, + "not found": { + startFindings: []*checker.Finding{ + {CheckReport: chk.CheckReport{Seq: 100}}, + {CheckReport: chk.CheckReport{Seq: 101}}, + {CheckReport: chk.CheckReport{Seq: 102}}, + }, + seq: 123, + action: chk.CheckInconsistAction_CIA_IGNORE, + expErr: ErrFindingNotFound(123), + }, + "stale, no action choices": { + startFindings: []*checker.Finding{ + {CheckReport: chk.CheckReport{Seq: 100}}, + {CheckReport: chk.CheckReport{Seq: 101}}, + {CheckReport: chk.CheckReport{Seq: 102}}, + }, + seq: 101, + action: chk.CheckInconsistAction_CIA_STALE, + expActionMsg: []string{ + staleMsg, + }, + }, + "stale ignores choices": { + startFindings: []*checker.Finding{ + { + CheckReport: chk.CheckReport{ + Seq: 101, + ActChoices: []chk.CheckInconsistAction{ + chk.CheckInconsistAction_CIA_IGNORE, + chk.CheckInconsistAction_CIA_TRUST_MS, + chk.CheckInconsistAction_CIA_TRUST_PS, + }, + ActMsgs: []string{ + "one", + "two", + "three", + }, + ActDetails: []string{ + "detail1", + "detail2", + "detail3", + }, + }, + }, + }, + seq: 101, + action: chk.CheckInconsistAction_CIA_STALE, + expActionChoices: nil, // cleared + expActionDetails: nil, // cleared + expActionMsg: []string{ + staleMsg, + }, + }, + "valid choice": { + startFindings: []*checker.Finding{ + { + CheckReport: chk.CheckReport{ + Seq: 101, + ActChoices: []chk.CheckInconsistAction{ + chk.CheckInconsistAction_CIA_IGNORE, + chk.CheckInconsistAction_CIA_TRUST_MS, + chk.CheckInconsistAction_CIA_TRUST_PS, + }, + ActMsgs: []string{ + "one", + "two", + "three", + }, + ActDetails: []string{ + "detail1", + "detail2", + "detail3", + }, + }, + }, + }, + seq: 101, + action: chk.CheckInconsistAction_CIA_TRUST_MS, + expActionChoices: nil, // cleared + expActionMsg: []string{ + "two", + }, + expActionDetails: []string{ + "detail2", + }, + }, + "no messages or details": { + startFindings: []*checker.Finding{ + { + CheckReport: chk.CheckReport{ + Seq: 101, + ActChoices: []chk.CheckInconsistAction{ + chk.CheckInconsistAction_CIA_IGNORE, + chk.CheckInconsistAction_CIA_TRUST_MS, + chk.CheckInconsistAction_CIA_TRUST_PS, + }, + }, + }, + }, + seq: 101, + action: chk.CheckInconsistAction_CIA_TRUST_MS, + expActionChoices: nil, // cleared + }, + "unavailable choice": { + startFindings: []*checker.Finding{ + { + CheckReport: chk.CheckReport{ + Seq: 101, + ActChoices: []chk.CheckInconsistAction{ + chk.CheckInconsistAction_CIA_IGNORE, + chk.CheckInconsistAction_CIA_TRUST_MS, + chk.CheckInconsistAction_CIA_TRUST_PS, + }, + ActMsgs: []string{ + "one", + "two", + "three", + }, + ActDetails: []string{ + "detail1", + "detail2", + "detail3", + }, + }, + }, + }, + seq: 101, + action: chk.CheckInconsistAction_CIA_TRUST_EC_DATA, + expErr: errors.New("action not available"), + }, + } { + t.Run(name, func(t *testing.T) { + ctx := test.MustLogContext(t) + + db := createDBWithFindings(t, logging.FromContext(ctx), tc.startFindings...) + + err := db.SetCheckerFindingAction(tc.seq, int32(tc.action)) + + test.CmpErr(t, tc.expErr, err) + + if tc.expErr == nil { + // Check that the action was actually updated + f, err := db.GetCheckerFinding(tc.seq) + if err != nil { + t.Fatal(err) + } + + test.AssertEqual(t, tc.action, f.Action, "verifying action was set") + test.CmpAny(t, "action choices", tc.expActionChoices, f.ActChoices) + test.CmpAny(t, "action messages", tc.expActionMsg, f.ActMsgs) + test.CmpAny(t, "action details", tc.expActionDetails, f.ActDetails) + } + }) + } +} diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 106205e0c5b..fa5b640cc5a 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -426,7 +426,7 @@ dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epo coll_args.ca_tgt_bitmap_sz = bitmap_sz; coll_args.ca_tgt_bitmap = bitmap; - rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_USE_CURRENT_ULT); + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, "Locally exec collective DTX PRC %u for "DF_DTI": "DF_RC"\n", opc, DP_DTI(xid), DP_RC(rc)); diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 175f440dc4e..ffb17279d34 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -883,15 +883,10 @@ dtx_handle_reinit(struct dtx_handle *dth) dth->dth_modify_shared = 0; dth->dth_active = 0; - dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; dth->dth_cos_done = 0; - - dth->dth_op_seq = 0; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - D_FREE(dth->dth_oid_array); - dth->dth_dkey_hash = 0; + dth->dth_op_seq = 0; + dth->dth_dkey_hash = 0; vos_dtx_rsrvd_fini(dth); return vos_dtx_rsrvd_init(dth); @@ -926,32 +921,29 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_coh = xoh; } - dth->dth_ver = pm_ver; - dth->dth_refs = 1; - dth->dth_mbs = mbs; - - dth->dth_pinned = 0; - dth->dth_cos_done = 0; - dth->dth_modify_shared = 0; - dth->dth_active = 0; - dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; - dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; - dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; - dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; + dth->dth_ver = pm_ver; + dth->dth_refs = 1; + dth->dth_mbs = mbs; + dth->dth_pinned = 0; + dth->dth_cos_done = 0; + dth->dth_modify_shared = 0; + dth->dth_active = 0; + dth->dth_local_tx_started = 0; + dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; + dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; + dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; + dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0; - dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; - dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; - dth->dth_aborted = 0; - dth->dth_already = 0; - dth->dth_need_validation = 0; + dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; + dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; + dth->dth_aborted = 0; + dth->dth_already = 0; + dth->dth_need_validation = 0; dth->dth_local = (flags & DTX_LOCAL) ? 1 : 0; - - dth->dth_dti_cos = dti_cos; - dth->dth_dti_cos_count = dti_cos_cnt; - dth->dth_ent = NULL; - dth->dth_flags = leader ? DTE_LEADER : 0; + dth->dth_dti_cos = dti_cos; + dth->dth_dti_cos_count = dti_cos_cnt; + dth->dth_ent = NULL; + dth->dth_flags = leader ? DTE_LEADER : 0; if (flags & DTX_SYNC) { dth->dth_flags |= DTE_BLOCK; @@ -960,12 +952,11 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_sync = 0; } - dth->dth_op_seq = 0; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - dth->dth_oid_array = NULL; - - dth->dth_dkey_hash = 0; + dth->dth_op_seq = 0; + dth->dth_local_oid_cnt = 0; + dth->dth_local_oid_cap = 0; + dth->dth_local_oid_array = NULL; + dth->dth_dkey_hash = 0; if (!(flags & DTX_LOCAL)) { if (daos_is_zero_dti(dti)) @@ -1001,83 +992,6 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, return rc; } -static int -dtx_insert_oid(struct dtx_handle *dth, daos_unit_oid_t *oid, bool touch_leader) -{ - int start = 0; - int end = dth->dth_oid_cnt - 1; - int at; - int rc = 0; - - do { - at = (start + end) / 2; - rc = daos_unit_oid_compare(dth->dth_oid_array[at], *oid); - if (rc == 0) - return 0; - - if (rc > 0) - end = at - 1; - else - start = at + 1; - } while (start <= end); - - if (dth->dth_oid_cnt == dth->dth_oid_cap) { - daos_unit_oid_t *oid_array; - - D_ALLOC_ARRAY(oid_array, dth->dth_oid_cap << 1); - if (oid_array == NULL) - return -DER_NOMEM; - - if (rc > 0) { - /* Insert before dth->dth_oid_array[at]. */ - if (at > 0) - memcpy(&oid_array[0], &dth->dth_oid_array[0], - sizeof(*oid) * at); - oid_array[at] = *oid; - memcpy(&oid_array[at + 1], &dth->dth_oid_array[at], - sizeof(*oid) * (dth->dth_oid_cnt - at)); - } else { - /* Insert after dth->dth_oid_array[at]. */ - memcpy(&oid_array[0], &dth->dth_oid_array[0], - sizeof(*oid) * (at + 1)); - oid_array[at + 1] = *oid; - if (at < dth->dth_oid_cnt - 1) - memcpy(&oid_array[at + 2], - &dth->dth_oid_array[at + 1], - sizeof(*oid) * (dth->dth_oid_cnt - 1 - at)); - } - - D_FREE(dth->dth_oid_array); - dth->dth_oid_array = oid_array; - dth->dth_oid_cap <<= 1; - - goto out; - } - - if (rc > 0) { - /* Insert before dth->dth_oid_array[at]. */ - memmove(&dth->dth_oid_array[at + 1], - &dth->dth_oid_array[at], - sizeof(*oid) * (dth->dth_oid_cnt - at)); - dth->dth_oid_array[at] = *oid; - } else { - /* Insert after dth->dth_oid_array[at]. */ - if (at < dth->dth_oid_cnt - 1) - memmove(&dth->dth_oid_array[at + 2], - &dth->dth_oid_array[at + 1], - sizeof(*oid) * (dth->dth_oid_cnt - 1 - at)); - dth->dth_oid_array[at + 1] = *oid; - } - -out: - if (touch_leader) - dth->dth_touched_leader_oid = 1; - - dth->dth_oid_cnt++; - - return 0; -} - void dtx_renew_epoch(struct dtx_epoch *epoch, struct dtx_handle *dth) { @@ -1110,51 +1024,6 @@ dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash) dth->dth_dkey_hash = dkey_hash; dth->dth_op_seq++; - rc = daos_unit_oid_compare(dth->dth_leader_oid, *oid); - if (rc == 0) { - if (dth->dth_oid_array == NULL) - dth->dth_touched_leader_oid = 1; - - if (dth->dth_touched_leader_oid) - goto out; - - rc = dtx_insert_oid(dth, oid, true); - - D_GOTO(out, rc); - } - - if (dth->dth_oid_array == NULL) { - D_ASSERT(dth->dth_oid_cnt == 0); - - /* 4 slots by default to hold rename case. */ - dth->dth_oid_cap = 4; - D_ALLOC_ARRAY(dth->dth_oid_array, dth->dth_oid_cap); - if (dth->dth_oid_array == NULL) - D_GOTO(out, rc = -DER_NOMEM); - - if (!dth->dth_touched_leader_oid) { - dth->dth_oid_array[0] = *oid; - dth->dth_oid_cnt = 1; - - D_GOTO(out, rc = 0); - } - - dth->dth_oid_cnt = 2; - - if (rc > 0) { - dth->dth_oid_array[0] = *oid; - dth->dth_oid_array[1] = dth->dth_leader_oid; - } else { - dth->dth_oid_array[0] = dth->dth_leader_oid; - dth->dth_oid_array[1] = *oid; - } - - D_GOTO(out, rc = 0); - } - - rc = dtx_insert_oid(dth, oid, false); - -out: D_DEBUG(DB_IO, "Sub init DTX "DF_DTI" for object "DF_UOID " dkey %lu, opc seq %d: "DF_RC"\n", DP_DTI(&dth->dth_xid), DP_UOID(*oid), @@ -1493,7 +1362,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_child *cont, int re dth->dth_sync ? "sync" : "async", dth->dth_dti_cos_count, dth->dth_cos_done ? dth->dth_dti_cos_count : 0, DP_RC(result)); - D_FREE(dth->dth_oid_array); D_FREE(dlh); d_tm_dec_gauge(dtx_tls_get()->dt_dtx_leader_total, 1); @@ -1617,7 +1485,6 @@ dtx_end(struct dtx_handle *dth, struct ds_cont_child *cont, int result) vos_dtx_detach(dth); out: - D_FREE(dth->dth_oid_array); D_FREE(dth); return result; diff --git a/src/dtx/tests/SConscript b/src/dtx/tests/SConscript index 2ea2e93eec5..0367747d76d 100644 --- a/src/dtx/tests/SConscript +++ b/src/dtx/tests/SConscript @@ -7,7 +7,7 @@ def scons(): # build dtx_tests - libraries = ['abt', 'bio', 'dtx', 'vos', 'gurt', 'daos_common_pmem', 'cmocka', 'pthread', + libraries = ['abt', 'bio', 'dtx', 'vos', 'ssl', 'gurt', 'daos_common_pmem', 'cmocka', 'pthread', 'uuid', 'cart', 'daos_tests'] tenv = denv.Clone() @@ -36,8 +36,8 @@ def scons(): # build dtx_ut - libraries = ['abt', 'bio', 'cmocka', 'daos_common_pmem', 'gurt', 'uuid', 'vea', 'pthread', - 'pmemobj'] + libraries = ['abt', 'bio', 'ssl', 'cmocka', 'daos_common_pmem', 'gurt', 'uuid', 'vea', + 'pthread', 'pmemobj'] tenv = denv.Clone() tenv.Append(CPPPATH=[Dir('../../vos').srcnode()]) diff --git a/src/dtx/tests/dts_aggregate.c b/src/dtx/tests/dts_aggregate.c index 0f341cf9f5b..4ef3a1f653c 100644 --- a/src/dtx/tests/dts_aggregate.c +++ b/src/dtx/tests/dts_aggregate.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -191,7 +191,7 @@ prep_dtx_entries(void) } static void -check_rollback(void) +check_rollback(uint32_t count) { int i; uint64_t cmt_time; @@ -214,8 +214,8 @@ check_rollback(void) umem_ptr2off(&mock_pool.vp_umm, mock_dbds[0])); assert_int_equal(mock_cont_df.cd_dtx_committed_tail, umem_ptr2off(&mock_pool.vp_umm, mock_dbds[DBD_BLOBS_CAP - 1])); - assert_int_equal(mock_cont.vc_dtx_committed_count, DBD_BLOBS_CAP * DBD_BLOB_DF_CAP); - assert_int_equal(mock_pool.vp_dtx_committed_count, DBD_BLOBS_CAP * DBD_BLOB_DF_CAP); + assert_int_equal(mock_cont.vc_dtx_committed_count, DBD_BLOBS_CAP * DBD_BLOB_DF_CAP - count); + assert_int_equal(mock_pool.vp_dtx_committed_count, DBD_BLOBS_CAP * DBD_BLOB_DF_CAP - count); assert_int_equal(mock_cont.vc_cmt_dtx_reindex_pos, umem_ptr2off(&mock_pool.vp_umm, mock_dbds[0])); } @@ -348,7 +348,7 @@ test_tx_begin_error(void **unused) rc = vos_dtx_aggregate(mock_coh, NULL); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(0); } /* DAOS B-tree delete failure */ @@ -366,10 +366,12 @@ test_dbtree_delete_error(void **unused) will_return(__wrap_dbtree_delete, 0); will_return(__wrap_dbtree_delete, -DER_UNKNOWN); expect_value(tx_abort, error, -DER_UNKNOWN); + expect_value(__wrap_d_tm_dec_gauge, metric, mock_tls.vtl_committed); + expect_value(__wrap_d_tm_dec_gauge, value, 3); rc = vos_dtx_aggregate(mock_coh, NULL); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(3); } /* Update of newest aggregated epoch failure */ @@ -389,10 +391,12 @@ test_newest_aggregated_error(void **unused) expect_value(tx_add_ptr, ptr_size, sizeof(mock_cont_df.cd_newest_aggregated)); will_return(tx_add_ptr, -DER_UNKNOWN); expect_value(tx_abort, error, -DER_UNKNOWN); + expect_value(__wrap_d_tm_dec_gauge, metric, mock_tls.vtl_committed); + expect_value(__wrap_d_tm_dec_gauge, value, DBD_BLOB_DF_CAP); rc = vos_dtx_aggregate(mock_coh, NULL); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(DBD_BLOB_DF_CAP); } /* Update of DTX blob list failure */ @@ -415,10 +419,12 @@ test_committed_head_error(void **unused) expect_value(tx_add_ptr, ptr_size, sizeof(umem_off_t)); will_return(tx_add_ptr, -DER_UNKNOWN); expect_value(tx_abort, error, -DER_UNKNOWN); + expect_value(__wrap_d_tm_dec_gauge, metric, mock_tls.vtl_committed); + expect_value(__wrap_d_tm_dec_gauge, value, DBD_BLOB_DF_CAP); rc = vos_dtx_aggregate(mock_coh, NULL); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(DBD_BLOB_DF_CAP); } /* Update of DTX blob list failure */ @@ -444,10 +450,12 @@ test_committed_prev_error(void **unused) expect_value(tx_add_ptr, ptr_size, sizeof(umem_off_t)); will_return(tx_add_ptr, -DER_UNKNOWN); expect_value(tx_abort, error, -DER_UNKNOWN); + expect_value(__wrap_d_tm_dec_gauge, metric, mock_tls.vtl_committed); + expect_value(__wrap_d_tm_dec_gauge, value, DBD_BLOB_DF_CAP); rc = vos_dtx_aggregate(mock_coh, NULL); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(DBD_BLOB_DF_CAP); } /* Pmem free failure */ @@ -475,10 +483,12 @@ test_umm_free_error(void **unused) expect_value(tx_free, umoff, mock_dbds_off[0]); will_return(tx_free, -DER_UNKNOWN); expect_value(tx_abort, error, -DER_UNKNOWN); + expect_value(__wrap_d_tm_dec_gauge, metric, mock_tls.vtl_committed); + expect_value(__wrap_d_tm_dec_gauge, value, DBD_BLOB_DF_CAP); rc = vos_dtx_aggregate(mock_coh, NULL); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(DBD_BLOB_DF_CAP); } /* Update of committed DTX entries failure */ @@ -507,7 +517,7 @@ test_committed_data_error(void **unused) cmt_time = CMT_TIME_START + (dtx_count - 1) * CMT_TIME_STEP; rc = vos_dtx_aggregate(mock_coh, &cmt_time); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(0); } /* Update of committed DTX entries count failure */ @@ -539,7 +549,7 @@ test_dbd_count_error(void **unused) cmt_time = CMT_TIME_START + (dtx_count - 1) * CMT_TIME_STEP; rc = vos_dtx_aggregate(mock_coh, &cmt_time); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(0); } /* Pmem commit transaction failure */ @@ -571,7 +581,7 @@ test_umm_commit_error(void **unused) cmt_time = CMT_TIME_START + (dtx_count - 1) * CMT_TIME_STEP; rc = vos_dtx_aggregate(mock_coh, &cmt_time); assert_rc_equal(rc, -DER_UNKNOWN); - check_rollback(); + check_rollback(0); } /* Pool without DTX committed transaction */ diff --git a/src/dtx/tests/dts_structs.c b/src/dtx/tests/dts_structs.c index bddfdf9816c..0e656e33f2c 100644 --- a/src/dtx/tests/dts_structs.c +++ b/src/dtx/tests/dts_structs.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -62,7 +62,6 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_drop_cmt); SET_BITFIELD_1(dummy, dth_modify_shared); SET_BITFIELD_1(dummy, dth_active); - SET_BITFIELD_1(dummy, dth_touched_leader_oid); SET_BITFIELD_1(dummy, dth_local_tx_started); SET_BITFIELD_1(dummy, dth_shares_inited); SET_BITFIELD_1(dummy, dth_dist); @@ -75,7 +74,7 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_local); SET_BITFIELD_1(dummy, dth_epoch_owner); SET_BITFIELD_1(dummy, dth_local_complete); - SET_BITFIELD(dummy, padding1, 12); + SET_BITFIELD(dummy, padding1, 13); SET_FIELD(dummy, dth_dti_cos_count); SET_FIELD(dummy, dth_dti_cos); @@ -87,10 +86,6 @@ struct_dtx_handle(void **state) SET_FIELD(dummy, dth_op_seq); SET_FIELD(dummy, dth_deferred_used_cnt); SET_FIELD(dummy, padding2); - SET_FIELD(dummy, dth_oid_cnt); - SET_FIELD(dummy, dth_oid_cap); - SET_FIELD(dummy, padding3); - SET_FIELD(dummy, dth_oid_array); SET_FIELD(dummy, dth_local_oid_cnt); SET_FIELD(dummy, dth_local_oid_cap); SET_FIELD(dummy, padding4); diff --git a/src/engine/SConscript b/src/engine/SConscript index 06c0e2bfef8..434ba4b1def 100644 --- a/src/engine/SConscript +++ b/src/engine/SConscript @@ -13,7 +13,7 @@ def scons(): denv.AppendUnique(CPPPATH=[Dir('..').srcnode()]) denv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD']) libraries = ['daos_common_pmem', 'gurt', 'cart', 'vos_srv'] - libraries += ['bio', 'dl', 'uuid', 'pthread', 'abt'] + libraries += ['bio', 'ssl', 'dl', 'uuid', 'pthread', 'abt'] libraries += ['hwloc', 'pmemobj', 'protobuf-c', 'isal', 'numa'] denv.require('argobots', 'protobufc', 'pmdk', 'isal') diff --git a/src/engine/init.c b/src/engine/init.c index 9ef8828b01c..0072b87be33 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -1,8 +1,7 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -24,6 +23,7 @@ #include #include +#include #include #include #include "srv_internal.h" @@ -1134,8 +1134,9 @@ int main(int argc, char **argv) { sigset_t set; - int sig; - int rc; + bool exit_failure = false; + int sig; + int rc; /** parse command line arguments */ parse(argc, argv); @@ -1167,6 +1168,7 @@ main(int argc, char **argv) /** wait for shutdown signal */ sigemptyset(&set); + sigaddset(&set, SIGBUS); sigaddset(&set, SIGINT); sigaddset(&set, SIGTERM); sigaddset(&set, SIGUSR1); @@ -1179,7 +1181,6 @@ main(int argc, char **argv) D_ERROR("failed to wait for signals: %d\n", rc); break; } - /* open specific file to dump ABT infos and ULTs stacks */ if (sig == SIGUSR1 || sig == SIGUSR2) { struct timeval tv; @@ -1261,12 +1262,18 @@ main(int argc, char **argv) continue; } - /* SIGINT/SIGTERM cause server shutdown */ + /* Log error for SIGBUS occurrence */ + if (sig == SIGBUS) { + D_ERROR("SIGBUS signal received; proceeding to shutdown.\n"); + exit_failure = true; + } + + /* SIGINT/SIGTERM/SIGBUS cause server shutdown */ break; } /** shutdown */ server_fini(true); - exit(EXIT_SUCCESS); + exit(exit_failure ? EXIT_FAILURE : EXIT_SUCCESS); } diff --git a/src/engine/sched.c b/src/engine/sched.c index 76ee6478810..cea0d52ba4a 100644 --- a/src/engine/sched.c +++ b/src/engine/sched.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -179,7 +179,7 @@ unsigned int sched_relax_intvl = SCHED_RELAX_INTVL_DEFAULT; unsigned int sched_relax_mode; unsigned int sched_unit_runtime_max = 32; /* ms */ bool sched_watchdog_all; -unsigned int sched_inactive_max = 40000; /* ms */ +unsigned int sched_inactive_max = 300000; /* ms, 5 mins */ bool sched_monitor_kill = true; enum { @@ -1928,7 +1928,13 @@ need_nvme_poll(struct dss_xstream *dx, struct sched_cycle *cycle) dmi = dss_get_module_info(); D_ASSERT(dmi != NULL); - return bio_need_nvme_poll(dmi->dmi_nvme_ctxt); + /* + * If SPDK I/O stalls indefinitely due to a hardware fault (or software bug), + * the resulting backlog of undrained I/Os will cause bio_need_nvme_poll() to + * consistently return true. To prevent starvation and ensure system progress, + * schedule the NVMe polling ULT and other ULTs intverleavingly. + */ + return !cycle->sc_age_nvme && bio_need_nvme_poll(dmi->dmi_nvme_ctxt); } static ABT_unit @@ -2109,6 +2115,7 @@ sched_xs_monitor(struct dss_xstream *cur_dx) struct sched_info *info, *cur_info; struct sched_hist_seq *hist; unsigned int gap; + char **strings = NULL; int rc, i, inactive_tgt, inactive_id = -1; D_ASSERT(is_monitor_xs(cur_dx)); @@ -2148,6 +2155,7 @@ sched_xs_monitor(struct dss_xstream *cur_dx) inactive_id = dx->dx_xs_id; inactive_tgt = dx->dx_tgt_id; gap = cur_info->si_cur_ts - hist->sm_last_ts; + strings = backtrace_symbols(&info->si_ult_func, 1); break; } } @@ -2155,8 +2163,10 @@ sched_xs_monitor(struct dss_xstream *cur_dx) dss_sched_monitor_exit(); if (inactive_id >= 0) { - D_WARN("SCHED_MONITOR: xs %d (tgt:%d) is inactive for more than %u ms!\n", - inactive_id, inactive_tgt, gap); + D_WARN("SCHED_MONITOR: xs %d (tgt:%d) is inactive for more than %u ms! symbol:%s\n", + inactive_id, inactive_tgt, gap, strings != NULL ? strings[0] : NULL); + free(strings); + if (sched_monitor_kill) { D_ERROR("SCHED_MONITOR: Killing engine...\n"); rc = kill(getpid(), SIGKILL); @@ -2256,6 +2266,16 @@ sched_exec_time(uint64_t *msecs, const char *ult_name) return 0; } +static inline bool +sched_monitor_enabled(struct dss_xstream *dx) +{ + if (sched_inactive_max == 0) + return false; + + /* Monitor SYS & VOS xstreams only */ + return dx->dx_xs_id == 0 || dx->dx_main_xs; +} + static void sched_watchdog_prep(struct dss_xstream *dx, ABT_unit unit) { @@ -2264,10 +2284,12 @@ sched_watchdog_prep(struct dss_xstream *dx, ABT_unit unit) void (*thread_func)(void *); int rc; - if (!watchdog_enabled(dx)) + if (!watchdog_enabled(dx) && !sched_monitor_enabled(dx)) return; - info->si_ult_start = daos_getmtime_coarse(); + if (watchdog_enabled(dx)) + info->si_ult_start = daos_getmtime_coarse(); + rc = ABT_unit_get_thread(unit, &thread); D_ASSERT(rc == ABT_SUCCESS); rc = ABT_thread_get_thread_func(thread, &thread_func); diff --git a/src/engine/server_iv.c b/src/engine/server_iv.c index 18ba81d5a84..a369e06538e 100644 --- a/src/engine/server_iv.c +++ b/src/engine/server_iv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -339,8 +339,8 @@ iv_entry_lookup_or_create(struct ds_iv_ns *ns, struct ds_iv_key *key, entry->iv_ref++; if (got != NULL) *got = entry; - D_DEBUG(DB_TRACE, "Get entry %p/%d key %d\n", - entry, entry->iv_ref, key->class_id); + D_DEBUG(DB_TRACE, "Get entry %p, ref %d valid %d key %d\n", entry, entry->iv_ref, + entry->iv_valid, key->class_id); return 0; } @@ -454,7 +454,7 @@ iv_on_update_internal(crt_iv_namespace_t ivns, crt_iv_key_t *iv_key, struct ds_iv_ns *ns = NULL; struct ds_iv_entry *entry; struct ds_iv_key key; - struct iv_priv_entry *priv_entry = priv; + struct iv_priv_entry *priv_entry = priv; int rc = 0; rc = iv_ns_lookup_by_ivns(ivns, &ns); @@ -473,17 +473,21 @@ iv_on_update_internal(crt_iv_namespace_t ivns, crt_iv_key_t *iv_key, } if (refresh) { + /* oid_iv_ent_refresh need to be called to unlock */ rc = refresh_iv_value(entry, &key, iv_value, ref_rc, priv_entry ? priv_entry->priv : NULL); + if (rc == 0) + rc = ref_rc; } else { D_ASSERT(iv_value != NULL); + D_ASSERT(ref_rc == 0); + D_ASSERT(!invalidate); if (ns->iv_master_rank != key.rank) { D_DEBUG(DB_MD, "key id %d master rank %u != %u: rc = %d\n", key.class_id, ns->iv_master_rank, key.rank, -DER_GRPVER); D_GOTO(output, rc = -DER_GRPVER); } - rc = update_iv_value(entry, &key, iv_value, - priv_entry ? priv_entry->priv : NULL); + rc = update_iv_value(entry, &key, iv_value, priv_entry ? priv_entry->priv : NULL); } if (rc != 0) { D_DEBUG(DB_MD, "key id %d update failed: rc = " DF_RC "\n", key.class_id, @@ -880,6 +884,56 @@ ds_iv_ns_cleanup(struct ds_iv_ns *ns) } } +/* To prepare for reintegrate, cleanup some IVs' cache. + * May add more types later when needed. + */ +int +ds_iv_ns_reint_prep(struct ds_iv_ns *ns) +{ + struct ds_iv_entry *entry; + struct ds_iv_entry *tmp; + uint32_t msec = 100; + uint32_t total = 0; + int rc; + + /* iv_refcount is 1 after ns create, + * 2 after ds_iv_ns_start. + * > 2 if with any in-flight IV operation. + * here wait the in-flight IV operation for at most 30 seconds, if cannot finish within + * 30 seconds return EBUSY so user can redo the reintegration. Should be very rare case + * for 30 seconds IV timeout. + */ + while (ns->iv_refcount > 2) { + msec = min(5000, msec * 2); + dss_sleep(msec); + total += msec; + if (total > 30000) { + rc = -DER_BUSY; + DL_ERROR( + rc, DF_UUID " timed out for wait IV, iv_refcount %d, waited %d seconds", + DP_UUID(ns->iv_pool_uuid), ns->iv_refcount, min(1, total / 1000)); + return rc; + } else { + D_INFO(DF_UUID " wait IV operation, iv_refcount %d, waited %d seconds", + DP_UUID(ns->iv_pool_uuid), ns->iv_refcount, min(1, total / 1000)); + } + } + + /* no yield for the cleanup */ + d_list_for_each_entry_safe(entry, tmp, &ns->iv_entry_list, iv_link) { + if (entry->iv_key.class_id == IV_CONT_TRACK_EPOCH || + entry->iv_key.class_id == IV_CONT_PROP || + entry->iv_key.class_id == IV_CONT_SNAP) { + D_INFO(DF_UUID " delete IV class_id %d", DP_UUID(ns->iv_pool_uuid), + entry->iv_key.class_id); + d_list_del(&entry->iv_link); + iv_entry_free(entry); + } + } + + return 0; +} + void ds_iv_ns_stop(struct ds_iv_ns *ns) { @@ -1075,7 +1129,7 @@ _iv_op(struct ds_iv_ns *ns, struct ds_iv_key *key, d_sg_list_t *value, rc = iv_op_internal(ns, key, value, sync, shortcut, opc); if (retry && !ns->iv_stop && (daos_rpc_retryable_rc(rc) || rc == -DER_NOTLEADER || rc == -DER_BUSY)) { - if (rc == -DER_GRPVER && engine_in_check()) { + if ((rc == -DER_GRPVER || rc == -DER_OOG) && engine_in_check()) { /* * Under check mode, the pool shard on peer rank/target does * not exist, then it will reply "-DER_GRPVER" that is normal diff --git a/src/engine/ult.c b/src/engine/ult.c index 56dd36009b6..bd21eae4908 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -9,6 +9,7 @@ #include #include +#include #include #include "srv_internal.h" @@ -92,18 +93,16 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, struct dss_coll_args *args, bool create_ult, unsigned int flags) { - struct collective_arg carg; - struct dss_coll_stream_args *stream_args; - struct dss_stream_arg_type *stream; - struct aggregator_arg_type aggregator; - struct dss_xstream *dx; - ABT_future future; - int xs_nr; - int rc; - int tid; - int tgt_id = dss_get_module_info()->dmi_tgt_id; - uint32_t bm_len; - bool self = false; + struct dss_coll_stream_args *stream_args; + struct dss_stream_arg_type *stream; + struct dss_xstream *dx; + struct collective_arg carg; + struct aggregator_arg_type aggregator; + ABT_future future; + uint32_t bm_len; + int xs_nr; + int rc; + int tid; if (ops == NULL || args == NULL || ops->co_func == NULL) { D_DEBUG(DB_MD, "mandatory args missing dss_collective_reduce"); @@ -171,11 +170,6 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, D_ASSERTF(rc == ABT_SUCCESS, "%d\n", rc); continue; } - - if (tgt_id == tid && flags & DSS_USE_CURRENT_ULT) { - self = true; - continue; - } } dx = dss_get_xstream(DSS_MAIN_XS_ID(tid)); @@ -216,12 +210,6 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, } } - if (self) { - stream = &stream_args->csa_streams[tgt_id]; - stream->st_coll_args = &carg; - collective_func(stream); - } - ABT_future_wait(future); rc = aggregator.at_rc; @@ -944,3 +932,80 @@ dss_chore_queue_fini(struct dss_xstream *dx) ABT_cond_free(&queue->chq_cond); ABT_mutex_free(&queue->chq_mutex); } + +struct dss_vos_pool_create_args { + const char *spc_path; + unsigned char *spc_uuid; + daos_size_t spc_scm_size; + daos_size_t spc_data_sz; + daos_size_t spc_meta_sz; + unsigned int spc_flags; + uint32_t spc_version; + daos_handle_t *spc_pool; +}; + +static int +dss_vos_pool_create_ult(void *varg) +{ + struct dss_vos_pool_create_args *arg = varg; + + return vos_pool_create(arg->spc_path, arg->spc_uuid, arg->spc_scm_size, arg->spc_data_sz, + arg->spc_meta_sz, arg->spc_flags, arg->spc_version, arg->spc_pool); +} + +/** + * Call vos_pool_create in a new deep-stack ULT on the same xstream. This is to + * avoid pmemobj_create or SPDK from overflowing the stack of the calling ULT. + */ +int +dss_vos_pool_create(const char *path, unsigned char *uuid, daos_size_t scm_size, + daos_size_t data_sz, daos_size_t meta_sz, unsigned int flags, uint32_t version, + daos_handle_t *pool) +{ + struct dss_vos_pool_create_args args; + + args.spc_path = path; + args.spc_uuid = uuid; + args.spc_scm_size = scm_size; + args.spc_data_sz = data_sz; + args.spc_meta_sz = meta_sz; + args.spc_flags = flags; + args.spc_version = version; + args.spc_pool = pool; + + return dss_ult_execute(dss_vos_pool_create_ult, &args, NULL /* user_cb */, + NULL /* cb_args */, DSS_XS_SELF, 0 /* tgt_id */, DSS_DEEP_STACK_SZ); +} + +struct dss_vos_pool_open_args { + const char *spo_path; + unsigned char *spo_uuid; + unsigned int spo_flags; + daos_handle_t *spo_pool; +}; + +static int +dss_vos_pool_open_ult(void *varg) +{ + struct dss_vos_pool_open_args *arg = varg; + + return vos_pool_open(arg->spo_path, arg->spo_uuid, arg->spo_flags, arg->spo_pool); +} + +/** + * Call vos_pool_open in a new deep-stack ULT on the same xstream. This is to + * avoid pmemobj_open or SPDK from overflowing the stack of the calling ULT. + */ +int +dss_vos_pool_open(const char *path, unsigned char *uuid, unsigned int flags, daos_handle_t *pool) +{ + struct dss_vos_pool_open_args args; + + args.spo_path = path; + args.spo_uuid = uuid; + args.spo_flags = flags; + args.spo_pool = pool; + + return dss_ult_execute(dss_vos_pool_open_ult, &args, NULL /* user_cb */, NULL /* cb_args */, + DSS_XS_SELF, 0 /* tgt_id */, DSS_DEEP_STACK_SZ); +} diff --git a/src/gurt/debug.c b/src/gurt/debug.c index e53ad61cb38..50f765d35bb 100644 --- a/src/gurt/debug.c +++ b/src/gurt/debug.c @@ -622,3 +622,48 @@ int d_register_alt_assert(void (*alt_assert)(const int, const char*, } return -DER_INVAL; } + +#define D_LOG_MEMORY_LINE_LENGTH (10 + 2 + 3 * 16 + 1) /** 0x12340000: 00 01 02... 0f */ + +void +d_log_memory(const uint8_t *ptr, size_t size) +{ + static char buf[D_LOG_MEMORY_LINE_LENGTH] = ""; + size_t i; + char *out = buf; + size_t out_space = D_LOG_MEMORY_LINE_LENGTH; + int rc; + + /** printed immediately in case reading the memory cause a crash */ + D_FATAL("ptr=%p, size=%zu\n", ptr, size); + + if (ptr == NULL || size == 0) { + return; + } + + for (i = 0; i < size; i++) { + /** start a new line */ + if (i % 16 == 0) { + rc = snprintf(out, out_space, "%p: ", &ptr[i]); /** append address */ + D_ASSERTF(rc > 0, "snprintf() failed: %d\n", rc); + out += rc; + out_space -= rc; + } + rc = snprintf(out, out_space, "%02x ", ptr[i]); /** append value */ + D_ASSERTF(rc > 0, "snprintf() failed: %d\n", rc); + out += rc; + out_space -= rc; + + /** print a complete line and reset the output buffer */ + if (i % 16 == 15) { + D_FATAL("%s\n", buf); + out = buf; + out_space = D_LOG_MEMORY_LINE_LENGTH; + } + } + + /** print an incomplete line */ + if (out_space < D_LOG_MEMORY_LINE_LENGTH) { + D_FATAL("%s\n", buf); + } +} diff --git a/src/gurt/fault_inject.c b/src/gurt/fault_inject.c index 95376044b5d..386e24d0172 100644 --- a/src/gurt/fault_inject.c +++ b/src/gurt/fault_inject.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2018-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -417,6 +418,8 @@ fault_attr_parse(yaml_parser_t *parser) yaml_event_delete(&event); if (event_type == YAML_SEQUENCE_END_EVENT) break; + if (event_type == YAML_DOCUMENT_END_EVENT) /** in case the list is actually empty */ + break; if (rc != DER_SUCCESS) break; } while (1); diff --git a/src/gurt/misc.c b/src/gurt/misc.c index 094b3cc1a56..afca34ab918 100644 --- a/src/gurt/misc.c +++ b/src/gurt/misc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -550,9 +550,24 @@ d_rank_list_shuffle(d_rank_list_t *rank_list) } /** - * Must be previously sorted or not modified at all in order to guarantee - * consistent indexes. - **/ + * Binary search \a rank in the sorted \a rank_list. + */ + +bool +d_rank_list_bsearch(d_rank_list_t *rank_list, d_rank_t rank, int *idx) +{ + d_rank_t *pos = NULL; + + if (rank_list != NULL) { + pos = bsearch(&rank, rank_list->rl_ranks, rank_list->rl_nr, sizeof(rank), + rank_compare); + if (pos != NULL && idx != NULL) + *idx = ((void *)pos - (void *)rank_list->rl_ranks) / sizeof(rank); + } + + return pos != NULL; +} + bool d_rank_list_find(d_rank_list_t *rank_list, d_rank_t rank, int *idx) { diff --git a/src/gurt/tests/SConscript b/src/gurt/tests/SConscript index a773b12812a..54d5d156689 100644 --- a/src/gurt/tests/SConscript +++ b/src/gurt/tests/SConscript @@ -6,6 +6,23 @@ import os + +def build_d_log_memory_ut(utenv): + """Build d_log_memory_ut""" + utenv.AppendUnique(LINKFLAGS=['-Wl,--wrap=d_vlog']) + libs = ['cmocka', 'uuid', 'yaml', 'm', 'pthread'] + srcs = [ + 'd_log_memory_ut.c', + '../debug.c', + '../dlog.c', + '../misc.c', + '../fault_inject.c', + '../hash.c', + '../errno.c', + ] + utenv.d_test_program('d_log_memory_ut', srcs, LIBS=libs) + + TEST_SRC = ['test_gurt.c', 'test_gurt_telem_producer.c'] @@ -35,6 +52,8 @@ def scons(): LIBS=test_env["LIBS"] + ['yaml']) tests.append(testprog) + build_d_log_memory_ut(env.Clone()) + Default(tests) diff --git a/src/gurt/tests/d_log_memory_ut.c b/src/gurt/tests/d_log_memory_ut.c new file mode 100644 index 00000000000..4cf3eead454 --- /dev/null +++ b/src/gurt/tests/d_log_memory_ut.c @@ -0,0 +1,258 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +#define D_LOGFAC DD_FAC(tests) + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/** helper */ +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + +#define FULL_LINE_LEN 16 +#define MAP_ADDRESS 0x12340000 +#define LINE_01_ADDRESS 0x12340010 +#define LINE_02_ADDRESS 0x12340020 +#define LINE_00_ADDRESS_STR STRINGIFY(MAP_ADDRESS) +#define LINE_01_ADDRESS_STR STRINGIFY(LINE_01_ADDRESS) +#define LINE_02_ADDRESS_STR STRINGIFY(LINE_02_ADDRESS) + +#define HDR_STR(SIZE_STR) "ptr=" LINE_00_ADDRESS_STR ", size=" SIZE_STR "\n" +#define EXP_LINE_00_01B LINE_00_ADDRESS_STR ": ff \n" +#define EXP_LINE_00_15B LINE_00_ADDRESS_STR ": ff fe fd fc fb fa f9 f8 f7 f6 f5 f4 f3 f2 f1 \n" +#define EXP_LINE_00_FULL LINE_00_ADDRESS_STR ": ff fe fd fc fb fa f9 f8 f7 f6 f5 f4 f3 f2 f1 f0 \n" +#define EXP_LINE_01_01B LINE_01_ADDRESS_STR ": ef \n" +#define EXP_LINE_01_15B LINE_01_ADDRESS_STR ": ef ee ed ec eb ea e9 e8 e7 e6 e5 e4 e3 e2 e1 \n" +#define EXP_LINE_01_FULL LINE_01_ADDRESS_STR ": ef ee ed ec eb ea e9 e8 e7 e6 e5 e4 e3 e2 e1 e0 \n" +#define EXP_LINE_02_01B LINE_02_ADDRESS_STR ": df \n" +#define EXP_LINE_02_FULL LINE_02_ADDRESS_STR ": df de dd dc db da d9 d8 d7 d6 d5 d4 d3 d2 d1 d0 \n" + +static const char Exp_line_00_full[] = EXP_LINE_00_FULL; +static const char Exp_line_01_full[] = EXP_LINE_01_FULL; +static const char Exp_line_02_full[] = EXP_LINE_02_FULL; + +/** mocks */ + +#define BUF_SIZE 1024 + +void +__wrap_d_vlog(int flags, const char *fmt, va_list ap) +{ + static char buf[BUF_SIZE]; + const char *output; + int rc; + + /** generate the output string */ + rc = vsnprintf(buf, BUF_SIZE, fmt, ap); + assert(rc > 0); + + /** skip the "file:line_number func() " bit */ + output = strchr(buf, ' '); + assert_non_null(output); + output += 1; + output = strchr(output, ' '); + assert_non_null(output); + output += 1; + + check_expected(output); +} + +/** setup & teardown */ + +static int +setup(void **state) +{ + void *addr = (void *)MAP_ADDRESS; /** desired address */ + size_t size = 4096; /** one page */ + + void *ptr = mmap(addr, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + assert_int_not_equal(ptr, MAP_FAILED); + + uint8_t *mem = ptr; + + /** initialize the bit which is in use - three lines 16 bytes each */ + for (int line = 0; line < 3; ++line) { + for (int _byte = 0; _byte < 16; ++_byte) { + int index = line * 16 + _byte; + mem[index] = 0xff - index; + } + } + + *state = ptr; + + return 0; +} + +static int +teardown(void **state) +{ + void *ptr = *state; + int rc; + + rc = munmap(ptr, 4096); + assert_int_equal(rc, 0); + + return 0; +} + +/** tests */ + +static void +test_ptr_NULL(void **state) +{ + const char hdr[] = "ptr=(nil), size=0\n"; + + expect_string(__wrap_d_vlog, output, hdr); + d_log_memory(NULL, 0); +} + +static void +test_size_0(void **state) +{ + const char hdr[] = HDR_STR("0"); + const uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + d_log_memory(mem, 0); +} + +static void +test_very_short_line(void **state) +{ + const char hdr[] = HDR_STR("1"); + const char exp[] = EXP_LINE_00_01B; + uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + expect_string(__wrap_d_vlog, output, exp); + d_log_memory(mem, 1); +} + +static void +test_short_line(void **state) +{ + const char hdr[] = HDR_STR("15"); + const char exp[] = EXP_LINE_00_15B; + uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + expect_string(__wrap_d_vlog, output, exp); + d_log_memory(mem, FULL_LINE_LEN - 1); +} + +static void +test_full_line(void **state) +{ + const char hdr[] = HDR_STR("16"); + uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + expect_string(__wrap_d_vlog, output, Exp_line_00_full); + d_log_memory(mem, FULL_LINE_LEN); +} + +static void +test_full_line_plus(void **state) +{ + const char hdr[] = HDR_STR("17"); + const char exp1[] = EXP_LINE_01_01B; + uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + expect_string(__wrap_d_vlog, output, Exp_line_00_full); + expect_string(__wrap_d_vlog, output, exp1); + d_log_memory(mem, FULL_LINE_LEN + 1); +} + +static void +test_almost_two_lines(void **state) +{ + const char hdr[] = HDR_STR("31"); + const char exp1[] = EXP_LINE_01_15B; + uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + expect_string(__wrap_d_vlog, output, Exp_line_00_full); + expect_string(__wrap_d_vlog, output, exp1); + d_log_memory(mem, FULL_LINE_LEN * 2 - 1); +} + +static void +test_two_lines(void **state) +{ + const char hdr[] = HDR_STR("32"); + uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + expect_string(__wrap_d_vlog, output, Exp_line_00_full); + expect_string(__wrap_d_vlog, output, Exp_line_01_full); + d_log_memory(mem, FULL_LINE_LEN * 2); +} + +static void +test_two_lines_plus(void **state) +{ + const char hdr[] = HDR_STR("33"); + const char exp2[] = EXP_LINE_02_01B; + uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + expect_string(__wrap_d_vlog, output, Exp_line_00_full); + expect_string(__wrap_d_vlog, output, Exp_line_01_full); + expect_string(__wrap_d_vlog, output, exp2); + d_log_memory(mem, FULL_LINE_LEN * 2 + 1); +} + +static void +test_three_lines(void **state) +{ + const char hdr[] = HDR_STR("48"); + uint8_t *mem = *state; + + expect_string(__wrap_d_vlog, output, hdr); + expect_string(__wrap_d_vlog, output, Exp_line_00_full); + expect_string(__wrap_d_vlog, output, Exp_line_01_full); + expect_string(__wrap_d_vlog, output, Exp_line_02_full); + d_log_memory(mem, FULL_LINE_LEN * 3); +} + +static const struct CMUnitTest tests[] = { + {"DUMP001: ptr == NULL", test_ptr_NULL, NULL, NULL}, + {"DUMP002: size == 0", test_size_0, NULL, NULL}, + {"DUMP003: very short line (1 byte)", test_very_short_line, NULL, NULL}, + {"DUMP004: short line (15 bytes)", test_short_line, NULL, NULL}, + {"DUMP005: full line (16 bytes)", test_full_line, NULL, NULL}, + {"DUMP006: full line + 1 (17 bytes)", test_full_line_plus, NULL, NULL}, + {"DUMP007: almost two lines (31 bytes)", test_almost_two_lines, NULL, NULL}, + {"DUMP008: two lines (32 bytes)", test_two_lines, NULL, NULL}, + {"DUMP009: two lines + 1 (33 bytes)", test_two_lines_plus, NULL, NULL}, + {"DUMP010: three lines (48 bytes)", test_three_lines, NULL, NULL}, + {NULL, NULL, NULL, NULL}}; + +int +main(int argc, char **argv) +{ + int rc; + + d_log_init(); + + d_register_alt_assert(mock_assert); + + rc = cmocka_run_group_tests_name("d_log_memory() tests", tests, setup, teardown); + + d_log_fini(); + + return rc; +} diff --git a/src/include/cart/types.h b/src/include/cart/types.h index 81cfb2ef0b9..65786f57ede 100644 --- a/src/include/cart/types.h +++ b/src/include/cart/types.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -46,26 +46,20 @@ typedef struct crt_init_options { * evnironment variable. */ int cio_crt_timeout; - uint32_t cio_sep_override:1, /**< Deprecated */ - cio_use_sep:1, /**< Deprecated */ - /** whether or not to inject faults */ - cio_fault_inject:1, - /** - * whether or not to override credits. When set - * overrides CRT_CTX_EP_CREDITS envariable - */ - cio_use_credits:1, - /** whether or not to enable per-context sensors */ - cio_use_sensors:1, - - /** whether or not to use expected sizes */ - cio_use_expected_size:1, - cio_use_unexpected_size:1; + uint32_t cio_sep_override : 1, /**< Deprecated */ + cio_use_sep : 1, /**< Deprecated */ + /** whether or not to inject faults */ + cio_fault_inject : 1, + /** whether or not to enable per-context sensors */ + cio_use_sensors : 1, + + /** whether or not to use expected sizes */ + cio_use_expected_size : 1, cio_use_unexpected_size : 1; /** overrides the value of the environment variable CRT_CTX_NUM */ int cio_ctx_max_num; - /** Used with cio_use_credits to set credit limit */ + /** set credit limit */ int cio_ep_credits; /** @@ -99,6 +93,9 @@ typedef struct crt_init_options { /** force busy wait (testing only, not in production) */ bool cio_progress_busy; + + /** use memory device */ + bool cio_mem_device; } crt_init_options_t; typedef int crt_status_t; diff --git a/src/include/daos/btree.h b/src/include/daos/btree.h index ad0066f111d..9b9b243d9bb 100644 --- a/src/include/daos/btree.h +++ b/src/include/daos/btree.h @@ -543,6 +543,15 @@ int dbtree_open_inplace(struct btr_root *root, struct umem_attr *uma, daos_handle_t *toh); int dbtree_open_inplace_ex(struct btr_root *root, struct umem_attr *uma, daos_handle_t coh, void *priv, daos_handle_t *toh); +enum btr_report_type { + BTR_REPORT_ERROR, + BTR_REPORT_WARNING, + BTR_REPORT_MSG, +}; +typedef void (*btr_report_fn_t)(void *arg, enum btr_report_type type, const char *fmt, ...); +int + dbtree_check_inplace(struct btr_root *root, struct umem_attr *uma, btr_report_fn_t report_fn, + void *report_arg, bool error_on_non_zero_padding); int dbtree_close(daos_handle_t toh); int dbtree_destroy(daos_handle_t toh, void *args); int dbtree_drain(daos_handle_t toh, int *credits, void *args, bool *destroyed); diff --git a/src/include/daos/checksum.h b/src/include/daos/checksum.h index fe4771f9fbf..2cf68e4ab1f 100644 --- a/src/include/daos/checksum.h +++ b/src/include/daos/checksum.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -356,6 +357,21 @@ int daos_csummer_verify_key(struct daos_csummer *obj, daos_key_t *key, struct dcs_csum_info *csum); +/** + * Verify a value to a checksum + * + * @param obj The daos_csummer obj + * @param recx extent for array value (NULL for single value) + * @param rsize element/value size + * @param val The key to verify + * @param csum_info The dcs_csum_info that describes the checksum + * + * @return 0 for success, -DER_CSUM if corruption is detected + */ +int +daos_csummer_verify_value(struct daos_csummer *obj, daos_recx_t *recx, daos_size_t rsize, + d_iov_t *val, struct dcs_csum_info *csum_info); + /** * Calculate the needed memory for all the structures that will * store the checksums for the iods. diff --git a/src/include/daos/common.h b/src/include/daos/common.h index 649a8f7bfc6..a31cf34dfa7 100644 --- a/src/include/daos/common.h +++ b/src/include/daos/common.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -927,12 +927,42 @@ enum { #define DAOS_CHK_FAIL_REPORT_POOL2 (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xb8) #define DAOS_CHK_ENGINE_DEATH (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xb9) #define DAOS_CHK_VERIFY_CONT_SHARDS (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xba) +#define DAOS_CHK_ORPHAN_POOL_SHARD (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xbb) +#define DAOS_CHK_REPORT_FAILURE (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xbc) + +#define DAOS_MGMT_FAIL_CREATE_QUERY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0xe0) /* WAL && checkpoint failure inject */ #define DAOS_WAL_NO_REPLAY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x100) #define DAOS_WAL_FAIL_REPLAY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x101) #define DAOS_MEM_FAIL_CHECKPOINT (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x102) +/** DLCK fault injection */ +#define DLCK_MOCK_ROOT (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x100) +#define DLCK_FAULT_GETGRNAM (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x101) +#define DLCK_MOCK_NO_DAOS_SERVER_GROUP (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x102) +#define DLCK_FAULT_GETGROUPS (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x103) +#define DLCK_MOCK_NOT_IN_DAOS_SERVER_GROUP (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x104) +#define DLCK_FAULT_CREATE_LOG_DIR (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x105) +#define DLCK_FAULT_CREATE_POOL_DIR (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x106) +#define DLCK_FAULT_ENGINE_START (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x107) +#define DLCK_FAULT_ENGINE_EXEC (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x108) +#define DLCK_FAULT_ENGINE_JOIN (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x109) +#define DLCK_FAULT_ENGINE_STOP (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x10a) + +/** Pool open fault injection */ +#define DAOS_FAULT_POOL_NVME_HEALTH (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x200) +#define DAOS_FAULT_POOL_OPEN_BIO (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x201) +#define DAOS_FAULT_POOL_OPEN_UMEM (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x202) +#define DAOS_FAULT_POOL_OPEN_MAGIC (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x203) +#define DAOS_FAULT_POOL_OPEN_VERSION (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x204) +#define DAOS_FAULT_POOL_OPEN_UUID (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x205) +#define DAOS_FAULT_BTREE_OPEN_INV_CLASS (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x206) +#define DAOS_FAULT_BTREE_OPEN_UNREG_CLASS (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x207) +#define DAOS_FAULT_BTREE_FEATURES (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x208) +#define DAOS_FAULT_POOL_EXT_PADDING (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x209) +#define DAOS_FAULT_POOL_EXT_RESERVED (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x20a) + #define DAOS_DTX_SKIP_PREPARE DAOS_DTX_SPEC_LEADER #define DAOS_FAIL_CHECK(id) daos_fail_check(id) diff --git a/src/include/daos/debug.h b/src/include/daos/debug.h index a9feeb022cd..f9e8c685084 100644 --- a/src/include/daos/debug.h +++ b/src/include/daos/debug.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -55,7 +55,8 @@ ACTION(il, il, arg) \ ACTION(csum, csum, arg) \ ACTION(pipeline, pipeline, arg) \ - ACTION(stack, stack, arg) + ACTION(stack, stack, arg) \ + ACTION(ddb, ddb, arg) #define DAOS_FOREACH_DB(ACTION, arg) \ /** metadata operation */ \ diff --git a/src/include/daos/lru.h b/src/include/daos/lru.h index de6c5a373b9..6b21d31a6f3 100644 --- a/src/include/daos/lru.h +++ b/src/include/daos/lru.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -137,6 +137,15 @@ daos_lru_ref_evict(struct daos_lru_cache *lcache, struct daos_llink *llink) d_hash_rec_evict_at(&lcache->dlc_htable, &llink->ll_link); } +/** + * Whether the item is evicted or not. + */ +static inline bool +daos_lru_is_evicted(struct daos_llink *llink) +{ + return llink->ll_evicted != 0; +} + /** * Evict the item from LRU before releasing the refcount on it, wait until * the caller is the last one holds refcount. diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h index 18d8fe988a7..d451f26704d 100644 --- a/src/include/daos/mem.h +++ b/src/include/daos/mem.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -38,6 +38,8 @@ umempobj_pgsz(int backend); /* umem persistent object property flags */ #define UMEMPOBJ_ENABLE_STATS 0x1 +#define UMEM_FILE_MODE_DEFAULT 0660 + #ifdef DAOS_PMEM_BUILD /* The backend type is stored in meta blob header, don't change the value */ @@ -284,6 +286,10 @@ struct umem_cache { uint32_t ca_reserve_waiters; /** Waitqueue for free page reserve: umem_cache_reserve() */ void *ca_reserve_wq; + /** Waiters for evictable pages to be unpinned */ + uint32_t ca_unpin_waiters; + /** Waitqueue for waiters for evictable pages to be unpinned */ + void *ca_unpin_wq; /** TODO: some other global status */ uint64_t *ptr2off; uintptr_t *off2ptr; diff --git a/src/include/daos/mgmt.h b/src/include/daos/mgmt.h index 5ea8f7cbd1b..fc63ab22036 100644 --- a/src/include/daos/mgmt.h +++ b/src/include/daos/mgmt.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -18,6 +18,8 @@ #include #include "svc.pb-c.h" +#define DAOS_DEFAULT_SYS_NAME "daos_server" + extern bool d_dynamic_ctx_g; int dc_mgmt_init(void); diff --git a/src/include/daos/object.h b/src/include/daos/object.h index d77f9f9c343..1aa93bce457 100644 --- a/src/include/daos/object.h +++ b/src/include/daos/object.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -79,7 +79,7 @@ enum { /* smallest cell size */ DAOS_EC_CELL_MIN = (4 << 10), /* default cell size */ - DAOS_EC_CELL_DEF = (64 << 10), + DAOS_EC_CELL_DEF = (128 << 10), /* largest cell size */ DAOS_EC_CELL_MAX = (1024 << 10), }; @@ -404,6 +404,17 @@ daos_oclass_is_ec(struct daos_oclass_attr *oca) return oca->ca_resil == DAOS_RES_EC; } +static inline bool +daos_cid_is_ec(daos_oclass_id_t cid) +{ + struct daos_oclass_attr *oca; + + oca = daos_oclass_id2attr(cid, NULL); + if (oca == NULL) + return false; + return daos_oclass_is_ec(oca); +} + static inline void daos_obj_set_oid(daos_obj_id_t *oid, enum daos_otype_t type, enum daos_obj_redun ord, uint32_t nr_grps, diff --git a/src/include/daos/pool_map.h b/src/include/daos/pool_map.h index 8d82791d235..c3c094e3ea3 100644 --- a/src/include/daos/pool_map.h +++ b/src/include/daos/pool_map.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -309,7 +309,7 @@ int pool_map_find_failed_tgts_by_rank(struct pool_map *map, unsigned int *tgt_cnt, d_rank_t rank); int update_dom_status_by_tgt_id(struct pool_map *map, uint32_t tgt_id, uint32_t status, - uint32_t version, bool *updated); + uint32_t version, bool *updated, bool for_revert); bool pool_map_node_status_match(struct pool_domain *dom, unsigned int status); diff --git a/src/include/daos/rpc.h b/src/include/daos/rpc.h index 26860fb6664..a92a6752c54 100644 --- a/src/include/daos/rpc.h +++ b/src/include/daos/rpc.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -72,8 +72,8 @@ enum daos_module_id { #define DAOS_CONT_VERSION 8 #define DAOS_OBJ_VERSION 10 #define DAOS_REBUILD_VERSION 4 -#define DAOS_RSVC_VERSION 4 -#define DAOS_RDB_VERSION 4 +#define DAOS_RSVC_VERSION 5 +#define DAOS_RDB_VERSION 5 #define DAOS_RDBT_VERSION 3 #define DAOS_SEC_VERSION 1 #define DAOS_DTX_VERSION 4 @@ -87,21 +87,39 @@ struct daos_protocol_table { }; static const struct daos_protocol_table daos_rpc_protocol_tables[] = { - { - /* Latest protocol */ - .protocol = DAOS_VERSION_PROTOCAL, - .versions = {DAOS_VOS_VERSION, DAOS_MGMT_VERSION, DAOS_POOL_VERSION, DAOS_CONT_VERSION, - DAOS_OBJ_VERSION, DAOS_REBUILD_VERSION, DAOS_RSVC_VERSION, DAOS_RDB_VERSION, - DAOS_RDBT_VERSION, DAOS_SEC_VERSION, DAOS_DTX_VERSION, DAOS_PIPELINE_VERSION, - DAOS_CHK_VERSION}, - }, + {/* Latest protocol */ + .protocol = DAOS_VERSION_PROTOCAL, + .versions = {[DAOS_VOS_MODULE] = DAOS_VOS_VERSION, + [DAOS_MGMT_MODULE] = DAOS_MGMT_VERSION, + [DAOS_POOL_MODULE] = DAOS_POOL_VERSION, + [DAOS_CONT_MODULE] = DAOS_CONT_VERSION, + [DAOS_OBJ_MODULE] = DAOS_OBJ_VERSION, + [DAOS_REBUILD_MODULE] = DAOS_REBUILD_VERSION, + [DAOS_RSVC_MODULE] = DAOS_RSVC_VERSION, + [DAOS_RDB_MODULE] = DAOS_RDB_VERSION, + [DAOS_RDBT_MODULE] = DAOS_RDBT_VERSION, + [DAOS_SEC_MODULE] = DAOS_SEC_VERSION, + [DAOS_DTX_MODULE] = DAOS_DTX_VERSION, + [DAOS_PIPELINE_MODULE] = DAOS_PIPELINE_VERSION, + [DAOS_CHK_MODULE] = DAOS_CHK_VERSION}}, /* Please update DAOS_VERSION_PROTOCOL - 1 table when rolling upgrade is supported. { - .protocol = DAOS_VERSION_PROTOCAL - 1; - .versions = {DAOS_VOS_VERSION, DAOS_MGMT_VERSION, DAOS_POOL_VERSION, DAOS_CONT_VERSION, - DAOS_OBJ_VERSION, DAOS_REBUILD_VERSION, DAOS_RSVC_VERSION, DAOS_RDB_VERSION, - DAOS_RDBT_VERSION, DAOS_SEC_VERSION, DAOS_DTX_VERSION, DAOS_PIPELINE_VERSION, - DAOS_CHK_VERSION}, + .protocol = DAOS_VERSION_PROTOCAL - 1; + .versions = { + [DAOS_VOS_MODULE] = DAOS_VOS_VERSION, + [DAOS_MGMT_MODULE] = DAOS_MGMT_VERSION, + [DAOS_POOL_MODULE] = DAOS_POOL_VERSION, + [DAOS_CONT_MODULE] = DAOS_CONT_VERSION, + [DAOS_OBJ_MODULE] = DAOS_OBJ_VERSION, + [DAOS_REBUILD_MODULE] = DAOS_REBUILD_VERSION, + [DAOS_RSVC_MODULE] = DAOS_RSVC_VERSION, + [DAOS_RDB_MODULE] = DAOS_RDB_VERSION, + [DAOS_RDBT_MODULE] = DAOS_RDBT_VERSION, + [DAOS_SEC_MODULE] = DAOS_SEC_VERSION, + [DAOS_DTX_MODULE] = DAOS_DTX_VERSION, + [DAOS_PIPELINE_MODULE] = DAOS_PIPELINE_VERSION, + [DAOS_CHK_MODULE] = DAOS_CHK_VERSION + } }, */ }; @@ -293,8 +311,6 @@ int daos_rpc_send(crt_rpc_t *rpc, tse_task_t *task); int daos_rpc_complete(crt_rpc_t *rpc, tse_task_t *task); int daos_rpc_send_wait(crt_rpc_t *rpc); -#define DAOS_DEFAULT_SYS_NAME "daos_server" - /* Currently, this is used on rcs in metadata RPC reply buffers. */ static inline bool daos_rpc_retryable_rc(int rc) diff --git a/src/include/daos/tests_lib.h b/src/include/daos/tests_lib.h index 72438598297..9bb15883b8c 100644 --- a/src/include/daos/tests_lib.h +++ b/src/include/daos/tests_lib.h @@ -176,19 +176,20 @@ enum test_cr_class { }; enum test_cr_action { - TCA_DEFAULT = 0, - TCA_INTERACT = 1, - TCA_IGNORE = 2, - TCA_DISCARD = 3, - TCA_READD = 4, - TCA_TRUST_MS = 5, - TCA_TRUST_PS = 6, - TCA_TRUST_TARGET = 7, - TCA_TRUST_MAJORITY = 8, - TCA_TRUST_LATEST = 9, - TCA_TRUST_OLDEST = 10, - TCA_TRUST_EC_PARITY = 11, - TCA_TRUST_EC_DATA = 12, + TCA_STALE = 0xffff, + TCA_DEFAULT = 0, + TCA_INTERACT = 1, + TCA_IGNORE = 2, + TCA_DISCARD = 3, + TCA_READD = 4, + TCA_TRUST_MS = 5, + TCA_TRUST_PS = 6, + TCA_TRUST_TARGET = 7, + TCA_TRUST_MAJORITY = 8, + TCA_TRUST_LATEST = 9, + TCA_TRUST_OLDEST = 10, + TCA_TRUST_EC_PARITY = 11, + TCA_TRUST_EC_DATA = 12, }; struct daos_check_pool_info { diff --git a/src/include/daos_pool.h b/src/include/daos_pool.h index ed732686299..35c018c93c6 100644 --- a/src/include/daos_pool.h +++ b/src/include/daos_pool.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2020-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -21,19 +21,6 @@ extern "C" { #include -/** Type of storage target */ -typedef enum { - DAOS_TP_UNKNOWN, - /** Rotating disk */ - DAOS_TP_HDD, - /** Flash-based */ - DAOS_TP_SSD, - /** Persistent memory */ - DAOS_TP_PM, - /** Volatile memory */ - DAOS_TP_VM, -} daos_target_type_t; - /** Current state of the storage target */ typedef enum { DAOS_TS_UNKNOWN, @@ -75,8 +62,8 @@ struct daos_space { /** Target information */ typedef struct { - /** Target type */ - daos_target_type_t ta_type; + /** padding - not used */ + uint32_t ta_padding; /** Target state */ daos_target_state_t ta_state; /** Target performance */ @@ -107,6 +94,12 @@ enum daos_rebuild_state_t { DRS_COMPLETED = 2, }; +/** For daos_rebuild_status.rs_flags */ +enum daos_rebuild_status_flag { + /** Data redundancy degraded (the pool has one or more DOWN targets) */ + DAOS_RSF_DEGRADED = (1 << 0), +}; + /** Pool rebuild status */ struct daos_rebuild_status { /** pool map version in rebuilding or last completed rebuild */ @@ -124,8 +117,10 @@ struct daos_rebuild_status { }; /** Maximum supported layout version */ uint16_t rs_max_supported_layout_ver; - /** padding of rebuild status */ - int16_t rs_padding16; + /** See daos_rebuild_status_flag. */ + uint8_t rs_flags; + /** Do not access this field by name. */ + uint8_t rs_reserved_; /** Failure on which rank */ int32_t rs_fail_rank; diff --git a/src/include/daos_prop.h b/src/include/daos_prop.h index 467a94b64fd..d41c59bc50b 100644 --- a/src/include/daos_prop.h +++ b/src/include/daos_prop.h @@ -149,7 +149,7 @@ enum daos_pool_props { #define DAOS_PROP_PO_EC_CELL_SZ_MAX (1UL << 30) #define DAOS_PROP_PO_REDUN_FAC_MAX 4 -#define DAOS_PROP_PO_REDUN_FAC_DEFAULT 0 +#define DAOS_PROP_PO_REDUN_FAC_DEFAULT 3 static inline bool daos_rf_is_valid(unsigned long long rf) @@ -291,7 +291,7 @@ enum daos_cont_props { DAOS_PROP_CO_LAYOUT_VER, /** * Checksum on/off + checksum type (CRC16, CRC32, SHA-1 & SHA-2). - * default = DAOS_PROP_CO_CSUM_OFF + * default = DAOS_PROP_CO_CSUM_CRC32 */ DAOS_PROP_CO_CSUM, /** @@ -301,7 +301,7 @@ enum daos_cont_props { DAOS_PROP_CO_CSUM_CHUNK_SIZE, /** * Checksum verification on server. Value = ON/OFF - * default = DAOS_PROP_CO_CSUM_SV_OFF + * default = DAOS_PROP_CO_CSUM_SV_ON */ DAOS_PROP_CO_CSUM_SERVER_VERIFY, /** diff --git a/src/include/daos_srv/bio.h b/src/include/daos_srv/bio.h index 6a2ddb0240b..f6ae1c4a177 100644 --- a/src/include/daos_srv/bio.h +++ b/src/include/daos_srv/bio.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2025 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/include/daos_srv/checker.h b/src/include/daos_srv/checker.h new file mode 100644 index 00000000000..c6e03de3726 --- /dev/null +++ b/src/include/daos_srv/checker.h @@ -0,0 +1,260 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#ifndef __DAOS_CHECKER_H__ +#define __DAOS_CHECKER_H__ + +#include +#include +#include +#include + +#define CHECKER_INDENT_MAX 10 + +/** + * @enum checker_event + * + * Checker event types. + */ +enum checker_event { + CHECKER_EVENT_INVALID = -1, + CHECKER_EVENT_ERROR = 0, + CHECKER_EVENT_WARNING, +}; + +/** + * @struct checker_options + * + * Checker control options. + */ +struct checker_options { + enum checker_event cko_non_zero_padding; +}; + +/** + * @struct checker + * + * Checker state. + */ +struct checker { + /** input */ + void *ck_private; + struct checker_options ck_options; + /** state */ + int ck_level; + char *ck_prefix; + int (*ck_indent_set)(struct checker *ck); + /** output */ + int (*ck_vprintf)(struct checker *ck, const char *fmt, va_list ap); + unsigned ck_warnings_num; +}; + +#define CHECKER_ERROR_INFIX "error: " +#define CHECKER_WARNING_INFIX "warning: " +#define CHECKER_OK_INFIX "ok" + +/** helpers */ + +/** + * Simple argument translation ... -> va_list + * + * \param[in] ck Checker to call. + * \param[in] fmt Format. + * \param[in] ... Format's arguments. + * + * \retval DER_SUCCESS Success. + * \retval -DER_* Error. + */ +static inline int +ck_common_printf(struct checker *ck, const char *fmt, ...) +{ + va_list args; + int rc; + + va_start(args, fmt); + rc = ck->ck_vprintf(ck, fmt, args); + va_end(args); + + return rc; +} + +/** + * Print a btree report as a checker message. + * + * \param[in] arg Checker. + * \param[in] type Btree report type. + * \param[in] fmt Format. + * \param[in] ... Format's arguments. + */ +static inline void +ck_report(void *arg, enum btr_report_type type, const char *fmt, ...) +{ + struct checker *ck = arg; + va_list args; + + va_start(args, fmt); + + switch (type) { + case BTR_REPORT_ERROR: + ck_common_printf(ck, "%s%s", ck->ck_prefix, CHECKER_ERROR_INFIX); + ck->ck_vprintf(ck, fmt, args); + break; + case BTR_REPORT_WARNING: + ck_common_printf(ck, "%s%s", ck->ck_prefix, CHECKER_WARNING_INFIX); + ck_common_printf(ck, fmt, args); + ck->ck_warnings_num++; + break; + case BTR_REPORT_MSG: + ck_common_printf(ck, "%s", ck->ck_prefix); + ck_common_printf(ck, fmt, args); + break; + default: + D_ASSERTF(0, "Unknown report type: %x\n", type); + } + + va_end(args); +} + +/** basic helpers */ + +#define IS_CHECKER(ck) (unlikely((ck) != NULL)) + +#define IS_NOT_CHECKER(dp) (likely((ck) == NULL)) + +#define YES_NO_STR(cond) ((cond) ? "yes" : "no") + +/** direct print(f) macros with and without prefix */ + +#define CK_PRINT(ck, msg) \ + do { \ + if (IS_CHECKER(ck)) { \ + (void)ck_common_printf(ck, "%s" msg, (ck)->ck_prefix); \ + } \ + } while (0) + +#define CK_PRINTF(ck, fmt, ...) \ + do { \ + if (IS_CHECKER(ck)) { \ + (void)ck_common_printf(ck, "%s" fmt, (ck)->ck_prefix, __VA_ARGS__); \ + } \ + } while (0) + +#define CK_PRINT_WO_PREFIX(ck, msg) \ + do { \ + if (IS_CHECKER(ck)) { \ + (void)ck_common_printf(ck, msg); \ + } \ + } while (0) + +#define CK_PRINTF_WO_PREFIX(ck, fmt, ...) \ + do { \ + if (IS_CHECKER(ck)) { \ + (void)ck_common_printf(ck, fmt, __VA_ARGS__); \ + } \ + } while (0) + +/** append + new line shortcuts */ + +#define CK_APPENDL_OK(ck) CK_PRINT_WO_PREFIX(ck, CHECKER_OK_INFIX ".\n") + +#define CK_APPENDL_RC(ck, rc) \ + do { \ + if (rc == DER_SUCCESS) { \ + CK_APPENDL_OK(ck); \ + } else { \ + CK_PRINTF_WO_PREFIX(ck, CHECKER_ERROR_INFIX DF_RC "\n", DP_RC(rc)); \ + } \ + } while (0) + +#define CK_APPENDFL_ERR(ck, fmt, ...) \ + CK_PRINTF_WO_PREFIX(ck, CHECKER_ERROR_INFIX fmt "\n", __VA_ARGS__) + +#define CK_APPENDFL_WARN(ck, fmt, ...) \ + do { \ + CK_PRINTF_WO_PREFIX(ck, CHECKER_WARNING_INFIX fmt "\n", __VA_ARGS__); \ + ++(ck)->ck_warnings_num; \ + } while (0) + +/** print(f) + return code + new line shortcuts */ + +#define CK_PRINTL_RC(ck, rc, msg) \ + do { \ + if (rc == DER_SUCCESS) { \ + CK_PRINT(ck, msg ": " CHECKER_OK_INFIX ".\n"); \ + } else { \ + CK_PRINTF(ck, CHECKER_ERROR_INFIX msg ": " DF_RC "\n", DP_RC(rc)); \ + } \ + } while (0) + +#define CK_PRINTFL_RC(ck, rc, fmt, ...) \ + do { \ + if (rc == DER_SUCCESS) { \ + CK_PRINTF(ck, fmt ": " CHECKER_OK_INFIX ".\n", __VA_ARGS__); \ + } else { \ + CK_PRINTF(ck, CHECKER_ERROR_INFIX fmt ": " DF_RC "\n", __VA_ARGS__, \ + DP_RC(rc)); \ + } \ + } while (0) + +/** + * An assert while run without a checker. A checker message otherwise. + * + * \param[in] ck Checker's state. + * \param[in] msg Message to print. + * \param[in] cond Condition to assert (without a checker) or condition to check (with a + * checker). + */ +#define CK_ASSERT(ck, msg, cond) \ + do { \ + if (IS_CHECKER(ck)) { \ + CK_PRINTF(ck, msg "%s\n", YES_NO_STR(cond)); \ + } else { \ + D_ASSERT(cond); \ + } \ + } while (0) + +/** manage the checker print's indentation */ + +static inline void +checker_print_indent_inc(struct checker *ck) +{ + if (IS_NOT_CHECKER(ck)) { + return; + } + + if (ck->ck_level == CHECKER_INDENT_MAX) { + CK_PRINT(ck, "Max indent reached.\n"); + return; + } + + ck->ck_level++; + ck->ck_indent_set(ck); +} + +static inline void +checker_print_indent_dec(struct checker *ck) +{ + if (IS_NOT_CHECKER(ck)) { + return; + } + + if (ck->ck_level == 0) { + CK_PRINT(ck, "Min indent reached.\n"); + return; + } + + ck->ck_level--; + ck->ck_indent_set(ck); +} + +#define CK_INDENT(ck, exp) \ + do { \ + checker_print_indent_inc(ck); \ + exp; \ + checker_print_indent_dec(ck); \ + } while (0) + +#endif /** __DAOS_CHECKER_H__ */ diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 3651927c29e..510ffc70ac4 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -76,9 +76,11 @@ struct ds_cont_child { ABT_cond sc_scrub_cond; ABT_cond sc_rebuild_cond; ABT_cond sc_fini_cond; + ABT_cond sc_init_cond; uint32_t sc_dtx_resyncing : 1, sc_dtx_reindex : 1, sc_dtx_reindex_abort : 1, - sc_dtx_delay_reset : 1, sc_dtx_registered : 1, sc_props_fetched : 1, sc_stopping : 1, - sc_destroying : 1, sc_vos_agg_active : 1, sc_ec_agg_active : 1, + sc_dtx_delay_reset : 1, sc_dtx_registered : 1, sc_csummer_inited : 1, + sc_csummer_initing : 1, sc_stopping : 1, sc_destroying : 1, sc_vos_agg_active : 1, + sc_ec_agg_active : 1, /* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */ sc_rw_disabled : 1, sc_scrubbing : 1, sc_rebuilding : 1, /* flag of sc_ec_agg_eph_boundary valid */ diff --git a/src/include/daos_srv/control.h b/src/include/daos_srv/control.h index fa0d64cb623..b977fb69f7f 100644 --- a/src/include/daos_srv/control.h +++ b/src/include/daos_srv/control.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2020-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -17,13 +18,6 @@ #include #include -/** - * Space separated string of CLI options to pass to DPDK when started during - * spdk_env_init(). These options will override the DPDK defaults. - */ -extern const char * -dpdk_cli_override_opts; - #define NVME_PCI_DEV_TYPE_VMD "vmd" #define NVME_DETAIL_BUFLEN 1024 /** @@ -74,6 +68,17 @@ dpdk_cli_override_opts; #define NVME_ROLE_ALL (NVME_ROLE_DATA | NVME_ROLE_META | NVME_ROLE_WAL) +/* Default SPDK log level (one of ERROR,WARN,NOTICE,INFO,DEBUG) */ +#define DAOS_SPDK_LOG_DEFAULT SPDK_LOG_ERROR +/* Max SPDK log level */ +#define DAOS_SPDK_LOG_MAX SPDK_LOG_DEBUG +/* Default DPDK log level: RTE_LOG_ERR (dpdk/lib/eal/include/rte_log.h) */ +#define DAOS_DPDK_LOG_DEFAULT 4 +/* Min DPDK log level: RTE_LOG_EMERG */ +#define DAOS_DPDK_LOG_MIN 1 +/* Max DPDK log level: RTE_LOG_MAX */ +#define DAOS_DPDK_LOG_MAX 8 + /** * Current device health state (health statistics). Periodically updated in * bio_bs_monitor(). Used to determine faulty device status. @@ -169,4 +174,25 @@ struct nvme_ns_t { * \return Zero on success, negative value on error */ int copy_ascii(char *dst, size_t dst_sz, const void *src, size_t src_sz); + +/** + * Build DPDK CLI options string with per-facility log levels. + * Useful for debugging specific facilities while keeping others quiet. + * + * DPDK log level (1-8): 1=EMERG, 2=ALERT, 3=CRIT, 4=ERR, 5=WARNING, + * 6=NOTICE, 7=INFO, 8=DEBUG + * + * \param eal_level Log level for Environment Abstraction Layer facility (1-8) + * \param default_level Default log level for other facilities (1-8) + * + * \return Pointer to static buffer containing DPDK CLI options string, + * or NULL if log levels are out of range. + * + * Example: + * // DEBUG for EAL, ERROR for rest + * const char *opts = dpdk_cli_build_opts(8, 4); + */ +const char * +dpdk_cli_build_opts(int eal_level, int default_level); + #endif /** __CONTROL_H_ */ diff --git a/src/include/daos_srv/daos_chk.h b/src/include/daos_srv/daos_chk.h index 5756c84232e..9c363c86c9c 100644 --- a/src/include/daos_srv/daos_chk.h +++ b/src/include/daos_srv/daos_chk.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -74,9 +74,9 @@ typedef int (*chk_query_pool_cb_t)(struct chk_query_pool_shard *shard, uint32_t typedef int (*chk_prop_cb_t)(void *buf, uint32_t policies[], int cnt, uint32_t flags); -int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, - struct chk_policy *policies, int pool_nr, uuid_t pools[], - uint32_t api_flags, int phase); +int +chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, + int pool_nr, uuid_t pools[], uint32_t api_flags); int chk_leader_stop(int pool_nr, uuid_t pools[]); diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index 5c18dfff639..c32b580db47 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -453,11 +453,9 @@ int dss_parameters_set(unsigned int key_id, uint64_t value); enum dss_ult_flags { /* Periodically created ULTs */ - DSS_ULT_FL_PERIODIC = (1 << 0), + DSS_ULT_FL_PERIODIC = (1 << 0), /* Use DSS_DEEP_STACK_SZ as the stack size */ - DSS_ULT_DEEP_STACK = (1 << 1), - /* Use current ULT (instead of creating new one) for the task. */ - DSS_USE_CURRENT_ULT = (1 << 2), + DSS_ULT_DEEP_STACK = (1 << 1), }; int dss_ult_create(void (*func)(void *), void *arg, int xs_type, int tgt_id, @@ -739,10 +737,11 @@ enum dss_init_state { }; enum dss_media_error_type { - MET_WRITE = 0, /* write error */ - MET_READ, /* read error */ - MET_UNMAP, /* unmap error */ - MET_CSUM /* checksum error */ + MET_WRITE = 0, /* NVME write error */ + MET_READ, /* NVME read error */ + MET_UNMAP, /* NVME unmap error */ + MET_CSUM, /* Checksum error */ + MET_IO_STALLED, /* NVMe I/O stalled */ }; void dss_init_state_set(enum dss_init_state state); @@ -847,4 +846,11 @@ dss_select_module_version(int module_id, uint8_t *module_ver) return dss_select_module_version(module_id, version); \ } +int +dss_vos_pool_create(const char *path, unsigned char *uuid, daos_size_t scm_size, + daos_size_t data_sz, daos_size_t meta_sz, unsigned int flags, uint32_t version, + daos_handle_t *pool); +int +dss_vos_pool_open(const char *path, unsigned char *uuid, unsigned int flags, daos_handle_t *pool); + #endif /* __DSS_API_H__ */ diff --git a/src/include/daos_srv/dlck.h b/src/include/daos_srv/dlck.h deleted file mode 100644 index 85745a9bccd..00000000000 --- a/src/include/daos_srv/dlck.h +++ /dev/null @@ -1,12 +0,0 @@ -/** - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP - * - * SPDX-License-Identifier: BSD-2-Clause-Patent - */ - -#ifndef __DAOS_DLCK_H__ -#define __DAOS_DLCK_H__ - -/** placeholder for the DLCK-dedicated APIs */ - -#endif /* __DAOS_DLCK_H__ */ diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 6143ed9b350..873d59ef1b2 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -47,6 +47,7 @@ struct dtx_local_oid_record { * the most optimal way (packed). Please make sure that all necessary padding * is explicit so it could be used in the future. */ +/* clang-format off */ struct dtx_handle { union { struct dtx_entry dth_dte; @@ -92,8 +93,6 @@ struct dtx_handle { dth_modify_shared : 1, /* The DTX entry is in active table. */ dth_active : 1, - /* Leader oid is touched. */ - dth_touched_leader_oid : 1, /* Local TX is started. */ dth_local_tx_started : 1, /* The DTX share lists are inited. */ @@ -117,7 +116,7 @@ struct dtx_handle { /* Locally generate the epoch. */ dth_epoch_owner : 1, /* Flag to commit the local transaction */ - dth_local_complete : 1, padding1 : 12; + dth_local_complete : 1, padding1 : 13; /* The count the DTXs in the dth_dti_cos array. */ uint32_t dth_dti_cos_count; @@ -138,25 +137,14 @@ struct dtx_handle { uint16_t dth_deferred_used_cnt; uint16_t padding2; - union { - struct { - /** The count of objects that are modified by this DTX. */ - uint16_t dth_oid_cnt; - /** The total slots in the dth_oid_array. */ - uint16_t dth_oid_cap; - uint32_t padding3; - /** If more than one objects are modified, the IDs are reocrded here. */ - daos_unit_oid_t *dth_oid_array; - }; - struct { - /** The count of objects stored in dth_local_oid_array. */ - uint16_t dth_local_oid_cnt; - /** The total slots in the dth_local_oid_array. */ - uint16_t dth_local_oid_cap; - uint32_t padding4; - /** The record of all objects touched by the local transaction. */ - struct dtx_local_oid_record *dth_local_oid_array; - }; + struct { + /** The count of objects stored in dth_local_oid_array. */ + uint16_t dth_local_oid_cnt; + /** The total slots in the dth_local_oid_array. */ + uint16_t dth_local_oid_cap; + uint32_t padding4; + /** The record of all objects touched by the local transaction. */ + struct dtx_local_oid_record *dth_local_oid_array; }; /* Hash of the dkey to be modified if applicable. Per modification. */ @@ -179,6 +167,7 @@ struct dtx_handle { int dth_share_tbd_count; uint32_t padding5; }; +/* clang-format on */ /* Each sub transaction handle to manage each sub thandle */ struct dtx_sub_status { diff --git a/src/include/daos_srv/iv.h b/src/include/daos_srv/iv.h index 12f19e98383..46ec9d0f8e8 100644 --- a/src/include/daos_srv/iv.h +++ b/src/include/daos_srv/iv.h @@ -319,6 +319,8 @@ int ds_iv_ns_create(crt_context_t ctx, uuid_t pool_uuid, crt_group_t *grp, void ds_iv_ns_update(struct ds_iv_ns *ns, unsigned int master_rank, uint64_t term); void ds_iv_ns_cleanup(struct ds_iv_ns *ns); +int + ds_iv_ns_reint_prep(struct ds_iv_ns *ns); void ds_iv_ns_stop(struct ds_iv_ns *ns); void ds_iv_ns_leader_stop(struct ds_iv_ns *ns); void ds_iv_ns_start(struct ds_iv_ns *ns); diff --git a/src/include/daos_srv/mgmt_tgt_common.h b/src/include/daos_srv/mgmt_tgt_common.h index e3d616f0339..8268fc6bff3 100644 --- a/src/include/daos_srv/mgmt_tgt_common.h +++ b/src/include/daos_srv/mgmt_tgt_common.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Vdura Inc. * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -57,13 +57,14 @@ typedef void (*bind_cpu_fn_t)(int tgt_id); * \param[in] pool_uuid Pool uuid * \param[in] scm_size Per vos file size * \param[in] tgt_nr Vos files number + * \param[in] tgts Target ID array * \param[in] rdb_blob_sz rdb file size (rdb file will not be recreated if size is zero) * \param[in] storage_path Base path to store vos and rdb files * \param[in] bind_cpu_fn Bind a separate cpu to each vos file allocation */ int -ds_mgmt_tgt_recreate(uuid_t pool_uuid, daos_size_t scm_size, int tgt_nr, daos_size_t rdb_blob_sz, - const char *storage_path, bind_cpu_fn_t bind_cpu_fn); +ds_mgmt_tgt_recreate(uuid_t pool_uuid, daos_size_t scm_size, int tgt_nr, int *tgts, + daos_size_t rdb_blob_sz, const char *storage_path, bind_cpu_fn_t bind_cpu_fn); /** * Parallel recreate vos files. @@ -74,11 +75,12 @@ ds_mgmt_tgt_recreate(uuid_t pool_uuid, daos_size_t scm_size, int tgt_nr, daos_si * \param[in] cancel_pending If true, preallocate will abort * \param[in] newborns_path Base path for store vos/rdb files * \param[in] bind_cpu_fn e.g. `dss_bind_to_xstream_cpuset` + * \param[in] tgts Target ID array */ int ds_mgmt_tgt_preallocate_parallel(uuid_t uuid, daos_size_t scm_size, int tgt_nr, bool *cancel_pending, const char *newborns_path, - bind_cpu_fn_t bind_cpu_fn); + bind_cpu_fn_t bind_cpu_fn, int *tgts); /** * Sequential recreate vos files. diff --git a/src/include/daos_srv/object.h b/src/include/daos_srv/object.h index bc816d98c6e..9fd82b76d0d 100644 --- a/src/include/daos_srv/object.h +++ b/src/include/daos_srv/object.h @@ -87,10 +87,10 @@ ds_object_migrate_send(struct ds_pool *pool, uuid_t pool_hdl_uuid, uuid_t cont_u uint32_t new_gl_ver, unsigned int migrate_opc, uint64_t *enqueue_id, uint32_t *max_delay); int -ds_migrate_object(struct ds_pool *pool, uuid_t po_hdl, uuid_t co_hdl, uuid_t co_uuid, - uint32_t version, uint32_t generation, uint64_t max_eph, uint32_t opc, - daos_unit_oid_t *oids, daos_epoch_t *epochs, daos_epoch_t *punched_epochs, - unsigned int *shards, uint32_t count, unsigned int tgt_idx, uint32_t new_gl_ver); +ds_migrate_object(uuid_t pool_uuid, uuid_t po_hdl, uuid_t co_hdl, uuid_t co_uuid, uint32_t version, + uint32_t generation, uint64_t max_eph, uint32_t opc, daos_unit_oid_t *oids, + daos_epoch_t *epochs, daos_epoch_t *punched_epochs, unsigned int *shards, + uint32_t count, unsigned int tgt_idx, uint32_t new_gl_ver); void ds_migrate_stop(struct ds_pool *pool, uint32_t ver, unsigned int generation); diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 852cadb71ea..147e4bb3fc1 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -79,6 +80,7 @@ struct ds_pool { struct sched_request *sp_ec_ephs_req; uint32_t sp_dtx_resync_version; + uint32_t sp_gl_dtx_resync_version; /* global DTX resync version */ /* Special pool/container handle uuid, which are * created on the pool leader step up, and propagated * to all servers by IV. Then they will be used by server @@ -92,7 +94,7 @@ struct ds_pool { * rebuild job. */ uint32_t sp_rebuild_gen; - int sp_rebuilding; + ATOMIC int sp_rebuilding; /** * someone has already messaged this pool to for rebuild scan, * NB: all xstreams can do lockless-write on it but it's OK @@ -217,7 +219,7 @@ struct ds_pool_svc_op_val { static inline bool ds_pool_is_rebuilding(struct ds_pool *pool) { - return (pool->sp_rebuilding > 0 || pool->sp_rebuild_scan > 0); + return (atomic_load(&pool->sp_rebuilding) > 0 || pool->sp_rebuild_scan > 0); } /* encode metadata RPC operation key: HLC time first, in network order, for keys sorted by time. @@ -385,7 +387,8 @@ ds_pool_child_map_refresh_async(struct ds_pool_child *dpc); int map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *ranks); - +int +map_ranks_failed(const struct pool_map *map, d_rank_list_t *ranks); void map_ranks_fini(d_rank_list_t *ranks); @@ -564,18 +567,19 @@ int ds_pool_prop_recov_cont_reset(struct rdb_tx *tx, struct ds_rsvc *rsvc); static inline bool -is_pool_rebuild_allowed(struct ds_pool *pool, bool check_delayed_rebuild) +is_pool_rebuild_allowed(struct ds_pool *pool, uint64_t self_heal, bool auto_recovery) { - uint64_t flags = DAOS_SELF_HEAL_AUTO_REBUILD; - - if (check_delayed_rebuild) - flags |= DAOS_SELF_HEAL_DELAY_REBUILD; + bool auto_rebuild_enabled = self_heal & DAOS_SELF_HEAL_AUTO_REBUILD; + bool delay_rebuild_enabled = self_heal & DAOS_SELF_HEAL_DELAY_REBUILD; if (pool->sp_disable_rebuild) return false; - if (!(pool->sp_self_heal & flags)) + + /* If auto recovery is requested, only allow if self_heal enables auto or delay_rebuild */ + if (auto_recovery && !(auto_rebuild_enabled || delay_rebuild_enabled)) return false; + /* Otherwise, rebuild is allowed */ return true; } diff --git a/src/include/daos_srv/ras.h b/src/include/daos_srv/ras.h index 5df47372eb9..8fcf4ae8013 100644 --- a/src/include/daos_srv/ras.h +++ b/src/include/daos_srv/ras.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2020-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -65,7 +65,8 @@ X(RAS_SYSTEM_FABRIC_PROV_CHANGED, "system_fabric_provider_changed") \ X(RAS_ENGINE_JOIN_FAILED, "engine_join_failed") \ X(RAS_DEVICE_LINK_SPEED_CHANGED, "device_link_speed_changed") \ - X(RAS_DEVICE_LINK_WIDTH_CHANGED, "device_link_width_changed") + X(RAS_DEVICE_LINK_WIDTH_CHANGED, "device_link_width_changed") \ + X(RAS_DEVICE_LED_SET, "device_led_set") /** Define RAS event enum */ typedef enum { diff --git a/src/include/daos_srv/rdb.h b/src/include/daos_srv/rdb.h index 9ee58895332..6a828caa65f 100644 --- a/src/include/daos_srv/rdb.h +++ b/src/include/daos_srv/rdb.h @@ -1,5 +1,6 @@ /* * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -114,6 +115,50 @@ */ struct rdb_storage; +/** + * Replica ID + * + * This 64-bit ID type is designed to be passed around by value, rather than + * address, even through it is a struct. + */ +typedef struct { + d_rank_t rri_rank; /**< rank */ + uint32_t rri_gen; /**< generation (see rdb_alloc_replica_gen) */ +} rdb_replica_id_t; + +#define RDB_F_RID "%u.%u" +#define RDB_P_RID(id) id.rri_rank, id.rri_gen + +static inline int +rdb_replica_id_compare(rdb_replica_id_t x, rdb_replica_id_t y) +{ + if (x.rri_rank < y.rri_rank) + return -1; + if (x.rri_rank > y.rri_rank) + return 1; + + if (x.rri_gen < y.rri_gen) + return -1; + if (x.rri_gen > y.rri_gen) + return 1; + + return 0; +} + +/* clang-format off */ +int crt_proc_rdb_replica_id_t(crt_proc_t proc, crt_proc_op_t proc_op, rdb_replica_id_t *p); +/* clang-format on */ + +/** Parameters for creating database storage */ +struct rdb_create_params { + size_t rcp_size; /**< VOS pool size in bytes */ + uint32_t rcp_vos_df_version; /**< VOS durable format version */ + uint32_t rcp_layout_version; /**< layout version (0 for default) */ + rdb_replica_id_t rcp_id; /**< self ID */ + rdb_replica_id_t *rcp_replicas; /**< replica IDs if bootstrapping */ + int rcp_replicas_len; /**< length of rcp_replicas[] */ +}; + struct rdb_cbs; /** @@ -138,15 +183,17 @@ struct rdb_clue { }; /** Database storage methods */ -int rdb_create(const char *path, const uuid_t uuid, uint64_t caller_term, size_t size, - uint32_t vos_df_version, const d_rank_list_t *replicas, struct rdb_cbs *cbs, - void *arg, struct rdb_storage **storagep); +/* clang-format off */ +int rdb_create(const char *path, const uuid_t uuid, uint64_t caller_term, + struct rdb_create_params *params, struct rdb_cbs *cbs, void *arg, + struct rdb_storage **storagep); int rdb_open(const char *path, const uuid_t uuid, uint64_t caller_term, struct rdb_cbs *cbs, void *arg, struct rdb_storage **storagep); void rdb_close(struct rdb_storage *storage); int rdb_destroy(const char *path, const uuid_t uuid); int rdb_glance(struct rdb_storage *storage, struct rdb_clue *clue); int rdb_dictate(struct rdb_storage *storage); +/* clang-format on */ /** Database (opaque) */ struct rdb; @@ -179,7 +226,14 @@ struct rdb_cbs { void (*dc_stop)(struct rdb *db, int err, void *arg); }; +/** Operation for \a rdb_modify_replicas */ +enum rdb_replica_op { + RDB_REPLICA_ADD, /**< add voting replicas */ + RDB_REPLICA_REMOVE /**< remove voting replicas */ +}; + /** Database methods */ +/* clang-format off */ int rdb_start(struct rdb_storage *storage, struct rdb **dbp); void rdb_stop(struct rdb *db, struct rdb_storage **storagep); void rdb_stop_and_close(struct rdb *db); @@ -187,12 +241,17 @@ void rdb_resign(struct rdb *db, uint64_t term); int rdb_campaign(struct rdb *db); bool rdb_is_leader(struct rdb *db, uint64_t *term); int rdb_get_leader(struct rdb *db, uint64_t *term, d_rank_t *rank); +rdb_replica_id_t rdb_get_replica_id(struct rdb *db); +int rdb_get_replicas(struct rdb *db, rdb_replica_id_t **replicas, int *replicas_len); int rdb_get_ranks(struct rdb *db, d_rank_list_t **ranksp); int rdb_get_size(struct rdb *db, size_t *sizep); -int rdb_add_replicas(struct rdb *db, d_rank_list_t *replicas); -int rdb_remove_replicas(struct rdb *db, d_rank_list_t *replicas); +uint32_t rdb_get_version(struct rdb *db); +int rdb_alloc_replica_gen(struct rdb *db, uint64_t term, uint32_t *gen_out); +int rdb_modify_replicas(struct rdb *db, enum rdb_replica_op op, rdb_replica_id_t *replicas, + int *replica_len); int rdb_ping(struct rdb *db, uint64_t caller_term); int rdb_upgrade_vos_pool(struct rdb *db, uint32_t df_version); +/* clang-format on */ /** * Path (opaque) @@ -210,16 +269,19 @@ typedef d_iov_t rdb_path_t; extern d_iov_t rdb_path_root_key; /** Path methods */ +/* clang-format off */ int rdb_path_init(rdb_path_t *path); void rdb_path_fini(rdb_path_t *path); int rdb_path_clone(const rdb_path_t *path, rdb_path_t *new_path); int rdb_path_push(rdb_path_t *path, const d_iov_t *key); +/* clang-format on */ /** * Define a d_iov_t object, named \a prefix + \a name, that represents a * constant string key. See rdb_layout.[ch] for an example of the usage of this * helper macro. */ +/* clang-format off */ #define RDB_STRING_KEY(prefix, name) \ static char prefix ## name ## _buf[] = #name; \ d_iov_t prefix ## name = { \ @@ -227,6 +289,7 @@ d_iov_t prefix ## name = { \ .iov_buf_len = sizeof(prefix ## name ## _buf), \ .iov_len = sizeof(prefix ## name ## _buf) \ } +/* clang-format on */ /** KVS classes */ enum rdb_kvs_class { @@ -261,13 +324,16 @@ struct rdb_tx { #define RDB_NIL_TERM UINT64_MAX /** TX methods */ +/* clang-format off */ int rdb_tx_begin(struct rdb *db, uint64_t term, struct rdb_tx *tx); int rdb_tx_begin_local(struct rdb_storage *storage, struct rdb_tx *tx); void rdb_tx_discard(struct rdb_tx *tx); int rdb_tx_commit(struct rdb_tx *tx); void rdb_tx_end(struct rdb_tx *tx); +/* clang-format on */ /** TX update methods */ +/* clang-format off */ int rdb_tx_create_root(struct rdb_tx *tx, const struct rdb_kvs_attr *attr); int rdb_tx_destroy_root(struct rdb_tx *tx); int rdb_tx_create_kvs(struct rdb_tx *tx, const rdb_path_t *parent, @@ -280,6 +346,7 @@ int rdb_tx_update_critical(struct rdb_tx *tx, const rdb_path_t *kvs, const d_iov_t *key, const d_iov_t *value); int rdb_tx_delete(struct rdb_tx *tx, const rdb_path_t *kvs, const d_iov_t *key); +/* clang-format on */ /** Probe operation codes */ enum rdb_probe_opc { @@ -306,6 +373,7 @@ typedef int (*rdb_iterate_cb_t)(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg); /** TX query methods */ +/* clang-format off */ int rdb_tx_lookup(struct rdb_tx *tx, const rdb_path_t *kvs, const d_iov_t *key, d_iov_t *value); int rdb_tx_fetch(struct rdb_tx *tx, const rdb_path_t *kvs, @@ -315,5 +383,6 @@ int rdb_tx_query_key_max(struct rdb_tx *tx, const rdb_path_t *kvs, d_iov_t *key) int rdb_tx_iterate(struct rdb_tx *tx, const rdb_path_t *kvs, bool backward, rdb_iterate_cb_t cb, void *arg); int rdb_tx_revalidate(struct rdb_tx *tx); +/* clang-format on */ #endif /* DAOS_SRV_RDB_H */ diff --git a/src/include/daos_srv/rebuild.h b/src/include/daos_srv/rebuild.h index b513d04c21a..a161ba8d5ad 100644 --- a/src/include/daos_srv/rebuild.h +++ b/src/include/daos_srv/rebuild.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -96,7 +96,7 @@ void ds_rebuild_running_query(uuid_t pool_uuid, uint32_t opc, uint32_t *rebuild_ daos_epoch_t *current_eph, uint32_t *rebuild_gen); int ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys_self_heal, - uint64_t delay_sec); + bool auto_recovery, uint64_t delay_sec); void ds_rebuild_leader_stop_all(void); void ds_rebuild_abort(uuid_t pool_uuid, unsigned int version, uint32_t rebuild_gen, uint64_t term); diff --git a/src/include/daos_srv/rsvc.h b/src/include/daos_srv/rsvc.h index e7f7ac4c6d2..7f66d66b329 100644 --- a/src/include/daos_srv/rsvc.h +++ b/src/include/daos_srv/rsvc.h @@ -135,29 +135,40 @@ enum ds_rsvc_start_mode { DS_RSVC_DICTATE /**< DANGEROUSLY reset and start the service (see rdb_dictate) */ }; +/* clang-format off */ int ds_rsvc_start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t caller_term, - enum ds_rsvc_start_mode mode, size_t size, uint32_t vos_df_version, - d_rank_list_t *replicas, void *arg); + enum ds_rsvc_start_mode mode, struct rdb_create_params *create_params, void *arg); int ds_rsvc_stop(enum ds_rsvc_class_id class, d_iov_t *id, uint64_t caller_term, bool destroy); int ds_rsvc_stop_leader(enum ds_rsvc_class_id class, d_iov_t *id, struct rsvc_hint *hint); +/* clang-format on */ + +/** Parameters used for creating an rsvc */ +struct ds_rsvc_create_params { + bool scp_bootstrap; /**< create with an initial list of replicas */ + size_t scp_size; /**< size of each replica in bytes */ + uint32_t scp_vos_df_version; /**< version of VOS durable format */ + uint32_t scp_layout_version; /**< version of RDB layout */ + rdb_replica_id_t *scp_replicas; /**< replicas IDs */ + int scp_replicas_len; /**< length of scp_replicas[] */ +}; + +/* clang-format off */ + int ds_rsvc_dist_start(enum ds_rsvc_class_id class, d_iov_t *id, const uuid_t dbid, const d_rank_list_t *ranks, uint64_t caller_term, - enum ds_rsvc_start_mode mode, bool bootstrap, size_t size, - uint32_t vos_df_version); + enum ds_rsvc_start_mode mode, struct ds_rsvc_create_params *create_params); int ds_rsvc_dist_stop(enum ds_rsvc_class_id class, d_iov_t *id, const d_rank_list_t *ranks, d_rank_list_t *excluded, uint64_t caller_term, bool destroy); enum ds_rsvc_state ds_rsvc_get_state(struct ds_rsvc *svc); void ds_rsvc_set_state(struct ds_rsvc *svc, enum ds_rsvc_state state); -void -ds_rsvc_begin_stepping_up(struct ds_rsvc *svc); -int - ds_rsvc_end_stepping_up(struct ds_rsvc *svc, int rc_in, enum ds_rsvc_state state); +void ds_rsvc_begin_stepping_up(struct ds_rsvc *svc); +int ds_rsvc_end_stepping_up(struct ds_rsvc *svc, int rc_in, enum ds_rsvc_state state); int ds_rsvc_add_replicas_s(struct ds_rsvc *svc, d_rank_list_t *ranks, size_t size, uint32_t vos_df_version); int ds_rsvc_add_replicas(enum ds_rsvc_class_id class, d_iov_t *id, d_rank_list_t *ranks, size_t size, uint32_t vos_df_version, struct rsvc_hint *hint); -int ds_rsvc_remove_replicas_s(struct ds_rsvc *svc, d_rank_list_t *ranks); +int ds_rsvc_remove_replicas_s(struct ds_rsvc *svc, d_rank_list_t *ranks, bool destroy); int ds_rsvc_remove_replicas(enum ds_rsvc_class_id class, d_iov_t *id, d_rank_list_t *ranks, struct rsvc_hint *hint); int ds_rsvc_lookup(enum ds_rsvc_class_id class, d_iov_t *id, @@ -186,4 +197,6 @@ void ds_rsvc_request_map_dist(struct ds_rsvc *svc); void ds_rsvc_query_map_dist(struct ds_rsvc *svc, uint32_t *version, bool *idle); void ds_rsvc_wait_map_dist(struct ds_rsvc *svc); +/* clang-format on */ + #endif /* DAOS_SRV_RSVC_H */ diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 07530961cc4..730c2e88742 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -491,7 +492,7 @@ vos_pool_upgrade(daos_handle_t poh, uint32_t version); */ int vos_pool_open_metrics(const char *path, uuid_t uuid, unsigned int flags, void *metrics, - daos_handle_t *poh); + struct checker *ck, daos_handle_t *poh); /** * Close a VOSP, all opened containers sharing this pool handle @@ -1053,6 +1054,17 @@ vos_cont_set_global_stable_epoch(daos_handle_t coh, daos_epoch_t epoch); int vos_cont_set_mod_bound(daos_handle_t coh, uint64_t epoch); +/** + * Save property for the given container. + * + * \param coh [IN] Container open handle + * \param props [IN] Pointer to container property to be saved. + * + * \return Zero on success, negative value if error. + */ +int +vos_cont_save_props(daos_handle_t coh, struct cont_props *props); + /** * Query the gap between the max allowed aggregation epoch and current HLC. * @@ -1578,7 +1590,7 @@ struct cont_scrub { void *scs_cont_src; daos_handle_t scs_cont_hdl; uuid_t scs_cont_uuid; - bool scs_props_fetched; + bool scs_csummer_inited; }; /* @@ -1798,7 +1810,8 @@ bool vos_oi_exist(daos_handle_t coh, daos_unit_oid_t oid); /* Timing statistic of DTX entries */ -#define DTX_TIME_STAT_COUNT 3 +enum { DTX_TIME_STAT_MIN = 0, DTX_TIME_STAT_MAX, DTX_TIME_STAT_MEAN, DTX_TIME_STAT_COUNT }; + struct dtx_time_stat { daos_epoch_t dts_epoch[DTX_TIME_STAT_COUNT]; uint64_t dts_cmt_time[DTX_TIME_STAT_COUNT]; diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index ca50a41658f..ee64bb92e1b 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2015-2025 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -184,6 +184,8 @@ typedef struct { daos_size_t ci_used; /** Highest (Last) aggregated epoch */ daos_epoch_t ci_hae; + /** latest epoch for writes that require aggregation */ + daos_epoch_t ci_agg_write; /** TODO */ } vos_cont_info_t; diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h index 4f50f02e323..6963fa8f7f4 100644 --- a/src/include/gurt/common.h +++ b/src/include/gurt/common.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -457,6 +457,7 @@ d_power2_nbits(unsigned int val) return val == LOWEST_BIT_SET(val) ? shift - 1 : shift; } +/* clang-format off */ int d_rank_list_dup(d_rank_list_t **dst, const d_rank_list_t *src); int d_rank_list_dup_sort_uniq(d_rank_list_t **dst, const d_rank_list_t *src); void d_rank_list_filter(d_rank_list_t *src_set, d_rank_list_t *dst_set, @@ -468,6 +469,7 @@ void d_rank_list_free(d_rank_list_t *rank_list); int d_rank_list_copy(d_rank_list_t *dst, d_rank_list_t *src); void d_rank_list_shuffle(d_rank_list_t *rank_list); void d_rank_list_sort(d_rank_list_t *rank_list); +bool d_rank_list_bsearch(d_rank_list_t *rank_list, d_rank_t rank, int *idx); bool d_rank_list_find(d_rank_list_t *rank_list, d_rank_t rank, int *idx); void d_rank_list_del_at(d_rank_list_t *list, int idx); int d_rank_list_del(d_rank_list_t *rank_list, d_rank_t rank); @@ -479,15 +481,13 @@ int d_rank_list_append(d_rank_list_t *rank_list, d_rank_t rank); int d_rank_list_dump(d_rank_list_t *rank_list, d_string_t name, int name_len); d_rank_list_t *uint32_array_to_rank_list(uint32_t *ints, size_t len); int rank_list_to_uint32_array(d_rank_list_t *rl, uint32_t **ints, size_t *len); -int - d_rank_list_to_str(d_rank_list_t *rank_list, char **rank_str); - +int d_rank_list_to_str(d_rank_list_t *rank_list, char **rank_str); d_rank_range_list_t *d_rank_range_list_alloc(uint32_t size); d_rank_range_list_t *d_rank_range_list_realloc(d_rank_range_list_t *range_list, uint32_t size); d_rank_range_list_t *d_rank_range_list_create_from_ranks(d_rank_list_t *rank_list); -int - d_rank_range_list_str(d_rank_range_list_t *list, char **ranks_str); +int d_rank_range_list_str(d_rank_range_list_t *list, char **ranks_str); void d_rank_range_list_free(d_rank_range_list_t *range_list); +/* clang-format on */ #ifdef FAULT_INJECTION diff --git a/src/include/gurt/debug.h b/src/include/gurt/debug.h index df6e9f48e42..38c728085f5 100644 --- a/src/include/gurt/debug.h +++ b/src/include/gurt/debug.h @@ -322,6 +322,15 @@ int d_log_getdbgbit(d_dbug_t *dbgbit, char *bitname); int d_register_alt_assert(void (*alt_assert)(const int, const char*, const char*, const int)); +/** + * \brief D_FATAL the provided memory range in hex. + * + * \param[in] ptr Start of the memory range. + * \param[in] size Size of the memory range. + */ +void +d_log_memory(const uint8_t *ptr, size_t size); + /** * D_PRINT can be used for output to stdout with or without clog being enabled */ @@ -343,6 +352,17 @@ int d_register_alt_assert(void (*alt_assert)(const int, const char*, assert(0); \ } while (0) +#define D_ASSERTF_MEM(cond, ptr, size, fmt, ...) \ + do { \ + if (likely(cond)) \ + break; \ + D_FATAL("Assertion '%s' failed: " fmt, #cond, ##__VA_ARGS__); \ + d_log_memory((uint8_t *)ptr, size); \ + if (d_alt_assert != NULL) \ + d_alt_assert(0, #cond, __FILE__, __LINE__); \ + assert(0); \ + } while (0) + /* Assert cond is true with message to report on failure */ #define D_ASSERTF(cond, fmt, ...) \ do { \ diff --git a/src/mgmt/mgmt_common.c b/src/mgmt/mgmt_common.c index bba5392c107..70ffcc46776 100644 --- a/src/mgmt/mgmt_common.c +++ b/src/mgmt/mgmt_common.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Vdura Inc. * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -82,8 +82,8 @@ ds_mgmt_dir_fsync(const char *dir) } int -ds_mgmt_tgt_recreate(uuid_t pool_uuid, daos_size_t scm_size, int tgt_nr, daos_size_t rdb_blob_sz, - const char *storage_path, bind_cpu_fn_t bind_cpu_fn) +ds_mgmt_tgt_recreate(uuid_t pool_uuid, daos_size_t scm_size, int tgt_nr, int *tgts, + daos_size_t rdb_blob_sz, const char *storage_path, bind_cpu_fn_t bind_cpu_fn) { char *newborns_path = NULL; char *pool_newborns_path = NULL; @@ -134,7 +134,7 @@ ds_mgmt_tgt_recreate(uuid_t pool_uuid, daos_size_t scm_size, int tgt_nr, daos_si /** create VOS files */ rc = ds_mgmt_tgt_preallocate_parallel(pool_uuid, scm_size, tgt_nr, &dummy_cancel_state, - newborns_path, bind_cpu_fn); + newborns_path, bind_cpu_fn, tgts); if (rc) { D_ERROR(DF_UUID ": failed to create tgt vos files: " DF_RC "\n", DP_UUID(pool_uuid), DP_RC(rc)); @@ -149,7 +149,7 @@ ds_mgmt_tgt_recreate(uuid_t pool_uuid, daos_size_t scm_size, int tgt_nr, daos_si rc = -DER_NONEXIST; goto out; } - fd = open(rdb_path, O_RDWR | O_CREAT, 0600); + fd = open(rdb_path, O_RDWR | O_CREAT, UMEM_FILE_MODE_DEFAULT); if (fd < 0) { rc = daos_errno2der(errno); D_ERROR("failed to create/open the vos file %s:" DF_RC "\n", rdb_path, @@ -200,7 +200,7 @@ ds_mgmt_tgt_preallocate(uuid_t uuid, daos_size_t scm_size, int tgt_id, const cha D_DEBUG(DB_MGMT, DF_UUID ": creating vos file %s (%ld bytes)\n", DP_UUID(uuid), path, scm_size); - fd = open(path, O_CREAT | O_RDWR, 0600); + fd = open(path, O_CREAT | O_RDWR, UMEM_FILE_MODE_DEFAULT); if (fd < 0) { rc = daos_errno2der(errno); D_ERROR(DF_UUID ": failed to create vos file %s: " DF_RC "\n", DP_UUID(uuid), path, @@ -307,7 +307,7 @@ ds_mgmt_tgt_preallocate_sequential(uuid_t uuid, daos_size_t scm_size, int tgt_nr int ds_mgmt_tgt_preallocate_parallel(uuid_t uuid, daos_size_t scm_size, int tgt_nr, bool *cancel_pending, const char *newborns_path, - bind_cpu_fn_t bind_cpu_fn) + bind_cpu_fn_t bind_cpu_fn, int *tgts) { int i; int rc; @@ -329,7 +329,7 @@ ds_mgmt_tgt_preallocate_parallel(uuid_t uuid, daos_size_t scm_size, int tgt_nr, entry = &thrds_list[i]; uuid_copy(entry->tvt_args.tvpa_uuid, uuid); entry->tvt_args.tvpa_scm_size = scm_size; - entry->tvt_args.tvpa_tgt_id = i; + entry->tvt_args.tvpa_tgt_id = (tgts != NULL) ? tgts[i] : i; entry->tvt_args.tvpa_newborns_path = newborns_path; entry->tvt_args.tvpa_bind_cpu_fn = bind_cpu_fn; rc = pthread_create(&entry->tvt_tid, NULL, tgt_preallocate_thrd_func, diff --git a/src/mgmt/pool.pb-c.c b/src/mgmt/pool.pb-c.c index c2ecb62f393..301e074e0d8 100644 --- a/src/mgmt/pool.pb-c.c +++ b/src/mgmt/pool.pb-c.c @@ -3327,20 +3327,28 @@ const ProtobufCMessageDescriptor mgmt__storage_usage_stats__descriptor = (ProtobufCMessageInit) mgmt__storage_usage_stats__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCEnumValue mgmt__pool_rebuild_status__state__enum_values_by_number[3] = +static const ProtobufCEnumValue mgmt__pool_rebuild_status__state__enum_values_by_number[7] = { - { "IDLE", "MGMT__POOL_REBUILD_STATUS__STATE__IDLE", 0 }, - { "DONE", "MGMT__POOL_REBUILD_STATUS__STATE__DONE", 1 }, - { "BUSY", "MGMT__POOL_REBUILD_STATUS__STATE__BUSY", 2 }, + { "BUSY", "MGMT__POOL_REBUILD_STATUS__STATE__BUSY", 0 }, + { "IDLE", "MGMT__POOL_REBUILD_STATUS__STATE__IDLE", 1 }, + { "DONE", "MGMT__POOL_REBUILD_STATUS__STATE__DONE", 2 }, + { "STOPPING", "MGMT__POOL_REBUILD_STATUS__STATE__STOPPING", 3 }, + { "STOPPED", "MGMT__POOL_REBUILD_STATUS__STATE__STOPPED", 4 }, + { "FAILING", "MGMT__POOL_REBUILD_STATUS__STATE__FAILING", 5 }, + { "FAILED", "MGMT__POOL_REBUILD_STATUS__STATE__FAILED", 6 }, }; static const ProtobufCIntRange mgmt__pool_rebuild_status__state__value_ranges[] = { -{0, 0},{0, 3} +{0, 0},{0, 7} }; -static const ProtobufCEnumValueIndex mgmt__pool_rebuild_status__state__enum_values_by_name[3] = +static const ProtobufCEnumValueIndex mgmt__pool_rebuild_status__state__enum_values_by_name[7] = { - { "BUSY", 2 }, - { "DONE", 1 }, - { "IDLE", 0 }, + { "BUSY", 0 }, + { "DONE", 2 }, + { "FAILED", 6 }, + { "FAILING", 5 }, + { "IDLE", 1 }, + { "STOPPED", 4 }, + { "STOPPING", 3 }, }; const ProtobufCEnumDescriptor mgmt__pool_rebuild_status__state__descriptor = { @@ -3349,15 +3357,15 @@ const ProtobufCEnumDescriptor mgmt__pool_rebuild_status__state__descriptor = "State", "Mgmt__PoolRebuildStatus__State", "mgmt", - 3, + 7, mgmt__pool_rebuild_status__state__enum_values_by_number, - 3, + 7, mgmt__pool_rebuild_status__state__enum_values_by_name, 1, mgmt__pool_rebuild_status__state__value_ranges, NULL,NULL,NULL,NULL /* reserved[1234] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_rebuild_status__field_descriptors[4] = +static const ProtobufCFieldDescriptor mgmt__pool_rebuild_status__field_descriptors[6] = { { "status", @@ -3407,8 +3415,34 @@ static const ProtobufCFieldDescriptor mgmt__pool_rebuild_status__field_descripto 0, /* flags */ 0,NULL,NULL /* reserved1,reserved2, etc */ }, + { + "derived_state", + 5, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_ENUM, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolRebuildStatus, derived_state), + &mgmt__pool_rebuild_status__state__descriptor, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "degraded", + 6, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_BOOL, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolRebuildStatus, degraded), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_rebuild_status__field_indices_by_name[] = { + 5, /* field[5] = degraded */ + 4, /* field[4] = derived_state */ 2, /* field[2] = objects */ 3, /* field[3] = records */ 1, /* field[1] = state */ @@ -3417,7 +3451,7 @@ static const unsigned mgmt__pool_rebuild_status__field_indices_by_name[] = { static const ProtobufCIntRange mgmt__pool_rebuild_status__number_ranges[1 + 1] = { { 1, 0 }, - { 0, 4 } + { 0, 6 } }; const ProtobufCMessageDescriptor mgmt__pool_rebuild_status__descriptor = { @@ -3427,7 +3461,7 @@ const ProtobufCMessageDescriptor mgmt__pool_rebuild_status__descriptor = "Mgmt__PoolRebuildStatus", "mgmt", sizeof(Mgmt__PoolRebuildStatus), - 4, + 6, mgmt__pool_rebuild_status__field_descriptors, mgmt__pool_rebuild_status__field_indices_by_name, 1, mgmt__pool_rebuild_status__number_ranges, @@ -4297,40 +4331,6 @@ const ProtobufCMessageDescriptor mgmt__storage_target_usage__descriptor = (ProtobufCMessageInit) mgmt__storage_target_usage__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCEnumValue mgmt__pool_query_target_info__target_type__enum_values_by_number[5] = -{ - { "UNKNOWN", "MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__UNKNOWN", 0 }, - { "HDD", "MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__HDD", 1 }, - { "SSD", "MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__SSD", 2 }, - { "PM", "MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__PM", 3 }, - { "VM", "MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__VM", 4 }, -}; -static const ProtobufCIntRange mgmt__pool_query_target_info__target_type__value_ranges[] = { -{0, 0},{0, 5} -}; -static const ProtobufCEnumValueIndex mgmt__pool_query_target_info__target_type__enum_values_by_name[5] = -{ - { "HDD", 1 }, - { "PM", 3 }, - { "SSD", 2 }, - { "UNKNOWN", 0 }, - { "VM", 4 }, -}; -const ProtobufCEnumDescriptor mgmt__pool_query_target_info__target_type__descriptor = -{ - PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC, - "mgmt.PoolQueryTargetInfo.TargetType", - "TargetType", - "Mgmt__PoolQueryTargetInfo__TargetType", - "mgmt", - 5, - mgmt__pool_query_target_info__target_type__enum_values_by_number, - 5, - mgmt__pool_query_target_info__target_type__enum_values_by_name, - 1, - mgmt__pool_query_target_info__target_type__value_ranges, - NULL,NULL,NULL,NULL /* reserved[1234] */ -}; static const ProtobufCEnumValue mgmt__pool_query_target_info__target_state__enum_values_by_number[7] = { { "STATE_UNKNOWN", "MGMT__POOL_QUERY_TARGET_INFO__TARGET_STATE__STATE_UNKNOWN", 0 }, @@ -4369,20 +4369,8 @@ const ProtobufCEnumDescriptor mgmt__pool_query_target_info__target_state__descri mgmt__pool_query_target_info__target_state__value_ranges, NULL,NULL,NULL,NULL /* reserved[1234] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descriptors[5] = +static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descriptors[4] = { - { - "type", - 1, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_ENUM, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryTargetInfo, type), - &mgmt__pool_query_target_info__target_type__descriptor, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, { "state", 2, @@ -4433,16 +4421,15 @@ static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descri }, }; static const unsigned mgmt__pool_query_target_info__field_indices_by_name[] = { - 4, /* field[4] = md_on_ssd_active */ - 3, /* field[3] = mem_file_bytes */ - 2, /* field[2] = space */ - 1, /* field[1] = state */ - 0, /* field[0] = type */ + 3, /* field[3] = md_on_ssd_active */ + 2, /* field[2] = mem_file_bytes */ + 1, /* field[1] = space */ + 0, /* field[0] = state */ }; static const ProtobufCIntRange mgmt__pool_query_target_info__number_ranges[1 + 1] = { - { 1, 0 }, - { 0, 5 } + { 2, 0 }, + { 0, 4 } }; const ProtobufCMessageDescriptor mgmt__pool_query_target_info__descriptor = { @@ -4452,7 +4439,7 @@ const ProtobufCMessageDescriptor mgmt__pool_query_target_info__descriptor = "Mgmt__PoolQueryTargetInfo", "mgmt", sizeof(Mgmt__PoolQueryTargetInfo), - 5, + 4, mgmt__pool_query_target_info__field_descriptors, mgmt__pool_query_target_info__field_indices_by_name, 1, mgmt__pool_query_target_info__number_ranges, diff --git a/src/mgmt/pool.pb-c.h b/src/mgmt/pool.pb-c.h index cdabee8b51e..a8596043bd9 100644 --- a/src/mgmt/pool.pb-c.h +++ b/src/mgmt/pool.pb-c.h @@ -57,31 +57,15 @@ typedef struct _Mgmt__PoolSelfHealEvalReq Mgmt__PoolSelfHealEvalReq; /* --- enums --- */ typedef enum _Mgmt__PoolRebuildStatus__State { - MGMT__POOL_REBUILD_STATUS__STATE__IDLE = 0, - MGMT__POOL_REBUILD_STATUS__STATE__DONE = 1, - MGMT__POOL_REBUILD_STATUS__STATE__BUSY = 2 + MGMT__POOL_REBUILD_STATUS__STATE__BUSY = 0, + MGMT__POOL_REBUILD_STATUS__STATE__IDLE = 1, + MGMT__POOL_REBUILD_STATUS__STATE__DONE = 2, + MGMT__POOL_REBUILD_STATUS__STATE__STOPPING = 3, + MGMT__POOL_REBUILD_STATUS__STATE__STOPPED = 4, + MGMT__POOL_REBUILD_STATUS__STATE__FAILING = 5, + MGMT__POOL_REBUILD_STATUS__STATE__FAILED = 6 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MGMT__POOL_REBUILD_STATUS__STATE) } Mgmt__PoolRebuildStatus__State; -typedef enum _Mgmt__PoolQueryTargetInfo__TargetType { - MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__UNKNOWN = 0, - /* - * Rotating disk - */ - MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__HDD = 1, - /* - * Flash-based - */ - MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__SSD = 2, - /* - * Persistent memory - */ - MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__PM = 3, - /* - * Volatile memory - */ - MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__VM = 4 - PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE) -} Mgmt__PoolQueryTargetInfo__TargetType; typedef enum _Mgmt__PoolQueryTargetInfo__TargetState { MGMT__POOL_QUERY_TARGET_INFO__TARGET_STATE__STATE_UNKNOWN = 0, /* @@ -777,7 +761,8 @@ struct _Mgmt__StorageUsageStats /* - * PoolRebuildStatus represents a pool's rebuild status. + * PoolRebuildStatus represents a pool's rebuild status, translates to enum daos_rebuild_state_t + * IN_PROGRESS/NOT_STARTED/COMPLETED states. */ struct _Mgmt__PoolRebuildStatus { @@ -789,10 +774,15 @@ struct _Mgmt__PoolRebuildStatus Mgmt__PoolRebuildStatus__State state; uint64_t objects; uint64_t records; + Mgmt__PoolRebuildStatus__State derived_state; + /* + * data redundancy degraded + */ + protobuf_c_boolean degraded; }; #define MGMT__POOL_REBUILD_STATUS__INIT \ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_rebuild_status__descriptor) \ - , 0, MGMT__POOL_REBUILD_STATUS__STATE__IDLE, 0, 0 } + , 0, MGMT__POOL_REBUILD_STATUS__STATE__BUSY, 0, 0, MGMT__POOL_REBUILD_STATUS__STATE__BUSY, 0 } /* @@ -1112,10 +1102,6 @@ struct _Mgmt__StorageTargetUsage struct _Mgmt__PoolQueryTargetInfo { ProtobufCMessage base; - /* - * Target type jsee enum daos_target_type_t - */ - Mgmt__PoolQueryTargetInfo__TargetType type; /* * target state see enum daos_target_state_t */ @@ -1139,7 +1125,7 @@ struct _Mgmt__PoolQueryTargetInfo }; #define MGMT__POOL_QUERY_TARGET_INFO__INIT \ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_query_target_info__descriptor) \ - , MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__UNKNOWN, MGMT__POOL_QUERY_TARGET_INFO__TARGET_STATE__STATE_UNKNOWN, 0,NULL, 0, 0 } + , MGMT__POOL_QUERY_TARGET_INFO__TARGET_STATE__STATE_UNKNOWN, 0,NULL, 0, 0 } /* @@ -2075,7 +2061,6 @@ extern const ProtobufCMessageDescriptor mgmt__pool_upgrade_req__descriptor; extern const ProtobufCMessageDescriptor mgmt__pool_query_target_req__descriptor; extern const ProtobufCMessageDescriptor mgmt__storage_target_usage__descriptor; extern const ProtobufCMessageDescriptor mgmt__pool_query_target_info__descriptor; -extern const ProtobufCEnumDescriptor mgmt__pool_query_target_info__target_type__descriptor; extern const ProtobufCEnumDescriptor mgmt__pool_query_target_info__target_state__descriptor; extern const ProtobufCMessageDescriptor mgmt__pool_query_target_resp__descriptor; extern const ProtobufCMessageDescriptor mgmt__pool_rebuild_start_req__descriptor; diff --git a/src/mgmt/srv_chk.c b/src/mgmt/srv_chk.c index 705f4f0609e..3dd937d9bff 100644 --- a/src/mgmt/srv_chk.c +++ b/src/mgmt/srv_chk.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -48,7 +48,7 @@ ds_mgmt_chk_parse_uuid(int pool_nr, char **pools, uuid_t **p_uuids) int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int32_t pool_nr, char **pools, - uint32_t flags, int32_t phase) + uint32_t flags) { uuid_t *uuids = NULL; struct chk_policy *ply = NULL; @@ -70,7 +70,7 @@ ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, } } - rc = chk_leader_start(rank_nr, ranks, policy_nr, ply, pool_nr, uuids, flags, phase); + rc = chk_leader_start(rank_nr, ranks, policy_nr, ply, pool_nr, uuids, flags); out: D_FREE(uuids); diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index 84424997cc8..6145dadc998 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -406,8 +406,11 @@ static int pool_create_fill_resp(Mgmt__PoolCreateResp *resp, uuid_t uuid, d_rank D_DEBUG(DB_MGMT, "%d service replicas\n", svc_ranks->rl_nr); - rc = ds_mgmt_pool_query(uuid, svc_ranks, &enabled_ranks, NULL, NULL, &pool_info, NULL, NULL, + rc = ds_mgmt_pool_query(uuid, svc_ranks, &enabled_ranks, NULL, NULL, + daos_getmtime_coarse() + 2 * 60 * 1000, &pool_info, NULL, NULL, &mem_file_bytes); + if (DAOS_FAIL_CHECK(DAOS_MGMT_FAIL_CREATE_QUERY)) + rc = -DER_TIMEDOUT; if (rc != 0) { D_ERROR("Failed to query created pool: rc=%d\n", rc); D_GOTO(out, rc); @@ -470,7 +473,7 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) return; } - D_INFO("Received request to create pool on %zu ranks.\n", req->n_ranks); + D_INFO("Received request to create pool %s on %zu ranks.\n", req->uuid, req->n_ranks); if (req->n_tier_bytes != DAOS_MEDIA_MAX) D_GOTO(out, rc = -DER_INVAL); @@ -534,6 +537,15 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) rc = pool_create_fill_resp(&resp, pool_uuid, svc); d_rank_list_free(svc); + if (rc != 0) { + int rc_tmp; + + DL_ERROR(rc, DF_UUID ": failed to fill pool create response", DP_UUID(pool_uuid)); + rc_tmp = ds_mgmt_destroy_pool(pool_uuid, targets); + if (rc_tmp != 0) + DL_ERROR(rc_tmp, DF_UUID ": failed to clean up pool", DP_UUID(pool_uuid)); + goto out; + } out: resp.status = rc; @@ -1744,14 +1756,16 @@ pool_rebuild_status_from_info(Mgmt__PoolRebuildStatus *rebuild, if (rebuild->status == 0) { rebuild->objects = info->rs_obj_nr; rebuild->records = info->rs_rec_nr; - - if (info->rs_version == 0) - rebuild->state = MGMT__POOL_REBUILD_STATUS__STATE__IDLE; - else if (info->rs_state == DRS_COMPLETED) - rebuild->state = MGMT__POOL_REBUILD_STATUS__STATE__DONE; - else - rebuild->state = MGMT__POOL_REBUILD_STATUS__STATE__BUSY; } + + if ((info->rs_version == 0) || (info->rs_state == DRS_NOT_STARTED)) + rebuild->state = MGMT__POOL_REBUILD_STATUS__STATE__IDLE; + else if (info->rs_state == DRS_COMPLETED) + rebuild->state = MGMT__POOL_REBUILD_STATUS__STATE__DONE; + else + rebuild->state = MGMT__POOL_REBUILD_STATUS__STATE__BUSY; + + rebuild->degraded = !!(info->rs_flags & DAOS_RSF_DEGRADED); } static void @@ -1807,8 +1821,8 @@ ds_mgmt_drpc_pool_query(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) pool_info.pi_bits = req->query_mask; rc = ds_mgmt_pool_query(uuid, svc_ranks, &enabled_ranks, &disabled_ranks, &dead_ranks, - &pool_info, &resp.pool_layout_ver, &resp.upgrade_layout_ver, - &resp.mem_file_bytes); + mgmt_ps_call_deadline(), &pool_info, &resp.pool_layout_ver, + &resp.upgrade_layout_ver, &resp.mem_file_bytes); if (rc != 0) { DL_ERROR(rc, DF_UUID ": Failed to query the pool", DP_UUID(uuid)); D_GOTO(error, rc); @@ -1972,7 +1986,6 @@ ds_mgmt_drpc_pool_query_targets(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) resp.infos[i] = &resp_infos[i]; mgmt__pool_query_target_info__init(resp.infos[i]); - resp.infos[i]->type = (Mgmt__PoolQueryTargetInfo__TargetType) infos[i].ta_type; resp.infos[i]->state = (Mgmt__PoolQueryTargetInfo__TargetState) infos[i].ta_state; D_ALLOC_ARRAY(resp.infos[i]->space, DAOS_MEDIA_MAX); if (resp.infos[i]->space == NULL) @@ -2778,7 +2791,7 @@ ds_mgmt_drpc_check_start(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) D_INFO("Received request to start check\n"); rc = ds_mgmt_check_start(req->n_ranks, req->ranks, req->n_policies, req->policies, - req->n_uuids, req->uuids, req->flags, -1 /* phase */); + req->n_uuids, req->uuids, req->flags); if (rc < 0) D_ERROR("Failed to start check: "DF_RC"\n", DP_RC(rc)); diff --git a/src/mgmt/srv_internal.h b/src/mgmt/srv_internal.h index 5bc977eae12..5c18c47817c 100644 --- a/src/mgmt/srv_internal.h +++ b/src/mgmt/srv_internal.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -88,7 +88,8 @@ int ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, d_rank_list_t *targets, size_t scm_size, size_t nvme_size, size_t meta_size, daos_prop_t *prop, d_rank_list_t **svcp, int domains_nr, uint32_t *domains); -int ds_mgmt_destroy_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks); +int + ds_mgmt_destroy_pool(uuid_t pool_uuid, d_rank_list_t *ranks); int ds_mgmt_evict_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uuid_t *handles, size_t n_handles, uint32_t destroy, uint32_t force_destroy, char *machine, uint32_t *count); @@ -124,10 +125,10 @@ int ds_mgmt_pool_list_cont(uuid_t uuid, d_rank_list_t *svc_ranks, struct daos_pool_cont_info **containers, uint64_t *ncontainers); int - ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **enabled_ranks, - d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, - daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, - uint32_t *upgrade_layout_ver, uint64_t *mem_file_bytes); +ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **enabled_ranks, + d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, uint64_t deadline, + daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, + uint32_t *upgrade_layout_ver, uint64_t *mem_file_bytes); int ds_mgmt_pool_query_targets(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_t rank, d_rank_list_t *tgts, daos_target_info_t **infos, @@ -138,9 +139,10 @@ int const char *user, const char *group); /** srv_chk.c */ -int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, +int + ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, - uint32_t flags, int phase); + uint32_t flags); int ds_mgmt_check_stop(int pool_nr, char **pools); int ds_mgmt_check_query(int pool_nr, char **pools, chk_query_head_cb_t head_cb, chk_query_pool_cb_t pool_cb, void *buf); diff --git a/src/mgmt/srv_pool.c b/src/mgmt/srv_pool.c index 9d7ef2d81ae..0a491ec8a7e 100644 --- a/src/mgmt/srv_pool.c +++ b/src/mgmt/srv_pool.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -154,7 +154,7 @@ ds_mgmt_tgt_pool_create_ranks(uuid_t pool_uuid, d_rank_list_t *rank_list, size_t } static int -ds_mgmt_pool_svc_create(uuid_t pool_uuid, int ntargets, const char *group, d_rank_list_t *ranks, +ds_mgmt_pool_svc_create(uuid_t pool_uuid, const char *group, d_rank_list_t *ranks, daos_prop_t *prop, d_rank_list_t **svc_list, size_t domains_nr, uint32_t *domains) { @@ -170,10 +170,11 @@ ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, d_rank_list_t *targets, size_t nvme_size, size_t meta_size, daos_prop_t *prop, d_rank_list_t **svcp, int domains_nr, uint32_t *domains) { - d_rank_list_t *pg_ranks = NULL; - d_rank_list_t *pg_targets = NULL; - int rc; - int rc_cleanup; + d_rank_list_t *pg_ranks = NULL; + d_rank_list_t *pg_targets = NULL; + d_rank_list_t *dummy = NULL; + int rc; + int rc_cleanup; D_DEBUG(DB_MGMT, DF_UUID ": create scm/meta/nvme sizes %ld/%ld/%ld\n", DP_UUID(pool_uuid), scm_size, meta_size, nvme_size); @@ -213,16 +214,33 @@ ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, d_rank_list_t *targets, D_GOTO(out, rc = -DER_OOG); } - rc = ds_mgmt_tgt_pool_create_ranks(pool_uuid, targets, scm_size, nvme_size, meta_size); + /* Extend the targets list to simulate orphan pool shard. */ + if (DAOS_FAIL_CHECK(DAOS_CHK_ORPHAN_POOL_SHARD)) { + d_rank_t rank; + int i; + + rank = daos_fail_value_get(); + if (!d_rank_in_rank_list(targets, rank)) { + dummy = d_rank_list_alloc(targets->rl_nr + 1); + D_ASSERT(dummy != NULL); + + for (i = 0; i < targets->rl_nr; i++) + dummy->rl_ranks[i] = targets->rl_ranks[i]; + dummy->rl_ranks[targets->rl_nr] = rank; + } + } + + rc = ds_mgmt_tgt_pool_create_ranks(pool_uuid, dummy != NULL ? dummy : targets, scm_size, + nvme_size, meta_size); if (rc != 0) { DL_ERROR(rc, DF_UUID ": creating pool on ranks failed", DP_UUID(pool_uuid)); goto out_ranks; } - D_INFO(DF_UUID": creating targets on ranks succeeded\n", DP_UUID(pool_uuid)); + D_INFO(DF_UUID ": creating targets on %d ranks succeeded\n", DP_UUID(pool_uuid), + dummy != NULL ? dummy->rl_nr : targets->rl_nr); - rc = ds_mgmt_pool_svc_create(pool_uuid, targets->rl_nr, group, targets, prop, svcp, - domains_nr, domains); + rc = ds_mgmt_pool_svc_create(pool_uuid, group, targets, prop, svcp, domains_nr, domains); if (rc) { D_ERROR("create pool "DF_UUID" svc failed: rc "DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc)); @@ -233,7 +251,8 @@ ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, d_rank_list_t *targets, * round of RPCs. */ out_ranks: - rc_cleanup = ds_mgmt_tgt_pool_destroy_ranks(pool_uuid, targets); + rc_cleanup = + ds_mgmt_tgt_pool_destroy_ranks(pool_uuid, dummy != NULL ? dummy : targets); if (rc_cleanup) D_ERROR(DF_UUID": failed to clean up failed pool: "DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc_cleanup)); @@ -247,6 +266,7 @@ ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, d_rank_list_t *targets, out: d_rank_list_free(pg_targets); d_rank_list_free(pg_ranks); + d_rank_list_free(dummy); D_DEBUG(DB_MGMT, "create pool "DF_UUID": "DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc)); return rc; @@ -397,6 +417,7 @@ ds_mgmt_pool_list_cont(uuid_t uuid, d_rank_list_t *svc_ranks, * \param[out] enabled_ranks Optional, returned storage ranks with enabled targets. * \param[out] disabled_ranks Optional, returned storage ranks with disabled targets. * \param[out] dead_ranks Optional, returned storage ranks marked DEAD by SWIM. + * \param[in] deadline Unix time deadline in milliseconds * \param[in][out] pool_info Query results * \param[in][out] pool_layout_ver Pool global version * \param[in][out] upgrade_layout_ver Latest pool global version this pool might be upgraded @@ -407,7 +428,7 @@ ds_mgmt_pool_list_cont(uuid_t uuid, d_rank_list_t *svc_ranks, */ int ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **enabled_ranks, - d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, + d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, uint64_t deadline, daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, uint32_t *upgrade_layout_ver, uint64_t *mem_file_bytes) { @@ -418,9 +439,9 @@ ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **e D_DEBUG(DB_MGMT, "Querying pool "DF_UUID"\n", DP_UUID(pool_uuid)); - return dsc_pool_svc_query(pool_uuid, svc_ranks, mgmt_ps_call_deadline(), enabled_ranks, - disabled_ranks, dead_ranks, pool_info, pool_layout_ver, - upgrade_layout_ver, mem_file_bytes); + return dsc_pool_svc_query(pool_uuid, svc_ranks, deadline, enabled_ranks, disabled_ranks, + dead_ranks, pool_info, pool_layout_ver, upgrade_layout_ver, + mem_file_bytes); } /** diff --git a/src/mgmt/srv_query.c b/src/mgmt/srv_query.c index 51e457ba278..5eab9777648 100644 --- a/src/mgmt/srv_query.c +++ b/src/mgmt/srv_query.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -248,11 +249,14 @@ bio_storage_dev_manage_led(void *arg) return -DER_INVAL; } - /* Set the LED of the VMD device to a FAULT state, tr_addr and state may be updated */ + /** + * Set/Get the LED state of the VMD device, tr_addr and state led_info fields may be + * updated. + */ rc = bio_led_manage(bxc, led_info->tr_addr, led_info->dev_uuid, (unsigned int)led_info->action, (unsigned int *)led_info->state, led_info->duration); - if ((rc != 0) && (rc != -DER_NOTSUPPORTED)) + if (rc != 0) DL_ERROR(rc, "bio_led_manage failed on device:" DF_UUID " (action: %s, state %s)", DP_UUID(led_info->dev_uuid), ctl__led_action__descriptor.values[led_info->action].name, @@ -486,7 +490,7 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp) break; resp->devices[i]->ctrlr_namespace_id = dev_info->bdi_ctrlr->nss->id; } else { - D_DEBUG(DB_MGMT, "ctrlr not initialized in bio_dev_info, unplugged?"); + D_DEBUG(DB_MGMT, "ctrlr not initialized in bio_dev_info, is it unplugged?"); } /* Populate NVMe device state */ @@ -513,13 +517,7 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp) init_xs_type(), 0, 0); if (rc != 0) { - if (rc == -DER_NOTSUPPORTED) { - resp->devices[i]->ctrlr->led_state = CTL__LED_STATE__NA; - /* Reset rc for non-VMD case */ - rc = 0; - } else { - break; - } + break; } resp->devices[i]->ctrlr->led_state = led_state; @@ -753,14 +751,8 @@ ds_mgmt_dev_set_faulty(uuid_t dev_uuid, Ctl__DevManageResp *resp) /* Set the VMD LED to FAULTY state on init xstream */ rc = dss_ult_execute(bio_storage_dev_manage_led, &led_info, NULL, NULL, init_xs_type(), 0, 0); - if (rc != 0) { - if (rc == -DER_NOTSUPPORTED) - /* Reset rc for non-VMD case */ - rc = 0; - else - DL_ERROR(rc, "FAULT LED state not set on device:" DF_UUID, - DP_UUID(dev_uuid)); - } + if (rc != 0) + DL_ERROR(rc, "FAULT LED state not set on device:" DF_UUID, DP_UUID(dev_uuid)); out: smd_dev_free_info(dev_info); @@ -808,14 +800,13 @@ ds_mgmt_dev_manage_led(Ctl__LedManageReq *req, Ctl__DevManageResp *resp) led_info.state = &led_state; led_info.duration = req->led_duration_mins * 60 * (NSEC_PER_SEC / NSEC_PER_USEC); - /* Manage the VMD LED state on init xstream */ + /* Manage the LED state on init xstream */ rc = dss_ult_execute(bio_storage_dev_manage_led, &led_info, NULL, NULL, init_xs_type(), 0, 0); if (rc != 0) { + DL_ERROR(rc, "LED manage failed on device %s (%d)", led_info.tr_addr, rc); resp->device->ctrlr->led_state = CTL__LED_STATE__NA; - if (rc == -DER_NOTSUPPORTED) - /* Reset rc for non-VMD case */ - rc = 0; + resp->status = rc; } else { resp->device->ctrlr->led_state = (Ctl__LedState)led_state; } diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index 9f771cddc75..08db055e5ac 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -418,7 +418,8 @@ recreate_pooltgts() D_ASSERT(pool_info->spi_scm_sz > 0); rc = ds_mgmt_tgt_recreate(pool_info->spi_id, pool_info->spi_scm_sz, - pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META], rdb_blob_sz, + pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META], + pool_info->spi_tgts[SMD_DEV_TYPE_META], rdb_blob_sz, dss_storage_path, dss_bind_to_xstream_cpuset); if (rc) goto out; @@ -702,7 +703,7 @@ tgt_create_preallocate(void *arg) rc = ds_mgmt_tgt_preallocate_parallel( tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr, &tca->tca_ptrec->cancel_create, newborns_path, - dss_bind_to_xstream_cpuset); + dss_bind_to_xstream_cpuset, NULL); } if (rc) goto out; diff --git a/src/mgmt/tests/SConscript b/src/mgmt/tests/SConscript index b3418e20b5c..bbf5638355c 100644 --- a/src/mgmt/tests/SConscript +++ b/src/mgmt/tests/SConscript @@ -12,7 +12,8 @@ def scons(): denv.AppendUnique(RPATH_FULL=['$PREFIX/lib64/daos_srv']) denv.d_test_program('srv_drpc_tests', source=[pb_objs, mocks, 'srv_drpc_tests.c', '../srv_drpc.c'], - LIBS=['cmocka', 'protobuf-c', 'daos_common_pmem', 'gurt', 'uuid', 'bio']) + LIBS=['cmocka', 'protobuf-c', 'daos_common_pmem', 'gurt', 'uuid', 'bio', + 'ssl']) if __name__ == "SCons.Script": diff --git a/src/mgmt/tests/mocks.c b/src/mgmt/tests/mocks.c index fa593c84f92..382616db41f 100644 --- a/src/mgmt/tests/mocks.c +++ b/src/mgmt/tests/mocks.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -287,7 +287,7 @@ d_rank_list_t *ds_mgmt_pool_query_dead_ranks_out; int ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **enabled_ranks, - d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, + d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, uint64_t deadline, daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, uint32_t *upgrade_layout_ver, uint64_t *mem_file_bytes) { @@ -374,7 +374,6 @@ mock_ds_mgmt_pool_query_targets_gen_infos(uint32_t n_infos) D_ALLOC_ARRAY(infos, n_infos); for (i = 0; i < n_infos; i++) { - infos[i].ta_type = DAOS_TP_UNKNOWN; infos[i].ta_state = (i == 0) ? DAOS_TS_DOWN_OUT : DAOS_TS_UP_IN; infos[i].ta_space.s_total[DAOS_MEDIA_SCM] = 1000000000; infos[i].ta_space.s_free[DAOS_MEDIA_SCM] = 800000000 + i; @@ -555,7 +554,7 @@ ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, d_rank_list_t *targets, } int -ds_mgmt_destroy_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks) +ds_mgmt_destroy_pool(uuid_t pool_uuid, d_rank_list_t *ranks) { return 0; } @@ -695,7 +694,7 @@ mock_ds_mgmt_dev_set_faulty_setup(void) int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, - uint32_t flags, int phase) + uint32_t flags) { return 0; } diff --git a/src/mgmt/tests/srv_drpc_tests.c b/src/mgmt/tests/srv_drpc_tests.c index 36c6535b9a2..aac8c4351ed 100644 --- a/src/mgmt/tests/srv_drpc_tests.c +++ b/src/mgmt/tests/srv_drpc_tests.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1406,7 +1406,7 @@ expect_query_resp_with_info(daos_pool_info_t *exp_info, } static void -test_drpc_pool_query_success(void **state) +test_drpc_pool_query_rebuild_idle_success(void **state) { Drpc__Call call = DRPC__CALL__INIT; Drpc__Response resp = DRPC__RESPONSE__INIT; @@ -1443,7 +1443,7 @@ test_drpc_pool_query_success(void **state) } static void -test_drpc_pool_query_success_rebuild_busy(void **state) +test_drpc_pool_query_rebuild_done_success(void **state) { Drpc__Call call = DRPC__CALL__INIT; Drpc__Response resp = DRPC__RESPONSE__INIT; @@ -1452,6 +1452,7 @@ test_drpc_pool_query_success_rebuild_busy(void **state) init_test_pool_info(&exp_info); init_test_rebuild_status(&exp_info.pi_rebuild_st); exp_info.pi_rebuild_st.rs_version = 1; + exp_info.pi_rebuild_st.rs_state = DRS_COMPLETED; ds_mgmt_pool_query_info_out = exp_info; ds_mgmt_pool_query_mem_bytes = 11; @@ -1459,16 +1460,14 @@ test_drpc_pool_query_success_rebuild_busy(void **state) ds_mgmt_drpc_pool_query(&call, &resp); - expect_query_resp_with_info(&exp_info, - MGMT__POOL_REBUILD_STATUS__STATE__BUSY, - &resp); + expect_query_resp_with_info(&exp_info, MGMT__POOL_REBUILD_STATUS__STATE__DONE, &resp); D_FREE(call.body.data); D_FREE(resp.body.data); } static void -test_drpc_pool_query_success_rebuild_done(void **state) +test_drpc_pool_query_rebuild_busy_success(void **state) { Drpc__Call call = DRPC__CALL__INIT; Drpc__Response resp = DRPC__RESPONSE__INIT; @@ -1477,7 +1476,6 @@ test_drpc_pool_query_success_rebuild_done(void **state) init_test_pool_info(&exp_info); init_test_rebuild_status(&exp_info.pi_rebuild_st); exp_info.pi_rebuild_st.rs_version = 1; - exp_info.pi_rebuild_st.rs_state = DRS_COMPLETED; ds_mgmt_pool_query_info_out = exp_info; ds_mgmt_pool_query_mem_bytes = 11; @@ -1485,16 +1483,76 @@ test_drpc_pool_query_success_rebuild_done(void **state) ds_mgmt_drpc_pool_query(&call, &resp); - expect_query_resp_with_info(&exp_info, - MGMT__POOL_REBUILD_STATUS__STATE__DONE, - &resp); + expect_query_resp_with_info(&exp_info, MGMT__POOL_REBUILD_STATUS__STATE__BUSY, &resp); + + D_FREE(call.body.data); + D_FREE(resp.body.data); +} + +static void +test_drpc_pool_query_rebuild_idle_err(void **state) +{ + Drpc__Call call = DRPC__CALL__INIT; + Drpc__Response resp = DRPC__RESPONSE__INIT; + daos_pool_info_t exp_info = {0}; + + init_test_pool_info(&exp_info); + exp_info.pi_rebuild_st.rs_version = 1; + exp_info.pi_rebuild_st.rs_errno = -DER_MISC; + exp_info.pi_rebuild_st.rs_state = DRS_NOT_STARTED; + + ds_mgmt_pool_query_info_out = exp_info; + ds_mgmt_pool_query_mem_bytes = 11; + /* + * rebuild results returned to us shouldn't include the number of + * objects/records if there's an error. + */ + ds_mgmt_pool_query_info_out.pi_rebuild_st.rs_obj_nr = 42; + ds_mgmt_pool_query_info_out.pi_rebuild_st.rs_rec_nr = 999; + + setup_pool_query_drpc_call(&call, TEST_UUID, 0); + + ds_mgmt_drpc_pool_query(&call, &resp); + + expect_query_resp_with_info(&exp_info, MGMT__POOL_REBUILD_STATUS__STATE__IDLE, &resp); D_FREE(call.body.data); D_FREE(resp.body.data); } static void -test_drpc_pool_query_success_rebuild_err(void **state) +test_drpc_pool_query_rebuild_done_err(void **state) +{ + Drpc__Call call = DRPC__CALL__INIT; + Drpc__Response resp = DRPC__RESPONSE__INIT; + daos_pool_info_t exp_info = {0}; + + init_test_pool_info(&exp_info); + exp_info.pi_rebuild_st.rs_version = 1; + exp_info.pi_rebuild_st.rs_errno = -DER_MISC; + exp_info.pi_rebuild_st.rs_state = DRS_COMPLETED; + + ds_mgmt_pool_query_info_out = exp_info; + ds_mgmt_pool_query_mem_bytes = 11; + /* + * rebuild results returned to us shouldn't include the number of + * objects/records if there's an error. + */ + ds_mgmt_pool_query_info_out.pi_rebuild_st.rs_obj_nr = 42; + ds_mgmt_pool_query_info_out.pi_rebuild_st.rs_rec_nr = 999; + + setup_pool_query_drpc_call(&call, TEST_UUID, 0); + + ds_mgmt_drpc_pool_query(&call, &resp); + + expect_query_resp_with_info(&exp_info, MGMT__POOL_REBUILD_STATUS__STATE__DONE, &resp); + + D_FREE(call.body.data); + D_FREE(resp.body.data); +} + +static void +test_drpc_pool_query_rebuild_busy_err(void **state) { Drpc__Call call = DRPC__CALL__INIT; Drpc__Response resp = DRPC__RESPONSE__INIT; @@ -1503,6 +1561,7 @@ test_drpc_pool_query_success_rebuild_err(void **state) init_test_pool_info(&exp_info); exp_info.pi_rebuild_st.rs_version = 1; exp_info.pi_rebuild_st.rs_errno = -DER_MISC; + exp_info.pi_rebuild_st.rs_state = DRS_IN_PROGRESS; ds_mgmt_pool_query_info_out = exp_info; ds_mgmt_pool_query_mem_bytes = 11; @@ -1517,9 +1576,7 @@ test_drpc_pool_query_success_rebuild_err(void **state) ds_mgmt_drpc_pool_query(&call, &resp); - expect_query_resp_with_info(&exp_info, - MGMT__POOL_REBUILD_STATUS__STATE__IDLE, - &resp); + expect_query_resp_with_info(&exp_info, MGMT__POOL_REBUILD_STATUS__STATE__BUSY, &resp); D_FREE(call.body.data); D_FREE(resp.body.data); @@ -1610,7 +1667,6 @@ expect_drpc_pool_query_targets_resp_with_targets(Drpc__Response *resp, for (i = 0; i < exp_infos_len; i++) { uint32_t j; - assert_int_equal(pqt_resp->infos[i]->type, infos[i].ta_type); assert_int_equal(pqt_resp->infos[i]->state, infos[i].ta_state); assert_int_equal(pqt_resp->infos[i]->n_space, DAOS_MEDIA_MAX); assert_int_equal(pqt_resp->infos[i]->mem_file_bytes, mem_file_bytes); @@ -3411,10 +3467,12 @@ main(void) REINT_TEST(test_drpc_reint_bad_uuid), QUERY_TEST(test_drpc_pool_query_bad_uuid), QUERY_TEST(test_drpc_pool_query_mgmt_svc_fails), - QUERY_TEST(test_drpc_pool_query_success), - QUERY_TEST(test_drpc_pool_query_success_rebuild_busy), - QUERY_TEST(test_drpc_pool_query_success_rebuild_done), - QUERY_TEST(test_drpc_pool_query_success_rebuild_err), + QUERY_TEST(test_drpc_pool_query_rebuild_idle_success), + QUERY_TEST(test_drpc_pool_query_rebuild_done_success), + QUERY_TEST(test_drpc_pool_query_rebuild_busy_success), + QUERY_TEST(test_drpc_pool_query_rebuild_idle_err), + QUERY_TEST(test_drpc_pool_query_rebuild_done_err), + QUERY_TEST(test_drpc_pool_query_rebuild_busy_err), QUERY_TARGETS_TEST(test_drpc_pool_query_targets_bad_uuid), QUERY_TARGETS_TEST(test_drpc_pool_query_targets_mgmt_svc_fails), QUERY_TARGETS_TEST(test_drpc_pool_query_targets_with_targets), diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index d517e3269d6..ef10ed439a5 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -724,8 +725,8 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo if (rc == 0) { if (!shard->do_rebuilding && !shard->do_reintegrating) { tmp_tgt.dct_rank = shard->do_target_rank; - dct = bsearch(&tmp_tgt, coa->coa_dcts, coa->coa_dct_nr, - sizeof(tmp_tgt), &dc_coll_sort_cmp); + dct = bsearch(&tmp_tgt, coa->coa_dcts, coa->coa_dct_nr, + sizeof(tmp_tgt), dc_coll_sort_cmp); D_ASSERT(dct != NULL); goto gen_mbs; diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 7ba74731f62..d4b474b5edf 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -1,8 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -387,7 +386,7 @@ obj_layout_refresh(struct dc_object *obj) D_RWLOCK_WRLOCK(&obj->cob_lock); obj_layout_free(obj); - rc = obj_layout_create(obj, 0, true); + rc = obj_layout_create(obj, obj->cob_mode, true); D_RWLOCK_UNLOCK(&obj->cob_lock); return rc; @@ -1717,35 +1716,43 @@ dc_obj_layout_refresh(daos_handle_t oh) } uint32_t -dc_obj_retry_delay(tse_task_t *task, uint32_t opc, int err, uint16_t *retry_cnt, - uint16_t *inprogress_cnt, uint32_t timeout_sec) +dc_obj_retry_delay(tse_task_t *task, uint32_t opc, int err, uint32_t *retry_cnt, + uint32_t timeout_sec, bool long_delay) { uint32_t delay = 0; - if (err == -DER_INPROGRESS || err == -DER_UPDATE_AGAIN) - ++(*inprogress_cnt); - - if (++(*retry_cnt) > 1) { + /* Randomly delay [1, max_delay - 5] for DER_OVERLOAD_RETRY case. */ + if (err == -DER_OVERLOAD_RETRY) { + delay = daos_rpc_rand_delay(timeout_sec) << 20; + } else if (++(*retry_cnt) > 1) { /* Randomly delay [31 ~ 1023] us if it is not the first retried object RPC. */ delay = (d_rand() | ((1 << 5) - 1)) & ((1 << 10) - 1); /* Rebuild is being established on the server side, wait a bit longer */ - if (err == -DER_UPDATE_AGAIN) + if (err == -DER_UPDATE_AGAIN || long_delay) { delay <<= 10; - else if (opc == DAOS_OBJ_RPC_COLL_PUNCH) - /* 128 times of the delay for collective object RPC. */ - delay <<= 7; - else if (opc == DAOS_OBJ_RPC_CPD) - /* 8 times of the delay for compounded RPC. */ - delay <<= 3; - D_DEBUG(DB_IO, "Try to re-sched task %p (%u) for %u/%u times with %u us delay\n", - task, opc, *inprogress_cnt, *retry_cnt, delay); + } else { + switch (opc) { + case DAOS_OBJ_RPC_COLL_PUNCH: + case DAOS_OBJ_RPC_COLL_QUERY: + /* 256 times of the delay for collective object RPC. */ + delay <<= 8; + break; + case DAOS_OBJ_RPC_CPD: + /* 8 times of the delay for compounded RPC. */ + delay <<= 3; + break; + default: + break; + } + + /* Increase delay after multiple times retry. */ + if (*retry_cnt >= 5) + delay <<= 1; + } } - /* - * Randomly delay [1, max_delay - 5] for DER_OVERLOAD_RETRY case. - */ - if (err == -DER_OVERLOAD_RETRY) - delay = daos_rpc_rand_delay(timeout_sec) << 20; + D_DEBUG(DB_IO, "Try to re-sched task %p (%u) for %u times with %u us delay\n", task, opc, + *retry_cnt, delay); return delay; } @@ -1755,10 +1762,12 @@ obj_retry_cb(tse_task_t *task, struct dc_object *obj, struct obj_auxi_args *obj_auxi, bool pmap_stale, bool *io_task_reinited) { - tse_sched_t *sched = tse_task2sched(task); - tse_task_t *pool_task = NULL; - int result = task->dt_result; - int rc; + tse_sched_t *sched = tse_task2sched(task); + tse_task_t *pool_task = NULL; + uint32_t delay = 0; + uint32_t opc = obj_auxi->opc; + int result = task->dt_result; + int rc; if (pmap_stale) { rc = obj_pool_query_task(sched, obj, 0, &pool_task); @@ -1767,8 +1776,6 @@ obj_retry_cb(tse_task_t *task, struct dc_object *obj, } if (obj_auxi->io_retry) { - uint32_t delay = 0; - if (pool_task != NULL) { rc = dc_task_depend(task, 1, &pool_task); if (rc != 0) { @@ -1778,19 +1785,24 @@ obj_retry_cb(tse_task_t *task, struct dc_object *obj, } } + if (obj_is_modification_opc(opc) && result == -DER_TIMEDOUT) + obj_auxi->long_retry_delay = 1; + else if (result != -DER_INPROGRESS) + obj_auxi->long_retry_delay = 0; + if (!pmap_stale) { uint32_t now = daos_gettime_coarse(); - delay = - dc_obj_retry_delay(task, obj_auxi->opc, result, &obj_auxi->retry_cnt, - &obj_auxi->inprogress_cnt, obj_auxi->max_delay); + delay = dc_obj_retry_delay(task, opc, result, &obj_auxi->retry_cnt, + obj_auxi->max_delay, + obj_auxi->long_retry_delay == 1 ? true : false); if (result == -DER_INPROGRESS && - ((obj_auxi->retry_warn_ts == 0 && obj_auxi->inprogress_cnt >= 10) || + ((obj_auxi->retry_warn_ts == 0 && obj_auxi->retry_cnt >= 10) || (obj_auxi->retry_warn_ts > 0 && obj_auxi->retry_warn_ts + 10 < now))) { obj_auxi->retry_warn_ts = now; obj_auxi->flags |= ORF_MAYBE_STARVE; D_WARN("The task %p has been retried for %u times, maybe starve\n", - task, obj_auxi->inprogress_cnt); + task, obj_auxi->retry_cnt); } } @@ -4125,8 +4137,10 @@ anchor_update_check_eof(struct obj_auxi_args *obj_auxi, daos_anchor_t *anchor) obj_auxi_shards_iterate(obj_auxi, update_sub_anchor_cb, NULL); sub_anchors = (struct shard_anchors *)anchor->da_sub_anchors; - if (!d_list_empty(&sub_anchors->sa_merged_list)) + if (!d_list_empty(&sub_anchors->sa_merged_list)) { + D_ASSERT(obj_auxi->opc != DAOS_OBJ_RPC_ENUMERATE); return; + } if (sub_anchors_is_eof(sub_anchors)) { daos_obj_list_t *obj_args; @@ -4135,6 +4149,18 @@ anchor_update_check_eof(struct obj_auxi_args *obj_auxi, daos_anchor_t *anchor) obj_args = dc_task_get_args(obj_auxi->obj_task); sub_anchors_free(obj_args, obj_auxi->opc); + } else if (obj_auxi->opc == DAOS_OBJ_RPC_ENUMERATE) { + for (int i = 0; i < sub_anchors->sa_anchors_nr; i++) { + daos_anchor_t *sub_anchor; + + sub_anchor = &sub_anchors->sa_anchors[i].ssa_anchor; + if (!daos_anchor_is_eof(sub_anchor)) { + D_DEBUG(DB_REBUILD, "shard %d sub_anchor %d/%d non EOF", + sub_anchors->sa_anchors[i].ssa_shard, i, + sub_anchors->sa_anchors_nr); + break; + } + } } } @@ -6455,7 +6481,7 @@ shard_anchors_check_alloc_bufs(struct obj_auxi_args *obj_auxi, struct shard_anch } if (obj_args->recxs != NULL) { - if (sub_anchor->ssa_recxs != NULL && sub_anchors->sa_nr == nr) + if (sub_anchor->ssa_recxs != NULL && sub_anchors->sa_nr != nr) D_FREE(sub_anchor->ssa_recxs); if (sub_anchor->ssa_recxs == NULL) { diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 871474f10b5..3708a11ef06 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -847,7 +847,8 @@ dc_rw_cb(tse_task_t *task, void *arg) * rec2big errors which can be expected. */ if (rc == -DER_REC2BIG || rc == -DER_NONEXIST || rc == -DER_NO_PERM || - rc == -DER_EXIST || rc == -DER_RF) + rc == -DER_EXIST || rc == -DER_RF || rc == -DER_UPDATE_AGAIN || + rc == -DER_FETCH_AGAIN) D_DEBUG(DB_IO, DF_UOID" rpc %p opc %d to rank %d tag %d: "DF_RC"\n", DP_UOID(orw->orw_oid), rw_args->rpc, opc, rw_args->rpc->cr_ep.ep_rank, rw_args->rpc->cr_ep.ep_tag, DP_RC(rc)); @@ -1598,62 +1599,6 @@ struct obj_enum_args { uint32_t *max_delay; }; -/** - * use iod/iod_csum as vehicle to verify data - */ -static int -csum_enum_verify_recx(struct daos_csummer *csummer, struct obj_enum_rec *rec, - d_iov_t *enum_type_val, struct dcs_csum_info *csum_info) -{ - daos_iod_t tmp_iod = {0}; - d_sg_list_t tmp_sgl = {0}; - struct dcs_iod_csums tmp_iod_csum = {0}; - int rc; - - tmp_iod.iod_size = rec->rec_size; - tmp_iod.iod_type = DAOS_IOD_ARRAY; - tmp_iod.iod_recxs = &rec->rec_recx; - tmp_iod.iod_nr = 1; - - tmp_sgl.sg_nr = tmp_sgl.sg_nr_out = 1; - tmp_sgl.sg_iovs = enum_type_val; - - tmp_iod_csum.ic_nr = 1; - tmp_iod_csum.ic_data = csum_info; - - rc = daos_csummer_verify_iod(csummer, &tmp_iod, &tmp_sgl, - &tmp_iod_csum, NULL, 0, NULL); - - return rc; -} - -/** - * use iod/iod_csum as vehicle to verify data - */ -static int -csum_enum_verify_sv(struct daos_csummer *csummer, struct obj_enum_rec *rec, - d_iov_t *enum_type_val, struct dcs_csum_info *csum_info) -{ - daos_iod_t tmp_iod = {0}; - d_sg_list_t tmp_sgl = {0}; - struct dcs_iod_csums tmp_iod_csum = {0}; - int rc; - - tmp_iod.iod_size = rec->rec_size; - tmp_iod.iod_type = DAOS_IOD_SINGLE; - tmp_iod.iod_nr = 1; - - tmp_sgl.sg_nr = tmp_sgl.sg_nr_out = 1; - tmp_sgl.sg_iovs = enum_type_val; - - tmp_iod_csum.ic_nr = 1; - tmp_iod_csum.ic_data = csum_info; - rc = daos_csummer_verify_iod(csummer, &tmp_iod, &tmp_sgl, - &tmp_iod_csum, NULL, 0, NULL); - - return rc; -} - struct csum_enum_args { d_iov_t *csum_iov; struct daos_csummer *csummer; @@ -1663,7 +1608,7 @@ static int verify_csum_cb(daos_key_desc_t *kd, void *buf, unsigned int size, void *arg) { struct dcs_csum_info *ci_to_compare = NULL; - struct csum_enum_args *args = arg; + struct csum_enum_args *args = arg; d_iov_t enum_type_val; int rc; @@ -1671,6 +1616,7 @@ verify_csum_cb(daos_key_desc_t *kd, void *buf, unsigned int size, void *arg) case OBJ_ITER_SINGLE: case OBJ_ITER_RECX: { struct obj_enum_rec *rec; + daos_recx_t *recx; uint64_t rec_data_len; rec = buf; @@ -1690,13 +1636,8 @@ verify_csum_cb(daos_key_desc_t *kd, void *buf, unsigned int size, void *arg) d_iov_set(&enum_type_val, buf, rec_data_len); - if (kd->kd_val_type == OBJ_ITER_RECX) - rc = csum_enum_verify_recx(args->csummer, rec, - &enum_type_val, - ci_to_compare); - else - rc = csum_enum_verify_sv(args->csummer, rec, - &enum_type_val, + recx = (kd->kd_val_type == OBJ_ITER_RECX) ? &rec->rec_recx : NULL; + rc = daos_csummer_verify_value(args->csummer, recx, rec->rec_size, &enum_type_val, ci_to_compare); if (rc != 0) return rc; @@ -1716,9 +1657,7 @@ verify_csum_cb(daos_key_desc_t *kd, void *buf, unsigned int size, void *arg) ci_cast(&ci_to_compare, args->csum_iov); ci_move_next_iov(ci_to_compare, args->csum_iov); - rc = daos_csummer_verify_key(args->csummer, - &enum_type_val, ci_to_compare); - + rc = daos_csummer_verify_key(args->csummer, &enum_type_val, ci_to_compare); if (rc != 0) { D_ERROR("daos_csummer_verify_key error for %s: %d\n", kd->kd_val_type == OBJ_ITER_AKEY ? "AKEY" : "DKEY", rc); diff --git a/src/object/obj_enum.c b/src/object/obj_enum.c index 84669771669..4175d7de907 100644 --- a/src/object/obj_enum.c +++ b/src/object/obj_enum.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2018-2022 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -689,9 +690,8 @@ obj_enum_iterate(daos_key_desc_t *kdss, d_sg_list_t *sgl, int nr, ptr = sgl_indexed_byte(sgl, &sgl_idx); D_ASSERTF(ptr != NULL, "kds and sgl don't line up"); - D_DEBUG(DB_REBUILD, "process %d, type %d, ptr %p, len "DF_U64 - ", total %zd\n", i, kds->kd_val_type, ptr, - kds->kd_key_len, sgl->sg_iovs[0].iov_len); + D_DEBUG(DB_REBUILD, "process %d/%d, type %d, ptr %p, len " DF_U64 ", total %zd\n", + i, nr, kds->kd_val_type, ptr, kds->kd_key_len, sgl->sg_iovs[0].iov_len); if (kds->kd_val_type == 0 || (kds->kd_val_type != type && type != -1)) { sgl_move_forward(sgl, &sgl_idx, kds->kd_key_len); diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 0710c3ad59d..ba3191e761b 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -464,40 +464,17 @@ struct obj_auxi_args { * ec_wait_recov -- obj fetch wait another EC recovery task, * ec_in_recov -- a EC recovery task */ - uint32_t io_retry:1, - args_initialized:1, - to_leader:1, - spec_shard:1, - spec_group:1, - req_reasbed:1, - is_ec_obj:1, - csum_retry:1, - csum_report:1, - tx_uncertain:1, - nvme_io_err:1, - no_retry:1, - ec_wait_recov:1, - ec_in_recov:1, - new_shard_tasks:1, - reset_param:1, - force_degraded:1, - shards_scheded:1, - sub_anchors:1, - ec_degrade_fetch:1, - tx_convert:1, - cond_modify:1, - /* conf_fetch split to multiple sub-tasks */ - cond_fetch_split:1, - reintegrating:1, - tx_renew:1, - rebuilding:1, - for_migrate:1, - req_dup_sgl:1; + uint32_t new_shard_tasks : 1, reset_param : 1, force_degraded : 1, shards_scheded : 1, + io_retry : 1, args_initialized : 1, to_leader : 1, spec_shard : 1, spec_group : 1, + req_reasbed : 1, is_ec_obj : 1, csum_retry : 1, csum_report : 1, tx_uncertain : 1, + nvme_io_err : 1, no_retry : 1, ec_wait_recov : 1, ec_in_recov : 1, rebuilding : 1, + sub_anchors : 1, ec_degrade_fetch : 1, long_retry_delay : 1, cond_fetch_split : 1, + cond_modify : 1, reintegrating : 1, tx_renew : 1, tx_convert : 1, req_dup_sgl : 1, + for_migrate : 1; /* request flags. currently only: ORF_RESEND */ - uint32_t specified_shard; - uint32_t flags; - uint16_t retry_cnt; - uint16_t inprogress_cnt; + uint32_t specified_shard; + uint32_t flags; + uint32_t retry_cnt; /* Last timestamp (in second) when report retry warning message. */ uint32_t retry_warn_ts; struct obj_req_tgts req_tgts; @@ -925,8 +902,8 @@ void obj_decref(struct dc_object *obj); int obj_get_grp_size(struct dc_object *obj); struct dc_object *obj_hdl2ptr(daos_handle_t oh); uint32_t -dc_obj_retry_delay(tse_task_t *task, uint32_t opc, int err, uint16_t *retry_cnt, - uint16_t *inprogress_cnt, uint32_t timeout_secs); +dc_obj_retry_delay(tse_task_t *task, uint32_t opc, int err, uint32_t *retry_cnt, + uint32_t timeout_secs, bool long_delay); /* handles, pointers for handling I/O */ struct obj_io_context { @@ -1204,6 +1181,8 @@ iov_alloc_for_csum_info(d_iov_t *iov, struct dcs_csum_info *csum_info); /* obj_layout.c */ int obj_pl_grp_idx(uint32_t layout_gl_ver, uint64_t hash, uint32_t grp_nr); +void +obj_dump_grp_layout(daos_handle_t oh, uint32_t shard); int obj_pl_place(struct pl_map *map, uint16_t layout_ver, struct daos_obj_md *md, diff --git a/src/object/obj_layout.c b/src/object/obj_layout.c index 189261ad31e..87958b70a11 100644 --- a/src/object/obj_layout.c +++ b/src/object/obj_layout.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -95,3 +96,36 @@ obj_layout_diff(struct pl_map *map, daos_unit_oid_t oid, uint32_t new_ver, uint3 return rc; } + +void +obj_dump_grp_layout(daos_handle_t oh, uint32_t shard) +{ + struct dc_object *obj; + struct dc_obj_shard *obj_shard; + uint32_t grp_idx, i, nr; + + obj = obj_hdl2ptr(oh); + if (obj == NULL) { + D_INFO("invalid oh"); + return; + } + if (shard >= obj->cob_shards_nr) { + D_ERROR("bad shard %d, cob_shards_nr %d", shard, obj->cob_shards_nr); + goto out; + } + + grp_idx = shard / obj->cob_grp_size; + D_INFO(DF_OID " shard %d, grp_idx %d, grp_size %d", DP_OID(obj->cob_md.omd_id), shard, + grp_idx, obj->cob_grp_size); + for (i = grp_idx * obj->cob_grp_size, nr = 0; nr < obj->cob_grp_size; i++, nr++) { + obj_shard = &obj->cob_shards->do_shards[i]; + D_INFO("shard %d/%d/%d, tgt_id %d, rank %d, tgt_idx %d, " + "rebuilding %d, reintegrating %d, fseq %d", + i, obj_shard->do_shard_idx, obj_shard->do_shard, obj_shard->do_target_id, + obj_shard->do_target_rank, obj_shard->do_target_idx, + obj_shard->do_rebuilding, obj_shard->do_reintegrating, obj_shard->do_fseq); + } + +out: + obj_decref(obj); +} diff --git a/src/object/obj_tx.c b/src/object/obj_tx.c index 7d37a091f4d..dfe3461eff0 100644 --- a/src/object/obj_tx.c +++ b/src/object/obj_tx.c @@ -101,8 +101,7 @@ struct dc_tx { /** The read requests count */ uint32_t tx_read_cnt; - uint16_t tx_retry_cnt; - uint16_t tx_inprogress_cnt; + uint32_t tx_retry_cnt; /* Last timestamp (in second) when report retry warning message. */ uint32_t tx_retry_warn_ts; /** Pool map version when trigger first IO. */ @@ -1083,15 +1082,14 @@ dc_tx_commit_cb(tse_task_t *task, void *data) if (rc != -DER_TX_RESTART) { uint32_t now = daos_gettime_coarse(); - delay = dc_obj_retry_delay(task, DAOS_OBJ_RPC_CPD, rc, &tx->tx_retry_cnt, - &tx->tx_inprogress_cnt, 0); + delay = dc_obj_retry_delay(task, DAOS_OBJ_RPC_CPD, rc, &tx->tx_retry_cnt, 0, false); if (rc == -DER_INPROGRESS && - ((tx->tx_retry_warn_ts == 0 && tx->tx_inprogress_cnt >= 10) || + ((tx->tx_retry_warn_ts == 0 && tx->tx_retry_cnt >= 10) || (tx->tx_retry_warn_ts > 0 && tx->tx_retry_warn_ts + 10 < now))) { tx->tx_retry_warn_ts = now; tx->tx_maybe_starve = 1; D_WARN("The dist TX task %p has been retried for %u times, maybe starve\n", - task, tx->tx_inprogress_cnt); + task, tx->tx_retry_cnt); } rc1 = tse_task_reinit_with_delay(task, delay); @@ -2588,7 +2586,6 @@ dc_tx_restart_begin(struct dc_tx *tx, uint32_t *backoff) */ tx->tx_status = TX_RESTARTING; tx->tx_retry_cnt = 0; - tx->tx_inprogress_cnt = 0; *backoff = d_backoff_seq_next(&tx->tx_backoff_seq); } diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index bc72894dcaa..28e34dd619b 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -70,7 +70,7 @@ obj_coll_local(crt_rpc_t *rpc, struct daos_coll_shard *shards, struct dtx_coll_e coll_args.ca_tgt_bitmap = dce->dce_bitmap; coll_args.ca_tgt_bitmap_sz = dce->dce_bitmap_sz; - rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_USE_CURRENT_ULT); + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_ULT_DEEP_STACK); out: if (octa.octa_versions != NULL) { diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 1390eb656b4..1baef1660a1 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2020-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -115,7 +115,7 @@ struct ec_agg_entry { struct ec_agg_par_extent ae_par_extent; /* Parity extent */ daos_handle_t ae_obj_hdl; /* Object handle for cur obj */ struct pl_obj_layout *ae_obj_layout; - struct daos_shard_loc ae_peer_pshards[OBJ_EC_MAX_P]; + struct daos_shard_loc ae_peer_pshards[OBJ_EC_MAX_P]; uint32_t ae_grp_idx; uint32_t ae_is_leader:1, ae_process_partial:1; @@ -1278,6 +1278,42 @@ agg_process_partial_stripe(struct ec_agg_entry *entry) return rc; } +static bool +agg_peer_failed(struct ec_agg_param *agg_param, struct daos_shard_loc *peer_loc) +{ + struct pool_target *targets = NULL; + uint32_t failed_tgts_cnt = 0; + int i; + int rc; + + rc = pool_map_find_failed_tgts(agg_param->ap_pool_info.api_pool->sp_map, &targets, + &failed_tgts_cnt); + if (rc) { + DL_ERROR(rc, DF_CONT " pool_map_find_failed_tgts failed.", + DP_CONT(agg_param->ap_pool_info.api_pool_uuid, + agg_param->ap_pool_info.api_cont_uuid)); + return false; + } + + if (targets == NULL || failed_tgts_cnt == 0) + return false; + + for (i = 0; i < failed_tgts_cnt; i++) { + if (targets[i].ta_comp.co_rank == peer_loc->sd_rank && + targets[i].ta_comp.co_index == peer_loc->sd_tgt_idx) { + D_DEBUG(DB_EPC, DF_CONT " peer parity tgt failed rank %d, tgt_idx %d.\n", + DP_CONT(agg_param->ap_pool_info.api_pool_uuid, + agg_param->ap_pool_info.api_cont_uuid), + peer_loc->sd_rank, peer_loc->sd_tgt_idx); + D_FREE(targets); + return true; + } + } + + D_FREE(targets); + return false; +} + int agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) { @@ -1334,6 +1370,12 @@ agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) return rc; } +static bool +agg_peer_retryable_err(int err) +{ + return err == -DER_STALE || err == -DER_TIMEDOUT || daos_crt_network_error(err); +} + /* Sends the generated parity and the stripe number to the peer * parity target. Handler writes the parity and deletes the replicas * for the stripe. @@ -1382,7 +1424,7 @@ agg_peer_update_ult(void *arg) obj = obj_hdl2ptr(entry->ae_obj_hdl); for (peer = 0; peer < p; peer++) { uint64_t enqueue_id = 0; - bool overloaded; + bool peer_retry; if (peer == pidx) continue; @@ -1390,7 +1432,7 @@ agg_peer_update_ult(void *arg) tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank; tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx; retry: - overloaded = false; + peer_retry = false; rc = ds_obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep, DAOS_OBJ_RPC_EC_AGGREGATE, &rpc); if (rc) { @@ -1470,13 +1512,20 @@ agg_peer_update_ult(void *arg) rc = ec_agg_out->ea_status; if (rc == -DER_OVERLOAD_RETRY) { enqueue_id = ec_agg_out->ea_comm_out.req_out_enqueue_id; - overloaded = true; + peer_retry = true; } D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR, "update parity[%d] to %d:%d, status = " DF_RC "\n", peer, tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc)); peer_updated += rc == 0; } + if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) && + !agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) { + DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.", + DP_UOID(entry->ae_oid), pidx, peer); + peer_retry = true; + } + next: if (bulk_hdl) crt_bulk_free(bulk_hdl); @@ -1487,7 +1536,7 @@ agg_peer_update_ult(void *arg) rpc = NULL; bulk_hdl = NULL; iod_csums = NULL; - if (overloaded) { + if (peer_retry) { dss_sleep(daos_rpc_rand_delay(max_delay) << 10); goto retry; } @@ -1665,13 +1714,13 @@ agg_process_holes_ult(void *arg) for (peer = 0; peer < p; peer++) { uint64_t enqueue_id = 0; uint32_t peer_shard; - bool overloaded; + bool peer_retry; if (pidx == peer) continue; retry: - overloaded = false; + peer_retry = false; D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE); tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank; tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx; @@ -1719,7 +1768,7 @@ agg_process_holes_ult(void *arg) rc = ec_rep_out->er_status; if (rc == -DER_OVERLOAD_RETRY) { enqueue_id = ec_rep_out->er_comm_out.req_out_enqueue_id; - overloaded = true; + peer_retry = true; } D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR, DF_UOID " parity[%d] er_status = " DF_RC "\n", @@ -1728,7 +1777,13 @@ agg_process_holes_ult(void *arg) } crt_req_decref(rpc); rpc = NULL; - if (overloaded) { + if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) && + !agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) { + DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.", + DP_UOID(entry->ae_oid), pidx, peer); + peer_retry = true; + } + if (peer_retry) { dss_sleep(daos_rpc_rand_delay(max_delay) << 10); goto retry; } @@ -2219,7 +2274,7 @@ agg_shard_is_parity(struct ds_pool *pool, struct ec_agg_entry *agg_entry) /* Initializes the struct holding the iteration state (ec_agg_entry). */ static void -agg_reset_dkey_entry(struct ec_agg_entry *agg_entry, vos_iter_entry_t *entry) +agg_reset_dkey_entry(struct ec_agg_entry *agg_entry) { agg_clear_extents(agg_entry); agg_reset_pos(VOS_ITER_AKEY, agg_entry); @@ -2257,7 +2312,7 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry, D_DEBUG(DB_EPC, "oid:"DF_UOID":"DF_KEY" ec agg starting leader %s\n", DP_UOID(agg_entry->ae_oid), DP_KEY(&agg_entry->ae_dkey), agg_entry->ae_is_leader ? "yes" : "no"); - agg_reset_dkey_entry(&agg_param->ap_agg_entry, entry); + agg_reset_dkey_entry(&agg_param->ap_agg_entry); rc = agg_get_obj_handle(agg_entry, true); } else { *acts |= VOS_ITER_CB_SKIP; @@ -2288,9 +2343,9 @@ ec_aggregate_yield(struct ec_agg_param *agg_param) int rc; if (ds_pool_is_rebuilding(agg_param->ap_pool_info.api_pool)) { - D_INFO(DF_UUID": abort ec aggregation, sp_rebuilding %d\n", + D_INFO(DF_UUID ": abort ec aggregation, sp_rebuilding %d\n", DP_UUID(agg_param->ap_pool_info.api_pool->sp_uuid), - agg_param->ap_pool_info.api_pool->sp_rebuilding); + atomic_load(&agg_param->ap_pool_info.api_pool->sp_rebuilding)); return true; } @@ -2346,8 +2401,8 @@ agg_reset_entry(struct ec_agg_entry *agg_entry, vos_iter_entry_t *entry, agg_entry->ae_rsize = 0UL; if (entry) { - agg_entry->ae_oid = entry->ie_oid; - agg_entry->ae_codec = obj_id2ec_codec(entry->ie_oid.id_pub); + agg_entry->ae_oid = entry->ie_oid; + agg_entry->ae_codec = obj_id2ec_codec(entry->ie_oid.id_pub); D_ASSERT(agg_entry->ae_codec); } else { agg_entry->ae_codec = NULL; @@ -2366,12 +2421,12 @@ agg_reset_entry(struct ec_agg_entry *agg_entry, vos_iter_entry_t *entry, } for (i = 0; i < OBJ_EC_MAX_P; i++) { - agg_entry->ae_peer_pshards[i].sd_rank = DAOS_TGT_IGNORE; + agg_entry->ae_peer_pshards[i].sd_rank = DAOS_TGT_IGNORE; agg_entry->ae_peer_pshards[i].sd_tgt_idx = DAOS_TGT_IGNORE; } agg_reset_pos(VOS_ITER_DKEY, agg_entry); - agg_reset_dkey_entry(agg_entry, entry); + agg_reset_dkey_entry(agg_entry); } static int @@ -2473,7 +2528,7 @@ ec_agg_object(daos_handle_t ih, vos_iter_entry_t *entry, struct ec_agg_param *ag md.omd_pdom_lvl = props.dcp_perf_domain; md.omd_pda = props.dcp_ec_pda; shard_nr = daos_oclass_grp_size(&oca) * daos_obj_id2grp_nr(md.omd_id); - agg_param->ap_credits += roundup(shard_nr, 128) / 128; + agg_param->ap_credits += min(512, roundup(shard_nr, 32) / 32); rc = pl_obj_place(map, agg_entry->ae_oid.id_layout_ver, &md, DAOS_OO_RO, NULL, &agg_entry->ae_obj_layout); @@ -2501,10 +2556,10 @@ agg_iterate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry, * (see obj_inflight_io_check()). */ if (ds_pool_is_rebuilding(agg_param->ap_pool_info.api_pool)) { - D_INFO(DF_CONT" abort as rebuild started, sp_rebuilding %d\n", - DP_CONT(agg_param->ap_pool_info.api_pool_uuid, - agg_param->ap_pool_info.api_cont_uuid), - agg_param->ap_pool_info.api_pool->sp_rebuilding); + D_INFO(DF_CONT " abort as rebuild started, sp_rebuilding %d\n", + DP_CONT(agg_param->ap_pool_info.api_pool_uuid, + agg_param->ap_pool_info.api_cont_uuid), + atomic_load(&agg_param->ap_pool_info.api_pool->sp_rebuilding)); return -1; } @@ -2529,9 +2584,9 @@ agg_iterate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry, } if (rc < 0) { - D_ERROR(DF_UUID" EC aggregation (rebuilding %d) failed: "DF_RC"\n", + D_ERROR(DF_UUID " EC aggregation (rebuilding %d) failed: " DF_RC "\n", DP_UUID(agg_param->ap_pool_info.api_pool->sp_uuid), - agg_param->ap_pool_info.api_pool->sp_rebuilding, DP_RC(rc)); + atomic_load(&agg_param->ap_pool_info.api_pool->sp_rebuilding), DP_RC(rc)); return rc; } @@ -2632,7 +2687,8 @@ ec_agg_param_init(struct ds_cont_child *cont, struct agg_param *param) agg_param->ap_credits_max = EC_AGG_ITERATION_MAX; D_INIT_LIST_HEAD(&agg_param->ap_agg_entry.ae_cur_stripe.as_dextents); - rc = dss_ult_execute(ec_agg_init_ult, agg_param, NULL, NULL, DSS_XS_SYS, 0, 0); + rc = dss_ult_execute(ec_agg_init_ult, agg_param, NULL, NULL, DSS_XS_SYS, 0, + DSS_DEEP_STACK_SZ); if (rc != 0) D_GOTO(out, rc); @@ -2683,6 +2739,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, struct dtx_id dti = { 0 }; struct dtx_epoch epoch = { 0 }; daos_unit_oid_t oid = { 0 }; + uint64_t ec_agg_eph; int blocks = 0; int rc = 0; @@ -2700,6 +2757,28 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, return rc; } + if (!cont->sc_ec_agg_eph_valid) { + D_DEBUG(DB_EPC, DF_CONT ": pause EC aggregation for sc_ec_agg_eph_boundary.\n", + DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid)); + return 0; + } + + if (cont->sc_ec_agg_eph == 0) { + D_INFO(DF_CONT ": update cont->sc_ec_agg_eph to " DF_X64, + DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), + cont->sc_ec_agg_eph_boundary); + cont->sc_ec_agg_eph = cont->sc_ec_agg_eph_boundary; + } + + if (cont->sc_ec_update_timestamp == 0) { + vos_cont_info_t info; + + /* load the timestamp of the last write that can be aggregated from VOS */ + vos_cont_query(ec_agg_param->ap_cont_handle, &info); + cont->sc_ec_update_timestamp = info.ci_agg_write; + } + + ec_agg_eph = cont->sc_ec_agg_eph; ec_agg_param->ap_min_unagg_eph = DAOS_EPOCH_MAX; if (flags & VOS_AGG_FL_FORCE_SCAN) { /** We don't want to use the latest container aggregation epoch for the filter @@ -2712,7 +2791,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, if (ec_agg_param->ap_filter_eph != 0 && ec_agg_param->ap_filter_eph >= cont->sc_ec_update_timestamp) { - D_DEBUG(DB_EPC, DF_CONT" skip EC agg "DF_U64">= "DF_U64"\n", + D_DEBUG(DB_EPC, DF_CONT " skip EC agg " DF_U64 ">= " DF_U64 "\n", DP_CONT(cont->sc_pool_uuid, cont->sc_uuid), ec_agg_param->ap_filter_eph, cont->sc_ec_update_timestamp); goto update_hae; @@ -2785,20 +2864,33 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, cont->sc_ec_agg_active = 0; if (rc == 0) { + /* If pool map updated during this round of aggregation, the sc_ec_agg_eph + * possibly be reset by ds_cont_child_reset_ec_agg_eph_all(). + * For that case should not bump local sc_ec_agg_eph and rescan from the reset + * value (sc_ec_agg_eph_boundary). + */ + if (cont->sc_ec_agg_eph != ec_agg_eph) { + D_INFO(DF_CONT " sc_ec_agg_eph changed from " DF_X64 " to " DF_X64 + " don't bump EC aggregation epoch", + DP_CONT(cont->sc_pool_uuid, cont->sc_uuid), ec_agg_eph, + cont->sc_ec_agg_eph); + return rc; + } + cont->sc_ec_agg_eph = max(cont->sc_ec_agg_eph, epr->epr_hi); if (!cont->sc_stopping && cont->sc_query_ec_agg_eph) { - uint64_t orig, cur; + uint64_t orig, cur, cur_eph; + cur_eph = min(ec_agg_param->ap_min_unagg_eph, cont->sc_ec_agg_eph); orig = d_hlc2sec(*cont->sc_query_ec_agg_eph); - cur = d_hlc2sec(cont->sc_ec_agg_eph); + cur = d_hlc2sec(cur_eph); if (orig && cur > orig && (cur - orig) >= 600) D_WARN(DF_CONT" Sluggish EC boundary bumping: " ""DF_U64" -> "DF_U64", gap:"DF_U64"\n", DP_CONT(cont->sc_pool_uuid, cont->sc_uuid), orig, cur, cur - orig); - *cont->sc_query_ec_agg_eph = min(ec_agg_param->ap_min_unagg_eph, - cont->sc_ec_agg_eph); + *cont->sc_query_ec_agg_eph = cur_eph; } } diff --git a/src/object/srv_enum.c b/src/object/srv_enum.c index e1513f02f7f..4f8a4eb7531 100644 --- a/src/object/srv_enum.c +++ b/src/object/srv_enum.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2018-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -320,37 +321,6 @@ entry_is_partial_extent(const vos_iter_entry_t *key_ent) return !recx_eq(&key_ent->ie_orig_recx, &key_ent->ie_recx); } -static int -csummer_verify_recx(struct daos_csummer *csummer, d_iov_t *data_to_verify, - daos_recx_t *recx, daos_size_t rsize, - struct dcs_csum_info *csum_info) -{ - int rc; - struct dcs_iod_csums iod_csum = {0}; - daos_iod_t iod = {0}; - d_sg_list_t sgl = {0}; - - iod.iod_type = DAOS_IOD_ARRAY; - iod.iod_recxs = recx; - iod.iod_nr = 1; - iod.iod_size = rsize; - - sgl.sg_iovs = data_to_verify; - sgl.sg_nr = 1; - sgl.sg_nr_out = 1; - - iod_csum.ic_nr = 1; - iod_csum.ic_data = csum_info; - - rc = daos_csummer_verify_iod(csummer, &iod, &sgl, - &iod_csum, NULL, 0, NULL); - if (rc != 0) - D_ERROR("Corruption found for recx "DF_RECX"\n", - DP_RECX(*recx)); - - return rc; -} - static int csummer_alloc_csum_info(struct daos_csummer *csummer, daos_recx_t *recx, daos_size_t rsize, @@ -467,15 +437,13 @@ csum_copy_inline(int type, vos_iter_entry_t *ent, struct ds_obj_enum_arg *arg, return rc; } - rc = csummer_verify_recx(csummer, - &data_to_verify, - &ent_to_verify.ie_orig_recx, - ent_to_verify.ie_rsize, - &ent_to_verify.ie_csum); - + rc = daos_csummer_verify_value(csummer, &ent_to_verify.ie_orig_recx, + ent_to_verify.ie_rsize, &data_to_verify, + &ent_to_verify.ie_csum); D_FREE(data_to_verify.iov_buf); if (rc != 0) { - D_ERROR("Found corruption!\n"); + D_ERROR("Found corrupted recx " DF_RECX "\n", + DP_RECX(ent_to_verify.ie_orig_recx)); return rc; } diff --git a/src/object/srv_internal.h b/src/object/srv_internal.h index 8c0d92940ed..3d3fbc15447 100644 --- a/src/object/srv_internal.h +++ b/src/object/srv_internal.h @@ -26,6 +26,8 @@ extern struct dss_module_key obj_module_key; +struct migr_res_manager; + /* Per pool attached to the migrate tls(per xstream) */ struct migrate_pool_tls { /* POOL UUID and pool to be migrated */ @@ -73,38 +75,24 @@ struct migrate_pool_tls { /* The ULT number on each target xstream, which actually refer * back to the item within mpt_obj/dkey_ult_cnts array. */ - ATOMIC uint32_t *mpt_tgt_obj_ult_cnt; - ATOMIC uint32_t *mpt_tgt_dkey_ult_cnt; + uint32_t mpt_tgt_obj_ult_cnt; + uint32_t mpt_tgt_dkey_ult_cnt; + /* The current in-flight data size */ + uint64_t mpt_inflight_size; - /* ULT count array from all targets, obj: enumeration, dkey:fetch/update */ - ATOMIC uint32_t *mpt_obj_ult_cnts; - ATOMIC uint32_t *mpt_dkey_ult_cnts; + struct migr_res_manager *mpt_rmg; /* reference count for the structure */ - uint64_t mpt_refcount; - - /* The current in-flight iod, mainly used for controlling - * rebuild in-flight rate to avoid the DMA buffer overflow. - */ - uint64_t mpt_inflight_size; - uint64_t mpt_inflight_max_size; - ABT_cond mpt_inflight_cond; - ABT_mutex mpt_inflight_mutex; - uint32_t mpt_inflight_max_ult; + uint64_t mpt_refcount; uint32_t mpt_opc; - ABT_cond mpt_init_cond; - ABT_mutex mpt_init_mutex; - /* The new layout version for upgrade job */ uint32_t mpt_new_layout_ver; /* migrate leader ULT */ - unsigned int mpt_ult_running:1, - mpt_init_tls:1, - mpt_fini:1, - mpt_reintegrating:1, /* incremental reint flag */ - mpt_post_process_started:1; /* reint post process started flag */ + unsigned int mpt_ult_running : 1, mpt_fini : 1, + mpt_reintegrating : 1, /* incremental reint flag */ + mpt_post_process_started : 1; /* reint post process started flag */ /* migration init error */ int mpt_init_err; @@ -156,6 +144,10 @@ struct obj_tgt_punch_args { void migrate_pool_tls_destroy(struct migrate_pool_tls *tls); +int +obj_migrate_init(void); +void +obj_migrate_fini(void); struct obj_tls { d_sg_list_t ot_echo_sgl; diff --git a/src/object/srv_mod.c b/src/object/srv_mod.c index 822658e7c69..a668089a71d 100644 --- a/src/object/srv_mod.c +++ b/src/object/srv_mod.c @@ -40,9 +40,16 @@ obj_mod_init(void) D_ERROR("failed to obj_ec_codec_init\n"); goto out_class; } + rc = obj_migrate_init(); + if (rc) { + D_ERROR("failed to init migration resource managers\n"); + goto out_ec; + } return 0; +out_ec: + obj_ec_codec_fini(); out_class: obj_class_fini(); out_utils: @@ -55,6 +62,7 @@ obj_mod_init(void) static int obj_mod_fini(void) { + obj_migrate_fini(); obj_ec_codec_fini(); obj_class_fini(); obj_utils_fini(); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 806e904cab4..2ecd52f1a0f 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -701,6 +701,22 @@ obj_set_reply_sizes(crt_rpc_t *rpc, daos_iod_t *iods, int iod_nr, uint8_t *skips sizes[i] = iods[idx].iod_size; D_DEBUG(DB_IO, DF_UOID" %d:"DF_U64"\n", DP_UOID(orw->orw_oid), i, iods[idx].iod_size); + if ((orw->orw_flags & ORF_FOR_MIGRATION) && sizes[i] == 0) { + D_DEBUG(DB_REBUILD, + DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, " + "i:%d/idx:%d, iod_nr %d, orw_epoch " DF_X64 + ", orw_epoch_first " DF_X64 " may cause DER_DATA_LOSS", + DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid), + DP_UOID(orw->orw_oid), i, idx, iods[idx].iod_nr, orw->orw_epoch, + orw->orw_epoch_first); + if (iods[idx].iod_type == DAOS_IOD_ARRAY) { + int j; + + for (j = 0; j < min(8, iods[idx].iod_nr); j++) + D_DEBUG(DB_REBUILD, "recx[%d] - " DF_RECX, j, + DP_RECX(iods[idx].iod_recxs[j])); + } + } idx++; } @@ -1368,7 +1384,7 @@ struct ec_agg_boundary_arg { }; static int -obj_fetch_ec_agg_boundary(void *data) +obj_fetch_ec_agg_boundary_ult(void *data) { struct ec_agg_boundary_arg *arg = data; int rc; @@ -1381,6 +1397,34 @@ obj_fetch_ec_agg_boundary(void *data) return rc; } +static int +obj_fetch_ec_agg_boundary(struct obj_io_context *ioc, daos_unit_oid_t *uoid) +{ + struct ec_agg_boundary_arg arg; + int rc; + + arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool; + uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid); + rc = dss_ult_execute(obj_fetch_ec_agg_boundary_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, + DSS_DEEP_STACK_SZ); + if (rc) { + DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), + DP_UOID(*uoid)); + return rc; + } + if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { + rc = -DER_FETCH_AGAIN; + DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid)); + return rc; + } + D_DEBUG(DB_IO, DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid), + ioc->ioc_coc->sc_ec_agg_eph_boundary); + return 0; +} + static int obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *iods, struct dcs_iod_csums *iod_csums, uint64_t *offs, uint8_t *skips, @@ -1503,29 +1547,14 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io } if ((ec_deg_fetch || (ec_recov && get_parity_list)) && ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { - struct ec_agg_boundary_arg arg; - - arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool; - uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid); - rc = dss_ult_execute(obj_fetch_ec_agg_boundary, &arg, NULL, NULL, - DSS_XS_SYS, 0, 0); + rc = obj_fetch_ec_agg_boundary(ioc, &orw->orw_oid); if (rc) { DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.", DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(orw->orw_oid)); goto out; } - if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { - rc = -DER_FETCH_AGAIN; - DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.", - DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), - DP_UOID(orw->orw_oid)); - goto out; - } - D_DEBUG(DB_IO, - DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n", - DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), - DP_UOID(orw->orw_oid), ioc->ioc_coc->sc_ec_agg_eph_boundary); + D_ASSERT(ioc->ioc_coc->sc_ec_agg_eph_valid); } if (get_parity_list) { D_ASSERT(!ec_deg_fetch); @@ -2424,9 +2453,9 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, if (opc == DAOS_OBJ_RPC_ENUMERATE && flags & ORF_FOR_MIGRATION) { /* EC aggregation is still inflight, rebuild should wait until it's paused */ if (ds_cont_child_ec_aggregating(child)) { - D_ERROR(DF_CONT" ec aggregate still active, rebuilding %d\n", + D_ERROR(DF_CONT " ec aggregate still active, rebuilding %d\n", DP_CONT(child->sc_pool->spc_uuid, child->sc_uuid), - child->sc_pool->spc_pool->sp_rebuilding); + atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)); return -DER_UPDATE_AGAIN; } } @@ -2434,7 +2463,7 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, if (!obj_is_modification_opc(opc) && (opc != DAOS_OBJ_RPC_CPD || flags & ORF_CPD_RDONLY)) return 0; - if (child->sc_pool->spc_pool->sp_rebuilding) { + if (atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)) { uint32_t version; ds_rebuild_running_query(child->sc_pool_uuid, RB_OP_REBUILD, @@ -2458,19 +2487,6 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, return -DER_UPDATE_AGAIN; } - /* All I/O during rebuilding, needs to wait for the rebuild fence to - * be generated (see rebuild_prepare_one()), which will create a boundary - * for rebuild, so the data after boundary(epoch) should not be rebuilt, - * which otherwise might be written duplicately, which might cause - * the failure in VOS. - */ - if ((flags & ORF_REBUILDING_IO) && - (is_pool_rebuild_allowed(child->sc_pool->spc_pool, false) && - child->sc_pool->spc_rebuild_fence == 0)) { - D_ERROR("rebuilding "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); - return -DER_UPDATE_AGAIN; - } - return 0; } @@ -3043,6 +3059,20 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (orw->orw_flags & ORF_FETCH_EPOCH_EC_AGG_BOUNDARY) { uint64_t rebuild_epoch; + if (ioc.ioc_coc->sc_ec_agg_eph_valid == 0) { + rc = obj_fetch_ec_agg_boundary(&ioc, &orw->orw_oid); + if (rc) { + DL_ERROR(rc, + DF_CONT ", " DF_UOID " fetch ec_agg_boundary " + "failed.", + DP_CONT(ioc.ioc_coc->sc_pool_uuid, + ioc.ioc_coc->sc_uuid), + DP_UOID(orw->orw_oid)); + goto out; + } + D_ASSERT(ioc.ioc_coc->sc_ec_agg_eph_valid); + } + D_ASSERTF(orw->orw_epoch <= orw->orw_epoch_first, "bad orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 "\n", orw->orw_epoch, orw->orw_epoch_first); @@ -3259,6 +3289,27 @@ obj_enum_complete(crt_rpc_t *rpc, int status, int map_version, D_FREE(oeo->oeo_csum_iov.iov_buf); } +static void +dump_enum_anchor(daos_unit_oid_t uoid, daos_anchor_t *anchor, char *str) +{ + int nr = DAOS_ANCHOR_BUF_MAX / 8; + int i; + uint64_t data[nr]; + + D_DEBUG(DB_REBUILD, DF_UOID "%s anchor -", DP_UOID(uoid), str); + D_DEBUG(DB_REBUILD, "type %d, shard %d, flags 0x%x\n", anchor->da_type, anchor->da_shard, + anchor->da_flags); + for (i = 0; i < nr; i++) + data[i] = *(uint64_t *)((char *)anchor->da_buf + i * 8); + if (nr >= 13) + D_DEBUG(DB_REBUILD, + "da_buf " DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 + "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 + "," DF_X64, + data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], + data[8], data[9], data[10], data[11], data[12]); +} + static int obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, struct vos_iter_anchors *anchors, struct ds_obj_enum_arg *enum_arg, @@ -3327,6 +3378,8 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, D_ASSERT(opc == DAOS_OBJ_RPC_ENUMERATE); type = VOS_ITER_DKEY; param.ip_flags |= VOS_IT_RECX_VISIBLE; + dump_enum_anchor(oei->oei_oid, &anchors->ia_dkey, "dkey"); + dump_enum_anchor(oei->oei_oid, &anchors->ia_akey, "akey"); if (daos_anchor_get_flags(&anchors->ia_dkey) & DIOF_WITH_SPEC_EPOCH) { /* For obj verification case. */ @@ -3344,7 +3397,12 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, enum_arg->chk_key2big = 1; enum_arg->need_punch = 1; enum_arg->copy_data_cb = vos_iter_copy; - fill_oid(oei->oei_oid, enum_arg); + rc = fill_oid(oei->oei_oid, enum_arg); + if (rc != 0) { + rc = -DER_KEY2BIG; + DL_ERROR(rc, DF_UOID "fill oid failed", DP_UOID(oei->oei_oid)); + goto failed; + } } /* @@ -5996,13 +6054,14 @@ ds_obj_coll_query_handler(crt_rpc_t *rpc) rc = dtx_leader_end(dlh, ioc.ioc_coc, rc); out: - D_DEBUG(DB_IO, "Handled collective query RPC %p %s forwarding for obj "DF_UOID - " on rank %u XS %u/%u epc "DF_X64" pmv %u, with dti "DF_DTI", dct_nr %u, " - "forward width %u, forward depth %u\n: "DF_RC"\n", rpc, - ocqi->ocqi_tgts.ca_count <= 1 ? "without" : "with", DP_UOID(ocqi->ocqi_oid), - myrank, dmi->dmi_xs_id, tgt_id, ocqi->ocqi_epoch, ocqi->ocqi_map_ver, - DP_DTI(&ocqi->ocqi_xid), (unsigned int)ocqi->ocqi_tgts.ca_count, - ocqi->ocqi_disp_width, ocqi->ocqi_disp_depth, DP_RC(rc)); + DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS, DLOG_ERR, DB_IO, rc, + "Handled collective query RPC %p %s forwarding for obj " DF_UOID " on rank %u XS " + "%u/%u epc " DF_X64 " pmv %u, with dti " DF_DTI ", dct_nr %u, forward width %u, " + "forward depth %u", + rpc, ocqi->ocqi_tgts.ca_count <= 1 ? "without" : "with", DP_UOID(ocqi->ocqi_oid), + myrank, dmi->dmi_xs_id, tgt_id, ocqi->ocqi_epoch, ocqi->ocqi_map_ver, + DP_DTI(&ocqi->ocqi_xid), (unsigned int)ocqi->ocqi_tgts.ca_count, + ocqi->ocqi_disp_width, ocqi->ocqi_disp_depth); obj_reply_set_status(rpc, rc); obj_reply_map_version_set(rpc, version); diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 1d936ad6557..2ccda79e4c2 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -31,15 +31,77 @@ #pragma GCC diagnostic ignored "-Wframe-larger-than=" #endif -/* Max in-flight data size per xstream */ -/* Set the total in-flight size to be 25% of MAX DMA size for +/* Max in-flight transfer size per xstream */ +/* Set the total in-flight size to be 50% of MAX DMA size for * the moment, will adjust it later if needed. */ -#define MIGRATE_MAX_SIZE (1 << 28) -/* Max migrate ULT number on the server */ -#define MIGRATE_DEFAULT_MAX_ULT 4096 +#define MIGR_TGT_INF_DATA (1 << 29) + +/* Threshold for very large transfers. + * This may exceed the MIGR_TGT_INF_DATA limit to prevent starvation. + * Only one such transfer is allowed at a time. + */ +#define MIGR_INF_DATA_HULK (1 << 28) + +/* Low water mark for DMA buffer usage, hulk transfer is allowed in this case. + */ +#define MIGR_INF_DATA_LWM (1 << 28) + #define ENV_MIGRATE_ULT_CNT "D_MIGRATE_ULT_CNT" + +/* Number of migration ULTs per target */ +#define MIGR_TGT_ULTS_MIN 100 +#define MIGR_TGT_ULTS_DEF 500 +#define MIGR_TGT_ULTS_MAX 2000 + +/* 1/3 object ults, 2/3 key ULTs */ +#define MIGR_OBJ_ULT_PERCENT 33 + +#define MIGR_TGT_OBJ_ULTS(ults) ((ults * MIGR_OBJ_ULT_PERCENT) / 100) +#define MIGR_TGT_KEY_ULTS(ults) (ults - MIGR_TGT_OBJ_ULTS(ults)) + +enum { + MIGR_OBJ = 0, + MIGR_KEY, + MIGR_DATA, + MIGR_MAX, +}; + +/* resource consumed by migration */ +struct migr_resource { + const char *res_name; + /* upper limit of the resource */ + long res_limit; + /* resource amount in "unit" */ + long res_units; + /* number of waiters on this resource */ + int res_waiters; + /* Only used by MIGR_DATA, it always allows exactly one ULT to use unbounded + * buffer for super large value (rare). + */ + int res_hulk; + /* ABT_cond for waiters */ + ABT_cond res_cond; +}; + +/* migration resources manager */ +struct migr_res_manager { + ABT_mutex rmg_mutex; + struct migr_resource rmg_resources[MIGR_MAX]; +}; + +struct migr_engine_res { + /* total ULTs per target, it a tunable which can be set by admin */ + unsigned int er_max_ults; + /* dss_tgt_nr resource managers */ + struct migr_res_manager *er_rmgs; +}; + +static struct migr_engine_res migr_eng_res; + struct migrate_one { + struct migrate_pool_tls *mo_tls; + struct iter_obj_arg *mo_obj_arg; daos_key_t mo_dkey; uint64_t mo_dkey_hash; uuid_t mo_pool_uuid; @@ -114,9 +176,13 @@ struct iter_cont_arg { /* Argument for object iteration and migrate */ struct iter_obj_arg { + struct migrate_pool_tls *pool_tls; uuid_t pool_uuid; uuid_t cont_uuid; daos_unit_oid_t oid; + daos_handle_t ioa_oh; + int ioa_obj_ref; + struct daos_oclass_attr ioa_oca; daos_epoch_t epoch; daos_epoch_t punched_epoch; unsigned int shard; @@ -127,6 +193,40 @@ struct iter_obj_arg { uint32_t generation; }; +void +migrate_pool_tls_put(struct migrate_pool_tls *tls); + +static void +migrate_obj_get(struct iter_obj_arg *arg) +{ + arg->ioa_obj_ref++; +} + +static void +obj_iter_arg_free(struct iter_obj_arg *arg) +{ + if (arg->pool_tls) + migrate_pool_tls_put(arg->pool_tls); + if (arg->snaps) + D_FREE(arg->snaps); + D_FREE(arg); +} + +static void +migrate_obj_put(struct iter_obj_arg *arg) +{ + D_ASSERTF(arg->ioa_obj_ref > 0, DF_CONT " obj " DF_UOID " bad ioa_obj_ref %d\n", + DP_CONT(arg->pool_uuid, arg->cont_uuid), DP_UOID(arg->oid), arg->ioa_obj_ref); + arg->ioa_obj_ref--; + if (arg->ioa_obj_ref == 0) { + if (daos_handle_is_valid(arg->ioa_oh)) { + dsc_obj_close(arg->ioa_oh); + arg->ioa_oh = DAOS_HDL_INVAL; + } + obj_iter_arg_free(arg); + } +} + static int obj_tree_destory_cb(daos_handle_t ih, d_iov_t *key_iov, d_iov_t *val_iov, void *data) @@ -383,10 +483,6 @@ migrate_pool_tls_destroy(struct migrate_pool_tls *tls) if (daos_handle_is_valid(tls->mpt_pool_hdl)) dsc_pool_close(tls->mpt_pool_hdl); - if (tls->mpt_obj_ult_cnts) - D_FREE(tls->mpt_obj_ult_cnts); - if (tls->mpt_dkey_ult_cnts) - D_FREE(tls->mpt_dkey_ult_cnts); d_list_del(&tls->mpt_list); D_DEBUG(DB_REBUILD, DF_RB ": TLS destroy\n", DP_RB_MPT(tls)); if (tls->mpt_pool) @@ -395,14 +491,6 @@ migrate_pool_tls_destroy(struct migrate_pool_tls *tls) D_FREE(tls->mpt_svc_list.rl_ranks); if (tls->mpt_done_eventual) ABT_eventual_free(&tls->mpt_done_eventual); - if (tls->mpt_inflight_cond) - ABT_cond_free(&tls->mpt_inflight_cond); - if (tls->mpt_inflight_mutex) - ABT_mutex_free(&tls->mpt_inflight_mutex); - if (tls->mpt_init_cond) - ABT_cond_free(&tls->mpt_init_cond); - if (tls->mpt_init_mutex) - ABT_mutex_free(&tls->mpt_init_mutex); if (daos_handle_is_valid(tls->mpt_root_hdl)) obj_tree_destroy(tls->mpt_root_hdl); if (daos_handle_is_valid(tls->mpt_migrated_root_hdl)) @@ -452,56 +540,30 @@ migrate_pool_tls_lookup(uuid_t pool_uuid, unsigned int ver, uint32_t gen) return found; } -#define MPT_CREATE_TGT_INLINE (32) -struct migrate_pool_tls_create_arg { - uuid_t pool_uuid; - uuid_t pool_hdl_uuid; - uuid_t co_hdl_uuid; - d_rank_list_t *svc_list; - uint8_t *tgt_status; - uint8_t tgt_status_inline[MPT_CREATE_TGT_INLINE]; - uint32_t *tgt_in_ver; - uint32_t tgt_in_ver_inline[MPT_CREATE_TGT_INLINE]; - ATOMIC uint32_t *obj_ult_cnts; - ATOMIC uint32_t *dkey_ult_cnts; - uint64_t max_eph; - unsigned int version; - unsigned int generation; - uint32_t opc; - uint32_t new_layout_ver; - uint32_t max_ult_cnt; -}; - -int -migrate_pool_tls_create_one(void *data) +static int +migrate_pool_tls_create(uuid_t pool_uuid, unsigned int version, unsigned int generation, + uuid_t pool_hdl_uuid, uuid_t co_hdl_uuid, uint64_t max_eph, + uint32_t new_layout_ver, uint32_t opc, struct migrate_pool_tls **p_tls, + d_rank_list_t *svc_list, uint8_t tgt_status, uint32_t tgt_in_ver) { - struct migrate_pool_tls_create_arg *arg = data; - struct obj_tls *tls = obj_tls_get(); - uint32_t tgt_id; - struct migrate_pool_tls *pool_tls; + struct obj_tls *obj_tls = obj_tls_get(); + uint32_t tgt_id = dss_get_module_info()->dmi_tgt_id; + struct migrate_pool_tls *pool_tls = NULL; struct ds_pool_child *pool_child = NULL; int rc = 0; - pool_tls = migrate_pool_tls_lookup(arg->pool_uuid, arg->version, arg->generation); - if (pool_tls != NULL) { - /* Some one else already created, because collective function - * might yield xstream. - */ - migrate_pool_tls_put(pool_tls); - return 0; - } + D_ASSERT(generation != (unsigned int)(-1)); - pool_child = ds_pool_child_lookup(arg->pool_uuid); + pool_child = ds_pool_child_lookup(pool_uuid); if (pool_child == NULL) { /* Local ds_pool_child isn't started yet, return a retry-able error */ if (dss_get_module_info()->dmi_xs_id != 0) { - D_INFO(DF_UUID ": Local VOS pool isn't ready yet.\n", - DP_UUID(arg->pool_uuid)); + D_INFO(DF_UUID ": Local VOS pool isn't ready yet.\n", DP_UUID(pool_uuid)); return -DER_STALE; } } else if (unlikely(pool_child->spc_no_storage)) { - D_DEBUG(DB_REBUILD, DF_UUID" "DF_UUID" lost pool shard, ver %d, skip.\n", - DP_UUID(arg->pool_uuid), DP_UUID(arg->pool_hdl_uuid), arg->version); + D_DEBUG(DB_REBUILD, DF_UUID " " DF_UUID " lost pool shard, ver %d, skip.\n", + DP_UUID(pool_uuid), DP_UUID(pool_hdl_uuid), version); D_GOTO(out, rc = 0); } @@ -517,215 +579,54 @@ migrate_pool_tls_create_one(void *data) if (rc != ABT_SUCCESS) D_GOTO(out, rc = dss_abterr2der(rc)); - rc = ABT_cond_create(&pool_tls->mpt_inflight_cond); - if (rc != ABT_SUCCESS) - D_GOTO(out, rc = dss_abterr2der(rc)); - - rc = ABT_mutex_create(&pool_tls->mpt_inflight_mutex); - if (rc != ABT_SUCCESS) - D_GOTO(out, rc = dss_abterr2der(rc)); - - uuid_copy(pool_tls->mpt_pool_uuid, arg->pool_uuid); - uuid_copy(pool_tls->mpt_poh_uuid, arg->pool_hdl_uuid); - uuid_copy(pool_tls->mpt_coh_uuid, arg->co_hdl_uuid); - pool_tls->mpt_version = arg->version; - pool_tls->mpt_generation = arg->generation; + uuid_copy(pool_tls->mpt_pool_uuid, pool_uuid); + uuid_copy(pool_tls->mpt_poh_uuid, pool_hdl_uuid); + uuid_copy(pool_tls->mpt_coh_uuid, co_hdl_uuid); + pool_tls->mpt_version = version; + pool_tls->mpt_generation = generation; pool_tls->mpt_rec_count = 0; pool_tls->mpt_obj_count = 0; pool_tls->mpt_size = 0; pool_tls->mpt_root_hdl = DAOS_HDL_INVAL; - pool_tls->mpt_max_eph = arg->max_eph; - pool_tls->mpt_new_layout_ver = arg->new_layout_ver; - pool_tls->mpt_opc = arg->opc; - if (dss_get_module_info()->dmi_xs_id == 0) { - int i; - - pool_tls->mpt_inflight_max_size = MIGRATE_MAX_SIZE; - pool_tls->mpt_inflight_max_ult = arg->max_ult_cnt; - D_ALLOC_ARRAY(pool_tls->mpt_obj_ult_cnts, dss_tgt_nr); - D_ALLOC_ARRAY(pool_tls->mpt_dkey_ult_cnts, dss_tgt_nr); - if (pool_tls->mpt_obj_ult_cnts == NULL || pool_tls->mpt_dkey_ult_cnts == NULL) - D_GOTO(out, rc = -DER_NOMEM); - for (i = 0; i < dss_tgt_nr; i++) { - atomic_init(&pool_tls->mpt_obj_ult_cnts[i], 0); - atomic_init(&pool_tls->mpt_dkey_ult_cnts[i], 0); - } - } else { - tgt_id = dss_get_module_info()->dmi_tgt_id; - - pool_tls->mpt_pool = ds_pool_child_lookup(arg->pool_uuid); - if (pool_tls->mpt_pool == NULL) - D_GOTO(out, rc = -DER_NO_HDL); - pool_tls->mpt_inflight_max_size = MIGRATE_MAX_SIZE / dss_tgt_nr; - pool_tls->mpt_inflight_max_ult = arg->max_ult_cnt / dss_tgt_nr; - pool_tls->mpt_tgt_obj_ult_cnt = &arg->obj_ult_cnts[tgt_id]; - pool_tls->mpt_tgt_dkey_ult_cnt = &arg->dkey_ult_cnts[tgt_id]; - - if (pool_child->spc_pool->sp_incr_reint && arg->opc == RB_OP_REBUILD && - arg->tgt_status[tgt_id] == PO_COMP_ST_UP && - arg->tgt_in_ver[tgt_id] <= pool_tls->mpt_version) - pool_tls->mpt_reintegrating = 1; - D_DEBUG(DB_REBUILD, DF_RB" tgt %d status %u in version %u, mpt_reintegrating %d\n", - DP_RB_MPT(pool_tls), tgt_id, arg->tgt_status[tgt_id], - arg->tgt_in_ver[tgt_id], pool_tls->mpt_reintegrating); - } + pool_tls->mpt_max_eph = max_eph; + pool_tls->mpt_new_layout_ver = new_layout_ver; + pool_tls->mpt_opc = opc; + pool_tls->mpt_pool = ds_pool_child_lookup(pool_uuid); + if (pool_tls->mpt_pool == NULL) + D_GOTO(out, rc = -DER_NO_HDL); + pool_tls->mpt_tgt_obj_ult_cnt = 0; + pool_tls->mpt_tgt_dkey_ult_cnt = 0; + + if (pool_child->spc_pool->sp_incr_reint && opc == RB_OP_REBUILD && + tgt_status == PO_COMP_ST_UP && tgt_in_ver <= pool_tls->mpt_version) + pool_tls->mpt_reintegrating = 1; + D_DEBUG(DB_REBUILD, DF_RB " tgt %d status %u in version %u, mpt_reintegrating %d\n", + DP_RB_MPT(pool_tls), tgt_id, tgt_status, tgt_in_ver, pool_tls->mpt_reintegrating); pool_tls->mpt_inflight_size = 0; pool_tls->mpt_refcount = 1; - if (arg->svc_list) { - rc = daos_rank_list_copy(&pool_tls->mpt_svc_list, arg->svc_list); + if (svc_list) { + rc = daos_rank_list_copy(&pool_tls->mpt_svc_list, svc_list); if (rc) D_GOTO(out, rc); } D_DEBUG(DB_REBUILD, DF_RB ": TLS %p create for hdls " DF_UUID "/" DF_UUID " " DF_RC "\n", - DP_RB_MPT(pool_tls), pool_tls, DP_UUID(arg->pool_hdl_uuid), - DP_UUID(arg->co_hdl_uuid), DP_RC(rc)); - d_list_add(&pool_tls->mpt_list, &tls->ot_pool_list); + DP_RB_MPT(pool_tls), pool_tls, DP_UUID(pool_hdl_uuid), DP_UUID(co_hdl_uuid), + DP_RC(rc)); + d_list_add(&pool_tls->mpt_list, &obj_tls->ot_pool_list); + migrate_pool_tls_get(pool_tls); out: - if (rc && pool_tls) - migrate_pool_tls_destroy(pool_tls); - if (pool_child != NULL) ds_pool_child_put(pool_child); - return rc; -} - -static int -migrate_pool_tls_lookup_create(struct ds_pool *pool, unsigned int version, unsigned int generation, - uuid_t pool_hdl_uuid, uuid_t co_hdl_uuid, uint64_t max_eph, - uint32_t new_layout_ver, uint32_t opc, struct migrate_pool_tls **p_tls) -{ - struct migrate_pool_tls *tls = NULL; - struct migrate_pool_tls_create_arg arg = { 0 }; - daos_prop_t *prop = NULL; - struct daos_prop_entry *entry; - struct pool_target *tgts; - uint32_t max_migrate_ult = MIGRATE_DEFAULT_MAX_ULT; - d_rank_t rank; - int i, rc = 0; - - D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - tls = migrate_pool_tls_lookup(pool->sp_uuid, version, generation); - if (tls) { - if (tls->mpt_init_tls) { - ABT_mutex_lock(tls->mpt_init_mutex); - ABT_cond_wait(tls->mpt_init_cond, tls->mpt_init_mutex); - ABT_mutex_unlock(tls->mpt_init_mutex); - if (tls->mpt_init_err) { - migrate_pool_tls_put(tls); - rc = tls->mpt_init_err; - } - } - - if (rc == 0) - *p_tls = tls; - - return rc; - } - - d_getenv_uint(ENV_MIGRATE_ULT_CNT, &max_migrate_ult); - D_ASSERT(generation != (unsigned int)(-1)); - uuid_copy(arg.pool_uuid, pool->sp_uuid); - uuid_copy(arg.pool_hdl_uuid, pool_hdl_uuid); - uuid_copy(arg.co_hdl_uuid, co_hdl_uuid); - arg.version = version; - arg.opc = opc; - arg.max_eph = max_eph; - arg.new_layout_ver = new_layout_ver; - arg.generation = generation; - arg.max_ult_cnt = max_migrate_ult; - - /* - * dss_task_collective does not do collective on sys xstrem, - * sys xstream need some information to track rebuild status. - */ - rc = migrate_pool_tls_create_one(&arg); - if (rc) - D_GOTO(out, rc); - - tls = migrate_pool_tls_lookup(pool->sp_uuid, version, generation); - D_ASSERT(tls != NULL); - pool->sp_rebuilding++; - - rc = ABT_cond_create(&tls->mpt_init_cond); - if (rc != ABT_SUCCESS) - D_GOTO(out, rc = dss_abterr2der(rc)); - - rc = ABT_mutex_create(&tls->mpt_init_mutex); - if (rc != ABT_SUCCESS) - D_GOTO(out, rc = dss_abterr2der(rc)); - - tls->mpt_init_tls = 1; - D_ALLOC_PTR(prop); - if (prop == NULL) - D_GOTO(out, rc = -DER_NOMEM); - - if (likely(dss_tgt_nr <= MPT_CREATE_TGT_INLINE)) { - arg.tgt_status = arg.tgt_status_inline; - arg.tgt_in_ver = arg.tgt_in_ver_inline; - } else { - D_ALLOC_ARRAY(arg.tgt_status, dss_tgt_nr); - if (arg.tgt_status == NULL) - D_GOTO(out, rc = -DER_NOMEM); - D_ALLOC_ARRAY(arg.tgt_in_ver, dss_tgt_nr); - if (arg.tgt_in_ver == NULL) - D_GOTO(out, rc = -DER_NOMEM); - } - - rank = dss_self_rank(); - rc = pool_map_find_target_by_rank_idx(pool->sp_map, rank, -1, &tgts); - D_ASSERT(rc == dss_tgt_nr); - for (i = 0; i < dss_tgt_nr; i++) { - arg.tgt_status[i] = tgts[i].ta_comp.co_status; - arg.tgt_in_ver[i] = tgts[i].ta_comp.co_in_ver; - } - - rc = ds_pool_iv_prop_fetch(pool, prop); - if (rc) - D_GOTO(out, rc); - - entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_LIST); - D_ASSERT(entry != NULL); - arg.svc_list = (d_rank_list_t *)entry->dpe_val_ptr; - arg.obj_ult_cnts = tls->mpt_obj_ult_cnts; - arg.dkey_ult_cnts = tls->mpt_dkey_ult_cnts; - rc = ds_pool_task_collective(pool->sp_uuid, - PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, - migrate_pool_tls_create_one, &arg, 0); - if (rc != 0) { - DL_ERROR(rc, DF_RB ": failed to create migrate tls on tgt xstreams", - DP_RB_MPT(tls)); - D_GOTO(out, rc); - } - -out: - if (tls != NULL && tls->mpt_init_tls) { - tls->mpt_init_tls = 0; - /* Set init failed, so the waiting lookup(above) can be notified */ - if (rc != 0) - tls->mpt_init_err = rc; - ABT_mutex_lock(tls->mpt_init_mutex); - ABT_cond_broadcast(tls->mpt_init_cond); - ABT_mutex_unlock(tls->mpt_init_mutex); - } - D_DEBUG(DB_TRACE, "create tls " DF_UUID ": " DF_RC "\n", DP_UUID(pool->sp_uuid), DP_RC(rc)); - + D_DEBUG(DB_TRACE, "create tls " DF_UUID ": " DF_RC "\n", DP_UUID(pool_uuid), DP_RC(rc)); if (rc != 0) { - if (tls != NULL) - migrate_pool_tls_put(tls); + migrate_pool_tls_put(pool_tls); } else { - *p_tls = tls; + *p_tls = pool_tls; } - if (prop != NULL) - daos_prop_free(prop); - if (arg.tgt_status != NULL && arg.tgt_status != arg.tgt_status_inline) - D_FREE(arg.tgt_status); - if (arg.tgt_in_ver != NULL && arg.tgt_in_ver != arg.tgt_in_ver_inline) - D_FREE(arg.tgt_in_ver); - return rc; } @@ -790,6 +691,7 @@ mrone_obj_fetch_internal(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_ d_iov_t *csum_iov_fetch, struct migrate_pool_tls *tls) { uint32_t *extra_arg = NULL; + int waited = 0; int rc; /* pass rebuild epoch by extra_arg */ @@ -798,11 +700,10 @@ mrone_obj_fetch_internal(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_ mrone->mo_epoch); extra_arg = (uint32_t *)mrone->mo_epoch; } - retry: rc = dsc_obj_fetch(oh, eph, &mrone->mo_dkey, iod_num, iods, sgls, NULL, flags, extra_arg, csum_iov_fetch); - if ((rc == -DER_TIMEDOUT || rc == -DER_FETCH_AGAIN) && + if ((rc == -DER_TIMEDOUT || rc == -DER_FETCH_AGAIN || rc == -DER_NOMEM) && tls->mpt_version + 1 >= tls->mpt_pool->spc_map_version) { if (tls->mpt_fini) { DL_ERROR(rc, DF_RB ": dsc_obj_fetch " DF_UOID "failed when mpt_fini", @@ -813,25 +714,45 @@ mrone_obj_fetch_internal(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_ * fail out. */ DL_WARN(rc, DF_RB ": retry " DF_UOID, DP_RB_MPT(tls), DP_UOID(mrone->mo_oid)); + if (rc == -DER_NOMEM) { + /* sleep 10 seconds before retry, give other layers a chance to + * release resources. + */ + dss_sleep(10 * 1000); + if (waited != 0 && waited % 3600 == 0) { + DL_ERROR(rc, DF_RB ": waited memory for %d hour(s)", + DP_RB_MRO(mrone), waited / 3600); + } + } + waited += 10; D_GOTO(retry, rc); } return rc; } +static inline int +migrate_pool_tls_get_status(struct migrate_pool_tls *tls) +{ + if (tls && tls->mpt_status) + return tls->mpt_status; + if (tls == NULL || tls->mpt_fini) + return -DER_SHUTDOWN; + + return 0; +} + static int mrone_obj_fetch(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_t *sgls, daos_iod_t *iods, int iod_num, daos_epoch_t eph, uint32_t flags, d_iov_t *csum_iov_fetch) { - struct migrate_pool_tls *tls; + struct migrate_pool_tls *tls = mrone->mo_tls; int rc = 0; - tls = migrate_pool_tls_lookup(mrone->mo_pool_uuid, - mrone->mo_pool_tls_version, mrone->mo_generation); - if (tls == NULL || tls->mpt_fini) { - D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(mrone->mo_pool_uuid)); - D_GOTO(out, rc = -DER_SHUTDOWN); + if (tls->mpt_fini) { + D_WARN(DF_RB " someone aborted the rebuild", DP_RB_MRO(mrone)); + D_GOTO(out, rc = migrate_pool_tls_get_status(tls)); } if (daos_oclass_grp_size(&mrone->mo_oca) > 1) @@ -862,7 +783,6 @@ mrone_obj_fetch(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_t *sgls, } out: - migrate_pool_tls_put(tls); return rc; } @@ -919,7 +839,7 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, struct dcs_iod_csums *iod_csums = NULL; int iod_cnt = 0; int start; - char iov_buf[OBJ_ENUM_UNPACK_MAX_IODS][MAX_BUF_SIZE]; + char iov_buf[OBJ_ENUM_UNPACK_MAX_IODS][MAX_BUF_SIZE]; bool fetch = false; int i; int rc = 0; @@ -1285,6 +1205,28 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, return rc; } +static void +mrone_dump_info(struct migrate_one *mrone, daos_handle_t oh, daos_iod_t *iod) +{ + int i; + + if (daos_is_dkey_uint64(mrone->mo_oid.id_pub) && mrone->mo_dkey.iov_len == 8) + D_INFO(DF_RB ": " DF_UOID " int dkey " DF_U64 ", akey " DF_KEY ", iod_type %d, " + " iod_nr %d, iod_size " DF_U64, + DP_RB_MPT(mrone->mo_tls), DP_UOID(mrone->mo_oid), + *(uint64_t *)mrone->mo_dkey.iov_buf, DP_KEY(&iod->iod_name), iod->iod_type, + iod->iod_nr, iod->iod_size); + else + D_INFO(DF_RB ": " DF_UOID " dkey " DF_KEY ", akey " DF_KEY ", iod_type %d, " + " iod_nr %d, iod_size " DF_U64, + DP_RB_MPT(mrone->mo_tls), DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&iod->iod_name), iod->iod_type, iod->iod_nr, iod->iod_size); + if (iod->iod_type == DAOS_IOD_ARRAY) + for (i = 0; i < min(8, iod->iod_nr); i++) + D_INFO("recxs[%d] - " DF_RECX, i, DP_RECX(iod->iod_recxs[i])); + obj_dump_grp_layout(oh, mrone->mo_oid.id_shard); +} + static int migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, struct ds_cont_child *ds_cont) @@ -1353,6 +1295,8 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, daos_iod_t *iod = &mrone->mo_iods[i]; if (mrone->mo_iods[i].iod_size == 0) { + static __thread int log_nr; + /* zero size iod will cause assertion failure * in VOS, so let's check here. * So the object is being destroyed between @@ -1364,12 +1308,17 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, * the rebuild and retry. */ rc = -DER_DATA_LOSS; - D_DEBUG(DB_REBUILD, - DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY - " nr %d/%d eph " DF_U64 " " DF_RC "\n", - DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, - DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name), - mrone->mo_iod_num, i, mrone->mo_epoch, DP_RC(rc)); + DL_INFO(rc, + DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY + " nr %d/%d eph " DF_X64, + DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid), + DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&mrone->mo_iods[i].iod_name), mrone->mo_iod_num, i, + mrone->mo_epoch); + if (log_nr <= 128) { + mrone_dump_info(mrone, oh, &mrone->mo_iods[i]); + log_nr++; + } D_GOTO(out, rc); } @@ -1536,6 +1485,8 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, for (i = 0; rc == 0 && i < iod_num; i++) { if (iods[i].iod_size == 0) { + static __thread int log_nr; + /* zero size iod will cause assertion failure * in VOS, so let's check here. * So the object is being destroyed between @@ -1547,11 +1498,16 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, * the rebuild and retry. */ rc = -DER_DATA_LOSS; - D_INFO(DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY - " nr %d/%d eph " DF_U64 " " DF_RC "\n", - DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, - DP_KEY(&mrone->mo_dkey), DP_KEY(&iods[i].iod_name), iod_num, i, - mrone->mo_epoch, DP_RC(rc)); + DL_INFO(rc, + DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY + " nr %d/%d mo_epoch " DF_X64 " fetch_eph " DF_X64, + DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid), + DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&iods[i].iod_name), iod_num, i, mrone->mo_epoch, fetch_eph); + if (log_nr <= 128) { + mrone_dump_info(mrone, oh, &mrone->mo_iods[i]); + log_nr++; + } D_GOTO(end, rc); } } @@ -1735,11 +1691,9 @@ migrate_punch(struct migrate_pool_tls *tls, struct migrate_one *mrone, mrone->mo_oid.id_shard)) mrone_recx_daos2_vos(mrone, mrone->mo_punch_iods, mrone->mo_punch_iod_num); - rc = vos_obj_update(cont->sc_hdl, mrone->mo_oid, - mrone->mo_rec_punch_eph, - mrone->mo_version, 0, &mrone->mo_dkey, - mrone->mo_punch_iod_num, - mrone->mo_punch_iods, NULL, NULL); + rc = vos_obj_update(cont->sc_hdl, mrone->mo_oid, mrone->mo_rec_punch_eph, + mrone->mo_version, VOS_OF_REBUILD, &mrone->mo_dkey, + mrone->mo_punch_iod_num, mrone->mo_punch_iods, NULL, NULL); D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " mrone %p punch %d eph " DF_U64 "records: " DF_RC "\n", DP_RB_MPT(tls), DP_UOID(mrone->mo_oid), mrone, mrone->mo_punch_iod_num, @@ -1805,47 +1759,28 @@ static int migrate_dkey(struct migrate_pool_tls *tls, struct migrate_one *mrone, daos_size_t data_size) { - struct ds_cont_child *cont = NULL; - struct cont_props props; - daos_handle_t coh = DAOS_HDL_INVAL; + struct ds_cont_child *cont = NULL; daos_handle_t oh = DAOS_HDL_INVAL; int rc; D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); rc = migrate_get_cont_child(tls, mrone->mo_cont_uuid, &cont, true); if (rc || cont == NULL) - D_GOTO(cont_put, rc); - - rc = dsc_pool_open(tls->mpt_pool_uuid, tls->mpt_poh_uuid, 0, - NULL, tls->mpt_pool->spc_pool->sp_map, - &tls->mpt_svc_list, &tls->mpt_pool_hdl); - if (rc) - D_GOTO(cont_put, rc); - - /* Open client dc handle used to read the remote object data */ - rc = migrate_cont_open(tls, mrone->mo_cont_uuid, 0, &coh); - if (rc) - D_GOTO(cont_put, rc); + D_GOTO(out, rc); - /* Open the remote object */ - rc = dsc_obj_open(coh, mrone->mo_oid.id_pub, DAOS_OO_RO, &oh); - if (rc) - D_GOTO(cont_put, rc); + D_ASSERTF(mrone->mo_obj_arg->ioa_obj_ref > 0, + DF_RB ": oid " DF_UOID ", bad ioa_obj_ref %d\n", DP_RB_MPT(tls), + DP_UOID(mrone->mo_oid), mrone->mo_obj_arg->ioa_obj_ref); + D_ASSERT(daos_handle_is_valid(mrone->mo_obj_arg->ioa_oh)); + oh = mrone->mo_obj_arg->ioa_oh; + mrone->mo_oca = mrone->mo_obj_arg->ioa_oca; if (DAOS_FAIL_CHECK(DAOS_REBUILD_TGT_NOSPACE)) - D_GOTO(obj_close, rc = -DER_NOSPACE); + D_GOTO(out, rc = -DER_NOSPACE); if (DAOS_FAIL_CHECK(DAOS_REBUILD_NO_REBUILD)) { D_DEBUG(DB_REBUILD, DF_RB ": fault injected, disable rebuild\n", DP_RB_MPT(tls)); - D_GOTO(obj_close, rc); - } - - dsc_cont_get_props(coh, &props); - rc = dsc_obj_id2oc_attr(mrone->mo_oid.id_pub, &props, &mrone->mo_oca); - if (rc) { - D_ERROR(DF_RB ": unknown object class: %u\n", DP_RB_MPT(tls), - daos_obj_id2class(mrone->mo_oid.id_pub)); - D_GOTO(obj_close, rc); + D_GOTO(out, rc); } /* punch the object */ @@ -1857,21 +1792,21 @@ migrate_dkey(struct migrate_pool_tls *tls, struct migrate_one *mrone, if (rc) { DL_ERROR(rc, DF_RB ": " DF_UOID " punch obj failed", DP_RB_MPT(tls), DP_UOID(mrone->mo_oid)); - D_GOTO(obj_close, rc); + D_GOTO(out, rc); } } rc = migrate_punch(tls, mrone, cont); if (rc) - D_GOTO(obj_close, rc); + D_GOTO(out, rc); if (data_size == 0) { D_DEBUG(DB_REBUILD, DF_RB ": empty mrone %p\n", DP_RB_MPT(tls), mrone); - D_GOTO(obj_close, rc); + D_GOTO(out, rc); } if (DAOS_FAIL_CHECK(DAOS_REBUILD_UPDATE_FAIL)) - D_GOTO(obj_close, rc = -DER_INVAL); + D_GOTO(out, rc = -DER_INVAL); if (mrone->mo_iods[0].iod_type == DAOS_IOD_SINGLE) rc = migrate_fetch_update_single(mrone, oh, cont); @@ -1887,9 +1822,8 @@ migrate_dkey(struct migrate_pool_tls *tls, struct migrate_one *mrone, tls->mpt_rec_count += mrone->mo_rec_num; tls->mpt_size += mrone->mo_size; -obj_close: - dsc_obj_close(oh); -cont_put: + +out: if (cont != NULL) ds_cont_child_put(cont); return rc; @@ -1902,6 +1836,7 @@ migrate_one_destroy(struct migrate_one *mrone) D_ASSERT(d_list_empty(&mrone->mo_list)); daos_iov_free(&mrone->mo_dkey); + daos_iov_free(&mrone->mo_csum_iov); if (mrone->mo_iods_update_ephs) { for (i = 0; i < mrone->mo_iod_alloc_num; i++) { @@ -1939,113 +1874,139 @@ migrate_one_destroy(struct migrate_one *mrone) if (mrone->mo_iods_csums) D_FREE(mrone->mo_iods_csums); + if (mrone->mo_obj_arg) + migrate_obj_put(mrone->mo_obj_arg); + if (mrone->mo_tls) + migrate_pool_tls_put(mrone->mo_tls); D_FREE(mrone); } -enum { - OBJ_ULT = 1, - DKEY_ULT = 2, -}; - -/* Check if there are enough resource for the migration to proceed. */ -static int -migrate_system_enter(struct migrate_pool_tls *tls, int tgt_idx, bool *yielded) +static bool +migr_res_is_hulk(int res_type, long units) { - uint32_t tgt_cnt = 0; - int rc = 0; - - D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - D_ASSERTF(tgt_idx < dss_tgt_nr, "tgt idx %d tgt nr %u\n", tgt_idx, dss_tgt_nr); - - tgt_cnt = atomic_load(&tls->mpt_obj_ult_cnts[tgt_idx]) + - atomic_load(&tls->mpt_dkey_ult_cnts[tgt_idx]); - - while ((tls->mpt_inflight_max_ult / dss_tgt_nr) <= tgt_cnt) { - D_DEBUG(DB_REBUILD, DF_RB ": tgt%d:%u max %u\n", DP_RB_MPT(tls), tgt_idx, tgt_cnt, - tls->mpt_inflight_max_ult / dss_tgt_nr); - *yielded = true; - dss_sleep(0); - if (tls->mpt_fini) - D_GOTO(out, rc = -DER_SHUTDOWN); - - tgt_cnt = atomic_load(&tls->mpt_obj_ult_cnts[tgt_idx]) + - atomic_load(&tls->mpt_dkey_ult_cnts[tgt_idx]); - } - - atomic_fetch_add(&tls->mpt_obj_ult_cnts[tgt_idx], 1); -out: - return rc; + return res_type == MIGR_DATA && units >= MIGR_INF_DATA_HULK; } static int -migrate_tgt_enter(struct migrate_pool_tls *tls) +migrate_res_hold(struct migrate_pool_tls *tls, int res_type, long units, bool *yielded) { - uint32_t dkey_cnt = 0; - int rc = 0; + struct dss_module_info *dmi = dss_get_module_info(); + struct migr_res_manager *rmg; + struct migr_resource *res; + bool is_hulk; + bool waited = false; + int rc = 0; + + D_ASSERT(dmi->dmi_xs_id != 0); + + rmg = &migr_eng_res.er_rmgs[dmi->dmi_tgt_id]; + if (tls->mpt_rmg == NULL) { + tls->mpt_rmg = rmg; + } else { + D_ASSERTF(tls->mpt_rmg == rmg, "target=%d, rmg_off=%d\n", dmi->dmi_tgt_id, + (int)(tls->mpt_rmg - &migr_eng_res.er_rmgs[0])); + } - D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); + res = &rmg->rmg_resources[res_type]; + is_hulk = migr_res_is_hulk(res_type, units); + while (1) { + if (tls->mpt_fini) { + rc = migrate_pool_tls_get_status(tls); + D_GOTO(out, rc); + } - dkey_cnt = atomic_load(tls->mpt_tgt_dkey_ult_cnt); - while (tls->mpt_inflight_max_ult / 2 <= dkey_cnt) { - D_DEBUG(DB_REBUILD, DF_RB ": tgt %u max %u\n", DP_RB_MPT(tls), dkey_cnt, - tls->mpt_inflight_max_ult); + if (is_hulk && res->res_hulk == 0 && res->res_units < MIGR_INF_DATA_LWM) { + /* skip the limit check and allow (only) one hulk transfer at a time */ + res->res_units += units; + res->res_hulk = 1; + break; - ABT_mutex_lock(tls->mpt_inflight_mutex); - ABT_cond_wait(tls->mpt_inflight_cond, tls->mpt_inflight_mutex); - ABT_mutex_unlock(tls->mpt_inflight_mutex); - if (tls->mpt_fini) - D_GOTO(out, rc = -DER_SHUTDOWN); + } else if (!is_hulk && res->res_units + units <= res->res_limit) { + res->res_units += units; + break; + } + + ABT_mutex_lock(rmg->rmg_mutex); + res->res_waiters++; + if (res->res_waiters >= 100 && res->res_waiters % 100 == 0) { + D_DEBUG(DB_REBUILD, + "%d waiters are waiting on res=%s (target=%d, unit=%lu)\n", + res->res_waiters, res->res_name, dmi->dmi_tgt_id, units); + } - dkey_cnt = atomic_load(tls->mpt_tgt_dkey_ult_cnt); + ABT_cond_wait(res->res_cond, rmg->rmg_mutex); + res->res_waiters--; + ABT_mutex_unlock(rmg->rmg_mutex); + waited = true; } + if (yielded) + *yielded = waited; + + /* per-pool counters for rebuild status tracking */ + if (res_type == MIGR_OBJ) + tls->mpt_tgt_obj_ult_cnt++; + else if (res_type == MIGR_KEY) + tls->mpt_tgt_dkey_ult_cnt++; + else + tls->mpt_inflight_size += units; - atomic_fetch_add(tls->mpt_tgt_dkey_ult_cnt, 1); + D_DEBUG(DB_REBUILD, + "res=%s, hold=%lu, used=%lu, limit=%lu, waited=%d)\n" DF_RB + " obj_ults=%u, key_ults=%u, inf_data=" DF_U64 ")\n", + res->res_name, units, res->res_units, res->res_limit, waited, DP_RB_MPT(tls), + tls->mpt_tgt_obj_ult_cnt, tls->mpt_tgt_dkey_ult_cnt, tls->mpt_inflight_size); out: return rc; } static void -migrate_system_exit(struct migrate_pool_tls *tls, unsigned int tgt_idx) +migrate_res_release(struct migrate_pool_tls *tls, int res_type, long units) { - /* NB: this will only be called during errr handling. In normal case - * the migrate ULT created by system will be exit on each target XS. - */ - D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - atomic_fetch_sub(&tls->mpt_obj_ult_cnts[tgt_idx], 1); -} + struct migr_res_manager *rmg; + struct migr_resource *res; -static void -migrate_tgt_try_wakeup(struct migrate_pool_tls *tls) -{ - uint32_t dkey_cnt = 0; + rmg = tls->mpt_rmg; + D_ASSERT(rmg != NULL); - D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - dkey_cnt = atomic_load(tls->mpt_tgt_dkey_ult_cnt); - if (tls->mpt_inflight_max_ult / 2 > dkey_cnt) { - ABT_mutex_lock(tls->mpt_inflight_mutex); - ABT_cond_broadcast(tls->mpt_inflight_cond); - ABT_mutex_unlock(tls->mpt_inflight_mutex); + res = &rmg->rmg_resources[res_type]; + + D_DEBUG(DB_REBUILD, + "%s: release=%lu, used=%lu, limit=%lu\n" DF_RB + " obj_ults=%u, key_ults=%u, inf_data=" DF_U64 ")\n", + res->res_name, units, res->res_units, res->res_limit, DP_RB_MPT(tls), + tls->mpt_tgt_obj_ult_cnt, tls->mpt_tgt_dkey_ult_cnt, tls->mpt_inflight_size); + + if (res_type == MIGR_OBJ) { + D_ASSERT(tls->mpt_tgt_obj_ult_cnt > 0); + tls->mpt_tgt_obj_ult_cnt--; + } else if (res_type == MIGR_KEY) { + D_ASSERT(tls->mpt_tgt_dkey_ult_cnt > 0); + tls->mpt_tgt_dkey_ult_cnt--; + } else { + D_ASSERT(tls->mpt_inflight_size >= units); + tls->mpt_inflight_size -= units; } -} -static void -migrate_tgt_exit(struct migrate_pool_tls *tls, int ult_type) -{ - D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - if (ult_type == OBJ_ULT) { - atomic_fetch_sub(tls->mpt_tgt_obj_ult_cnt, 1); - return; + D_ASSERT(res->res_units >= units); + res->res_units -= units; + + if (migr_res_is_hulk(res_type, units)) { + D_ASSERT(res->res_hulk == 1); + res->res_hulk = 0; } - atomic_fetch_sub(tls->mpt_tgt_dkey_ult_cnt, 1); - migrate_tgt_try_wakeup(tls); + if (res->res_waiters > 0) { + ABT_mutex_lock(rmg->rmg_mutex); + ABT_cond_signal(res->res_cond); + ABT_mutex_unlock(rmg->rmg_mutex); + } } static void migrate_one_ult(void *arg) { - struct migrate_one *mrone = arg; + struct migrate_one *mrone = arg; struct migrate_pool_tls *tls; daos_size_t data_size; int rc = 0; @@ -2053,10 +2014,9 @@ migrate_one_ult(void *arg) while (daos_fail_check(DAOS_REBUILD_TGT_REBUILD_HANG)) dss_sleep(0); - tls = migrate_pool_tls_lookup(mrone->mo_pool_uuid, - mrone->mo_pool_tls_version, mrone->mo_generation); - if (tls == NULL || tls->mpt_fini) { - D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(mrone->mo_pool_uuid)); + tls = mrone->mo_tls; + if (tls->mpt_fini) { + D_WARN(DF_RB " someone aborted the rebuild", DP_RB_MRO(mrone)); goto out; } @@ -2068,26 +2028,14 @@ migrate_one_ult(void *arg) data_size, mrone->mo_iod_num, mrone->mo_iods_num_from_parity); D_ASSERT(data_size != (daos_size_t)-1); - D_DEBUG(DB_REBUILD, DF_RB ": mrone %p inflight_size " DF_U64 " max " DF_U64 "\n", - DP_RB_MPT(tls), mrone, tls->mpt_inflight_size, tls->mpt_inflight_max_size); - while (tls->mpt_inflight_size + data_size >= tls->mpt_inflight_max_size && - tls->mpt_inflight_max_size != 0 && tls->mpt_inflight_size != 0 && - !tls->mpt_fini) { - D_DEBUG(DB_REBUILD, DF_RB ": mrone %p wait " DF_U64 "/" DF_U64 "/" DF_U64 "\n", - DP_RB_MPT(tls), mrone, tls->mpt_inflight_size, tls->mpt_inflight_max_size, - data_size); - ABT_mutex_lock(tls->mpt_inflight_mutex); - ABT_cond_wait(tls->mpt_inflight_cond, tls->mpt_inflight_mutex); - ABT_mutex_unlock(tls->mpt_inflight_mutex); - } - - if (tls->mpt_fini) + rc = migrate_res_hold(tls, MIGR_DATA, data_size, NULL); + if (rc) D_GOTO(out, rc); - tls->mpt_inflight_size += data_size; rc = migrate_dkey(tls, mrone, data_size); - tls->mpt_inflight_size -= data_size; + + migrate_res_release(tls, MIGR_DATA, data_size); D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " layout %u migrate dkey " DF_KEY " inflight_size " DF_U64 @@ -2111,11 +2059,8 @@ migrate_one_ult(void *arg) tls->mpt_fini = 1; } out: + migrate_res_release(tls, MIGR_KEY, 1); migrate_one_destroy(mrone); - if (tls != NULL) { - migrate_tgt_exit(tls, DKEY_ULT); - migrate_pool_tls_put(tls); - } } /* If src_iod is NULL, it will try to merge the recxs inside dst_iod */ @@ -2520,29 +2465,28 @@ migrate_one_create(struct enum_unpack_arg *arg, struct dc_obj_enum_unpack_io *io d_sg_list_t *sgls = io->ui_sgls; uint32_t version = io->ui_version; struct dc_object *obj = NULL; - struct migrate_pool_tls *tls; + struct migrate_pool_tls *tls = iter_arg->pool_tls; struct migrate_one *mrone = NULL; bool inline_copy = true; int i; int rc = 0; - tls = migrate_pool_tls_lookup(iter_arg->pool_uuid, iter_arg->version, iter_arg->generation); - if (tls == NULL || tls->mpt_fini) { - D_WARN("someone aborted the rebuild " DF_UUID "dkey " DF_KEY "iod_nr %d\n", - DP_UUID(iter_arg->pool_uuid), DP_KEY(dkey), iod_eph_total); - D_GOTO(put, rc = 0); + if (tls->mpt_fini) { + D_WARN("someone aborted the rebuild " DF_UUID " ver %d, dkey " DF_KEY "iod_nr %d\n", + DP_UUID(iter_arg->pool_uuid), version, DP_KEY(dkey), iod_eph_total); + D_GOTO(out, rc = 0); } D_DEBUG(DB_REBUILD, DF_RB ": migrate dkey " DF_KEY " iod nr %d\n", DP_RB_MPT(tls), DP_KEY(dkey), iod_eph_total); if (iod_eph_total == 0 || tls->mpt_fini) { D_DEBUG(DB_REBUILD, DF_RB ": no need eph_total %d version %u fini %d\n", DP_RB_MPT(tls), iod_eph_total, version, tls->mpt_fini); - D_GOTO(put, rc = 0); + D_GOTO(out, rc = 0); } D_ALLOC_PTR(mrone); if (mrone == NULL) - D_GOTO(put, rc = -DER_NOMEM); + D_GOTO(out, rc = -DER_NOMEM); D_INIT_LIST_HEAD(&mrone->mo_list); D_ALLOC_ARRAY(mrone->mo_iods, iod_eph_total); @@ -2651,9 +2595,7 @@ migrate_one_create(struct enum_unpack_arg *arg, struct dc_obj_enum_unpack_io *io d_list_del_init(&mrone->mo_list); migrate_one_destroy(mrone); } -put: - if (tls) - migrate_pool_tls_put(tls); +out: return rc; } @@ -2661,14 +2603,14 @@ static int migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) { struct enum_unpack_arg *arg = data; + struct migrate_pool_tls *tls = arg->arg->pool_tls; uint32_t shard = arg->arg->shard; struct migrate_one *mo; uint32_t unpack_tgt_off; uint32_t migrate_tgt_off; bool merged = false; bool create_migrate_one = false; - int rc = 0; - struct migrate_pool_tls *tls; + int rc = 0; struct dc_object *obj = NULL; uint32_t parity_shard = -1; uint32_t layout_ver; @@ -2690,10 +2632,8 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) if (rc < 0) return rc; - tls = migrate_pool_tls_lookup(arg->arg->pool_uuid, arg->arg->version, - arg->arg->generation); - if (tls == NULL || tls->mpt_fini) { - D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(arg->arg->pool_uuid)); + if (tls->mpt_fini) { + D_WARN(DF_RB " someone aborted the rebuild", DP_RB_MPT(tls)); D_GOTO(put, rc = 0); } @@ -2709,9 +2649,18 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) if (rc == 1 && (is_ec_data_shard_by_tgt_off(unpack_tgt_off, &arg->oc_attr) || (io->ui_oid.id_layout_ver > 0 && io->ui_oid.id_shard != parity_shard))) { - D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " ignore shard " DF_KEY "/%u/%d/%u/%d.\n", - DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard, - (int)obj_ec_shard_off(obj, io->ui_dkey_hash, 0), parity_shard, rc); + if (daos_is_dkey_uint64(io->ui_oid.id_pub) && io->ui_dkey.iov_len == 8) + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " ignore shard, int dkey " DF_U64 + "/%u/%d/%u/%d.\n", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), + *(uint64_t *)io->ui_dkey.iov_buf, shard, + (int)obj_ec_shard_off(obj, io->ui_dkey_hash, 0), parity_shard, rc); + else + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " ignore shard " DF_KEY "/%u/%d/%u/%d.\n", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard, + (int)obj_ec_shard_off(obj, io->ui_dkey_hash, 0), parity_shard, rc); D_GOTO(put, rc = 0); } rc = 0; @@ -2727,11 +2676,19 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) continue; } - D_DEBUG(DB_REBUILD, - DF_RB ": " DF_UOID " unpack " DF_KEY " for shard " - "%u/%u/%u/" DF_X64 "/%u\n", - DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard, - unpack_tgt_off, migrate_tgt_off, io->ui_dkey_hash, parity_shard); + if (daos_is_dkey_uint64(io->ui_oid.id_pub) && io->ui_dkey.iov_len == 8) + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " unpack int dkey " DF_U64 " for shard " + "%u/%u/%u/" DF_X64 "/%u\n", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), + *(uint64_t *)io->ui_dkey.iov_buf, shard, unpack_tgt_off, + migrate_tgt_off, io->ui_dkey_hash, parity_shard); + else + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " unpack " DF_KEY " for shard " + "%u/%u/%u/" DF_X64 "/%u\n", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard, + unpack_tgt_off, migrate_tgt_off, io->ui_dkey_hash, parity_shard); /** * Since we do not need split the rebuild into parity rebuild @@ -2769,8 +2726,14 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) if (!create_migrate_one) { struct ds_cont_child *cont = NULL; - D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID "/" DF_KEY " does not need rebuild.\n", - DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey)); + if (daos_is_dkey_uint64(io->ui_oid.id_pub) && io->ui_dkey.iov_len == 8) + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID "/int dkey: " DF_U64 " does not need rebuild.", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), + *(uint64_t *)io->ui_dkey.iov_buf); + else + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID "/" DF_KEY " does not need rebuild.", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey)); /* Create the vos container when no record need to be rebuilt for this shard, * for the case of reintegrate the container was discarded ahead. @@ -2809,8 +2772,6 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) put: if (obj) obj_decref(obj); - if (tls != NULL) - migrate_pool_tls_put(tls); return rc; } @@ -2822,10 +2783,10 @@ migrate_obj_punch_one(void *data) struct ds_cont_child *cont; int rc; - tls = migrate_pool_tls_lookup(arg->pool_uuid, arg->version, arg->generation); - if (tls == NULL || tls->mpt_fini) { - D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(arg->pool_uuid)); - D_GOTO(put, rc = 0); + tls = arg->pool_tls; + if (tls->mpt_fini) { + D_WARN(DF_RB " someone aborted the rebuild", DP_RB_MPT(tls)); + D_GOTO(out, rc = 0); } D_DEBUG(DB_REBUILD, DF_RB ": tls %p version %d punch " DF_U64 " " DF_UOID "\n", @@ -2833,39 +2794,47 @@ migrate_obj_punch_one(void *data) rc = migrate_get_cont_child(tls, arg->cont_uuid, &cont, true); if (rc != 0 || cont == NULL) - D_GOTO(put, rc); + D_GOTO(out, rc); D_ASSERT(arg->punched_epoch != 0); rc = vos_obj_punch(cont->sc_hdl, arg->oid, arg->punched_epoch, tls->mpt_version, VOS_OF_REPLAY_PC, NULL, 0, NULL, NULL); ds_cont_child_put(cont); -put: +out: if (rc) DL_ERROR(rc, DF_RB ": " DF_UOID " migrate punch failed", DP_RB_MPT(tls), DP_UOID(arg->oid)); - if (tls) { - if (tls->mpt_status == 0 && rc != 0) - tls->mpt_status = rc; - migrate_pool_tls_put(tls); - } + + if (tls->mpt_status == 0 && rc != 0) + tls->mpt_status = rc; return rc; } +static inline void +free_mrones(struct enum_unpack_arg *unpack_arg) +{ + struct migrate_one *mrone, *tmp; + + d_list_for_each_entry_safe(mrone, tmp, &unpack_arg->merge_list, mo_list) { + d_list_del_init(&mrone->mo_list); + migrate_one_destroy(mrone); + } +} + static int migrate_start_ult(struct enum_unpack_arg *unpack_arg) { - struct migrate_pool_tls *tls; struct iter_obj_arg *arg = unpack_arg->arg; + struct migrate_pool_tls *tls = arg->pool_tls; struct migrate_one *mrone; struct migrate_one *tmp; int rc = 0; - tls = migrate_pool_tls_lookup(arg->pool_uuid, arg->version, arg->generation); - if (tls == NULL || tls->mpt_fini) { - D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(arg->pool_uuid)); - D_GOTO(put, rc = 0); + if (tls->mpt_fini) { + D_WARN(DF_RB " someone aborted the rebuild", DP_RB_MPT(tls)); + D_GOTO(out, rc = 0); } d_list_for_each_entry_safe(mrone, tmp, &unpack_arg->merge_list, mo_list) { @@ -2883,22 +2852,26 @@ migrate_start_ult(struct enum_unpack_arg *unpack_arg) continue; } - rc = migrate_tgt_enter(tls); + rc = migrate_res_hold(tls, MIGR_KEY, 1, NULL); if (rc) break; d_list_del_init(&mrone->mo_list); - rc = dss_ult_create(migrate_one_ult, mrone, DSS_XS_VOS, - arg->tgt_idx, MIGRATE_STACK_SIZE, NULL); + + migrate_obj_get(arg); + mrone->mo_obj_arg = arg; + migrate_pool_tls_get(tls); + mrone->mo_tls = tls; + + D_ASSERT(arg->tgt_idx == dss_get_module_info()->dmi_tgt_id); + rc = dss_ult_create(migrate_one_ult, mrone, DSS_XS_SELF, 0, MIGRATE_STACK_SIZE, + NULL); if (rc) { - migrate_tgt_exit(tls, DKEY_ULT); + migrate_res_release(tls, MIGR_KEY, 1); migrate_one_destroy(mrone); break; } } - -put: - if (tls) - migrate_pool_tls_put(tls); +out: return rc; } @@ -2921,13 +2894,10 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, daos_key_desc_t kds[KDS_NUM] = {0}; d_iov_t csum = {0}; d_iov_t *p_csum; - uint8_t stack_csum_buf[CSUM_BUF_SIZE] = {0}; - struct cont_props props; + uint8_t stack_csum_buf[CSUM_BUF_SIZE] = {0}; struct enum_unpack_arg unpack_arg = { 0 }; d_iov_t iov = { 0 }; - d_sg_list_t sgl = { 0 }; - daos_handle_t coh = DAOS_HDL_INVAL; - daos_handle_t oh = DAOS_HDL_INVAL; + d_sg_list_t sgl = {0}; uint32_t minimum_nr; uint32_t enum_flags; uint32_t num; @@ -2944,44 +2914,15 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - rc = dsc_pool_open(tls->mpt_pool_uuid, tls->mpt_poh_uuid, 0, - NULL, tls->mpt_pool->spc_pool->sp_map, - &tls->mpt_svc_list, &tls->mpt_pool_hdl); - if (rc) { - DL_ERROR(rc, DF_RB ": dsc_pool_open failed", DP_RB_MPT(tls)); - D_GOTO(out, rc); - } - - rc = migrate_cont_open(tls, arg->cont_uuid, 0, &coh); - if (rc) { - DL_ERROR(rc, DF_RB ": migrate_cont_open failed", DP_RB_MPT(tls)); - D_GOTO(out, rc); - } - - /* Only open with RW flag, reintegrating flag will be set, which is needed - * during unpack_cb to check if parity shard alive. - */ - rc = dsc_obj_open(coh, arg->oid.id_pub, DAOS_OO_RO, &oh); - if (rc) { - DL_ERROR(rc, DF_RB ": dsc_obj_open failed", DP_RB_MPT(tls)); - D_GOTO(out, rc); - } - + D_ASSERT(daos_handle_is_valid(arg->ioa_oh)); unpack_arg.arg = arg; unpack_arg.epr = *epr; - unpack_arg.oh = oh; + unpack_arg.oh = arg->ioa_oh; unpack_arg.version = tls->mpt_version; D_INIT_LIST_HEAD(&unpack_arg.merge_list); buf = stack_buf; buf_len = ITER_BUF_SIZE; - - dsc_cont_get_props(coh, &props); - rc = dsc_obj_id2oc_attr(arg->oid.id_pub, &props, &unpack_arg.oc_attr); - if (rc) { - DL_ERROR(rc, DF_RB ": unknown object class: %u", DP_RB_MPT(tls), - daos_obj_id2class(arg->oid.id_pub)); - D_GOTO(out_obj, rc); - } + unpack_arg.oc_attr = arg->ioa_oca; memset(&anchor, 0, sizeof(anchor)); memset(&akey_anchor, 0, sizeof(akey_anchor)); @@ -3003,9 +2944,11 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, if (daos_oclass_is_ec(&unpack_arg.oc_attr)) { p_csum = NULL; - /* EC rotate needs to fetch from all shards */ + /* EC rotate needs to fetch from all shards, at least with data_tgt_nr alive, + * at least one shard should get 2 KDs. + */ if (obj_ec_parity_rotate_enabled_by_version(arg->oid.id_layout_ver)) - minimum_nr = obj_ec_tgt_nr(&unpack_arg.oc_attr); + minimum_nr = obj_ec_data_tgt_nr(&unpack_arg.oc_attr) + 1; else minimum_nr = 2; enum_flags |= DIOF_RECX_REVERSE; @@ -3031,9 +2974,8 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, daos_anchor_set_flags(&dkey_anchor, enum_flags); num = KDS_NUM; - rc = dsc_obj_list_obj(oh, epr, NULL, NULL, NULL, - &num, kds, &sgl, &anchor, - &dkey_anchor, &akey_anchor, p_csum); + rc = dsc_obj_list_obj(arg->ioa_oh, epr, NULL, NULL, NULL, &num, kds, &sgl, &anchor, + &dkey_anchor, &akey_anchor, p_csum); if (rc == -DER_KEY2BIG) { D_DEBUG(DB_REBUILD, @@ -3137,9 +3079,9 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, } /* Each object enumeration RPC will at least one OID */ - if (num <= minimum_nr && (enum_flags & DIOF_TO_SPEC_GROUP)) { - D_DEBUG(DB_REBUILD, DF_RB ": enumeration buffer %u empty" DF_UOID "\n", - DP_RB_MPT(tls), num, DP_UOID(arg->oid)); + if (num < minimum_nr && (enum_flags & DIOF_TO_SPEC_GROUP)) { + D_INFO(DF_RB ": enumeration buffer %u empty" DF_UOID, DP_RB_MPT(tls), num, + DP_UOID(arg->oid)); break; } @@ -3167,14 +3109,14 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, enum_flags |= DIOF_TO_LEADER; } + free_mrones(&unpack_arg); + if (buf != NULL && buf != stack_buf) D_FREE(buf); if (csum.iov_buf != NULL && csum.iov_buf != stack_csum_buf) D_FREE(csum.iov_buf); -out_obj: - dsc_obj_close(oh); -out: + D_DEBUG(DB_REBUILD, DF_RB ": obj " DF_UOID " shard %u eph " DF_U64 "-" DF_U64 ": " DF_RC "\n", DP_RB_MPT(tls), DP_UOID(arg->oid), arg->shard, epr->epr_lo, epr->epr_hi, DP_RC(rc)); @@ -3183,9 +3125,11 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, } struct migrate_stop_arg { - uuid_t pool_uuid; + uuid_t pool_uuid; unsigned int version; unsigned int generation; + unsigned int stop_count; + ABT_mutex stop_lock; }; static int @@ -3202,9 +3146,20 @@ migrate_fini_one_ult(void *data) D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); tls->mpt_fini = 1; - ABT_mutex_lock(tls->mpt_inflight_mutex); - ABT_cond_broadcast(tls->mpt_inflight_cond); - ABT_mutex_unlock(tls->mpt_inflight_mutex); + ABT_mutex_lock(arg->stop_lock); + arg->stop_count++; + ABT_mutex_unlock(arg->stop_lock); + + if (tls->mpt_rmg) { + struct migr_res_manager *rmg = tls->mpt_rmg; + int i; + + /* NB: no big deal but ULTs of all pools will be waken up */ + ABT_mutex_lock(rmg->rmg_mutex); + for (i = 0; i < MIGR_MAX; i++) + ABT_cond_broadcast(rmg->rmg_resources[i].res_cond); + ABT_mutex_unlock(rmg->rmg_mutex); + } migrate_pool_tls_put(tls); /* lookup */ rc = ABT_eventual_wait(tls->mpt_done_eventual, NULL); @@ -3226,44 +3181,27 @@ migrate_fini_one_ult(void *data) void ds_migrate_stop(struct ds_pool *pool, unsigned int version, unsigned int generation) { - struct migrate_pool_tls *tls; struct migrate_stop_arg arg; int rc; - D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - tls = migrate_pool_tls_lookup(pool->sp_uuid, version, generation); - if (tls == NULL || tls->mpt_fini) { - if (tls != NULL) - migrate_pool_tls_put(tls); - D_INFO(DF_UUID" migrate stopped\n", DP_UUID(pool->sp_uuid)); - return; - } - - tls->mpt_fini = 1; uuid_copy(arg.pool_uuid, pool->sp_uuid); arg.version = version; arg.generation = generation; + arg.stop_count = 0; + rc = ABT_mutex_create(&arg.stop_lock); + if (rc != ABT_SUCCESS) { + D_ERROR(DF_UUID " migrate stop: %d\n", DP_UUID(pool->sp_uuid), rc); + return; + } rc = ds_pool_thread_collective(pool->sp_uuid, 0, migrate_fini_one_ult, &arg, 0); if (rc) D_ERROR(DF_UUID" migrate stop: %d\n", DP_UUID(pool->sp_uuid), rc); - migrate_pool_tls_put(tls); - /* Wait for xstream 0 migrate ULT(migrate_ult) stop */ - if (tls->mpt_ult_running) { - ABT_mutex_lock(tls->mpt_inflight_mutex); - ABT_cond_broadcast(tls->mpt_inflight_cond); - ABT_mutex_unlock(tls->mpt_inflight_mutex); - rc = ABT_eventual_wait(tls->mpt_done_eventual, NULL); - if (rc != ABT_SUCCESS) { - rc = dss_abterr2der(rc); - D_WARN("failed to migrate wait "DF_UUID": "DF_RC"\n", - DP_UUID(pool->sp_uuid), DP_RC(rc)); - } - } + D_ASSERT(atomic_load(&pool->sp_rebuilding) >= arg.stop_count); + atomic_fetch_sub(&pool->sp_rebuilding, arg.stop_count); + ABT_mutex_free(&arg.stop_lock); - migrate_pool_tls_put(tls); - pool->sp_rebuilding--; D_INFO(DF_UUID" migrate stopped\n", DP_UUID(pool->sp_uuid)); } @@ -3294,12 +3232,15 @@ migrate_obj_ult(void *data) struct migrate_pool_tls *tls = NULL; daos_epoch_range_t epr; daos_epoch_t stable_epoch = 0; + daos_handle_t coh = DAOS_HDL_INVAL; + struct cont_props props; int i; int rc = 0; - tls = migrate_pool_tls_lookup(arg->pool_uuid, arg->version, arg->generation); - if (tls == NULL || tls->mpt_fini) { - D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(arg->pool_uuid)); + migrate_obj_get(arg); + tls = arg->pool_tls; + if (tls->mpt_fini) { + D_WARN(DF_RB " someone aborted the rebuild", DP_RB_MPT(tls)); D_GOTO(free_notls, rc); } @@ -3348,6 +3289,33 @@ migrate_obj_ult(void *data) ds_cont_child_put(cont_child); } + rc = dsc_pool_open(tls->mpt_pool_uuid, tls->mpt_poh_uuid, 0, NULL, + tls->mpt_pool->spc_pool->sp_map, &tls->mpt_svc_list, &tls->mpt_pool_hdl); + if (rc) { + DL_ERROR(rc, DF_RB ": dsc_pool_open failed", DP_RB_MPT(tls)); + D_GOTO(out, rc); + } + + rc = migrate_cont_open(tls, arg->cont_uuid, 0, &coh); + if (rc) { + DL_ERROR(rc, DF_RB ": migrate_cont_open failed", DP_RB_MPT(tls)); + D_GOTO(out, rc); + } + + rc = dsc_obj_open(coh, arg->oid.id_pub, DAOS_OO_RO, &arg->ioa_oh); + if (rc) { + DL_ERROR(rc, DF_RB ": dsc_obj_open failed", DP_RB_MPT(tls)); + D_GOTO(out, rc); + } + + dsc_cont_get_props(coh, &props); + rc = dsc_obj_id2oc_attr(arg->oid.id_pub, &props, &arg->ioa_oca); + if (rc) { + DL_ERROR(rc, DF_RB ": unknown object class: %u", DP_RB_MPT(tls), + daos_obj_id2class(arg->oid.id_pub)); + D_GOTO(out, rc); + } + for (i = 0; i < arg->snap_cnt; i++) { daos_epoch_t lower_epoch = 0; @@ -3416,18 +3384,14 @@ migrate_obj_ult(void *data) if (tls->mpt_status == 0 && rc < 0) tls->mpt_status = rc; - D_DEBUG( - DB_REBUILD, - DF_RB ": stop migrate obj " DF_UOID "for shard %u ult %u/%u " DF_U64 " : " DF_RC "\n", - DP_RB_MPT(tls), DP_UOID(arg->oid), arg->shard, atomic_load(tls->mpt_tgt_obj_ult_cnt), - atomic_load(tls->mpt_tgt_dkey_ult_cnt), tls->mpt_obj_count, DP_RC(rc)); + D_DEBUG(DB_REBUILD, + DF_RB ": stop migrate obj " DF_UOID "for shard %u ult %u/%u " DF_U64 " : " DF_RC + "\n", + DP_RB_MPT(tls), DP_UOID(arg->oid), arg->shard, tls->mpt_tgt_obj_ult_cnt, + tls->mpt_tgt_dkey_ult_cnt, tls->mpt_obj_count, DP_RC(rc)); free_notls: - if (tls != NULL) - migrate_tgt_exit(tls, OBJ_ULT); - - D_FREE(arg->snaps); - D_FREE(arg); - migrate_pool_tls_put(tls); + migrate_res_release(tls, MIGR_OBJ, 1); + migrate_obj_put(arg); } struct migrate_obj_val { @@ -3456,7 +3420,11 @@ migrate_one_object(daos_unit_oid_t oid, daos_epoch_t eph, daos_epoch_t punched_e if (obj_arg == NULL) return -DER_NOMEM; + migrate_pool_tls_get(tls); + obj_arg->pool_tls = tls; obj_arg->oid = oid; + obj_arg->ioa_oh = DAOS_HDL_INVAL; + obj_arg->ioa_obj_ref = 0; obj_arg->epoch = eph; obj_arg->shard = shard; obj_arg->punched_epoch = punched_eph; @@ -3476,9 +3444,8 @@ migrate_one_object(daos_unit_oid_t oid, daos_epoch_t eph, daos_epoch_t punched_e sizeof(*obj_arg->snaps) * cont_arg->snap_cnt); } - /* Let's iterate the object on different xstream */ - rc = dss_ult_create(migrate_obj_ult, obj_arg, DSS_XS_VOS, - tgt_idx, MIGRATE_STACK_SIZE, NULL); + D_ASSERT(tgt_idx == dss_get_module_info()->dmi_tgt_id); + rc = dss_ult_create(migrate_obj_ult, obj_arg, DSS_XS_SELF, 0, MIGRATE_STACK_SIZE, NULL); if (rc) goto free; @@ -3490,13 +3457,11 @@ migrate_one_object(daos_unit_oid_t oid, daos_epoch_t eph, daos_epoch_t punched_e rc = obj_tree_insert(toh, cont_arg->cont_uuid, -1, oid, &val_iov); D_DEBUG(DB_REBUILD, DF_RB ": insert " DF_UUID "/" DF_UOID ": ult %u/%u " DF_RC "\n", DP_RB_MPT(tls), DP_UUID(cont_arg->cont_uuid), DP_UOID(oid), - atomic_load(&tls->mpt_obj_ult_cnts[tgt_idx]), - atomic_load(&tls->mpt_dkey_ult_cnts[tgt_idx]), DP_RC(rc)); + tls->mpt_tgt_obj_ult_cnt, tls->mpt_tgt_dkey_ult_cnt, DP_RC(rc)); return 0; free: - D_FREE(obj_arg->snaps); - D_FREE(obj_arg); + obj_iter_arg_free(obj_arg); return rc; } @@ -3523,8 +3488,8 @@ migrate_obj_iter_cb(daos_handle_t ih, d_iov_t *key_iov, d_iov_t *val_iov, void * DF_RB ": obj migrate " DF_UUID "/" DF_UOID " %" PRIx64 " eph " DF_U64 " start\n", DP_RB_MPT(arg->pool_tls), DP_UUID(arg->cont_uuid), DP_UOID(*oid), ih.cookie, epoch); - rc = migrate_system_enter(arg->pool_tls, tgt_idx, &yielded); - if (rc != 0) { + rc = migrate_res_hold(arg->pool_tls, MIGR_OBJ, 1, &yielded); + if (rc) { DL_ERROR(rc, DF_RB ": " DF_UUID " enter migrate failed.", DP_RB_MPT(arg->pool_tls), DP_UUID(arg->cont_uuid)); return rc; @@ -3534,11 +3499,11 @@ migrate_obj_iter_cb(daos_handle_t ih, d_iov_t *key_iov, d_iov_t *val_iov, void * if (rc != 0) { DL_ERROR(rc, DF_RB ": obj " DF_UOID " migration failed", DP_RB_MPT(arg->pool_tls), DP_UOID(*oid)); - migrate_system_exit(arg->pool_tls, tgt_idx); + migrate_res_release(arg->pool_tls, MIGR_OBJ, 1); return rc; } - /* migrate_system_enter possibly yielded the ULT, let's re-probe before delete */ + /* migrate_res_hold possibly yielded the ULT, let's re-probe before delete */ if (yielded) { d_iov_set(&tmp_iov, oid, sizeof(*oid)); rc = dbtree_iter_probe(ih, BTR_PROBE_EQ, DAOS_INTENT_MIGRATION, &tmp_iov, NULL); @@ -3572,6 +3537,59 @@ migrate_obj_iter_cb(daos_handle_t ih, d_iov_t *key_iov, d_iov_t *val_iov, void * return rc; } +struct cont_fetch_arg { + uuid_t pool_uuid; + uuid_t cont_uuid; + uint64_t *snapshots; + int snap_cnt; + struct ds_pool *pool; +}; + +static int +cont_fetch_start_ult(void *arg) +{ + int rc; + struct cont_fetch_arg *fetch_arg = (struct cont_fetch_arg *)arg; + + rc = ds_pool_lookup(fetch_arg->pool_uuid, &fetch_arg->pool); + if (rc) { + D_ERROR(DF_UUID " ds_pool_lookup failed: " DF_RC "\n", + DP_UUID(fetch_arg->pool_uuid), DP_RC(rc)); + return rc; + } + + rc = ds_cont_fetch_snaps(fetch_arg->pool->sp_iv_ns, fetch_arg->cont_uuid, + &fetch_arg->snapshots, &fetch_arg->snap_cnt); + if (rc) { + D_ERROR("ds_cont_fetch_snaps failed: " DF_RC "\n", DP_RC(rc)); + return rc; + } + + rc = ds_cont_fetch_ec_agg_boundary(fetch_arg->pool->sp_iv_ns, fetch_arg->cont_uuid); + if (rc) { + /* Sometime it may too early to fetch the EC boundary, + * since EC boundary does not start yet, which is forbidden + * during rebuild anyway, so let's continue. + */ + D_DEBUG(DB_REBUILD, DF_UUID " fetch agg_boundary failed: " DF_RC "\n", + DP_UUID(fetch_arg->cont_uuid), DP_RC(rc)); + } + + return rc; +} + +static int +cont_fetch_end_ult(void *arg) +{ + struct cont_fetch_arg *fetch_arg = (struct cont_fetch_arg *)arg; + + if (fetch_arg->pool) + ds_pool_put(fetch_arg->pool); + + D_FREE(fetch_arg->snapshots); + return 0; +} + /* This iterates the migration database "container", which is different than the * similarly identified by container UUID as the actual container in VOS. * However, this container only contains object IDs that were specified to be @@ -3581,48 +3599,33 @@ static int migrate_cont_iter_cb(daos_handle_t ih, d_iov_t *key_iov, d_iov_t *val_iov, void *data) { - struct ds_pool *dp; struct iter_cont_arg arg = { 0 }; struct tree_cache_root *root = val_iov->iov_buf; - struct migrate_pool_tls *tls = data; - uint64_t *snapshots = NULL; - uuid_t cont_uuid; - int snap_cnt; + struct migrate_pool_tls *tls = data; + uuid_t cont_uuid; d_iov_t tmp_iov; int rc; + struct cont_fetch_arg fetch_arg = {0}; uuid_copy(cont_uuid, *(uuid_t *)key_iov->iov_buf); D_DEBUG(DB_REBUILD, DF_RB ": iter cont " DF_UUID "/%" PRIx64 " %" PRIx64 " start\n", DP_RB_MPT(tls), DP_UUID(cont_uuid), ih.cookie, root->tcr_root_hdl.cookie); - rc = ds_pool_lookup(tls->mpt_pool_uuid, &dp); + uuid_copy(fetch_arg.cont_uuid, cont_uuid); + uuid_copy(fetch_arg.pool_uuid, tls->mpt_pool_uuid); + rc = dss_ult_execute(cont_fetch_start_ult, &fetch_arg, NULL, NULL, DSS_XS_SYS, 0, + MIGRATE_STACK_SIZE); if (rc) { DL_ERROR(rc, DF_RB ": ds_pool_lookup failed", DP_RB_MPT(tls)); - rc = 0; - D_GOTO(out_put, rc); - } - - rc = ds_cont_fetch_snaps(dp->sp_iv_ns, cont_uuid, &snapshots, - &snap_cnt); - if (rc) { - DL_ERROR(rc, DF_RB ": ds_cont_fetch_snaps failed", DP_RB_MPT(tls)); - D_GOTO(out_put, rc); - } - - rc = ds_cont_fetch_ec_agg_boundary(dp->sp_iv_ns, cont_uuid); - if (rc) { - /* Sometime it may too early to fetch the EC boundary, - * since EC boundary does not start yet, which is forbidden - * during rebuild anyway, so let's continue. - */ - D_DEBUG(DB_REBUILD, DF_RB ": " DF_UUID " fetch agg_boundary failed: " DF_RC "\n", - DP_RB_MPT(tls), DP_UUID(cont_uuid), DP_RC(rc)); + if (rc == -DER_SHUTDOWN) + rc = 0; + D_GOTO(free, rc); } arg.yield_freq = DEFAULT_YIELD_FREQ; arg.cont_root = root; - arg.snaps = snapshots; - arg.snap_cnt = snap_cnt; + arg.snaps = fetch_arg.snapshots; + arg.snap_cnt = fetch_arg.snap_cnt; arg.pool_tls = tls; uuid_copy(arg.cont_uuid, cont_uuid); while (!dbtree_is_empty(root->tcr_root_hdl)) { @@ -3670,14 +3673,10 @@ migrate_cont_iter_cb(daos_handle_t ih, d_iov_t *key_iov, D_GOTO(free, rc); } free: - if (snapshots) - D_FREE(snapshots); - -out_put: + D_ASSERT(dss_ult_execute(cont_fetch_end_ult, &fetch_arg, NULL, NULL, DSS_XS_SYS, 0, 0) == + 0); if (tls->mpt_status == 0 && rc < 0) tls->mpt_status = rc; - if (dp != NULL) - ds_pool_put(dp); return rc; } @@ -3799,26 +3798,129 @@ migrate_try_obj_insert(struct migrate_pool_tls *tls, uuid_t co_uuid, return rc; } +struct ds_pool_migrate_arg { + uuid_t pma_pool_uuid; + struct ds_pool *pma_pool; + uint32_t pma_rebuild_ver; + uint32_t pma_generation; + daos_prop_t *pma_prop; + int pma_tgt_id; + uint8_t pma_tgt_status; + uint32_t pma_tgt_in_ver; + bool pma_no_iv; +}; + +static int +ds_migrate_end_ult(void *arg) +{ + struct ds_pool_migrate_arg *pool_arg = (struct ds_pool_migrate_arg *)arg; + + ds_pool_put(pool_arg->pma_pool); + if (pool_arg->pma_prop) + daos_prop_free(pool_arg->pma_prop); + return 0; +} + +static int +ds_migrate_prepare_ult(void *arg) +{ + int rc; + uint32_t rebuild_ver; + struct ds_pool_migrate_arg *pool_arg = (struct ds_pool_migrate_arg *)arg; + struct pool_target *tgts; + + rc = ds_pool_lookup(pool_arg->pma_pool_uuid, &pool_arg->pma_pool); + if (rc != 0) { + if (rc == -DER_SHUTDOWN) { + D_DEBUG(DB_REBUILD, DF_UUID " pool service is stopping.\n", + DP_UUID(pool_arg->pma_pool_uuid)); + rc = 0; + } else { + D_DEBUG(DB_REBUILD, DF_UUID " pool service is not started yet. " DF_RC "\n", + DP_UUID(pool_arg->pma_pool_uuid), DP_RC(rc)); + rc = -DER_AGAIN; + } + return rc; + } + + ds_rebuild_running_query(pool_arg->pma_pool_uuid, -1, &rebuild_ver, NULL, NULL); + if (rebuild_ver == 0 || rebuild_ver != pool_arg->pma_rebuild_ver) { + rc = -DER_SHUTDOWN; + D_GOTO(out, rc); + } + + if (pool_arg->pma_no_iv) + D_GOTO(out, rc = 0); + + D_ALLOC_PTR(pool_arg->pma_prop); + if (pool_arg->pma_prop == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + rc = ds_pool_iv_prop_fetch(pool_arg->pma_pool, pool_arg->pma_prop); + if (rc) + D_GOTO(out, rc); + + rc = pool_map_find_target_by_rank_idx(pool_arg->pma_pool->sp_map, dss_self_rank(), -1, + &tgts); + D_ASSERT(rc == dss_tgt_nr); + pool_arg->pma_tgt_status = tgts[pool_arg->pma_tgt_id].ta_comp.co_status; + pool_arg->pma_tgt_in_ver = tgts[pool_arg->pma_tgt_id].ta_comp.co_in_ver; + rc = 0; +out: + return rc; +} + int -ds_migrate_object(struct ds_pool *pool, uuid_t po_hdl, uuid_t co_hdl, uuid_t co_uuid, - uint32_t version, unsigned int generation, uint64_t max_eph, uint32_t opc, - daos_unit_oid_t *oids, daos_epoch_t *epochs, daos_epoch_t *punched_epochs, - unsigned int *shards, uint32_t count, unsigned int tgt_idx, - uint32_t new_layout_ver) +ds_migrate_object(uuid_t pool_uuid, uuid_t po_hdl, uuid_t co_hdl, uuid_t co_uuid, uint32_t version, + unsigned int generation, uint64_t max_eph, uint32_t opc, daos_unit_oid_t *oids, + daos_epoch_t *epochs, daos_epoch_t *punched_epochs, unsigned int *shards, + uint32_t count, unsigned int tgt_idx, uint32_t new_layout_ver) { - struct migrate_pool_tls *tls = NULL; - int i; - int rc; + struct migrate_pool_tls *tls = NULL; + int i; + int rc; + d_rank_list_t *svc_list = NULL; + struct daos_prop_entry *entry; + struct ds_pool_migrate_arg arg = {0}; + uint32_t tgt_id = dss_get_module_info()->dmi_tgt_id; + + tls = migrate_pool_tls_lookup(pool_uuid, version, generation); + if (tls) + arg.pma_no_iv = true; + + uuid_copy(arg.pma_pool_uuid, pool_uuid); + arg.pma_rebuild_ver = version; + arg.pma_tgt_id = tgt_id; + arg.pma_generation = generation; + rc = dss_ult_execute(ds_migrate_prepare_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, + MIGRATE_STACK_SIZE); + if (rc || arg.pma_pool == NULL) + D_GOTO(out, rc); + + if (tls) + goto skip_create; + + entry = daos_prop_entry_get(arg.pma_prop, DAOS_PROP_PO_SVC_LIST); + D_ASSERT(entry != NULL); + svc_list = (d_rank_list_t *)entry->dpe_val_ptr; + + /* prepare might yield */ + tls = migrate_pool_tls_lookup(pool_uuid, version, generation); + if (tls) { + goto skip_create; + } - /* Check if the pool tls exists */ - rc = migrate_pool_tls_lookup_create(pool, version, generation, po_hdl, co_hdl, max_eph, - new_layout_ver, opc, &tls); + atomic_fetch_add(&arg.pma_pool->sp_rebuilding, 1); + + rc = migrate_pool_tls_create(pool_uuid, version, generation, po_hdl, co_hdl, max_eph, + new_layout_ver, opc, &tls, svc_list, arg.pma_tgt_status, + arg.pma_tgt_in_ver); if (rc != 0) D_GOTO(out, rc); +skip_create: if (tls->mpt_fini) - D_GOTO(out, rc = -DER_SHUTDOWN); + D_GOTO(out, rc = migrate_pool_tls_get_status(tls)); - /* NB: only create this tree on xstream 0 */ rc = migrate_try_create_object_tree(tls); if (rc) D_GOTO(out, rc); @@ -3857,8 +3959,10 @@ ds_migrate_object(struct ds_pool *pool, uuid_t po_hdl, uuid_t co_hdl, uuid_t co_ } out: - if (tls) - migrate_pool_tls_put(tls); + migrate_pool_tls_put(tls); + if (arg.pma_pool) + D_ASSERT(dss_ult_execute(ds_migrate_end_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0) == + 0); return rc; } @@ -3880,9 +3984,7 @@ ds_obj_migrate_handler(crt_rpc_t *rpc) uuid_t po_uuid; uuid_t po_hdl_uuid; uuid_t co_uuid; - uuid_t co_hdl_uuid; - struct ds_pool *pool = NULL; - uint32_t rebuild_ver; + uuid_t co_hdl_uuid; int rc; migrate_in = crt_req_get(rpc); @@ -3911,35 +4013,11 @@ ds_obj_migrate_handler(crt_rpc_t *rpc) uuid_copy(po_uuid, migrate_in->om_pool_uuid); uuid_copy(po_hdl_uuid, migrate_in->om_poh_uuid); - rc = ds_pool_lookup(po_uuid, &pool); - if (rc != 0) { - if (rc == -DER_SHUTDOWN) { - D_DEBUG(DB_REBUILD, DF_RB " pool service is stopping.\n", - DP_RB_OMI(migrate_in)); - rc = 0; - } else { - D_DEBUG(DB_REBUILD, DF_RB " pool service is not started yet. " DF_RC "\n", - DP_RB_OMI(migrate_in), DP_RC(rc)); - rc = -DER_AGAIN; - } - D_GOTO(out, rc); - } - - ds_rebuild_running_query(migrate_in->om_pool_uuid, -1, &rebuild_ver, NULL, NULL); - if (rebuild_ver == 0 || rebuild_ver != migrate_in->om_version) { - rc = -DER_SHUTDOWN; - DL_ERROR(rc, DF_RB " rebuild ver %u", DP_RB_OMI(migrate_in), rebuild_ver); - D_GOTO(out, rc); - } - - rc = ds_migrate_object(pool, po_hdl_uuid, co_hdl_uuid, co_uuid, migrate_in->om_version, + rc = ds_migrate_object(po_uuid, po_hdl_uuid, co_hdl_uuid, co_uuid, migrate_in->om_version, migrate_in->om_generation, migrate_in->om_max_eph, migrate_in->om_opc, oids, ephs, punched_ephs, shards, oids_count, migrate_in->om_tgt_idx, migrate_in->om_new_layout_ver); out: - if (pool) - ds_pool_put(pool); - migrate_out = crt_reply_get(rpc); migrate_out->om_status = rc; dss_rpc_reply(rpc, DAOS_REBUILD_DROP_OBJ); @@ -4185,12 +4263,12 @@ reint_post_process_ult(void *data) struct migrate_query_arg { uuid_t pool_uuid; - ABT_mutex status_lock; - struct btr_root *mpt_migrated_root; + ABT_mutex status_lock; struct ds_migrate_status dms; uint32_t version; uint32_t total_ult_cnt; uint32_t generation; + uint32_t ult_running; daos_rebuild_opc_t rebuild_op; uint32_t mpt_reintegrating:1, reint_post_start:1, @@ -4211,8 +4289,7 @@ migrate_check_one(void *data) if (tls == NULL) return 0; - ult_cnt = atomic_load(tls->mpt_tgt_obj_ult_cnt) + atomic_load(tls->mpt_tgt_dkey_ult_cnt); - + ult_cnt = tls->mpt_tgt_obj_ult_cnt + tls->mpt_tgt_dkey_ult_cnt; ABT_mutex_lock(arg->status_lock); arg->dms.dm_rec_count += tls->mpt_rec_count; arg->dms.dm_obj_count += tls->mpt_obj_count; @@ -4220,6 +4297,7 @@ migrate_check_one(void *data) if (arg->dms.dm_status == 0) arg->dms.dm_status = tls->mpt_status; arg->total_ult_cnt += ult_cnt; + arg->ult_running += tls->mpt_ult_running; if (tls->mpt_reintegrating) { arg->mpt_reintegrating = 1; if (arg->reint_post_start) { @@ -4240,9 +4318,8 @@ migrate_check_one(void *data) ABT_mutex_unlock(arg->status_lock); D_DEBUG(DB_REBUILD, DF_RB " status %d/%d/ ult %u/%u rec/obj/size " DF_U64 "/" DF_U64 "/" DF_U64 "\n", - DP_RB_MQA(arg), tls->mpt_status, arg->dms.dm_status, - atomic_load(tls->mpt_tgt_obj_ult_cnt), atomic_load(tls->mpt_tgt_dkey_ult_cnt), - tls->mpt_rec_count, tls->mpt_obj_count, tls->mpt_size); + DP_RB_MQA(arg), tls->mpt_status, arg->dms.dm_status, tls->mpt_tgt_obj_ult_cnt, + tls->mpt_tgt_dkey_ult_cnt, tls->mpt_rec_count, tls->mpt_obj_count, tls->mpt_size); if (reint_post_start && !tls->mpt_post_process_started) { migrate_pool_tls_get(tls); @@ -4252,7 +4329,7 @@ migrate_check_one(void *data) D_GOTO(out, rc = -DER_NOMEM); ult_arg->rpa_tls = tls; - ult_arg->rpa_migrated_root = arg->mpt_migrated_root; + ult_arg->rpa_migrated_root = &tls->mpt_migrated_root; rc = dss_ult_create(reint_post_process_ult, ult_arg, DSS_XS_SELF, 0, MIGRATE_STACK_SIZE, NULL); if (rc) { @@ -4276,19 +4353,13 @@ int ds_migrate_query_status(uuid_t pool_uuid, uint32_t ver, unsigned int generation, int op, bool gl_scan_done, struct ds_migrate_status *dms) { - struct migrate_query_arg arg = { 0 }; - struct migrate_pool_tls *tls; + struct migrate_query_arg arg = {0}; int rc; - tls = migrate_pool_tls_lookup(pool_uuid, ver, generation); - if (tls == NULL) - return 0; - uuid_copy(arg.pool_uuid, pool_uuid); arg.version = ver; arg.generation = generation; arg.rebuild_op = op; - arg.mpt_migrated_root = &tls->mpt_migrated_root; rc = ABT_mutex_create(&arg.status_lock); if (rc != ABT_SUCCESS) D_GOTO(out, rc); @@ -4301,8 +4372,8 @@ ds_migrate_query_status(uuid_t pool_uuid, uint32_t ver, unsigned int generation, /* when globally scan done, and locally pull done, for reintegration need to do some post * processing, cannot report riv_pull_done before the post processing complete. */ - if (gl_scan_done && arg.total_ult_cnt == 0 && !tls->mpt_ult_running && - arg.mpt_reintegrating && !arg.reint_post_processing) { + if (gl_scan_done && arg.total_ult_cnt == 0 && !arg.ult_running && arg.mpt_reintegrating && + !arg.reint_post_processing) { arg.reint_post_start = 1; rc = ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, migrate_check_one, &arg, 0); @@ -4310,8 +4381,7 @@ ds_migrate_query_status(uuid_t pool_uuid, uint32_t ver, unsigned int generation, D_GOTO(out, rc); } - if (!gl_scan_done || arg.total_ult_cnt > 0 || tls->mpt_ult_running || - arg.reint_post_processing) + if (!gl_scan_done || arg.total_ult_cnt > 0 || arg.ult_running || arg.reint_post_processing) arg.dms.dm_migrating = 1; else arg.dms.dm_migrating = 0; @@ -4323,12 +4393,11 @@ ds_migrate_query_status(uuid_t pool_uuid, uint32_t ver, unsigned int generation, DF_RB " migrating=%s, obj_count=" DF_U64 ", rec_count=" DF_U64 ", size=" DF_U64 " ult_cnt %u, mpt_ult_running %d, reint_post_processing %d, status %d\n", DP_RB_MQA(&arg), arg.dms.dm_migrating ? "yes" : "no", arg.dms.dm_obj_count, - arg.dms.dm_rec_count, arg.dms.dm_total_size, arg.total_ult_cnt, - tls->mpt_ult_running, arg.reint_post_processing, arg.dms.dm_status); + arg.dms.dm_rec_count, arg.dms.dm_total_size, arg.total_ult_cnt, arg.ult_running, + arg.reint_post_processing, arg.dms.dm_status); out: ABT_mutex_free(&arg.status_lock); - migrate_pool_tls_put(tls); return rc; } @@ -4402,7 +4471,7 @@ ds_object_migrate_send(struct ds_pool *pool, uuid_t pool_hdl_uuid, uuid_t cont_h tgt_ep.ep_rank = target->ta_comp.co_rank; index = target->ta_comp.co_index; ABT_rwlock_unlock(pool->sp_lock); - tgt_ep.ep_tag = 0; + tgt_ep.ep_tag = daos_rpc_tag(DAOS_REQ_TGT, index); opcode = DAOS_RPC_OPCODE(DAOS_OBJ_RPC_MIGRATE, DAOS_OBJ_MODULE, rpc_ver); rc = crt_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep, opcode, &rpc); @@ -4455,3 +4524,92 @@ ds_object_migrate_send(struct ds_pool *pool, uuid_t pool_hdl_uuid, uuid_t cont_h return rc; } + +static int +migr_res_init(struct migr_resource *res, const char *name, long limit) +{ + int rc; + + memset(res, 0, sizeof(*res)); + res->res_name = name; + res->res_limit = limit; + rc = ABT_cond_create(&res->res_cond); + + return (rc != ABT_SUCCESS) ? dss_abterr2der(rc) : 0; +} + +static void +migr_res_fini(struct migr_resource *res) +{ + if (res->res_cond) + ABT_cond_free(&res->res_cond); +} + +int +obj_migrate_init(void) +{ + unsigned int ults = MIGR_TGT_ULTS_DEF; + int i; + int rc = 0; + + D_CASSERT(MIGR_TGT_INF_DATA > MIGR_INF_DATA_LWM); + D_CASSERT(MIGR_TGT_INF_DATA > MIGR_INF_DATA_HULK); + + d_getenv_uint(ENV_MIGRATE_ULT_CNT, &ults); + if (ults < MIGR_TGT_ULTS_MIN) + ults = MIGR_TGT_ULTS_MIN; + if (ults > MIGR_TGT_ULTS_MAX) + ults = MIGR_TGT_ULTS_MAX; + + memset(&migr_eng_res, 0, sizeof(migr_eng_res)); + migr_eng_res.er_max_ults = ults; + + D_ASSERT(dss_tgt_nr > 0); + D_ALLOC(migr_eng_res.er_rmgs, sizeof(struct migr_res_manager) * dss_tgt_nr); + if (!migr_eng_res.er_rmgs) + return -DER_NOMEM; + + for (i = 0; i < dss_tgt_nr; i++) { + struct migr_res_manager *rmg = &migr_eng_res.er_rmgs[i]; + + rc = ABT_mutex_create(&rmg->rmg_mutex); + if (rc != ABT_SUCCESS) + D_GOTO(out, rc = dss_abterr2der(rc)); + + rc = migr_res_init(&rmg->rmg_resources[MIGR_OBJ], "OBJ", MIGR_TGT_OBJ_ULTS(ults)); + if (rc) + D_GOTO(out, rc); + + rc = migr_res_init(&rmg->rmg_resources[MIGR_KEY], "KEY", MIGR_TGT_KEY_ULTS(ults)); + if (rc) + D_GOTO(out, rc); + + rc = migr_res_init(&rmg->rmg_resources[MIGR_DATA], "DATA", MIGR_TGT_INF_DATA); + if (rc) + D_GOTO(out, rc); + } + return 0; +out: + obj_migrate_fini(); + return rc; +} + +void +obj_migrate_fini(void) +{ + int i; + int j; + + if (migr_eng_res.er_rmgs) { + for (i = 0; i < dss_tgt_nr; i++) { + struct migr_res_manager *rmg = &migr_eng_res.er_rmgs[i]; + + for (j = 0; j < MIGR_MAX; j++) + migr_res_fini(&rmg->rmg_resources[j]); + if (rmg->rmg_mutex) + ABT_mutex_free(&rmg->rmg_mutex); + } + D_FREE(migr_eng_res.er_rmgs); + } + memset(&migr_eng_res, 0, sizeof(migr_eng_res)); +} diff --git a/src/object/tests/SConscript b/src/object/tests/SConscript index 74c0f41da23..56fc8a015d0 100644 --- a/src/object/tests/SConscript +++ b/src/object/tests/SConscript @@ -14,7 +14,7 @@ def scons(): unit_env.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD']) unit_env.d_test_program(['srv_checksum_tests.c', '../srv_csum.c'], LIBS=['daos_common_pmem', 'gurt', 'cmocka', - 'vos', 'bio', 'abt']) + 'vos', 'bio', 'ssl', 'abt']) unit_env.d_test_program(['cli_checksum_tests.c', '../cli_csum.c', diff --git a/src/placement/tests/place_obj_common.h b/src/placement/tests/place_obj_common.h index 537fbe963a2..34abd403b77 100644 --- a/src/placement/tests/place_obj_common.h +++ b/src/placement/tests/place_obj_common.h @@ -1,11 +1,9 @@ /** * (C) Copyright 2016-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ -#define D_LOGFAC DD_FAC(tests) - #ifndef __PL_MAP_COMMON_H__ #define __PL_MAP_COMMON_H__ diff --git a/src/placement/tests/placement_test.c b/src/placement/tests/placement_test.c index 1649fdd7fa1..c86d0e10ec9 100644 --- a/src/placement/tests/placement_test.c +++ b/src/placement/tests/placement_test.c @@ -1,9 +1,11 @@ /** * (C) Copyright 2021-2023 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent * */ +#define D_LOGFAC DD_FAC(tests) #include #include "place_obj_common.h" diff --git a/src/pool/srv_cli.c b/src/pool/srv_cli.c index 857b03bb8c7..0391b8bbc49 100644 --- a/src/pool/srv_cli.c +++ b/src/pool/srv_cli.c @@ -644,7 +644,6 @@ pool_query_target_consume(uuid_t pool_uuid, crt_rpc_t *rpc, void *varg) D_DEBUG(DB_MGMT, DF_UUID ": Successfully queried pool rank %u target %u\n", DP_UUID(pool_uuid), arg->pqta_rank, arg->pqta_tgt_idx); - arg->pqta_info->ta_type = DAOS_TP_UNKNOWN; arg->pqta_info->ta_state = out->pqio_state; for (i = 0; i < DAOS_MEDIA_MAX; i++) { arg->pqta_info->ta_space.s_total[i] = out->pqio_space.s_total[i]; diff --git a/src/pool/srv_iv.c b/src/pool/srv_iv.c index 15dd40f78a7..86f29111310 100644 --- a/src/pool/srv_iv.c +++ b/src/pool/srv_iv.c @@ -774,10 +774,11 @@ pool_iv_ent_fetch(struct ds_iv_entry *entry, struct ds_iv_key *key, if (dss_self_rank() == entry->ns->iv_master_rank) { if (!entry->iv_valid) { - D_INFO(DF_UUID" master %u is still stepping up: %d.\n", - DP_UUID(entry->ns->iv_pool_uuid), entry->ns->iv_master_rank, - -DER_NOTLEADER); - return -DER_NOTLEADER; + rc = -DER_NOTLEADER; + DL_INFO(rc, DF_UUID " iv class id %d, master %u is still stepping up.", + DP_UUID(entry->ns->iv_pool_uuid), key->class_id, + entry->ns->iv_master_rank); + return rc; } } @@ -961,6 +962,13 @@ pool_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, struct ds_pool *pool = 0; int rc; + if (ref_rc != 0) { + rc = ref_rc; + DL_WARN(rc, DF_UUID "bypass refresh, IV class id %d.", + DP_UUID(entry->ns->iv_pool_uuid), key->class_id); + goto out_put; + } + if (src == NULL) rc = ds_pool_lookup_internal(entry->ns->iv_pool_uuid, &pool); else diff --git a/src/pool/srv_layout.c b/src/pool/srv_layout.c index 12db4f7bd35..fefb56d5720 100644 --- a/src/pool/srv_layout.c +++ b/src/pool/srv_layout.c @@ -66,7 +66,7 @@ struct daos_prop_entry pool_prop_entries_default[DAOS_PROP_PO_NUM] = { }, { .dpe_type = DAOS_PROP_PO_SPACE_RB, - .dpe_val = 0, + .dpe_val = 5, }, { .dpe_type = DAOS_PROP_PO_SELF_HEAL, diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index bbf553ca740..b7ada57c251 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -763,7 +763,7 @@ pool_prop_write(struct rdb_tx *tx, const rdb_path_t *kvs, daos_prop_t *prop) static int init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, const char *group, - const d_rank_list_t *ranks, daos_prop_t *prop, uint32_t ndomains, + const d_rank_list_t *ranks, daos_prop_t *prop_orig, uint32_t ndomains, const uint32_t *domains) { struct pool_buf *map_buf; @@ -780,24 +780,59 @@ init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, co uint32_t svc_ops_max; uint32_t svc_ops_num; uint64_t rdb_size; + daos_prop_t *prop = NULL; int rc; struct daos_prop_entry *entry; uuid_t uuid; + /* duplicate the default properties, overwrite it with pool create + * parameter and then write to pool meta data. + */ + prop = daos_prop_dup(&pool_prop_default, true /* pool */, false /* input */); + if (prop == NULL) { + D_ERROR("daos_prop_dup failed.\n"); + D_GOTO(out, rc = -DER_NOMEM); + } + + if (DAOS_FAIL_CHECK(DAOS_FAIL_POOL_CREATE_VERSION)) { + uint64_t fail_val = daos_fail_value_get(); + + entry = daos_prop_entry_get(prop, DAOS_PROP_PO_OBJ_VERSION); + D_ASSERT(entry != NULL); + entry->dpe_val = (uint32_t)fail_val; + } + + rc = pool_prop_default_copy(prop, prop_orig); + if (rc) { + DL_ERROR(rc, "daos_prop_default_copy() failed"); + D_GOTO(out_prop, rc); + } + rc = gen_pool_buf(NULL /* map */, &map_buf, map_version, ndomains, nnodes, ntargets, domains, dss_tgt_nr); if (rc != 0) { D_ERROR("failed to generate pool buf, "DF_RC"\n", DP_RC(rc)); - goto out; + goto out_prop; } - entry = daos_prop_entry_get(prop, DAOS_PROP_PO_REDUN_FAC); + entry = daos_prop_entry_get(prop_orig, DAOS_PROP_PO_REDUN_FAC); if (entry) { + /** if the user provided an explicit incompatible rd_fac, then fail gracefully */ if (entry->dpe_val + 1 > map_buf->pb_domain_nr) { - D_ERROR("ndomains(%u) could not meet redunc factor(%lu)\n", + D_ERROR("ndomains(%u) could not meet specified redunc factor(%lu)\n", map_buf->pb_domain_nr, entry->dpe_val); D_GOTO(out_map_buf, rc = -DER_INVAL); } + } else { + /** if the default rd_fac cannot be satisfied, adjust it on the fly */ + entry = daos_prop_entry_get(prop, DAOS_PROP_PO_REDUN_FAC); + if (entry) { + if (entry->dpe_val + 1 > map_buf->pb_domain_nr) { + D_DEBUG(DB_MD, "ndomains(%u) could not meet default redunc factor(%lu)\n", + map_buf->pb_domain_nr, entry->dpe_val); + entry->dpe_val = (uint64_t) map_buf->pb_domain_nr - 1; + } + } } /* Initialize the pool map properties. */ @@ -930,6 +965,8 @@ init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, co out_map_buf: pool_buf_free(map_buf); +out_prop: + daos_prop_free(prop); out: return rc; } @@ -1017,23 +1054,26 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, d_rank_list_t *target_addrs, int ndomains, uint32_t *domains, daos_prop_t *prop, d_rank_list_t **svc_addrs) { - struct daos_prop_entry *svc_rf_entry; - struct pool_buf *map_buf; - uint32_t map_version = 1; - d_rank_list_t *ranks; - d_iov_t psid; - struct rsvc_client client; - struct dss_module_info *info = dss_get_module_info(); - crt_endpoint_t ep; - crt_rpc_t *rpc; - struct daos_prop_entry *lbl_ent; - struct daos_prop_entry *def_lbl_ent; - struct pool_create_out *out; - struct d_backoff_seq backoff_seq; - uuid_t pi_hdl_uuid; - uint64_t req_time = 0; - int n_attempts = 0; - int rc; + struct daos_prop_entry *svc_rf_entry; + struct pool_buf *map_buf; + uint32_t map_version = 1; + d_rank_list_t *ranks; + rdb_replica_id_t *replicas; + int i; + struct ds_rsvc_create_params create_params; + d_iov_t psid; + struct rsvc_client client; + struct dss_module_info *info = dss_get_module_info(); + crt_endpoint_t ep; + crt_rpc_t *rpc; + struct daos_prop_entry *lbl_ent; + struct daos_prop_entry *def_lbl_ent; + struct pool_create_out *out; + struct d_backoff_seq backoff_seq; + uuid_t pi_hdl_uuid; + uint64_t req_time = 0; + int n_attempts = 0; + int rc; /* Check for default label supplied via property. */ def_lbl_ent = daos_prop_entry_get(&pool_prop_default, DAOS_PROP_PO_LABEL); @@ -1063,20 +1103,37 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, D_DEBUG(DB_MD, DF_UUID": creating PS: ntargets=%d ndomains=%d svc_rf="DF_U64"\n", DP_UUID(pool_uuid), ntargets, ndomains, svc_rf_entry->dpe_val); + /* Determine the ranks and IDs of the PS replicas. */ rc = select_svc_ranks(svc_rf_entry->dpe_val, map_buf, map_version, &ranks); if (rc != 0) goto out_map_buf; + D_ALLOC_ARRAY(replicas, ranks->rl_nr); + if (replicas == NULL) { + rc = -DER_NOMEM; + goto out_ranks; + } + for (i = 0; i < ranks->rl_nr; i++) { + replicas[i].rri_rank = ranks->rl_ranks[i]; + /* Allocate replica generations from 1. See rdb_raft_init. */ + replicas[i].rri_gen = i + 1; + } + + create_params.scp_bootstrap = true; + create_params.scp_size = ds_rsvc_get_md_cap(); + create_params.scp_vos_df_version = ds_pool_get_vos_df_version_default(); + create_params.scp_layout_version = 0 /* default */; + create_params.scp_replicas = replicas; + create_params.scp_replicas_len = ranks->rl_nr; d_iov_set(&psid, (void *)pool_uuid, sizeof(uuid_t)); rc = ds_rsvc_dist_start(DS_RSVC_CLASS_POOL, &psid, pool_uuid, ranks, RDB_NIL_TERM, - DS_RSVC_CREATE, true /* bootstrap */, ds_rsvc_get_md_cap(), - ds_pool_get_vos_df_version_default()); + DS_RSVC_CREATE, &create_params); if (rc != 0) - D_GOTO(out_ranks, rc); + goto out_replicas; rc = rsvc_client_init(&client, ranks); if (rc != 0) - D_GOTO(out_ranks, rc); + goto out_replicas; rc = d_backoff_seq_init(&backoff_seq, 0 /* nzeros */, 16 /* factor */, 8 /* next (ms) */, 1 << 10 /* max (ms) */); @@ -1141,6 +1198,8 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, * Intentionally skip cleaning up the PS replicas. See the function * documentation above. */ +out_replicas: + D_FREE(replicas); out_ranks: d_rank_list_free(ranks); out_map_buf: @@ -1180,8 +1239,8 @@ ds_pool_svc_start(uuid_t uuid) } d_iov_set(&id, uuid, sizeof(uuid_t)); - rc = ds_rsvc_start(DS_RSVC_CLASS_POOL, &id, uuid, RDB_NIL_TERM, DS_RSVC_START, 0 /* size */, - 0 /* vos_df_version */, NULL /* replicas */, NULL /* arg */); + rc = ds_rsvc_start(DS_RSVC_CLASS_POOL, &id, uuid, RDB_NIL_TERM, DS_RSVC_START, + NULL /* create_params */, NULL /* arg */); if (rc == -DER_ALREADY) { D_DEBUG(DB_MD, DF_UUID": pool service already started\n", DP_UUID(uuid)); return 0; @@ -2555,7 +2614,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) if (rc != 0) goto out; - rc = ds_rebuild_regenerate_task(svc->ps_pool, prop, sys_self_heal, 0); + rc = ds_rebuild_regenerate_task(svc->ps_pool, prop, sys_self_heal, true /* auto_recovery */, + 0 /* delay_sec */); if (rc != 0) goto out; @@ -3947,7 +4007,6 @@ ds_pool_create_handler(crt_rpc_t *rpc) struct rdb_tx tx; d_iov_t value; struct rdb_kvs_attr attr; - daos_prop_t *prop_dup = NULL; daos_prop_t *prop = NULL; d_rank_list_t *tgt_ranks = NULL; uint32_t ndomains; @@ -4025,38 +4084,13 @@ ds_pool_create_handler(crt_rpc_t *rpc) D_GOTO(out_tx, rc); } - /* duplicate the default properties, overwrite it with pool create - * parameter and then write to pool meta data. - */ - prop_dup = daos_prop_dup(&pool_prop_default, true /* pool */, - false /* input */); - if (prop_dup == NULL) { - D_ERROR("daos_prop_dup failed.\n"); - D_GOTO(out_tx, rc = -DER_NOMEM); - } - - if (DAOS_FAIL_CHECK(DAOS_FAIL_POOL_CREATE_VERSION)) { - uint64_t fail_val = daos_fail_value_get(); - struct daos_prop_entry *entry; - - entry = daos_prop_entry_get(prop_dup, DAOS_PROP_PO_OBJ_VERSION); - D_ASSERT(entry != NULL); - entry->dpe_val = (uint32_t)fail_val; - } - - rc = pool_prop_default_copy(prop_dup, prop); - if (rc) { - DL_ERROR(rc, "daos_prop_default_copy() failed"); - D_GOTO(out_tx, rc); - } - /* Initialize the DB and the metadata for this pool. */ attr.dsa_class = RDB_KVS_GENERIC; attr.dsa_order = 8; rc = rdb_tx_create_root(&tx, &attr); if (rc != 0) D_GOTO(out_tx, rc); - rc = init_pool_metadata(&tx, &svc->ps_root, ntgts, NULL /* group */, tgt_ranks, prop_dup, + rc = init_pool_metadata(&tx, &svc->ps_root, ntgts, NULL /* group */, tgt_ranks, prop, ndomains, domains); if (rc != 0) D_GOTO(out_tx, rc); @@ -4069,7 +4103,6 @@ ds_pool_create_handler(crt_rpc_t *rpc) D_GOTO(out_tx, rc); out_tx: - daos_prop_free(prop_dup); ds_cont_unlock_metadata(svc->ps_cont_svc); ABT_rwlock_unlock(svc->ps_lock); rdb_tx_end(&tx); @@ -4133,6 +4166,26 @@ bulk_cb(const struct crt_bulk_cb_info *cb_info) return 0; } +static int +pool_query_set_rebuild_status_degraded(struct pool_svc *svc, struct daos_rebuild_status *rebuild_st) +{ + unsigned int down_tgts = 0; + int rc; + + ABT_rwlock_rdlock(svc->ps_pool->sp_lock); + rc = pool_map_find_down_tgts(svc->ps_pool->sp_map, NULL /* tgt_pp */, &down_tgts); + ABT_rwlock_unlock(svc->ps_pool->sp_lock); + if (rc != 0) + return rc; + + if (down_tgts > 0) + rebuild_st->rs_flags |= DAOS_RSF_DEGRADED; + else + rebuild_st->rs_flags &= ~DAOS_RSF_DEGRADED; + + return 0; +} + /* Currently we only maintain compatibility between 2 metadata layout versions */ #define NUM_POOL_VERSIONS 2 @@ -4364,6 +4417,12 @@ pool_connect_handler(crt_rpc_t *rpc, int handler_version) goto out_map_version; } + if (query_bits & DAOS_PO_QUERY_REBUILD_STATUS) { + rc = pool_query_set_rebuild_status_degraded(svc, &out->pco_rebuild_st); + if (rc != 0) + goto out_map_version; + } + transfer_map = true; if (skip_update) D_GOTO(out_map_version, rc = 0); @@ -4738,10 +4797,7 @@ pool_disconnect_handler(crt_rpc_t *rpc, int handler_version) void ds_pool_disconnect_handler(crt_rpc_t *rpc) { - uint8_t rpc_ver = opc_get_rpc_ver(rpc->cr_opc); - - D_ASSERT(rpc_ver == DAOS_POOL_VERSION); - pool_disconnect_handler(rpc, rpc_ver); + pool_disconnect_handler(rpc, opc_get_rpc_ver(rpc->cr_opc)); } static int @@ -5429,6 +5485,12 @@ pool_query_handler(crt_rpc_t *rpc, int handler_version) } } + if (query_bits & DAOS_PO_QUERY_REBUILD_STATUS) { + rc = pool_query_set_rebuild_status_degraded(svc, &out->pqo_rebuild_st); + if (rc != 0) + goto out_lock; + } + out_lock: ABT_rwlock_unlock(svc->ps_lock); rdb_tx_end(&tx); @@ -6966,7 +7028,7 @@ pool_svc_reconf_ult(void *varg) DP_UUID(svc->ps_uuid), DP_RC(rc)); goto out_to_add_remove; } - rc = rdb_remove_replicas(svc->ps_rsvc.s_db, tmp); + rc = ds_rsvc_remove_replicas_s(&svc->ps_rsvc, to_remove, false /* destroy */); if (rc != 0) D_ERROR(DF_UUID": failed to remove replicas: "DF_RC"\n", DP_UUID(svc->ps_uuid), DP_RC(rc)); @@ -7234,7 +7296,7 @@ log_unavailable_targets(struct pool_svc *svc, struct pool_map *map) if (doms[i].do_comp.co_status & PO_COMP_ST_DOWN) { D_ERROR(DF_UUID ": rank %u\n", DP_UUID(svc->ps_uuid), doms[i].do_comp.co_rank); - } else if (doms[i].do_comp.co_status & PO_COMP_ST_UPIN) { // XXX: ask Xuezhao + } else if (doms[i].do_comp.co_status & PO_COMP_ST_UPIN) { int j; for (j = 0; j < doms[i].do_target_nr; j++) @@ -7727,23 +7789,24 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, struct pool_target_addr_list *inval_list_out, uint32_t *map_version, struct rsvc_hint *hint, enum map_update_source src, uint32_t flags) { - struct pool_target_id_list target_list = {0}; - uint32_t tgt_map_ver = 0; - bool updated; - int rc; - char *env; - daos_epoch_t rebuild_eph = d_hlc_get(); - uint64_t delay = 2; - bool sys_self_heal_applicable; - uint64_t sys_self_heal = 0; + struct pool_target_id_list target_list = {0}; + uint32_t tgt_map_ver = 0; + bool updated; + int rc; + char *env; + daos_epoch_t rebuild_eph = d_hlc_get(); + uint64_t delay = 2; + bool auto_recovery; + uint64_t sys_self_heal = 0; /* - * The system self-heal policy only applies to automatic pool exclude + * The pool and system self-heal policies only apply to automatic pool exclude * and rebuild operations. */ - sys_self_heal_applicable = (opc == MAP_EXCLUDE && src == MUS_SWIM); + auto_recovery = (opc == MAP_EXCLUDE && src == MUS_SWIM); - if (sys_self_heal_applicable) { + /* If applicable, check system self-heal policy. */ + if (auto_recovery) { rc = ds_mgmt_get_self_heal_policy(pool_svc_abort_gshp, svc, &sys_self_heal); if (rc != 0) { DL_ERROR(rc, DF_UUID ": failed to get self-heal policy", @@ -7765,6 +7828,7 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, } } + /* Pool self-heal policy is checked in this call. */ rc = pool_svc_update_map_internal(svc, opc, exclude_rank, extend_rank_list, extend_domains_nr, extend_domains, &target_list, list, hint, &updated, map_version, &tgt_map_ver, inval_list_out, @@ -7785,14 +7849,14 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, } d_freeenv_str(&env); - if (sys_self_heal_applicable && !(sys_self_heal & DS_MGMT_SELF_HEAL_POOL_REBUILD)) { + if (auto_recovery && !(sys_self_heal & DS_MGMT_SELF_HEAL_POOL_REBUILD)) { D_DEBUG(DB_MD, DF_UUID ": pool_rebuild disabled in system property self_heal\n", DP_UUID(svc->ps_uuid)); rc = 0; goto out; } - if (!is_pool_rebuild_allowed(svc->ps_pool, true)) { + if (!is_pool_rebuild_allowed(svc->ps_pool, svc->ps_pool->sp_self_heal, auto_recovery)) { D_DEBUG(DB_MD, DF_UUID ": rebuild disabled for pool\n", DP_UUID(svc->ps_pool->sp_uuid)); D_GOTO(out, rc); diff --git a/src/pool/srv_pool_map.c b/src/pool/srv_pool_map.c index 32f0710102c..7361c146e8c 100644 --- a/src/pool/srv_pool_map.c +++ b/src/pool/srv_pool_map.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2021-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent * @@ -318,12 +318,12 @@ update_one_dom(struct pool_map *map, struct pool_domain *dom, struct pool_target if (dom->do_comp.co_status == PO_COMP_ST_DOWNOUT || dom->do_comp.co_status == PO_COMP_ST_DOWN) update_dom_status_by_tgt_id(map, tgt->ta_comp.co_id, PO_COMP_ST_UP, - *version, &updated); + *version, &updated, false); break; case MAP_EXTEND: if (dom->do_comp.co_status == PO_COMP_ST_NEW) update_dom_status_by_tgt_id(map, tgt->ta_comp.co_id, PO_COMP_ST_UP, - *version, &updated); + *version, &updated, false); break; case MAP_EXCLUDE: /* Only change the dom status if it is from SWIM eviction */ @@ -331,27 +331,29 @@ update_one_dom(struct pool_map *map, struct pool_domain *dom, struct pool_target !(dom->do_comp.co_status & (PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT)) && pool_map_node_status_match(dom, PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT)) update_dom_status_by_tgt_id(map, tgt->ta_comp.co_id, PO_COMP_ST_DOWN, - *version, &updated); + *version, &updated, false); break; case MAP_FINISH_REBUILD: if (dom->do_comp.co_status == PO_COMP_ST_UP) update_dom_status_by_tgt_id(map, tgt->ta_comp.co_id, PO_COMP_ST_UPIN, - *version, &updated); + *version, &updated, false); else if (dom->do_comp.co_status == PO_COMP_ST_DOWN && exclude_rank) update_dom_status_by_tgt_id(map, tgt->ta_comp.co_id, PO_COMP_ST_DOWNOUT, - *version, &updated); + *version, &updated, false); break; case MAP_REVERT_REBUILD: if (dom->do_comp.co_status == PO_COMP_ST_UP) { if (dom->do_comp.co_fseq == 1) update_dom_status_by_tgt_id(map, tgt->ta_comp.co_id, PO_COMP_ST_NEW, - *version, &updated); + *version, &updated, true); else if (dom->do_comp.co_flags == PO_COMPF_DOWN2UP) update_dom_status_by_tgt_id(map, tgt->ta_comp.co_id, - PO_COMP_ST_DOWN, *version, &updated); + PO_COMP_ST_DOWN, *version, &updated, + true); else update_dom_status_by_tgt_id(map, tgt->ta_comp.co_id, - PO_COMP_ST_DOWNOUT, *version, &updated); + PO_COMP_ST_DOWNOUT, *version, &updated, + true); } break; default: diff --git a/src/pool/srv_pool_scrub_ult.c b/src/pool/srv_pool_scrub_ult.c index 437b860ead7..fef7efa25e0 100644 --- a/src/pool/srv_pool_scrub_ult.c +++ b/src/pool/srv_pool_scrub_ult.c @@ -82,7 +82,7 @@ cont_lookup_cb(uuid_t pool_uuid, uuid_t cont_uuid, void *arg, cont->scs_cont_hdl = cont_child->sc_hdl; uuid_copy(cont->scs_cont_uuid, cont_uuid); cont->scs_cont_src = cont_child; - cont->scs_props_fetched = cont_child->sc_props_fetched; + cont->scs_csummer_inited = cont_child->sc_csummer_inited; ABT_mutex_lock(cont_child->sc_mutex); cont_child->sc_scrubbing = 1; diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 756df65e7aa..b7fe7dfe2c8 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -242,9 +242,9 @@ flush_ult(void *arg) } else if (rc) { /* This pool doesn't have NVMe partition */ sleep_ms = 60000; } else if (sched_req_space_check(child->spc_flush_req) == SCHED_SPACE_PRESS_NONE) { - sleep_ms = 500; + sleep_ms = 5000; } else { - sleep_ms = (nr_flushed < nr_flush) ? 50 : 0; + sleep_ms = (nr_flushed < nr_flush) ? 1000 : 0; } if (dss_ult_exiting(child->spc_flush_req)) @@ -539,7 +539,7 @@ pool_child_start(struct ds_pool_child *child, bool recreate) D_ASSERT(child->spc_metrics[DAOS_VOS_MODULE] != NULL); rc = vos_pool_open_metrics(path, child->spc_uuid, VOS_POF_EXCL | VOS_POF_EXTERNAL_FLUSH | VOS_POF_EXTERNAL_CHKPT, - child->spc_metrics[DAOS_VOS_MODULE], &child->spc_hdl); + child->spc_metrics[DAOS_VOS_MODULE], NULL, &child->spc_hdl); D_FREE(path); @@ -909,6 +909,13 @@ pool_alloc_ref(void *key, unsigned int ksize, void *varg, pool->sp_map_version = arg->pca_map_version; pool->sp_reclaim = DAOS_RECLAIM_LAZY; /* default reclaim strategy */ pool->sp_data_thresh = DAOS_PROP_PO_DATA_THRESH_DEFAULT; + /* + * Set proper default chkpt parameters to ensure the checkpoint working + * before the pool property being propagated. + */ + pool->sp_checkpoint_mode = DAOS_PROP_PO_CHECKPOINT_MODE_DEFAULT; + pool->sp_checkpoint_freq = DAOS_PROP_PO_CHECKPOINT_FREQ_DEFAULT; + pool->sp_checkpoint_thresh = DAOS_PROP_PO_CHECKPOINT_THRESH_DEFAULT; /** set up ds_pool metrics */ rc = ds_pool_metrics_start(pool); @@ -2797,6 +2804,8 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) pool->sp_need_discard = 1; pool->sp_discard_status = 0; rc = dss_ult_execute(ds_pool_tgt_discard_ult, arg, NULL, NULL, DSS_XS_SYS, 0, 0); + if (rc == 0) + rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ ds_pool_put(pool); out: @@ -3117,6 +3126,9 @@ ds_pool_recov_cont_handler(crt_rpc_t *rpc) rc = ds_pool_thread_collective(prci->prci_uuid, ex_status, pool_tgt_recov_cont, &prca, 0); ABT_rwlock_unlock(pool->sp_recov_lock); + if (rc == 0) + rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ + out: DL_CDEBUG(rc != 0, DLOG_ERR, DB_REBUILD, rc, "Recovered ( " DF_U64 ") containers for the pool " DF_UUID, prci->prci_cont_nr, diff --git a/src/pool/srv_util.c b/src/pool/srv_util.c index 9b4e771dc20..d67afe16050 100644 --- a/src/pool/srv_util.c +++ b/src/pool/srv_util.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -65,6 +65,69 @@ map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *r return 0; } +static bool +all_tgts_match(struct pool_domain *rank_dom, unsigned int status) +{ + int i; + + for (i = 0; i < rank_dom->do_target_nr; i++) { + if ((status & rank_dom->do_targets[i].ta_comp.co_status) == 0) + return false; + } + + return true; +} + +/* Build failed rank list, treats the rank as DOWN if all its targets are DOWN . */ +int +map_ranks_failed(const struct pool_map *map, d_rank_list_t *ranks) +{ + struct pool_domain *domains = NULL; + unsigned int status = PO_COMP_ST_DOWNOUT | PO_COMP_ST_DOWN; + int nranks; + int n = 0; + int i; + d_rank_t *rs; + + nranks = pool_map_find_ranks((struct pool_map *)map, PO_COMP_ID_ALL, &domains); + if (nranks == 0) { + D_ERROR("no nodes in pool map\n"); + return -DER_IO; + } + + for (i = 0; i < nranks; i++) { + if ((status & domains[i].do_comp.co_status) || all_tgts_match(&domains[i], status)) + n++; + } + + if (n == 0) { + ranks->rl_nr = 0; + ranks->rl_ranks = NULL; + return 0; + } + + D_ALLOC_ARRAY(rs, n); + if (rs == NULL) + return -DER_NOMEM; + + ranks->rl_nr = n; + ranks->rl_ranks = rs; + + n = 0; + for (i = 0; i < nranks; i++) { + if ((status & domains[i].do_comp.co_status) || + all_tgts_match(&domains[i], status)) { + D_ASSERT(n < ranks->rl_nr); + ranks->rl_ranks[n] = domains[i].do_comp.co_rank; + n++; + continue; + } + } + D_ASSERTF(n == ranks->rl_nr, "%d != %u\n", n, ranks->rl_nr); + + return 0; +} + void map_ranks_fini(d_rank_list_t *ranks) { @@ -1449,7 +1512,7 @@ check_pool_targets(uuid_t pool_id, int *tgt_ids, int tgt_cnt, bool reint, int i, nr, rc = 0; /* Get pool map to check the target status */ - pool_child = ds_pool_child_lookup(pool_id); + pool_child = ds_pool_child_find(pool_id); if (pool_child == NULL) { D_ERROR(DF_UUID": Pool child not found\n", DP_UUID(pool_id)); /* @@ -1470,6 +1533,13 @@ check_pool_targets(uuid_t pool_id, int *tgt_ids, int tgt_cnt, bool reint, nr_downout = nr_down = nr_upin = nr_up = 0; ABT_rwlock_rdlock(pool->sp_lock); + + if (pool->sp_map == NULL) { + D_ERROR(DF_UUID ": Pool map not populated\n", DP_UUID(pool_id)); + rc = -DER_UNINIT; + goto done; + } + for (i = 0; i < tgt_cnt; i++) { nr = pool_map_find_target_by_rank_idx(pool->sp_map, rank, tgt_ids[i], &target); @@ -1498,7 +1568,7 @@ check_pool_targets(uuid_t pool_id, int *tgt_ids, int tgt_cnt, bool reint, break; } } - +done: if (pool->sp_iv_ns != NULL) { *pl_rank = pool->sp_iv_ns->iv_master_rank; } else { diff --git a/src/proto/chk/chk.proto b/src/proto/chk/chk.proto index 869243528b3..338f2d5efb5 100644 --- a/src/proto/chk/chk.proto +++ b/src/proto/chk/chk.proto @@ -119,6 +119,9 @@ enum CheckInconsistAction { CIA_TRUST_EC_PARITY = 11; // Trust EC data shard. CIA_TRUST_EC_DATA = 12; + + // Stale unresolved interaction. The checker can no longer address this report without re-running on affected pool. + CIA_STALE = 0xffff; } // The flags to control DAOS check general behavior, not related with any detailed inconsistency. diff --git a/src/proto/mgmt/pool.proto b/src/proto/mgmt/pool.proto index 8a67ef1f825..5fc7c8762c5 100644 --- a/src/proto/mgmt/pool.proto +++ b/src/proto/mgmt/pool.proto @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -199,17 +199,24 @@ message StorageUsageStats { StorageMediaType media_type = 6; } -// PoolRebuildStatus represents a pool's rebuild status. +// PoolRebuildStatus represents a pool's rebuild status, translates to enum daos_rebuild_state_t +// IN_PROGRESS/NOT_STARTED/COMPLETED states. message PoolRebuildStatus { int32 status = 1; // DAOS error code enum State { - IDLE = 0; - DONE = 1; - BUSY = 2; + BUSY = 0; + IDLE = 1; + DONE = 2; + STOPPING = 3; + STOPPED = 4; + FAILING = 5; + FAILED = 6; } State state = 2; uint64 objects = 3; uint64 records = 4; + State derived_state = 5; + bool degraded = 6; // data redundancy degraded } enum PoolServiceState { @@ -314,14 +321,7 @@ message StorageTargetUsage { // PoolQueryTargetInfo represents pool target query info for a single target. // The RPC response type (PoolQueryTargetResponse) contains a sequence of these. message PoolQueryTargetInfo { - enum TargetType { // See enum daos_target_type_t - UNKNOWN = 0; - HDD = 1; // Rotating disk - SSD = 2; // Flash-based - PM = 3; // Persistent memory - VM = 4; // Volatile memory - } - TargetType type = 1; // Target type jsee enum daos_target_type_t + reserved 1; enum TargetState { // See enum daos_target_state_t STATE_UNKNOWN = 0; diff --git a/src/rdb/raft b/src/rdb/raft index 12dbc1595fa..27d05255720 160000 --- a/src/rdb/raft +++ b/src/rdb/raft @@ -1 +1 @@ -Subproject commit 12dbc1595fad8b570de1e336205f994f2b0e22f5 +Subproject commit 27d0525572026d66177005506a5a22703a8fd8cf diff --git a/src/rdb/rdb.c b/src/rdb/rdb.c index bdb0872f7e6..d40fb39d758 100644 --- a/src/rdb/rdb.c +++ b/src/rdb/rdb.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -18,53 +18,51 @@ #include "rdb_internal.h" #include "rdb_layout.h" -static int rdb_open_internal(daos_handle_t pool, daos_handle_t mc, const uuid_t uuid, - uint64_t caller_term, struct rdb_cbs *cbs, void *arg, - struct rdb **dbp); +static int +rdb_open_internal(daos_handle_t pool, daos_handle_t mc, const uuid_t uuid, uint32_t layout_version, + uint64_t caller_term, struct rdb_cbs *cbs, void *arg, struct rdb **dbp); /** - * Create an RDB replica at \a path with \a uuid, \a caller_term, \a size, - * \a vos_df_version, and \a replicas, and open it with \a cbs and \a arg. + * Create an RDB replica at \a path with \a uuid, \a caller_term, and \a params, + * and open it with \a cbs and \a arg. * * \param[in] path replica path * \param[in] uuid database UUID * \param[in] caller_term caller term if not RDB_NIL_TERM (see rdb_open) - * \param[in] size replica size in bytes - * \param[in] vos_df_version version of VOS durable format - * \param[in] replicas list of replica ranks + * \param[in] params parameters for creating the replica * \param[in] cbs callbacks (not copied) * \param[in] arg argument for cbs * \param[out] storagep database storage */ int -rdb_create(const char *path, const uuid_t uuid, uint64_t caller_term, size_t size, - uint32_t vos_df_version, const d_rank_list_t *replicas, struct rdb_cbs *cbs, void *arg, +rdb_create(const char *path, const uuid_t uuid, uint64_t caller_term, + struct rdb_create_params *params, struct rdb_cbs *cbs, void *arg, struct rdb_storage **storagep) { daos_handle_t pool; daos_handle_t mc; d_iov_t value; - uint32_t version = RDB_LAYOUT_VERSION; + uint32_t version; struct rdb *db; int rc; D_DEBUG(DB_MD, - DF_UUID ": creating db %s with %u replicas: caller_term=" DF_X64 " size=" DF_U64 - " vos_df_version=%u\n", - DP_UUID(uuid), path, replicas == NULL ? 0 : replicas->rl_nr, caller_term, size, - vos_df_version); + DF_UUID ": creating db %s with %d replicas: caller_term=" DF_X64 " size=" DF_U64 + " vos_df_version=%u layout_version=%u self=" RDB_F_RID "\n", + DP_UUID(uuid), path, params->rcp_replicas_len, caller_term, params->rcp_size, + params->rcp_vos_df_version, params->rcp_layout_version, RDB_P_RID(params->rcp_id)); /* * Create and open a VOS pool. RDB pools specify VOS_POF_SMALL for * basic system memory reservation and VOS_POF_EXCL for concurrent * access protection. */ - rc = vos_pool_create(path, (unsigned char *)uuid, size, 0 /* data_sz */, 0 /* meta_sz */, - VOS_POF_SMALL | VOS_POF_EXCL | VOS_POF_RDB | VOS_POF_EXTERNAL_CHKPT, - vos_df_version, &pool); + rc = dss_vos_pool_create( + path, (unsigned char *)uuid, params->rcp_size, 0 /* data_sz */, 0 /* meta_sz */, + VOS_POF_SMALL | VOS_POF_EXCL | VOS_POF_RDB | VOS_POF_EXTERNAL_CHKPT, + params->rcp_vos_df_version, &pool); if (rc != 0) goto out; - ABT_thread_yield(); /* Create and open the metadata container. */ rc = vos_cont_create(pool, (unsigned char *)uuid); @@ -75,15 +73,32 @@ rdb_create(const char *path, const uuid_t uuid, uint64_t caller_term, size_t siz goto out_pool_hdl; /* Initialize the layout version. */ + version = params->rcp_layout_version; + if (version == 0) + version = RDB_LAYOUT_VERSION; d_iov_set(&value, &version, sizeof(version)); rc = rdb_mc_update(mc, RDB_MC_ATTRS, 1 /* n */, &rdb_mc_version, &value, NULL /* vtx */); if (rc != 0) goto out_mc_hdl; + /* Initialize the replica ID. */ + if (version >= RDB_LAYOUT_VERSION_REPLICA_ID) { + d_iov_set(&value, ¶ms->rcp_id, sizeof(params->rcp_id)); + rc = rdb_mc_update(mc, RDB_MC_ATTRS, 1 /* n */, &rdb_mc_replica_id, &value, + NULL /* vtx */); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to initialize replica ID", DP_UUID(uuid)); + goto out_mc_hdl; + } + } + /* Initialize Raft. */ - rc = rdb_raft_init(pool, mc, replicas); - if (rc != 0) + rc = rdb_raft_init((unsigned char *)uuid, pool, mc, params->rcp_replicas, + params->rcp_replicas_len, version); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to initialize Raft", DP_UUID(uuid)); goto out_mc_hdl; + } /* * Mark this replica as fully initialized by storing its UUID. @@ -94,7 +109,7 @@ rdb_create(const char *path, const uuid_t uuid, uint64_t caller_term, size_t siz if (rc != 0) goto out_mc_hdl; - rc = rdb_open_internal(pool, mc, uuid, caller_term, cbs, arg, &db); + rc = rdb_open_internal(pool, mc, uuid, version, caller_term, cbs, arg, &db); if (rc != 0) goto out_mc_hdl; @@ -129,6 +144,7 @@ rdb_destroy(const char *path, const uuid_t uuid) { int rc; + D_INFO(DF_UUID ": destroying db %s\n", DP_UUID(uuid), path); rc = vos_pool_destroy_ex(path, (unsigned char *)uuid, VOS_POF_RDB); if (rc != 0) D_ERROR(DF_UUID": failed to destroy %s: "DF_RC"\n", @@ -237,11 +253,12 @@ static void rdb_chkptd_stop(struct rdb *db); * the caller shall not close in this case. */ static int -rdb_open_internal(daos_handle_t pool, daos_handle_t mc, const uuid_t uuid, uint64_t caller_term, - struct rdb_cbs *cbs, void *arg, struct rdb **dbp) +rdb_open_internal(daos_handle_t pool, daos_handle_t mc, const uuid_t uuid, uint32_t layout_version, + uint64_t caller_term, struct rdb_cbs *cbs, void *arg, struct rdb **dbp) { struct rdb *db; int rc; + d_iov_t value; struct vos_pool_space vps; uint64_t rdb_extra_sys[DAOS_MEDIA_MAX]; @@ -260,6 +277,7 @@ rdb_open_internal(daos_handle_t pool, daos_handle_t mc, const uuid_t uuid, uint6 db->d_cbs = cbs; db->d_arg = arg; db->d_pool = pool; + db->d_version = layout_version; db->d_mc = mc; rc = ABT_mutex_create(&db->d_mutex); @@ -284,9 +302,28 @@ rdb_open_internal(daos_handle_t pool, daos_handle_t mc, const uuid_t uuid, uint6 goto err_raft_mutex; } + rc = ABT_rwlock_create(&db->d_gen_lock); + if (rc != ABT_SUCCESS) { + D_ERROR(DF_DB ": failed to create gen rwlock: %d\n", DP_DB(db), rc); + rc = dss_abterr2der(rc); + goto err_ref_cv; + } + + if (db->d_version >= RDB_LAYOUT_VERSION_REPLICA_ID) { + d_iov_set(&value, &db->d_replica_id, sizeof(db->d_replica_id)); + rc = rdb_mc_lookup(mc, RDB_MC_ATTRS, &rdb_mc_replica_id, &value); + if (rc != 0) { + DL_ERROR(rc, DF_DB ": failed to look up replica ID", DP_DB(db)); + goto err_gen_lock; + } + } else { + db->d_replica_id.rri_rank = dss_self_rank(); + db->d_replica_id.rri_gen = 0; + } + rc = rdb_chkptd_start(db); if (rc != 0) - goto err_ref_cv; + goto err_gen_lock; rc = rdb_kvs_cache_create(&db->d_kvss); if (rc != 0) @@ -339,6 +376,8 @@ rdb_open_internal(daos_handle_t pool, daos_handle_t mc, const uuid_t uuid, uint6 rdb_kvs_cache_destroy(db->d_kvss); err_chkptd: rdb_chkptd_stop(db); +err_gen_lock: + ABT_rwlock_free(&db->d_gen_lock); err_ref_cv: ABT_cond_free(&db->d_ref_cv); err_raft_mutex: @@ -387,9 +426,9 @@ rdb_open(const char *path, const uuid_t uuid, uint64_t caller_term, struct rdb_c * RDB pools specify VOS_POF_SMALL for basic system memory reservation * and VOS_POF_EXCL for concurrent access protection. */ - rc = vos_pool_open(path, (unsigned char *)uuid, - VOS_POF_SMALL | VOS_POF_EXCL | VOS_POF_RDB | VOS_POF_EXTERNAL_CHKPT, - &pool); + rc = dss_vos_pool_open(path, (unsigned char *)uuid, + VOS_POF_SMALL | VOS_POF_EXCL | VOS_POF_RDB | VOS_POF_EXTERNAL_CHKPT, + &pool); if (rc == -DER_ID_MISMATCH) { ds_notify_ras_eventf(RAS_RDB_DF_INCOMPAT, RAS_TYPE_INFO, RAS_SEV_ERROR, NULL /* hwid */, NULL /* rank */, NULL /* inc */, @@ -402,7 +441,6 @@ rdb_open(const char *path, const uuid_t uuid, uint64_t caller_term, struct rdb_c path, DP_RC(rc)); goto err; } - ABT_thread_yield(); rc = vos_cont_open(pool, (unsigned char *)uuid, &mc); if (rc != 0) { @@ -453,7 +491,7 @@ rdb_open(const char *path, const uuid_t uuid, uint64_t caller_term, struct rdb_c goto err_mc; } - rc = rdb_open_internal(pool, mc, uuid, caller_term, cbs, arg, &db); + rc = rdb_open_internal(pool, mc, uuid, version, caller_term, cbs, arg, &db); if (rc != 0) goto err_mc; @@ -485,6 +523,7 @@ rdb_close(struct rdb_storage *storage) vos_cont_close(db->d_mc); vos_pool_close(db->d_pool); rdb_kvs_cache_destroy(db->d_kvss); + ABT_rwlock_free(&db->d_gen_lock); ABT_cond_free(&db->d_ref_cv); ABT_mutex_free(&db->d_raft_mutex); ABT_mutex_free(&db->d_mutex); @@ -512,15 +551,18 @@ rdb_get_use_leases(void) int rdb_glance(struct rdb_storage *storage, struct rdb_clue *clue) { - struct rdb *db = rdb_from_storage(storage); - d_iov_t value; - uint64_t term; - int vote; - uint64_t last_index = db->d_lc_record.dlr_tail - 1; - uint64_t last_term; - d_rank_list_t *replicas; - uint64_t oid_next; - int rc; + struct rdb *db = rdb_from_storage(storage); + d_iov_t value; + uint64_t term; + rdb_replica_id_t vote; + uint64_t last_index = db->d_lc_record.dlr_tail - 1; + uint64_t last_term; + struct rdb_replica_record *replicas; + int replicas_len; + d_rank_list_t *ranks; + int i; + uint64_t oid_next; + int rc; d_iov_set(&value, &term, sizeof(term)); rc = rdb_mc_lookup(db->d_mc, RDB_MC_ATTRS, &rdb_mc_term, &value); @@ -531,10 +573,11 @@ rdb_glance(struct rdb_storage *storage, struct rdb_clue *clue) goto err; } - d_iov_set(&value, &vote, sizeof(vote)); + rdb_set_mc_vote_lookup_buf(db, &vote, &value); rc = rdb_mc_lookup(db->d_mc, RDB_MC_ATTRS, &rdb_mc_vote, &value); if (rc == -DER_NONEXIST) { - vote = -1; + vote.rri_rank = -1; + vote.rri_gen = -1; } else if (rc != 0) { D_ERROR(DF_DB": failed to look up vote: "DF_RC"\n", DP_DB(db), DP_RC(rc)); goto err; @@ -556,12 +599,23 @@ rdb_glance(struct rdb_storage *storage, struct rdb_clue *clue) last_term = header.dre_term; } - rc = rdb_raft_load_replicas(db->d_lc, last_index, &replicas); + rc = rdb_raft_load_replicas(db->d_uuid, db->d_lc, last_index, db->d_version, &replicas, + &replicas_len); if (rc != 0) { D_ERROR(DF_DB": failed to load replicas at "DF_U64": "DF_RC"\n", DP_DB(db), last_index, DP_RC(rc)); goto err; } + ranks = d_rank_list_alloc(replicas_len); + if (ranks == NULL) { + D_ERROR(DF_DB ": failed to convert replicas to ranks\n", DP_DB(db)); + rc = -DER_NOMEM; + D_FREE(replicas); + goto err; + } + for (i = 0; i < replicas_len; i++) + ranks->rl_ranks[i] = replicas[i].drr_id.rri_rank; + D_FREE(replicas); d_iov_set(&value, &oid_next, sizeof(oid_next)); rc = rdb_lc_lookup(db->d_lc, last_index, RDB_LC_ATTRS, &rdb_lc_oid_next, &value); @@ -569,26 +623,22 @@ rdb_glance(struct rdb_storage *storage, struct rdb_clue *clue) oid_next = RDB_LC_OID_NEXT_INIT; } else if (rc != 0) { D_ERROR(DF_DB": failed to look up next object number: %d\n", DP_DB(db), rc); - goto err_replicas; + goto err_ranks; } - clue->bcl_term = term; - clue->bcl_vote = vote; - /* - * In the future, the self node ID might differ from the rank and need - * to be stored persistently. - */ - clue->bcl_self = dss_self_rank(); + clue->bcl_term = term; + clue->bcl_vote = vote.rri_rank; + clue->bcl_self = db->d_replica_id.rri_rank; clue->bcl_last_index = last_index; - clue->bcl_last_term = last_term; + clue->bcl_last_term = last_term; clue->bcl_base_index = db->d_lc_record.dlr_base; - clue->bcl_base_term = db->d_lc_record.dlr_base_term; - clue->bcl_replicas = replicas; - clue->bcl_oid_next = oid_next; + clue->bcl_base_term = db->d_lc_record.dlr_base_term; + clue->bcl_replicas = ranks; + clue->bcl_oid_next = oid_next; return 0; -err_replicas: - d_rank_list_free(replicas); +err_ranks: + d_rank_list_free(ranks); err: return rc; } @@ -624,7 +674,13 @@ rdb_start(struct rdb_storage *storage, struct rdb **dbp) db->d_use_leases = rdb_get_use_leases(); - D_DEBUG(DB_MD, DF_DB": started db %p: use_leases=%d\n", DP_DB(db), db, db->d_use_leases); + D_INFO(DF_DB ": started: db=%p version=%u use_leases=%d election_timeout=%d " + "request_timeout=%d lease_maintenance_grace=%d compact_thres=" DF_U64 + " ae_max_entries=%u ae_max_size=" DF_U64 "\n", + DP_DB(db), db, db->d_version, db->d_use_leases, + raft_get_election_timeout(db->d_raft), raft_get_request_timeout(db->d_raft), + raft_get_lease_maintenance_grace(db->d_raft), db->d_compact_thres, + db->d_ae_max_entries, db->d_ae_max_size); *dbp = db; return 0; } @@ -641,7 +697,7 @@ rdb_stop(struct rdb *db, struct rdb_storage **storagep) { bool deleted; - D_DEBUG(DB_MD, DF_DB": stopping db %p\n", DP_DB(db), db); + D_INFO(DF_DB ": stopping: db=%p\n", DP_DB(db), db); ABT_mutex_lock(rdb_hash_lock); deleted = d_hash_rec_delete(&rdb_hash, db->d_uuid, sizeof(uuid_t)); @@ -650,7 +706,7 @@ rdb_stop(struct rdb *db, struct rdb_storage **storagep) rdb_raft_stop(db); - D_DEBUG(DB_MD, DF_DB": stopped db %p\n", DP_DB(db), db); + D_INFO(DF_DB ": stopped: db=%p\n", DP_DB(db), db); *storagep = rdb_to_storage(db); } @@ -691,66 +747,75 @@ rdb_dictate(struct rdb_storage *storage) } /** - * Add \a replicas. + * Allocate a replica generation. * * \param[in] db database - * \param[in,out] - * replicas [in] list of replica ranks; - * [out] list of replica ranks that could not be added + * \param[in] term if not RDB_NIL_TERM, term to allocate in + * \param[out] gen_out replica generation */ int -rdb_add_replicas(struct rdb *db, d_rank_list_t *replicas) +rdb_alloc_replica_gen(struct rdb *db, uint64_t term, uint32_t *gen_out) { - int i; - int rc; + struct rdb_tx tx; + d_iov_t value; + uint32_t next; + int rc; + + if (db->d_version < RDB_LAYOUT_VERSION_REPLICA_ID) { + D_DEBUG(DB_MD, DF_DB ": zero for old layout\n", DP_DB(db)); + *gen_out = 0; + rc = 0; + goto out; + } - D_DEBUG(DB_MD, DF_DB": Adding %d replicas\n", - DP_DB(db), replicas->rl_nr); + rc = rdb_tx_begin(db, term, &tx); + if (rc != 0) + goto out; + ABT_rwlock_wrlock(db->d_gen_lock); - ABT_mutex_lock(db->d_raft_mutex); + d_iov_set(&value, &next, sizeof(next)); + rc = rdb_tx_lookup(&tx, &rdb_path_attrs, &rdb_lc_replica_gen_next, &value); + if (rc != 0) + goto out_lock; - rc = rdb_raft_wait_applied(db, db->d_debut, raft_get_current_term(db->d_raft)); - if (rc != 0) { - ABT_mutex_unlock(db->d_raft_mutex); - return rc; - } + next++; - rc = -DER_INVAL; - for (i = 0; i < replicas->rl_nr; ++i) { - rc = rdb_raft_add_replica(db, replicas->rl_ranks[i]); - if (rc != 0) { - D_ERROR(DF_DB": failed to add rank %u: "DF_RC"\n", DP_DB(db), - replicas->rl_ranks[i], DP_RC(rc)); - break; - } - } + rc = rdb_tx_update_critical(&tx, &rdb_path_attrs, &rdb_lc_replica_gen_next, &value); + if (rc != 0) + goto out_lock; - ABT_mutex_unlock(db->d_raft_mutex); + rc = rdb_tx_commit(&tx); - /* Update list to only contain ranks which could not be added. */ - replicas->rl_nr -= i; - if (replicas->rl_nr > 0 && i > 0) - memmove(&replicas->rl_ranks[0], &replicas->rl_ranks[i], - replicas->rl_nr * sizeof(d_rank_t)); +out_lock: + ABT_rwlock_unlock(db->d_gen_lock); + rdb_tx_end(&tx); + if (rc != 0) + goto out; + + D_INFO(DF_DB ": updated next replica generation to %u\n", DP_DB(db), next); + *gen_out = next - 1; +out: return rc; } /** - * Remove \a replicas. + * Modify \a replicas. * - * \param[in] db database - * \param[in,out] - * replicas [in] list of replica ranks; - * [out] list of replica ranks that could not be removed + * \param[in] db database + * \param[in] op operation to perform + * \param[in,out] replicas [in] list of replica ranks; + * [out] list of replica ranks that could not be modified + * \param[in,out] replicas_len length of \a replicas; */ int -rdb_remove_replicas(struct rdb *db, d_rank_list_t *replicas) +rdb_modify_replicas(struct rdb *db, enum rdb_replica_op op, rdb_replica_id_t *replicas, + int *replicas_len) { - int i; - int rc; + raft_logtype_e type; + int i; + int rc; - D_DEBUG(DB_MD, DF_DB": Removing %d replicas\n", - DP_DB(db), replicas->rl_nr); + D_DEBUG(DB_MD, DF_DB ": op=%d replicas=%d\n", DP_DB(db), op, *replicas_len); ABT_mutex_lock(db->d_raft_mutex); @@ -761,22 +826,33 @@ rdb_remove_replicas(struct rdb *db, d_rank_list_t *replicas) } rc = -DER_INVAL; - for (i = 0; i < replicas->rl_nr; ++i) { - rc = rdb_raft_remove_replica(db, replicas->rl_ranks[i]); + switch (op) { + case RDB_REPLICA_ADD: + type = RAFT_LOGTYPE_ADD_NODE; + break; + case RDB_REPLICA_REMOVE: + type = RAFT_LOGTYPE_REMOVE_NODE; + break; + default: + D_ASSERTF(0, "invalid op %d\n", op); + } + for (i = 0; i < *replicas_len; ++i) { + rc = rdb_raft_append_apply_cfg(db, type, replicas[i]); if (rc != 0) { - D_ERROR(DF_DB": failed to remove rank %u: "DF_RC"\n", DP_DB(db), - replicas->rl_ranks[i], DP_RC(rc)); + DL_ERROR(rc, DF_DB ": failed to do op %d on replica " RDB_F_RID, DP_DB(db), + op, RDB_P_RID(replicas[i])); break; } } ABT_mutex_unlock(db->d_raft_mutex); - /* Update list to only contain ranks which could not be removed. */ - replicas->rl_nr -= i; - if (replicas->rl_nr > 0 && i > 0) - memmove(&replicas->rl_ranks[0], &replicas->rl_ranks[i], - replicas->rl_nr * sizeof(d_rank_t)); + /* Update list to only contain replicas which could not be modified. */ + if (i > 0) { + *replicas_len -= i; + if (*replicas_len > 0) + memmove(&replicas[0], &replicas[i], *replicas_len * sizeof(replicas[0])); + } return rc; } @@ -856,8 +932,7 @@ rdb_is_leader(struct rdb *db, uint64_t *term) int rdb_get_leader(struct rdb *db, uint64_t *term, d_rank_t *rank) { - raft_node_t *node; - struct rdb_raft_node *dnode; + raft_node_t *node; ABT_mutex_lock(db->d_raft_mutex); node = raft_get_current_leader_node(db->d_raft); @@ -865,15 +940,41 @@ rdb_get_leader(struct rdb *db, uint64_t *term, d_rank_t *rank) ABT_mutex_unlock(db->d_raft_mutex); return -DER_NONEXIST; } - dnode = raft_node_get_udata(node); - D_ASSERT(dnode != NULL); *term = raft_get_current_term(db->d_raft); - *rank = dnode->dn_rank; + *rank = rdb_replica_id_decode(raft_node_get_id(node)).rri_rank; ABT_mutex_unlock(db->d_raft_mutex); return 0; } +rdb_replica_id_t +rdb_get_replica_id(struct rdb *db) +{ + return db->d_replica_id; +} + +int +rdb_get_replicas(struct rdb *db, rdb_replica_id_t **replicas, int *replicas_len) +{ + return rdb_raft_get_replicas(db, replicas, replicas_len); +} + +static d_rank_list_t * +rdb_replica_id_to_rank_list(rdb_replica_id_t *replicas, int replicas_len) +{ + d_rank_list_t *ranks; + int i; + + ranks = d_rank_list_alloc(replicas_len); + if (ranks == NULL) + return NULL; + + for (i = 0; i < replicas_len; i++) + ranks->rl_ranks[i] = replicas[i].rri_rank; + + return ranks; +} + /** * Get the list of replica ranks. Callers are responsible for * d_rank_list_free(*ranksp). @@ -884,7 +985,22 @@ rdb_get_leader(struct rdb *db, uint64_t *term, d_rank_t *rank) int rdb_get_ranks(struct rdb *db, d_rank_list_t **ranksp) { - return rdb_raft_get_ranks(db, ranksp); + rdb_replica_id_t *replicas; + int replicas_len; + d_rank_list_t *ranks; + int rc; + + rc = rdb_get_replicas(db, &replicas, &replicas_len); + if (rc != 0) + return rc; + + ranks = rdb_replica_id_to_rank_list(replicas, replicas_len); + D_FREE(replicas); + if (ranks == NULL) + return -DER_NOMEM; + + *ranksp = ranks; + return 0; } int @@ -905,6 +1021,12 @@ rdb_get_size(struct rdb *db, uint64_t *sizep) return rc; } +uint32_t +rdb_get_version(struct rdb *db) +{ + return db->d_version; +} + /** Implementation of the RDB pool checkpoint ULT. The ULT * is only active if DAOS is using MD on SSD. */ diff --git a/src/rdb/rdb_internal.h b/src/rdb/rdb_internal.h index 57f9b4ee3a0..cb33675128b 100644 --- a/src/rdb/rdb_internal.h +++ b/src/rdb/rdb_internal.h @@ -78,6 +78,7 @@ struct rdb { /* General fields */ d_list_t d_entry; /* in rdb_hash */ uuid_t d_uuid; /* of database */ + rdb_replica_id_t d_replica_id; /* of this replica */ ABT_mutex d_mutex; /* d_replies, d_replies_cv */ int d_ref; /* of callers and RPCs */ ABT_cond d_ref_cv; /* for d_ref decrements */ @@ -85,6 +86,7 @@ struct rdb { void *d_arg; /* for d_cbs callbacks */ struct daos_lru_cache *d_kvss; /* rdb_kvs cache */ daos_handle_t d_pool; /* VOS pool */ + uint32_t d_version; /* of DB layout */ struct rdb_chkpt_record d_chkpt_record; /* pool checkpoint information */ ABT_thread d_chkptd; /* thread handle for pool checkpoint daemon */ ABT_mutex d_chkpt_mutex; /* mutex for checkpoint synchronization */ @@ -94,6 +96,7 @@ struct rdb { uint64_t d_nospc_ts; /* last time commit observed low/no space (usec) */ bool d_new; /* for skipping lease recovery */ bool d_use_leases; /* when verifying leadership */ + ABT_rwlock d_gen_lock; /* for rdb_lc_replica_gen_next */ /* rdb_raft fields */ raft_server_t *d_raft; @@ -131,21 +134,8 @@ struct rdb { #define RDB_NOAPPEND_FREE_SPACE (1ULL << 22) #define RDB_CRITICAL_FREE_SPACE (1ULL << 14) -/* Current rank */ -#define DF_RANK "%u" -static inline d_rank_t -DP_RANK(void) -{ - d_rank_t rank; - int rc; - - rc = crt_group_rank(NULL, &rank); - D_ASSERTF(rc == 0, "%d\n", rc); - return rank; -} - -#define DF_DB DF_UUID"["DF_RANK"]" -#define DP_DB(db) DP_UUID((db)->d_uuid), DP_RANK() +#define DF_DB DF_UUID "[" RDB_F_RID "]" +#define DP_DB(db) DP_UUID((db)->d_uuid), RDB_P_RID((db)->d_replica_id) /* Number of "base" references that the rdb_stop() path expects to remain */ #define RDB_BASE_REFS 1 @@ -158,6 +148,24 @@ struct rdb *rdb_lookup(const uuid_t uuid); /* rdb_raft.c *****************************************************************/ +D_CASSERT(sizeof(raft_node_id_t) == sizeof(uint64_t)); + +static inline rdb_replica_id_t +rdb_replica_id_decode(raft_node_id_t raft_id) +{ + rdb_replica_id_t id; + + id.rri_rank = (uint64_t)raft_id >> 32; + id.rri_gen = raft_id & 0xffffffff; + return id; +} + +static inline raft_node_id_t +rdb_replica_id_encode(rdb_replica_id_t id) +{ + return (uint64_t)id.rri_rank << 32 | id.rri_gen; +} + /* * Per-raft_node_t INSTALLSNAPSHOT state * @@ -179,9 +187,11 @@ struct rdb_raft_node { struct rdb_raft_is dn_is; }; +/* clang-format off */ void rdb_raft_module_init(void); void rdb_raft_module_fini(void); -int rdb_raft_init(daos_handle_t pool, daos_handle_t mc, const d_rank_list_t *replicas); +int rdb_raft_init(uuid_t db_uuid, daos_handle_t pool, daos_handle_t mc, rdb_replica_id_t *replicas, + int replicas_len, uint32_t layout_version); int rdb_raft_open(struct rdb *db, uint64_t caller_term); int rdb_raft_start(struct rdb *db); void rdb_raft_stop(struct rdb *db); @@ -191,19 +201,21 @@ void rdb_raft_resign(struct rdb *db, uint64_t term); int rdb_raft_campaign(struct rdb *db); int rdb_raft_ping(struct rdb *db, uint64_t caller_term); int rdb_raft_verify_leadership(struct rdb *db); -int rdb_raft_load_replicas(daos_handle_t lc, uint64_t index, d_rank_list_t **replicas); -int rdb_raft_add_replica(struct rdb *db, d_rank_t rank); -int rdb_raft_remove_replica(struct rdb *db, d_rank_t rank); +int rdb_raft_load_replicas(uuid_t db_uuid, daos_handle_t lc, uint64_t index, + uint32_t layout_version, struct rdb_replica_record **replicas_out, + int *replicas_len_out); +int rdb_raft_append_apply_cfg(struct rdb *db, raft_logtype_e type, rdb_replica_id_t id); int rdb_raft_append_apply(struct rdb *db, void *entry, size_t size, void *result); int rdb_raft_wait_applied(struct rdb *db, uint64_t index, uint64_t term); -int rdb_raft_get_ranks(struct rdb *db, d_rank_list_t **ranksp); +int rdb_raft_get_replicas(struct rdb *db, rdb_replica_id_t **replicas_out, int *replicas_len_out); void rdb_requestvote_handler(crt_rpc_t *rpc); void rdb_appendentries_handler(crt_rpc_t *rpc); void rdb_installsnapshot_handler(crt_rpc_t *rpc); void rdb_raft_process_reply(struct rdb *db, crt_rpc_t *rpc); void rdb_raft_free_request(struct rdb *db, crt_rpc_t *rpc); int rdb_raft_trigger_compaction(struct rdb *db, bool compact_all, uint64_t *idx); +/* clang-format on */ /* rdb_rpc.c ******************************************************************/ @@ -231,15 +243,22 @@ enum rdb_operation { RDB_PROTO_SRV_RPC_LIST }; extern struct crt_proto_format rdb_proto_fmt; +/* clang-format off */ #define DAOS_ISEQ_RDB_OP /* input fields */ \ - ((uuid_t) (ri_uuid) CRT_VAR) + ((uuid_t) (ri_uuid) CRT_VAR) \ + ((rdb_replica_id_t) (ri_from) CRT_VAR) \ + ((rdb_replica_id_t) (ri_to) CRT_VAR) #define DAOS_OSEQ_RDB_OP /* output fields */ \ ((int32_t) (ro_rc) CRT_VAR) \ - ((uint32_t) (ro_padding) CRT_VAR) + ((uint32_t) (ro_padding) CRT_VAR) \ + ((rdb_replica_id_t) (ro_from) CRT_VAR) \ + ((rdb_replica_id_t) (ro_to) CRT_VAR) +/* clang-format on */ CRT_RPC_DECLARE(rdb_op, DAOS_ISEQ_RDB_OP, DAOS_OSEQ_RDB_OP) +/* clang-format off */ #define DAOS_ISEQ_RDB_REQUESTVOTE /* input fields */ \ ((struct rdb_op_in) (rvi_op) CRT_VAR) \ ((msg_requestvote_t) (rvi_msg) CRT_RAW) @@ -247,10 +266,12 @@ CRT_RPC_DECLARE(rdb_op, DAOS_ISEQ_RDB_OP, DAOS_OSEQ_RDB_OP) #define DAOS_OSEQ_RDB_REQUESTVOTE /* output fields */ \ ((struct rdb_op_out) (rvo_op) CRT_VAR) \ ((msg_requestvote_response_t) (rvo_msg) CRT_VAR) +/* clang-format on */ CRT_RPC_DECLARE(rdb_requestvote, DAOS_ISEQ_RDB_REQUESTVOTE, DAOS_OSEQ_RDB_REQUESTVOTE) +/* clang-format off */ #define DAOS_ISEQ_RDB_APPENDENTRIES /* input fields */ \ ((struct rdb_op_in) (aei_op) CRT_VAR) \ ((msg_appendentries_t) (aei_msg) CRT_VAR) @@ -258,6 +279,7 @@ CRT_RPC_DECLARE(rdb_requestvote, DAOS_ISEQ_RDB_REQUESTVOTE, #define DAOS_OSEQ_RDB_APPENDENTRIES /* output fields */ \ ((struct rdb_op_out) (aeo_op) CRT_VAR) \ ((msg_appendentries_response_t) (aeo_msg) CRT_RAW) +/* clang-format on */ CRT_RPC_DECLARE(rdb_appendentries, DAOS_ISEQ_RDB_APPENDENTRIES, DAOS_OSEQ_RDB_APPENDENTRIES) @@ -267,6 +289,7 @@ struct rdb_local { d_iov_t rl_data_iov; /* isi_data buffer */ }; +/* clang-format off */ #define DAOS_ISEQ_RDB_INSTALLSNAPSHOT /* input fields */ \ ((struct rdb_op_in) (isi_op) CRT_VAR) \ ((msg_installsnapshot_t) (isi_msg) CRT_VAR) \ @@ -290,23 +313,26 @@ struct rdb_local { ((uint64_t) (iso_seq) CRT_VAR) \ /* last anchor */ \ ((struct rdb_anchor) (iso_anchor) CRT_RAW) +/* clang-format on */ CRT_RPC_DECLARE(rdb_installsnapshot, DAOS_ISEQ_RDB_INSTALLSNAPSHOT, DAOS_OSEQ_RDB_INSTALLSNAPSHOT) -int rdb_create_raft_rpc(crt_opcode_t opc, raft_node_t *node, crt_rpc_t **rpc); +/* clang-format off */ +int rdb_create_raft_rpc(struct rdb *db, crt_opcode_t opc, raft_node_t *node, crt_rpc_t **rpc); int rdb_send_raft_rpc(crt_rpc_t *rpc, struct rdb *db); int rdb_abort_raft_rpcs(struct rdb *db); void rdb_recvd(void *arg); +/* clang-format on */ /* rdb_kvs.c ******************************************************************/ /* KVS cache entry */ struct rdb_kvs { - struct daos_llink de_entry; /* in LRU */ - rdb_path_t de_path; - rdb_oid_t de_object; - uint8_t de_buf[]; /* for de_path */ + struct daos_llink de_entry; /* in LRU (private) */ + rdb_path_t de_path; + rdb_oid_t de_object; + uint8_t de_buf[]; /* for de_path */ }; int rdb_kvs_cache_create(struct daos_lru_cache **cache); @@ -319,6 +345,14 @@ void rdb_kvs_evict(struct rdb *db, struct rdb_kvs *kvs); /* rdb_path.c *****************************************************************/ +extern rdb_path_t rdb_path_attrs; + +static inline bool +rdb_path_is_attrs(const rdb_path_t *path) +{ + return path->iov_len == 0; +} + int rdb_path_clone(const rdb_path_t *path, rdb_path_t *new_path); typedef int (*rdb_path_iterate_cb_t)(d_iov_t *key, void *arg); int rdb_path_iterate(const rdb_path_t *path, rdb_path_iterate_cb_t cb, @@ -489,6 +523,26 @@ rdb_lc_iterate(daos_handle_t lc, uint64_t index, rdb_oid_t oid, bool backward, return rdb_vos_iterate(lc, index, oid, backward, cb, arg); } +static inline void +rdb_set_mc_vote_lookup_buf(struct rdb *db, rdb_replica_id_t *vote, d_iov_t *value) +{ + if (db->d_version < RDB_LAYOUT_VERSION_REPLICA_ID) { + d_iov_set(value, &vote->rri_rank, sizeof(vote->rri_rank)); + vote->rri_gen = 0; + } else { + d_iov_set(value, vote, sizeof(*vote)); + } +} + +static inline void +rdb_set_mc_vote_update_buf(struct rdb *db, rdb_replica_id_t *vote, d_iov_t *value) +{ + if (db->d_version < RDB_LAYOUT_VERSION_REPLICA_ID) + d_iov_set(value, &vote->rri_rank, sizeof(vote->rri_rank)); + else + d_iov_set(value, vote, sizeof(*vote)); +} + int rdb_scm_left(struct rdb *db, daos_size_t *scm_left_outp); int diff --git a/src/rdb/rdb_kvs.c b/src/rdb/rdb_kvs.c index f07cd4de877..c4619ab1179 100644 --- a/src/rdb/rdb_kvs.c +++ b/src/rdb/rdb_kvs.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2022 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -18,6 +19,14 @@ #include "rdb_internal.h" #include "rdb_layout.h" +/* + * Special static entry for RDB_LC_ATTRS + * + * Because rdb_path_attrs is a special, empty path, we can't store it in the + * LRU cache. Thankfully, it always maps to RDB_LC_ATTRS, which always exists. + */ +static struct rdb_kvs rdb_kvs_attrs = {.de_object = RDB_LC_ATTRS}; + struct rdb_kvs_open_arg { struct rdb *deo_db; rdb_oid_t deo_parent; @@ -28,9 +37,9 @@ struct rdb_kvs_open_arg { static int rdb_kvs_open_path_cb(d_iov_t *key, void *varg) { - struct rdb_kvs_open_arg *arg = varg; - rdb_oid_t parent = arg->deo_parent; - d_iov_t value; + struct rdb_kvs_open_arg *arg = varg; + rdb_oid_t parent = arg->deo_parent; + d_iov_t value; if (key->iov_len == 0) { D_ASSERTF(parent == RDB_LC_ATTRS, DF_X64"\n", parent); @@ -202,6 +211,11 @@ rdb_kvs_lookup(struct rdb *db, const rdb_path_t *path, uint64_t index, D_DEBUG(DB_TRACE, DF_DB": looking up "DF_IOV": alloc=%d\n", DP_DB(db), DP_IOV(path), alloc); + if (rdb_path_is_attrs(path)) { + *kvs = &rdb_kvs_attrs; + return 0; + } + arg.dea_db = db; arg.dea_index = index; arg.dea_alloc = alloc; @@ -217,11 +231,13 @@ rdb_kvs_lookup(struct rdb *db, const rdb_path_t *path, uint64_t index, void rdb_kvs_put(struct rdb *db, struct rdb_kvs *kvs) { - daos_lru_ref_release(db->d_kvss, &kvs->de_entry); + if (kvs != &rdb_kvs_attrs) + daos_lru_ref_release(db->d_kvss, &kvs->de_entry); } void rdb_kvs_evict(struct rdb *db, struct rdb_kvs *kvs) { - daos_lru_ref_evict(db->d_kvss, &kvs->de_entry); + if (kvs != &rdb_kvs_attrs) + daos_lru_ref_evict(db->d_kvss, &kvs->de_entry); } diff --git a/src/rdb/rdb_layout.c b/src/rdb/rdb_layout.c index 22092735609..dcabd64cd35 100644 --- a/src/rdb/rdb_layout.c +++ b/src/rdb/rdb_layout.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2021 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -17,6 +18,7 @@ RDB_STRING_KEY(rdb_, dkey); RDB_STRING_KEY(rdb_mc_, uuid); RDB_STRING_KEY(rdb_mc_, version); +RDB_STRING_KEY(rdb_mc_, replica_id); RDB_STRING_KEY(rdb_mc_, term); RDB_STRING_KEY(rdb_mc_, vote); RDB_STRING_KEY(rdb_mc_, lc); @@ -27,4 +29,5 @@ RDB_STRING_KEY(rdb_lc_, entry_header); RDB_STRING_KEY(rdb_lc_, entry_data); RDB_STRING_KEY(rdb_lc_, nreplicas); RDB_STRING_KEY(rdb_lc_, replicas); +RDB_STRING_KEY(rdb_lc_, replica_gen_next); RDB_STRING_KEY(rdb_lc_, root); diff --git a/src/rdb/rdb_layout.h b/src/rdb/rdb_layout.h index 66fb9b5788c..f84d2e805a3 100644 --- a/src/rdb/rdb_layout.h +++ b/src/rdb/rdb_layout.h @@ -1,5 +1,6 @@ /* * (C) Copyright 2017-2021 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -17,6 +18,7 @@ * D-key rdb_dkey * A-key rdb_mc_uuid // (see rdb_create()) * A-key rdb_mc_version // layout version + * A-key rdb_mc_replica_id // replica ID * A-key rdb_mc_term // term * A-key rdb_mc_vote // vote for term * A-key rdb_mc_lc // log container record @@ -26,7 +28,9 @@ * A-key rdb_lc_entry_header // log entry header * A-key rdb_lc_entry_data // log entry data * A-key rdb_lc_nreplicas // number of replicas - * A-key rdb_lc_replicas // replica ranks + * A-key rdb_lc_replicas // replicas + * A-key rdb_lc_replica_gen_next + * // result for next replica generation allocation * A-key rdb_lc_oid_next // result for next object ID allocation * A-key rdb_lc_root // * Object // root KVS @@ -72,11 +76,16 @@ #define RDB_LAYOUT_H /* Default layout version */ -#define RDB_LAYOUT_VERSION 1 +#define RDB_LAYOUT_VERSION 2 /* Lowest compatible layout version */ #define RDB_LAYOUT_VERSION_LOW 1 +/* Layout version that introduces replica IDs with generations */ +#define RDB_LAYOUT_VERSION_REPLICA_ID 2 + +D_CASSERT(sizeof(rdb_replica_id_t) == sizeof(uint64_t)); + /* * Object ID * @@ -122,8 +131,9 @@ struct rdb_anchor { */ extern d_iov_t rdb_mc_uuid; /* uuid_t */ extern d_iov_t rdb_mc_version; /* uint32_t */ +extern d_iov_t rdb_mc_replica_id; /* rdb_replica_id_t or absent (< v2) */ extern d_iov_t rdb_mc_term; /* uint64_t */ -extern d_iov_t rdb_mc_vote; /* int */ +extern d_iov_t rdb_mc_vote; /* rdb_replica_id_t or int (< v2) */ extern d_iov_t rdb_mc_lc; /* rdb_lc_record */ extern d_iov_t rdb_mc_slc; /* rdb_lc_record */ @@ -154,15 +164,34 @@ struct rdb_lc_record { extern d_iov_t rdb_lc_entry_header; /* rdb_entry */ extern d_iov_t rdb_lc_entry_data; /* uint8_t[] */ extern d_iov_t rdb_lc_nreplicas; /* uint8_t */ -extern d_iov_t rdb_lc_replicas; /* uint32_t[] */ +extern d_iov_t rdb_lc_replicas; /* rdb_replica_record[] or uint32_t[] (< v2) */ +extern d_iov_t rdb_lc_replica_gen_next; /* uint32_t or absent (< v2)*/ extern d_iov_t rdb_lc_oid_next; /* rdb_oid_t (classless) */ extern d_iov_t rdb_lc_root; /* rdb_oid_t */ -/* Log entry */ +/* Log entry header */ struct rdb_entry { uint64_t dre_term; uint32_t dre_type; uint32_t dre_size; /* of entry data */ }; +/* + * Log normal entry data + * + * See rdb_tx_append. + */ + +/* + * Log cfg entry data + * + * rdb_replica_id or d_rank_t (< v2). + */ + +/* Replica record in rdb_lc_replicas */ +struct rdb_replica_record { + rdb_replica_id_t drr_id; + uint64_t drr_reserved; /* for future non-voting support, etc. */ +}; + #endif /* RDB_LAYOUT_H */ diff --git a/src/rdb/rdb_path.c b/src/rdb/rdb_path.c index f2afe62b501..8d8b00189cf 100644 --- a/src/rdb/rdb_path.c +++ b/src/rdb/rdb_path.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2021 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -13,6 +14,9 @@ #include "rdb_internal.h" +/* Special path for RDB_LC_ATTRS (available only for internal use within rdb) */ +rdb_path_t rdb_path_attrs; + /* Key for the root KVS */ d_iov_t rdb_path_root_key; diff --git a/src/rdb/rdb_raft.c b/src/rdb/rdb_raft.c index e7420e5bc51..811b3a2207c 100644 --- a/src/rdb/rdb_raft.c +++ b/src/rdb/rdb_raft.c @@ -87,6 +87,7 @@ rdb_raft_cb_send_requestvote(raft_server_t *raft, void *arg, raft_node_t *node, msg_requestvote_t *msg) { struct rdb *db = arg; + rdb_replica_id_t rdb_node_id; struct rdb_raft_node *rdb_node = raft_node_get_udata(node); char *s = msg->prevote ? " (prevote)" : ""; crt_rpc_t *rpc; @@ -94,24 +95,25 @@ rdb_raft_cb_send_requestvote(raft_server_t *raft, void *arg, raft_node_t *node, int rc; D_ASSERT(db->d_raft == raft); - D_DEBUG(DB_TRACE, DF_DB": sending rv%s to node %d rank %u: term=%ld\n", - DP_DB(db), s, raft_node_get_id(node), rdb_node->dn_rank, - msg->term); + D_ASSERT(node != NULL); + D_ASSERT(rdb_node != NULL); + rdb_node_id = rdb_replica_id_decode(raft_node_get_id(node)); + D_DEBUG(DB_TRACE, DF_DB ": sending rv%s to node " RDB_F_RID ": term=%ld\n", DP_DB(db), s, + RDB_P_RID(rdb_node_id), msg->term); - rc = rdb_create_raft_rpc(RDB_REQUESTVOTE, node, &rpc); + rc = rdb_create_raft_rpc(db, RDB_REQUESTVOTE, node, &rpc); if (rc != 0) { - D_ERROR(DF_DB": failed to create RV%s RPC to node %d: %d\n", - DP_DB(db), s, raft_node_get_id(node), rc); + DL_ERROR(rc, DF_DB ": failed to create RV%s RPC to node " RDB_F_RID, DP_DB(db), s, + RDB_P_RID(rdb_node_id)); return rc; } - in = crt_req_get(rpc); - uuid_copy(in->rvi_op.ri_uuid, db->d_uuid); + in = crt_req_get(rpc); in->rvi_msg = *msg; rc = rdb_send_raft_rpc(rpc, db); if (rc != 0) { - D_ERROR(DF_DB": failed to send RV%s RPC to node %d: %d\n", - DP_DB(db), s, raft_node_get_id(node), rc); + DL_ERROR(rc, DF_DB ": failed to send RV%s RPC to node " RDB_F_RID, DP_DB(db), s, + RDB_P_RID(rdb_node_id)); crt_req_decref(rpc); } return rc; @@ -184,27 +186,29 @@ rdb_raft_cb_send_appendentries(raft_server_t *raft, void *arg, raft_node_t *node, msg_appendentries_t *msg) { struct rdb *db = arg; + rdb_replica_id_t rdb_node_id; struct rdb_raft_node *rdb_node = raft_node_get_udata(node); crt_rpc_t *rpc; struct rdb_appendentries_in *in; int rc; D_ASSERT(db->d_raft == raft); - D_DEBUG(DB_TRACE, DF_DB": sending ae to node %u rank %u: term=%ld\n", - DP_DB(db), raft_node_get_id(node), rdb_node->dn_rank, - msg->term); + D_ASSERT(node != NULL); + D_ASSERT(rdb_node != NULL); + rdb_node_id = rdb_replica_id_decode(raft_node_get_id(node)); + D_DEBUG(DB_TRACE, DF_DB ": sending ae to node " RDB_F_RID ": term=%ld\n", DP_DB(db), + RDB_P_RID(rdb_node_id), msg->term); if (DAOS_FAIL_CHECK(DAOS_RDB_SKIP_APPENDENTRIES_FAIL)) D_GOTO(err, rc = 0); - rc = rdb_create_raft_rpc(RDB_APPENDENTRIES, node, &rpc); + rc = rdb_create_raft_rpc(db, RDB_APPENDENTRIES, node, &rpc); if (rc != 0) { - D_ERROR(DF_DB": failed to create AE RPC to node %d: %d\n", - DP_DB(db), raft_node_get_id(node), rc); + DL_ERROR(rc, DF_DB ": failed to create AE RPC to node " RDB_F_RID, DP_DB(db), + RDB_P_RID(rdb_node_id)); D_GOTO(err, rc); } in = crt_req_get(rpc); - uuid_copy(in->aei_op.ri_uuid, db->d_uuid); rc = rdb_raft_clone_ae(db, msg, &in->aei_msg); if (rc != 0) { D_ERROR(DF_DB": failed to allocate entry array\n", DP_DB(db)); @@ -213,8 +217,8 @@ rdb_raft_cb_send_appendentries(raft_server_t *raft, void *arg, rc = rdb_send_raft_rpc(rpc, db); if (rc != 0) { - D_ERROR(DF_DB": failed to send AE RPC to node %d: %d\n", - DP_DB(db), raft_node_get_id(node), rc); + DL_ERROR(rc, DF_DB ": failed to send AE RPC to node " RDB_F_RID, DP_DB(db), + RDB_P_RID(rdb_node_id)); D_GOTO(err_in, rc); } return 0; @@ -228,60 +232,186 @@ rdb_raft_cb_send_appendentries(raft_server_t *raft, void *arg, } static int -rdb_raft_store_replicas(daos_handle_t lc, uint64_t index, const d_rank_list_t *replicas, - rdb_vos_tx_t vtx) +rdb_replica_record_compare_void(const void *vx, const void *vy) +{ + const struct rdb_replica_record *x = vx; + const struct rdb_replica_record *y = vy; + + return rdb_replica_id_compare(x->drr_id, y->drr_id); +} + +/* Just some defensive sanity checks. */ +static int +rdb_raft_check_replicas(uuid_t db_uuid, uint32_t layout_version, + struct rdb_replica_record *replicas, int replicas_len) +{ + struct rdb_replica_record *rs; + int rs_len; + int i; + int rc; + + if (replicas_len <= 0 || replicas_len > UINT8_MAX) { + D_ERROR(DF_UUID ": invalid replicas_len: %d\n", DP_UUID(db_uuid), replicas_len); + rc = -DER_INVAL; + goto out; + } + + rs_len = replicas_len; + D_ALLOC_ARRAY(rs, rs_len); + if (rs == NULL) { + rc = -DER_NOMEM; + goto out; + } + memcpy(rs, replicas, sizeof(*rs) * replicas_len); + qsort(rs, rs_len, sizeof(*rs), rdb_replica_record_compare_void); + + for (i = 0; i < replicas_len; i++) { + if (i > 0 && rs[i].drr_id.rri_rank == rs[i - 1].drr_id.rri_rank) { + D_ERROR(DF_UUID ": duplicate replica rank: %u\n", DP_UUID(db_uuid), + rs[i].drr_id.rri_rank); + rc = -DER_INVAL; + goto out_rs; + } + if (layout_version < RDB_LAYOUT_VERSION_REPLICA_ID && + replicas[i].drr_id.rri_gen != 0) { + D_ERROR(DF_UUID ": unexpected replica gen: " RDB_F_RID "\n", + DP_UUID(db_uuid), RDB_P_RID(replicas[i].drr_id)); + rc = -DER_INVAL; + goto out_rs; + } + } + + rc = 0; +out_rs: + D_FREE(rs); +out: + return rc; +} + +static int +rdb_raft_store_replicas(uuid_t db_uuid, daos_handle_t lc, uint64_t index, uint32_t layout_version, + struct rdb_replica_record *replicas, int replicas_len, rdb_vos_tx_t vtx) { - d_iov_t keys[2]; - d_iov_t vals[2]; - uint8_t nreplicas; + d_iov_t keys[2]; + d_iov_t vals[2]; + uint8_t nreplicas; + d_rank_t *ranks = NULL; + int i; + int rc; + + rc = rdb_raft_check_replicas(db_uuid, layout_version, replicas, replicas_len); + if (rc != 0) + return rc; - D_ASSERTF(replicas->rl_nr <= UINT8_MAX, "nreplicas = %u", - replicas->rl_nr); - nreplicas = replicas->rl_nr; + D_ASSERTF(0 < replicas_len && replicas_len <= UINT8_MAX, "replicas_len = %u", replicas_len); + nreplicas = replicas_len; keys[0] = rdb_lc_nreplicas; d_iov_set(&vals[0], &nreplicas, sizeof(nreplicas)); + keys[1] = rdb_lc_replicas; - d_iov_set(&vals[1], replicas->rl_ranks, sizeof(*replicas->rl_ranks) * nreplicas); - return rdb_lc_update(lc, index, RDB_LC_ATTRS, true /* crit */, 2 /* n */, keys, vals, vtx); + if (layout_version < RDB_LAYOUT_VERSION_REPLICA_ID) { + D_ALLOC_ARRAY(ranks, replicas_len); + if (ranks == NULL) + return -DER_NOMEM; + for (i = 0; i < replicas_len; i++) + ranks[i] = replicas[i].drr_id.rri_rank; + d_iov_set(&vals[1], ranks, sizeof(*ranks) * replicas_len); + } else { + d_iov_set(&vals[1], replicas, sizeof(*replicas) * replicas_len); + } + + rc = rdb_lc_update(lc, index, RDB_LC_ATTRS, true /* crit */, 2 /* n */, keys, vals, vtx); + if (rc == 0) { + D_DEBUG(DB_MD, DF_UUID ": stored nreplicas and replicas at " DF_U64 ":\n", + DP_UUID(db_uuid), index); + for (i = 0; i < replicas_len; i++) + D_DEBUG(DB_MD, DF_UUID ": [%d]: id=" RDB_F_RID " reserved=" DF_X64 "\n", + DP_UUID(db_uuid), i, RDB_P_RID(replicas[i].drr_id), + replicas[i].drr_reserved); + } else { + DL_ERROR(rc, DF_UUID ": failed to update nreplicas and replicas", DP_UUID(db_uuid)); + } + + D_FREE(ranks); + return rc; } +/* The caller must free *replicas_out with D_FREE. */ int -rdb_raft_load_replicas(daos_handle_t lc, uint64_t index, d_rank_list_t **replicas) +rdb_raft_load_replicas(uuid_t db_uuid, daos_handle_t lc, uint64_t index, uint32_t layout_version, + struct rdb_replica_record **replicas_out, int *replicas_len_out) { - d_iov_t value; - uint8_t nreplicas; - d_rank_list_t *r; - int rc; + d_iov_t value; + uint8_t nreplicas; + struct rdb_replica_record *replicas = NULL; + d_rank_t *ranks = NULL; + int i; + int rc; d_iov_set(&value, &nreplicas, sizeof(nreplicas)); rc = rdb_lc_lookup(lc, index, RDB_LC_ATTRS, &rdb_lc_nreplicas, &value); if (rc == -DER_NONEXIST) { - D_DEBUG(DB_MD, "no replicas in "DF_U64"\n", index); + D_DEBUG(DB_MD, DF_UUID ": no replicas at " DF_U64 "\n", DP_UUID(db_uuid), index); nreplicas = 0; + rc = 0; } else if (rc != 0) { - return rc; + DL_ERROR(rc, DF_UUID ": failed to look up nreplicas", DP_UUID(db_uuid)); + goto out; } - r = daos_rank_list_alloc(nreplicas); - if (r == NULL) - return -DER_NOMEM; - if (nreplicas > 0) { - d_iov_set(&value, r->rl_ranks, sizeof(*r->rl_ranks) * nreplicas); + D_ALLOC_ARRAY(replicas, nreplicas); + if (replicas == NULL) { + rc = -DER_NOMEM; + goto out; + } + + if (layout_version < RDB_LAYOUT_VERSION_REPLICA_ID) { + D_ALLOC_ARRAY(ranks, nreplicas); + if (ranks == NULL) { + rc = -DER_NOMEM; + goto out; + } + d_iov_set(&value, ranks, sizeof(*ranks) * nreplicas); + } else { + d_iov_set(&value, replicas, sizeof(*replicas) * nreplicas); + } + rc = rdb_lc_lookup(lc, index, RDB_LC_ATTRS, &rdb_lc_replicas, &value); if (rc != 0) { - d_rank_list_free(r); - return rc; + DL_ERROR(rc, DF_UUID ": failed to look up replicas", DP_UUID(db_uuid)); + goto out; } + + if (layout_version < RDB_LAYOUT_VERSION_REPLICA_ID) + for (i = 0; i < nreplicas; i++) + replicas[i].drr_id.rri_rank = ranks[i]; + + rc = rdb_raft_check_replicas(db_uuid, layout_version, replicas, nreplicas); + if (rc != 0) + goto out; } - *replicas = r; - return 0; +out: + D_FREE(ranks); + if (rc == 0) { + D_DEBUG(DB_MD, DF_UUID ": loaded nreplicas and replicas at " DF_U64 ":\n", + DP_UUID(db_uuid), index); + for (i = 0; i < nreplicas; i++) + D_DEBUG(DB_MD, DF_UUID ": [%d]: id=" RDB_F_RID " reserved=" DF_X64 "\n", + DP_UUID(db_uuid), i, RDB_P_RID(replicas[i].drr_id), + replicas[i].drr_reserved); + *replicas_out = replicas; + *replicas_len_out = nreplicas; + } else { + D_FREE(replicas); + } + return rc; } /* Caller must hold d_raft_mutex. */ static int -rdb_raft_add_node(struct rdb *db, d_rank_t rank) +rdb_raft_add_node(struct rdb *db, struct rdb_replica_record record) { struct rdb_raft_node *dnode; raft_node_t *node; @@ -295,13 +425,17 @@ rdb_raft_add_node(struct rdb *db, d_rank_t rank) dnode = calloc(1, sizeof(*dnode)); if (dnode == NULL) D_GOTO(out, rc = -DER_NOMEM); - dnode->dn_rank = rank; - node = raft_add_node(db->d_raft, dnode, rank, rank == dss_self_rank()); + dnode->dn_rank = record.drr_id.rri_rank; + + node = raft_add_node(db->d_raft, dnode, rdb_replica_id_encode(record.drr_id), + rdb_replica_id_compare(record.drr_id, db->d_replica_id) == 0); if (node == NULL) { - D_ERROR(DF_DB": failed to add node %u\n", DP_DB(db), rank); + D_ERROR(DF_DB ": failed to add node " RDB_F_RID "\n", DP_DB(db), + RDB_P_RID(record.drr_id)); free(dnode); D_GOTO(out, rc = -DER_NOMEM); } + out: return rc; } @@ -310,9 +444,10 @@ rdb_raft_add_node(struct rdb *db, d_rank_t rank) static int rdb_raft_load_snapshot(struct rdb *db) { - d_rank_list_t *replicas; - int i; - int rc; + struct rdb_replica_record *replicas; + int replicas_len; + int i; + int rc; D_DEBUG(DB_MD, DF_DB": loading snapshot: base="DF_U64" term="DF_U64"\n", DP_DB(db), db->d_lc_record.dlr_base, @@ -323,7 +458,8 @@ rdb_raft_load_snapshot(struct rdb *db) * after the raft_begin_load_snapshot call, which removes all nodes in * raft. */ - rc = rdb_raft_load_replicas(db->d_lc, db->d_lc_record.dlr_base, &replicas); + rc = rdb_raft_load_replicas(db->d_uuid, db->d_lc, db->d_lc_record.dlr_base, db->d_version, + &replicas, &replicas_len); if (rc != 0) { D_ERROR(DF_DB": failed to load replicas in snapshot "DF_U64" (term="DF_U64"): " DF_RC"\n", DP_DB(db), db->d_lc_record.dlr_base, @@ -354,8 +490,8 @@ rdb_raft_load_snapshot(struct rdb *db) } /* Add the corresponding nodes to raft. */ - for (i = 0; i < replicas->rl_nr; i++) { - rc = rdb_raft_add_node(db, replicas->rl_ranks[i]); + for (i = 0; i < replicas_len; i++) { + rc = rdb_raft_add_node(db, replicas[i]); /* TODO: Freeze and shut down db. */ D_ASSERTF(rc == 0, "failed to add node: "DF_RC"\n", DP_RC(rc)); } @@ -364,7 +500,7 @@ rdb_raft_load_snapshot(struct rdb *db) D_ASSERTF(rc == 0, ""DF_RC"\n", DP_RC(rc)); out_replicas: - d_rank_list_free(replicas); + D_FREE(replicas); out: return rc; } @@ -439,6 +575,7 @@ rdb_raft_cb_send_installsnapshot(raft_server_t *raft, void *arg, raft_node_t *node, msg_installsnapshot_t *msg) { struct rdb *db = arg; + rdb_replica_id_t rdb_node_id; struct rdb_raft_node *rdb_node = raft_node_get_udata(node); struct rdb_raft_is *is = &rdb_node->dn_is; crt_rpc_t *rpc; @@ -449,16 +586,20 @@ rdb_raft_cb_send_installsnapshot(raft_server_t *raft, void *arg, struct dss_module_info *info = dss_get_module_info(); int rc; - rc = rdb_create_raft_rpc(RDB_INSTALLSNAPSHOT, node, &rpc); + D_ASSERT(db->d_raft == raft); + D_ASSERT(node != NULL); + D_ASSERT(rdb_node != NULL); + rdb_node_id = rdb_replica_id_decode(raft_node_get_id(node)); + + rc = rdb_create_raft_rpc(db, RDB_INSTALLSNAPSHOT, node, &rpc); if (rc != 0) { - D_ERROR(DF_DB": failed to create IS RPC to rank %u: %d\n", - DP_DB(db), rdb_node->dn_rank, rc); + DL_ERROR(rc, DF_DB ": failed to create IS RPC to node " RDB_F_RID, DP_DB(db), + RDB_P_RID(rdb_node_id)); goto err; } /* Start filling the request. */ - in = crt_req_get(rpc); - uuid_copy(in->isi_op.ri_uuid, db->d_uuid); + in = crt_req_get(rpc); in->isi_msg = *msg; /* @@ -505,8 +646,8 @@ rdb_raft_cb_send_installsnapshot(raft_server_t *raft, void *arg, rc = crt_bulk_create(info->dmi_ctx, &sgl, CRT_BULK_RO, &in->isi_kds); if (rc != 0) { - D_ERROR(DF_DB": failed to create key descriptor bulk for rank " - "%u: %d\n", DP_DB(db), rdb_node->dn_rank, rc); + DL_ERROR(rc, DF_DB ": failed to create key descriptor bulk for node " RDB_F_RID, + DP_DB(db), RDB_P_RID(rdb_node_id)); goto err_data; } data.iov_buf_len = data.iov_len; @@ -515,24 +656,23 @@ rdb_raft_cb_send_installsnapshot(raft_server_t *raft, void *arg, sgl.sg_iovs = &data; rc = crt_bulk_create(info->dmi_ctx, &sgl, CRT_BULK_RO, &in->isi_data); if (rc != 0) { - D_ERROR(DF_DB": failed to create key bulk for rank %u: %d\n", - DP_DB(db), rdb_node->dn_rank, rc); + DL_ERROR(rc, DF_DB ": failed to create key bulk for node " RDB_F_RID, DP_DB(db), + RDB_P_RID(rdb_node_id)); goto err_kds_bulk; } rc = rdb_send_raft_rpc(rpc, db); if (rc != 0) { - D_ERROR(DF_DB": failed to send IS RPC to rank %u: %d\n", - DP_DB(db), rdb_node->dn_rank, rc); + DL_ERROR(rc, DF_DB ": failed to send IS RPC to node " RDB_F_RID, DP_DB(db), + RDB_P_RID(rdb_node_id)); goto err_data_bulk; } D_DEBUG(DB_TRACE, - DF_DB": sent is to node %u rank %u: term=%ld last_idx=%ld seq=" - DF_U64" kds.len="DF_U64" data.len="DF_U64"\n", - DP_DB(db), raft_node_get_id(node), rdb_node->dn_rank, - in->isi_msg.term, in->isi_msg.last_idx, in->isi_seq, - kds.iov_len, data.iov_len); + DF_DB ": sent is to node " RDB_F_RID ": term=%ld last_idx=%ld seq=" DF_U64 + " kds.len=" DF_U64 " data.len=" DF_U64 "\n", + DP_DB(db), RDB_P_RID(rdb_node_id), in->isi_msg.term, in->isi_msg.last_idx, + in->isi_seq, kds.iov_len, data.iov_len); return 0; err_data_bulk: @@ -950,19 +1090,24 @@ rdb_raft_cb_recv_installsnapshot_resp(raft_server_t *raft, void *arg, { struct rdb *db = arg; struct rdb_raft_node *rdb_node = raft_node_get_udata(node); + rdb_replica_id_t rdb_node_id; struct rdb_raft_is *is = &rdb_node->dn_is; struct rdb_installsnapshot_out *out; + D_ASSERT(db->d_raft == raft); + D_ASSERT(node != NULL); + D_ASSERT(rdb_node != NULL); + rdb_node_id = rdb_replica_id_decode(raft_node_get_id(node)); out = container_of(resp, struct rdb_installsnapshot_out, iso_msg); /* If no longer transferring this snapshot, ignore this response. */ if (rdb_node->dn_term != raft_get_current_term(raft) || is->dis_index != resp->last_idx) { D_DEBUG(DB_TRACE, - DF_DB": rank %u: stale term "DF_U64" != %ld or index " - DF_U64" != %ld\n", DP_DB(db), rdb_node->dn_rank, - rdb_node->dn_term, raft_get_current_term(raft), - is->dis_index, resp->last_idx); + DF_DB ": node " RDB_F_RID ": stale term " DF_U64 " != %ld or index " DF_U64 + " != %ld\n", + DP_DB(db), RDB_P_RID(rdb_node_id), rdb_node->dn_term, + raft_get_current_term(raft), is->dis_index, resp->last_idx); return 0; } @@ -974,8 +1119,8 @@ rdb_raft_cb_recv_installsnapshot_resp(raft_server_t *raft, void *arg, * snapshot. */ if (resp->complete) { - D_DEBUG(DB_TRACE, DF_DB": rank %u: completed snapshot %ld\n", DP_DB(db), - rdb_node->dn_rank, resp->last_idx); + D_DEBUG(DB_TRACE, DF_DB ": node " RDB_F_RID ": completed snapshot %ld\n", + DP_DB(db), RDB_P_RID(rdb_node_id), resp->last_idx); return 0; } @@ -983,26 +1128,25 @@ rdb_raft_cb_recv_installsnapshot_resp(raft_server_t *raft, void *arg, * ... and the snapshot is not complete, return a generic error so * that raft will not retry too eagerly. */ - D_DEBUG(DB_TRACE, - DF_DB": rank %u: unsuccessful chunk %ld/"DF_U64"(" - DF_U64")\n", DP_DB(db), rdb_node->dn_rank, - resp->last_idx, out->iso_seq, is->dis_seq); + D_DEBUG( + DB_TRACE, + DF_DB ": node " RDB_F_RID ": unsuccessful chunk %ld/" DF_U64 "(" DF_U64 ")\n", + DP_DB(db), RDB_P_RID(rdb_node_id), resp->last_idx, out->iso_seq, is->dis_seq); return -DER_MISC; } /* Ignore this stale response. */ if (out->iso_seq <= is->dis_seq) { D_DEBUG(DB_TRACE, - DF_DB": rank %u: stale chunk %ld/"DF_U64"("DF_U64")\n", - DP_DB(db), rdb_node->dn_rank, resp->last_idx, - out->iso_seq, is->dis_seq); + DF_DB ": node " RDB_F_RID ": stale chunk %ld/" DF_U64 "(" DF_U64 ")\n", + DP_DB(db), RDB_P_RID(rdb_node_id), resp->last_idx, out->iso_seq, + is->dis_seq); return 0; } D_DEBUG(DB_TRACE, - DF_DB": rank %u: completed chunk %ld/"DF_U64"("DF_U64")\n", - DP_DB(db), rdb_node->dn_rank, resp->last_idx, out->iso_seq, - is->dis_seq); + DF_DB ": node " RDB_F_RID ": completed chunk %ld/" DF_U64 "(" DF_U64 ")\n", + DP_DB(db), RDB_P_RID(rdb_node_id), resp->last_idx, out->iso_seq, is->dis_seq); /* Update the last sequence number and anchor. */ is->dis_seq = out->iso_seq; @@ -1014,18 +1158,19 @@ rdb_raft_cb_recv_installsnapshot_resp(raft_server_t *raft, void *arg, static int rdb_raft_cb_persist_vote(raft_server_t *raft, void *arg, raft_node_id_t vote) { - struct rdb *db = arg; - d_iov_t value; - int rc; + struct rdb *db = arg; + rdb_replica_id_t rdb_vote = rdb_replica_id_decode(vote); + d_iov_t value; + int rc; if (!db->d_raft_loaded) return 0; - d_iov_set(&value, &vote, sizeof(vote)); + rdb_set_mc_vote_update_buf(db, &rdb_vote, &value); rc = rdb_mc_update(db->d_mc, RDB_MC_ATTRS, 1 /* n */, &rdb_mc_vote, &value, NULL /* vtx */); if (rc != 0) - D_ERROR(DF_DB": failed to persist vote %d: %d\n", DP_DB(db), - vote, rc); + DL_ERROR(rc, DF_DB ": failed to persist vote " RDB_F_RID, DP_DB(db), + RDB_P_RID(rdb_vote)); return rc; } @@ -1034,10 +1179,11 @@ static int rdb_raft_cb_persist_term(raft_server_t *raft, void *arg, raft_term_t term, raft_node_id_t vote) { - struct rdb *db = arg; - d_iov_t keys[2]; - d_iov_t values[2]; - int rc; + struct rdb *db = arg; + rdb_replica_id_t rdb_vote = rdb_replica_id_decode(vote); + d_iov_t keys[2]; + d_iov_t values[2]; + int rc; if (!db->d_raft_loaded) return 0; @@ -1046,21 +1192,30 @@ rdb_raft_cb_persist_term(raft_server_t *raft, void *arg, raft_term_t term, keys[0] = rdb_mc_term; d_iov_set(&values[0], &term, sizeof(term)); keys[1] = rdb_mc_vote; - d_iov_set(&values[1], &vote, sizeof(vote)); + rdb_set_mc_vote_update_buf(db, &rdb_vote, &values[1]); rc = rdb_mc_update(db->d_mc, RDB_MC_ATTRS, 2 /* n */, keys, values, NULL /* vtx */); if (rc != 0) - D_ERROR(DF_DB ": failed to update term %ld and vote %d: " DF_RC "\n", DP_DB(db), - term, vote, DP_RC(rc)); + DL_ERROR(rc, DF_DB ": failed to update term %ld and vote " RDB_F_RID, DP_DB(db), + term, RDB_P_RID(rdb_vote)); return rc; } -static d_rank_t -rdb_raft_cfg_entry_rank(raft_entry_t *entry) +static rdb_replica_id_t +rdb_raft_cfg_entry_node_id(raft_entry_t *entry, uint32_t layout_version) { + rdb_replica_id_t id; + D_ASSERT(entry->data.buf != NULL); - D_ASSERTF(entry->data.len == sizeof(d_rank_t), "%u\n", entry->data.len); - return *((d_rank_t *)entry->data.buf); + if (layout_version < RDB_LAYOUT_VERSION_REPLICA_ID) { + D_ASSERTF(entry->data.len == sizeof(id.rri_rank), "%u\n", entry->data.len); + id.rri_rank = *(d_rank_t *)entry->data.buf; + id.rri_gen = 0; + } else { + D_ASSERTF(entry->data.len == sizeof(id), "%u\n", entry->data.len); + id = *(rdb_replica_id_t *)entry->data.buf; + } + return id; } /* See rdb_raft_update_node. */ @@ -1070,50 +1225,92 @@ rdb_raft_cfg_entry_rank(raft_entry_t *entry) static int rdb_raft_update_node(struct rdb *db, uint64_t index, raft_entry_t *entry, rdb_vos_tx_t vtx) { - d_rank_list_t *replicas; - d_rank_t rank = rdb_raft_cfg_entry_rank(entry); - bool found; - void *result; - int rc; + struct rdb_replica_record *replicas; + int replicas_len; + rdb_replica_id_t id = rdb_raft_cfg_entry_node_id(entry, db->d_version); + int i; + struct rdb_replica_record *tmp; + int tmp_len; + void *result; + int rc; - D_DEBUG(DB_MD, DF_DB": cfg entry "DF_U64": term=%ld type=%s rank=%u\n", DP_DB(db), index, - entry->term, rdb_raft_entry_type_str(entry->type), rank); + D_DEBUG(DB_MD, DF_DB ": cfg entry " DF_U64 ": term=%ld type=%s node=" RDB_F_RID "\n", + DP_DB(db), index, entry->term, rdb_raft_entry_type_str(entry->type), RDB_P_RID(id)); - rc = rdb_raft_load_replicas(db->d_lc, index, &replicas); + rc = rdb_raft_load_replicas(db->d_uuid, db->d_lc, index, db->d_version, &replicas, + &replicas_len); if (rc != 0) goto out; - found = d_rank_list_find(replicas, rank, NULL); - if (found && entry->type == RAFT_LOGTYPE_ADD_NODE) { - D_WARN(DF_DB": %s: rank %u already exists\n", DP_DB(db), - rdb_raft_entry_type_str(entry->type), rank); - rc = 0; - goto out_replicas; - } else if (!found && entry->type == RAFT_LOGTYPE_REMOVE_NODE) { - D_WARN(DF_DB": %s: rank %u does not exist\n", DP_DB(db), - rdb_raft_entry_type_str(entry->type), rank); - rc = 0; - goto out_replicas; - } + switch (entry->type) { + case RAFT_LOGTYPE_ADD_NODE: + /* + * Ensure that no existing replica ID uses id.rri_rank or + * id.rri_gen (if nonzero). Note that nonzero generations + * are unique even for different ranks, because of how we + * produce them. + */ + for (i = 0; i < replicas_len; i++) { + if (replicas[i].drr_id.rri_rank == id.rri_rank || + (id.rri_gen != 0 && replicas[i].drr_id.rri_gen == id.rri_gen)) { + D_ERROR(DF_DB ": %s: replica " RDB_F_RID + " already exists: " RDB_F_RID "\n", + DP_DB(db), rdb_raft_entry_type_str(entry->type), + RDB_P_RID(id), RDB_P_RID(replicas[i].drr_id)); + rc = -DER_INVAL; + goto out_replicas; + } + } - if (entry->type == RAFT_LOGTYPE_ADD_NODE) - rc = d_rank_list_append(replicas, rank); - else if (entry->type == RAFT_LOGTYPE_REMOVE_NODE) - rc = d_rank_list_del(replicas, rank); - if (rc != 0) + /* Append id to replicas. */ + tmp_len = replicas_len + 1; + D_REALLOC_ARRAY(tmp, replicas, replicas_len, tmp_len); + if (tmp == NULL) { + rc = -DER_NOMEM; + goto out_replicas; + } + replicas = tmp; + replicas_len = tmp_len; + replicas[replicas_len - 1].drr_id = id; + replicas[replicas_len - 1].drr_reserved = 0; + break; + case RAFT_LOGTYPE_REMOVE_NODE: + /* Find id in replicas. */ + for (i = 0; i < replicas_len; i++) + if (rdb_replica_id_compare(replicas[i].drr_id, id) == 0) + break; + if (i == replicas_len) { + D_ERROR(DF_DB ": %s: replica " RDB_F_RID " does not exist\n", DP_DB(db), + rdb_raft_entry_type_str(entry->type), RDB_P_RID(id)); + rc = -DER_INVAL; + goto out_replicas; + } + + /* Remove it. */ + if (replicas_len - i - 1 > 0) + memmove(&replicas[i], &replicas[i + 1], + (replicas_len - i - 1) * sizeof(*replicas)); + replicas_len--; + break; + default: + D_ERROR(DF_DB ": entry type %s (%d) not supported: " RDB_F_RID "\n", DP_DB(db), + rdb_raft_entry_type_str(entry->type), entry->type, RDB_P_RID(id)); + rc = -DER_NOTSUPPORTED; goto out_replicas; + } - rc = rdb_raft_store_replicas(db->d_lc, index, replicas, vtx); + rc = rdb_raft_store_replicas(db->d_uuid, db->d_lc, index, db->d_version, replicas, + replicas_len, vtx); out_replicas: - d_rank_list_free(replicas); + D_FREE(replicas); out: result = rdb_raft_lookup_result(db, index); if (result != NULL) *(int *)result = rc; if (rc != 0) - D_ERROR(DF_DB": failed to perform %s on rank %u at index "DF_U64": "DF_RC"\n", - DP_DB(db), rdb_raft_entry_type_str(entry->type), rank, index, DP_RC(rc)); + DL_ERROR(rc, DF_DB ": failed to do %s " RDB_F_RID " at index " DF_U64, DP_DB(db), + rdb_raft_entry_type_str(entry->type), RDB_P_RID(id), index); return rc; } @@ -1421,18 +1618,20 @@ rdb_raft_cb_log_pop(raft_server_t *raft, void *arg, raft_entry_t *entry, } static raft_node_id_t -rdb_raft_cb_log_get_node_id(raft_server_t *raft, void *arg, raft_entry_t *entry, - raft_index_t index) +rdb_raft_cb_log_get_node_id(raft_server_t *raft, void *arg, raft_entry_t *entry, raft_index_t index) { - D_ASSERTF(raft_entry_is_cfg_change(entry), "index=%ld type=%s\n", index, + struct rdb *db = arg; + + D_ASSERTF(raft_entry_is_cfg_change(entry), DF_DB ": index=%ld type=%s\n", DP_DB(db), index, rdb_raft_entry_type_str(entry->type)); - return rdb_raft_cfg_entry_rank(entry); + return rdb_replica_id_encode(rdb_raft_cfg_entry_node_id(entry, db->d_version)); } static void rdb_raft_cb_notify_membership_event(raft_server_t *raft, void *udata, raft_node_t *node, raft_entry_t *entry, raft_membership_e type) { + struct rdb *db = udata; struct rdb_raft_node *rdb_node = raft_node_get_udata(node); switch (type) { @@ -1454,7 +1653,7 @@ rdb_raft_cb_notify_membership_event(raft_server_t *raft, void *udata, raft_node_ * calloc instead of D_ALLOC_PTR to avoid being fault-injected. */ D_ASSERT(rdb_node != NULL); - rdb_node->dn_rank = rdb_raft_cfg_entry_rank(entry); + rdb_node->dn_rank = rdb_raft_cfg_entry_node_id(entry, db->d_version).rri_rank; raft_node_set_udata(node, rdb_node); break; case RAFT_MEMBERSHIP_REMOVE: @@ -1462,7 +1661,7 @@ rdb_raft_cb_notify_membership_event(raft_server_t *raft, void *udata, raft_node_ free(rdb_node); break; default: - D_ASSERTF(false, "invalid raft membership event type %s\n", + D_ASSERTF(false, DF_DB ": invalid raft membership event type %s\n", DP_DB(db), rdb_raft_entry_type_str(type)); } } @@ -1475,8 +1674,8 @@ rdb_raft_cb_log(raft_server_t *raft, raft_node_t *node, void *arg, raft_loglevel if (node == NULL) \ D_DEBUG(flag, DF_DB ": %s\n", DP_DB(db), buf); \ else \ - D_DEBUG(flag, DF_DB ": %s: rank=%u\n", DP_DB(db), buf, \ - ((struct rdb_raft_node *)raft_node_get_udata(node))->dn_rank); + D_DEBUG(flag, DF_DB ": %s: node=" RDB_F_RID "\n", DP_DB(db), buf, \ + RDB_P_RID(rdb_replica_id_decode(raft_node_get_id(node)))); struct rdb *db = raft_get_udata(raft); @@ -2106,31 +2305,24 @@ rdb_raft_append_apply_internal(struct rdb *db, msg_entry_t *mentry, } int -rdb_raft_add_replica(struct rdb *db, d_rank_t rank) +rdb_raft_append_apply_cfg(struct rdb *db, raft_logtype_e type, rdb_replica_id_t id) { - msg_entry_t entry = {}; - int result; - int rc; + msg_entry_t entry = {.type = type}; + int result; + int rc; - D_DEBUG(DB_MD, DF_DB": Replica Rank: %d\n", DP_DB(db), rank); - entry.type = RAFT_LOGTYPE_ADD_NODE; - entry.data.buf = &rank; - entry.data.len = sizeof(d_rank_t); - rc = rdb_raft_append_apply_internal(db, &entry, &result); - return (rc != 0) ? rc : result; -} + D_ASSERTF(raft_entry_is_cfg_change(&entry), "invalid type: %d\n", type); + D_DEBUG(DB_MD, DF_DB ": %s " RDB_F_RID "\n", DP_DB(db), rdb_raft_entry_type_str(type), + RDB_P_RID(id)); -int -rdb_raft_remove_replica(struct rdb *db, d_rank_t rank) -{ - msg_entry_t entry = {}; - int result; - int rc; + if (db->d_version >= RDB_LAYOUT_VERSION_REPLICA_ID) { + entry.data.buf = &id; + entry.data.len = sizeof(id); + } else { + entry.data.buf = &id.rri_rank; + entry.data.len = sizeof(id.rri_rank); + } - D_DEBUG(DB_MD, DF_DB": Replica Rank: %d\n", DP_DB(db), rank); - entry.type = RAFT_LOGTYPE_REMOVE_NODE; - entry.data.buf = &rank; - entry.data.len = sizeof(d_rank_t); rc = rdb_raft_append_apply_internal(db, &entry, &result); return (rc != 0) ? rc : result; } @@ -2326,33 +2518,71 @@ rdb_raft_destroy_lc(daos_handle_t pool, daos_handle_t mc, d_iov_t *key, * error. */ int -rdb_raft_init(daos_handle_t pool, daos_handle_t mc, const d_rank_list_t *replicas) -{ - daos_handle_t lc; - struct rdb_lc_record record; - uint64_t base; - int rc; - int rc_close; +rdb_raft_init(uuid_t db_uuid, daos_handle_t pool, daos_handle_t mc, rdb_replica_id_t *replicas, + int replicas_len, uint32_t layout_version) +{ + d_iov_t value; + daos_handle_t lc; + struct rdb_lc_record record; + uint64_t base; + struct rdb_replica_record *replica_records; + int i; + int rc; + int rc_close; - base = (replicas == NULL || replicas->rl_nr == 0) ? 0 : 1; + /* + * If replicas are specified, we are bootstrapping and shall initialize + * the LC at index 1 with replicas. Otherwise, we are not bootstrapping + * and shall initialize the LC to be empty. + */ + base = (replicas == NULL || replicas_len == 0) ? 0 : 1; - /* Create log container; base is 1 since we store replicas at idx 1 */ rc = rdb_raft_create_lc(pool, mc, &rdb_mc_lc, base, 0 /* base_term */, 0 /* term */, &record /* lc_record */); - /* Return on failure or if there are no replicas to be stored */ - if (base == 0 || rc != 0) + if (rc != 0) return rc; - /* Record the configuration in the LC at index 1. */ + if (base == 0) + return 0; + rc = vos_cont_open(pool, record.dlr_uuid, &lc); - /* This really should not be happening.. */ - D_ASSERTF(rc == 0, "Open VOS container: "DF_RC"\n", DP_RC(rc)); + /* We are opening a container that we've just created. */ + D_ASSERTF(rc == 0, "open LC: " DF_RC "\n", DP_RC(rc)); - /* No initial configuration if rank list empty */ - rc = rdb_raft_store_replicas(lc, 1 /* base */, replicas, NULL /* vtx */); - if (rc != 0) - D_ERROR("failed to create list of replicas: "DF_RC"\n", - DP_RC(rc)); + D_ALLOC_ARRAY(replica_records, replicas_len); + if (replica_records == NULL) { + rc = -DER_NOMEM; + goto out_lc; + } + for (i = 0; i < replicas_len; i++) + replica_records[i].drr_id = replicas[i]; + rc = rdb_raft_store_replicas(db_uuid, lc, base, layout_version, replica_records, + replicas_len, NULL /* vtx */); + D_FREE(replica_records); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to initialize replicas", DP_UUID(db_uuid)); + goto out_lc; + } + + /* Initialize rdb_lc_replica_gen_next to max{replicas[].rri_gen} + 1. */ + if (layout_version >= RDB_LAYOUT_VERSION_REPLICA_ID) { + uint32_t replica_gen_next = 0; + + for (i = 0; i < replicas_len; i++) + if (replicas[i].rri_gen > replica_gen_next) + replica_gen_next = replicas[i].rri_gen; + replica_gen_next++; + D_DEBUG(DB_MD, DF_UUID ": replica_gen_next=%u\n", DP_UUID(db_uuid), + replica_gen_next); + d_iov_set(&value, &replica_gen_next, sizeof(replica_gen_next)); + rc = rdb_lc_update(lc, base, RDB_LC_ATTRS, false /* crit */, 1, + &rdb_lc_replica_gen_next, &value, NULL /* vtx */); + if (rc != 0) + DL_ERROR(rc, DF_UUID ": failed to initialize next replica generation", + DP_UUID(db_uuid)); + } + +out_lc: rc_close = vos_cont_close(lc); return (rc != 0) ? rc : rc_close; } @@ -2449,9 +2679,18 @@ rdb_raft_load_entry(struct rdb *db, uint64_t index) return rdb_raft_rc(rc); } - D_DEBUG(DB_TRACE, DF_DB ": loaded entry " DF_U64 ": term=%ld type=%s buf=%p len=%u\n", - DP_DB(db), index, entry.term, rdb_raft_entry_type_str(entry.type), entry.data.buf, - entry.data.len); + if (raft_entry_is_cfg_change(&entry)) { + D_DEBUG(DB_MD, + DF_DB ": loaded cfg entry " DF_U64 ": term=%ld type=%s node=" RDB_F_RID + "\n", + DP_DB(db), index, entry.term, rdb_raft_entry_type_str(entry.type), + RDB_P_RID(rdb_raft_cfg_entry_node_id(&entry, db->d_version))); + } else { + D_DEBUG(DB_TRACE, + DF_DB ": loaded entry " DF_U64 ": term=%ld type=%s buf=%p len=%u\n", + DP_DB(db), index, entry.term, rdb_raft_entry_type_str(entry.type), + entry.data.buf, entry.data.len); + } return 0; } @@ -2650,14 +2889,13 @@ rdb_raft_discard_slc(struct rdb *db) int rdb_raft_dictate(struct rdb *db) { - struct rdb_lc_record lc_record = db->d_lc_record; - uint64_t term; - d_rank_list_t replicas; - d_rank_t self = dss_self_rank(); - d_iov_t keys[2]; - d_iov_t value; - uint64_t index = lc_record.dlr_tail; - int rc; + struct rdb_lc_record lc_record = db->d_lc_record; + uint64_t term; + struct rdb_replica_record replicas = {.drr_id = db->d_replica_id}; + d_iov_t keys[2]; + d_iov_t value; + uint64_t index = lc_record.dlr_tail; + int rc; /* * If an SLC exists, discard it, since it must be either stale or @@ -2701,11 +2939,10 @@ rdb_raft_dictate(struct rdb *db) * membership change entry that, for instance, adds a node other than * ourself, which contradicts with the new membership of only ourself. */ - replicas.rl_ranks = &self; - replicas.rl_nr = 1; - rc = rdb_raft_store_replicas(db->d_lc, index, &replicas, NULL /* vtx */); + rc = rdb_raft_store_replicas(db->d_uuid, db->d_lc, index, db->d_version, &replicas, + 1 /* replicas_len */, NULL /* vtx */); if (rc != 0) { - D_ERROR(DF_DB": failed to reset membership: "DF_RC"\n", DP_DB(db), DP_RC(rc)); + DL_ERROR(rc, DF_DB ": failed to reset membership", DP_DB(db)); return rc; } keys[0] = rdb_lc_entry_header; @@ -2868,10 +3105,10 @@ rdb_raft_close(struct rdb *db) static int rdb_raft_load(struct rdb *db) { - d_iov_t value; - uint64_t term; - int vote; - int rc; + d_iov_t value; + uint64_t term; + rdb_replica_id_t vote; + int rc; D_DEBUG(DB_MD, DF_DB": load persistent state: begin\n", DP_DB(db)); D_ASSERT(!db->d_raft_loaded); @@ -2881,16 +3118,21 @@ rdb_raft_load(struct rdb *db) if (rc == 0) { rc = raft_set_current_term(db->d_raft, term); D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc)); - } else if (rc != -DER_NONEXIST) { + } else if (rc == -DER_NONEXIST) { + term = 0; + } else { goto out; } - d_iov_set(&value, &vote, sizeof(vote)); + rdb_set_mc_vote_lookup_buf(db, &vote, &value); rc = rdb_mc_lookup(db->d_mc, RDB_MC_ATTRS, &rdb_mc_vote, &value); if (rc == 0) { - rc = raft_vote_for_nodeid(db->d_raft, vote); + rc = raft_vote_for_nodeid(db->d_raft, rdb_replica_id_encode(vote)); D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc)); - } else if (rc != -DER_NONEXIST) { + } else if (rc == -DER_NONEXIST) { + vote.rri_rank = -1; + vote.rri_gen = -1; + } else { goto out; } @@ -2899,11 +3141,11 @@ rdb_raft_load(struct rdb *db) goto out; D_DEBUG(DB_MD, - DF_DB ": term=" DF_U64 " vote=%d lc.uuid=" DF_UUID " lc.base=" DF_U64 + DF_DB ": term=" DF_U64 " vote=" RDB_F_RID " lc.uuid=" DF_UUID " lc.base=" DF_U64 " lc.base_term=" DF_U64 " lc.tail=" DF_U64 " lc.aggregated=" DF_U64 " lc.term=" DF_U64 " lc.seq=" DF_U64 "\n", - DP_DB(db), term, vote, DP_UUID(db->d_lc_record.dlr_uuid), db->d_lc_record.dlr_base, - db->d_lc_record.dlr_base_term, db->d_lc_record.dlr_tail, + DP_DB(db), term, RDB_P_RID(vote), DP_UUID(db->d_lc_record.dlr_uuid), + db->d_lc_record.dlr_base, db->d_lc_record.dlr_base_term, db->d_lc_record.dlr_tail, db->d_lc_record.dlr_aggregated, db->d_lc_record.dlr_term, db->d_lc_record.dlr_seq); db->d_raft_loaded = true; @@ -2938,7 +3180,7 @@ rdb_raft_start(struct rdb *db) goto err; } - raft_set_nodeid(db->d_raft, dss_self_rank()); + raft_set_nodeid(db->d_raft, rdb_replica_id_encode(db->d_replica_id)); if (db->d_new) raft_set_first_start(db->d_raft); raft_set_callbacks(db->d_raft, &rdb_raft_cbs, db); @@ -2971,12 +3213,6 @@ rdb_raft_start(struct rdb *db) if (rc != 0) goto err_callbackd; - D_DEBUG(DB_MD, - DF_DB": raft started: election_timeout=%dms request_timeout=%dms " - "lease_maintenance_grace=%dms compact_thres="DF_U64" ae_max_entries=%u " - "ae_max_size="DF_U64"\n", DP_DB(db), election_timeout, request_timeout, - lease_maintenance_grace, db->d_compact_thres, db->d_ae_max_entries, - db->d_ae_max_size); return 0; err_callbackd: @@ -3164,41 +3400,86 @@ rdb_raft_wait_applied(struct rdb *db, uint64_t index, uint64_t term) return rc; } +static int +rdb_replica_id_compare_void(const void *vx, const void *vy) +{ + const rdb_replica_id_t *x = vx; + const rdb_replica_id_t *y = vy; + + return rdb_replica_id_compare(*x, *y); +} + int -rdb_raft_get_ranks(struct rdb *db, d_rank_list_t **ranksp) +rdb_raft_get_replicas(struct rdb *db, rdb_replica_id_t **replicas_out, int *replicas_len_out) { - d_rank_list_t *ranks; - int n; - int i; - int rc; + rdb_replica_id_t *replicas; + int n; + int i; + int rc; ABT_mutex_lock(db->d_raft_mutex); n = raft_get_num_nodes(db->d_raft); - ranks = d_rank_list_alloc(n); - if (ranks == NULL) { + D_ALLOC_ARRAY(replicas, n); + if (replicas == NULL) { rc = -DER_NOMEM; goto mutex; } for (i = 0; i < n; i++) { - raft_node_t *node = raft_get_node_from_idx(db->d_raft, i); - struct rdb_raft_node *rdb_node = raft_node_get_udata(node); + raft_node_t *node = raft_get_node_from_idx(db->d_raft, i); + raft_node_id_t node_id = raft_node_get_id(node); - ranks->rl_ranks[i] = rdb_node->dn_rank; + replicas[i] = rdb_replica_id_decode(node_id); } - ranks->rl_nr = i; - d_rank_list_sort(ranks); + qsort(replicas, n, sizeof(*replicas), rdb_replica_id_compare_void); - *ranksp = ranks; + *replicas_out = replicas; + *replicas_len_out = n; rc = 0; mutex: ABT_mutex_unlock(db->d_raft_mutex); return rc; } +static int +rdb_lookup_for_request(crt_rpc_t *rpc, struct rdb **db_out) +{ + struct rdb_op_in *in = crt_req_get(rpc); + d_rank_t src_rank; + struct rdb *db; + int rc; + + rc = crt_req_src_rank_get(rpc, &src_rank); + D_ASSERTF(rc == 0, "crt_req_src_rank_get: " DF_RC "\n", DP_RC(rc)); + if (src_rank != in->ri_from.rri_rank) { + D_ERROR(DF_UUID ": inconsistent request: src_rank=%u from=" RDB_F_RID "\n", + DP_UUID(in->ri_uuid), src_rank, RDB_P_RID(in->ri_from)); + return -DER_PROTO; + } + + db = rdb_lookup(in->ri_uuid); + if (db == NULL) + return -DER_NONEXIST; + + if (db->d_stop) { + rdb_put(db); + return -DER_CANCELED; + } + + if (rdb_replica_id_compare(db->d_replica_id, in->ri_to) != 0) { + D_DEBUG(DB_MD, DF_DB ": replica ID mismatch: self=" RDB_F_RID " to=" RDB_F_RID "\n", + DP_DB(db), RDB_P_RID(db->d_replica_id), RDB_P_RID(in->ri_to)); + rdb_put(db); + return -DER_BAD_TARGET; + } + + *db_out = db; + return 0; +} + void rdb_requestvote_handler(crt_rpc_t *rpc) { @@ -3206,46 +3487,40 @@ rdb_requestvote_handler(crt_rpc_t *rpc) struct rdb_requestvote_out *out = crt_reply_get(rpc); struct rdb *db; char *s; - struct rdb_raft_state state; - d_rank_t srcrank; + struct rdb_raft_state state; + raft_node_id_t node_id = rdb_replica_id_encode(in->rvi_op.ri_from); int rc; s = in->rvi_msg.prevote ? " (prevote)" : ""; - rc = crt_req_src_rank_get(rpc, &srcrank); - D_ASSERTF(rc == 0, ""DF_RC"\n", DP_RC(rc)); - db = rdb_lookup(in->rvi_op.ri_uuid); - if (db == NULL) - D_GOTO(out, rc = -DER_NONEXIST); - if (db->d_stop) - D_GOTO(out_db, rc = -DER_CANCELED); + rc = rdb_lookup_for_request(rpc, &db); + if (rc != 0) + goto out; - D_DEBUG(DB_TRACE, DF_DB": handling raft rv%s from rank %u\n", - DP_DB(db), s, srcrank); + D_DEBUG(DB_TRACE, DF_DB ": handling raft rv%s from " RDB_F_RID "\n", DP_DB(db), s, + RDB_P_RID(in->rvi_op.ri_from)); ABT_mutex_lock(db->d_raft_mutex); rdb_raft_save_state(db, &state); - rc = raft_recv_requestvote(db->d_raft, - raft_get_node(db->d_raft, - srcrank), - &in->rvi_msg, &out->rvo_msg); + rc = raft_recv_requestvote(db->d_raft, raft_get_node(db->d_raft, node_id), &in->rvi_msg, + &out->rvo_msg); rc = rdb_raft_check_state(db, &state, rc); ABT_mutex_unlock(db->d_raft_mutex); if (rc != 0) { - D_ERROR(DF_DB": failed to process REQUESTVOTE%s from rank %u: " - "%d\n", DP_DB(db), s, srcrank, rc); + DL_ERROR(rc, DF_DB ": failed to process REQUESTVOTE%s from " RDB_F_RID, DP_DB(db), + s, RDB_P_RID(in->rvi_op.ri_from)); /* raft_recv_requestvote() always generates a valid reply. */ rc = 0; } -out_db: rdb_put(db); out: out->rvo_op.ro_rc = rc; + out->rvo_op.ro_from = in->rvi_op.ri_to; + out->rvo_op.ro_to = in->rvi_op.ri_from; rc = crt_reply_send(rpc); if (rc != 0) - D_ERROR(DF_UUID": failed to send REQUESTVOTE%s reply to " - "rank %u: %d\n", DP_UUID(in->rvi_op.ri_uuid), s, - srcrank, rc); + DL_ERROR(rc, DF_UUID ": failed to send REQUESTVOTE%s reply to " RDB_F_RID, + DP_UUID(in->rvi_op.ri_uuid), s, RDB_P_RID(in->rvi_op.ri_from)); } void @@ -3254,44 +3529,38 @@ rdb_appendentries_handler(crt_rpc_t *rpc) struct rdb_appendentries_in *in = crt_req_get(rpc); struct rdb_appendentries_out *out = crt_reply_get(rpc); struct rdb *db; - struct rdb_raft_state state; - d_rank_t srcrank; + struct rdb_raft_state state; + raft_node_id_t node_id = rdb_replica_id_encode(in->aei_op.ri_from); int rc; - rc = crt_req_src_rank_get(rpc, &srcrank); - D_ASSERTF(rc == 0, ""DF_RC"\n", DP_RC(rc)); - - db = rdb_lookup(in->aei_op.ri_uuid); - if (db == NULL) - D_GOTO(out, rc = -DER_NONEXIST); - if (db->d_stop) - D_GOTO(out_db, rc = -DER_CANCELED); + rc = rdb_lookup_for_request(rpc, &db); + if (rc != 0) + goto out; - D_DEBUG(DB_TRACE, DF_DB": handling raft ae from rank %u\n", DP_DB(db), - srcrank); + D_DEBUG(DB_TRACE, DF_DB ": handling raft ae from " RDB_F_RID "\n", DP_DB(db), + RDB_P_RID(in->aei_op.ri_from)); ABT_mutex_lock(db->d_raft_mutex); rdb_raft_save_state(db, &state); - rc = raft_recv_appendentries(db->d_raft, - raft_get_node(db->d_raft, srcrank), - &in->aei_msg, &out->aeo_msg); + rc = raft_recv_appendentries(db->d_raft, raft_get_node(db->d_raft, node_id), &in->aei_msg, + &out->aeo_msg); rc = rdb_raft_check_state(db, &state, rc); ABT_mutex_unlock(db->d_raft_mutex); if (rc != 0) { - D_ERROR(DF_DB": failed to process APPENDENTRIES from rank %u: " - "%d\n", DP_DB(db), srcrank, rc); + DL_ERROR(rc, DF_DB ": failed to process APPENDENTRIES from " RDB_F_RID, DP_DB(db), + RDB_P_RID(in->aei_op.ri_from)); /* raft_recv_appendentries() always generates a valid reply. */ rc = 0; } -out_db: rdb_put(db); out: out->aeo_op.ro_rc = rc; + out->aeo_op.ro_from = in->aei_op.ri_to; + out->aeo_op.ro_to = in->aei_op.ri_from; rc = crt_reply_send(rpc); if (rc != 0) - D_ERROR(DF_UUID": failed to send APPENDENTRIES reply to rank " - "%u: %d\n", DP_UUID(in->aei_op.ri_uuid), - srcrank, rc); + DL_ERROR(rc, DF_UUID ": failed to send APPENDENTRIES reply to " RDB_F_RID, + DP_UUID(in->aei_op.ri_uuid), RDB_P_RID(in->aei_op.ri_from)); } void @@ -3300,46 +3569,38 @@ rdb_installsnapshot_handler(crt_rpc_t *rpc) struct rdb_installsnapshot_in *in = crt_req_get(rpc); struct rdb_installsnapshot_out *out = crt_reply_get(rpc); struct rdb *db; - struct rdb_raft_state state; - d_rank_t srcrank; + struct rdb_raft_state state; + raft_node_id_t node_id = rdb_replica_id_encode(in->isi_op.ri_from); int rc; - rc = crt_req_src_rank_get(rpc, &srcrank); - D_ASSERTF(rc == 0, ""DF_RC"\n", DP_RC(rc)); - - db = rdb_lookup(in->isi_op.ri_uuid); - if (db == NULL) { - rc = -DER_NONEXIST; + rc = rdb_lookup_for_request(rpc, &db); + if (rc != 0) goto out; - } - if (db->d_stop) { - rc = -DER_CANCELED; - goto out_db; - } - D_DEBUG(DB_TRACE, DF_DB": handling raft is from rank %u\n", DP_DB(db), - srcrank); + D_DEBUG(DB_TRACE, DF_DB ": handling raft is from " RDB_F_RID "\n", DP_DB(db), + RDB_P_RID(in->isi_op.ri_from)); /* Receive the bulk data buffers before entering raft. */ rc = rdb_raft_recv_is(db, rpc, &in->isi_local.rl_kds_iov, &in->isi_local.rl_data_iov); if (rc != 0) { - D_ERROR(DF_DB": failed to receive INSTALLSNAPSHOT chunk %ld" - "/"DF_U64": %d\n", DP_DB(db), in->isi_msg.last_idx, - in->isi_seq, rc); + DL_ERROR(rc, + DF_DB ": failed to receive INSTALLSNAPSHOT chunk %ld" + "/" DF_U64 " from " RDB_F_RID, + DP_DB(db), in->isi_msg.last_idx, in->isi_seq, + RDB_P_RID(in->isi_op.ri_from)); goto out_db; } ABT_mutex_lock(db->d_raft_mutex); rdb_raft_save_state(db, &state); - rc = raft_recv_installsnapshot(db->d_raft, - raft_get_node(db->d_raft, srcrank), - &in->isi_msg, &out->iso_msg); + rc = raft_recv_installsnapshot(db->d_raft, raft_get_node(db->d_raft, node_id), &in->isi_msg, + &out->iso_msg); rc = rdb_raft_check_state(db, &state, rc); ABT_mutex_unlock(db->d_raft_mutex); if (rc != 0) { - D_ERROR(DF_DB": failed to process INSTALLSNAPSHOT from rank " - "%u: %d\n", DP_DB(db), srcrank, rc); + DL_ERROR(rc, DF_DB ": failed to process INSTALLSNAPSHOT from " RDB_F_RID, DP_DB(db), + RDB_P_RID(in->isi_op.ri_from)); /* * raft_recv_installsnapshot() always generates a valid reply. */ @@ -3352,11 +3613,12 @@ rdb_installsnapshot_handler(crt_rpc_t *rpc) rdb_put(db); out: out->iso_op.ro_rc = rc; + out->iso_op.ro_from = in->isi_op.ri_to; + out->iso_op.ro_to = in->isi_op.ri_from; rc = crt_reply_send(rpc); if (rc != 0) - D_ERROR(DF_UUID": failed to send INSTALLSNAPSHOT reply to rank " - "%u: %d\n", DP_UUID(in->isi_op.ri_uuid), - srcrank, rc); + DL_ERROR(rc, DF_UUID ": failed to send INSTALLSNAPSHOT reply to " RDB_F_RID, + DP_UUID(in->isi_op.ri_uuid), RDB_P_RID(in->isi_op.ri_from)); } void @@ -3368,18 +3630,28 @@ rdb_raft_process_reply(struct rdb *db, crt_rpc_t *rpc) struct rdb_requestvote_out *out_rv; struct rdb_appendentries_out *out_ae; struct rdb_installsnapshot_out *out_is; - d_rank_t rank; + struct rdb_op_out *out_op = out; + d_rank_t dst_rank; raft_node_t *node; raft_time_t *lease = NULL; int rc; - /* Get the destination of the request - that is the source - * rank of this reply. This CaRT API is based on request hdr. - */ - rc = crt_req_dst_rank_get(rpc, &rank); - D_ASSERTF(rc == 0, ""DF_RC"\n", DP_RC(rc)); + rc = crt_req_dst_rank_get(rpc, &dst_rank); + D_ASSERTF(rc == 0, "crt_req_dst_rank_get: " DF_RC "\n", DP_RC(rc)); + if (dst_rank != out_op->ro_from.rri_rank) { + D_ERROR(DF_DB ": inconsistent reply: dst_rank=%u from=" RDB_F_RID "\n", DP_DB(db), + dst_rank, RDB_P_RID(out_op->ro_from)); + return; + } + + if (rdb_replica_id_compare(db->d_replica_id, out_op->ro_to) != 0) { + D_DEBUG(DB_MD, + DF_DB ": replica ID mismatch: self=" RDB_F_RID " to=" RDB_F_RID " opc=%u\n", + DP_DB(db), RDB_P_RID(db->d_replica_id), RDB_P_RID(out_op->ro_to), opc); + return; + } - rc = ((struct rdb_op_out *)out)->ro_rc; + rc = out_op->ro_rc; if (rc != 0) { D_DEBUG(DB_MD, DF_DB": opc %u failed: %d\n", DP_DB(db), opc, rc); @@ -3404,8 +3676,10 @@ rdb_raft_process_reply(struct rdb *db, crt_rpc_t *rpc) int adjustment = d_hlc2msec(d_hlc_epsilon_get()) + 1 /* ms margin */; if (*lease < adjustment) { - D_ERROR(DF_DB": dropping %s response from rank %u: invalid lease: %ld\n", - DP_DB(db), opc == RDB_APPENDENTRIES ? "AE" : "IS", rank, *lease); + D_ERROR(DF_DB ": dropping %s response from " RDB_F_RID + ": invalid lease: %ld\n", + DP_DB(db), opc == RDB_APPENDENTRIES ? "AE" : "IS", + RDB_P_RID(out_op->ro_from), *lease); return; } *lease -= adjustment; @@ -3413,9 +3687,10 @@ rdb_raft_process_reply(struct rdb *db, crt_rpc_t *rpc) ABT_mutex_lock(db->d_raft_mutex); - node = raft_get_node(db->d_raft, rank); + node = raft_get_node(db->d_raft, rdb_replica_id_encode(out_op->ro_from)); if (node == NULL) { - D_DEBUG(DB_MD, DF_DB": rank %u not in current membership\n", DP_DB(db), rank); + D_DEBUG(DB_MD, DF_DB ": " RDB_F_RID " not in current membership\n", DP_DB(db), + RDB_P_RID(out_op->ro_from)); goto out_mutex; } @@ -3438,8 +3713,8 @@ rdb_raft_process_reply(struct rdb *db, crt_rpc_t *rpc) } rc = rdb_raft_check_state(db, &state, rc); if (rc != 0 && rc != -DER_NOTLEADER) - DL_ERROR(rc, DF_DB ": failed to process opc %u response from rank %u", DP_DB(db), - opc, rank); + DL_ERROR(rc, DF_DB ": failed to process opc %u response from " RDB_F_RID, DP_DB(db), + opc, RDB_P_RID(out_op->ro_from)); out_mutex: ABT_mutex_unlock(db->d_raft_mutex); diff --git a/src/rdb/rdb_rpc.c b/src/rdb/rdb_rpc.c index e4479e258d3..c987c9e8308 100644 --- a/src/rdb/rdb_rpc.c +++ b/src/rdb/rdb_rpc.c @@ -172,6 +172,21 @@ crt_proc_struct_rdb_local(crt_proc_t proc, crt_proc_op_t proc_op, return 0; } +int +crt_proc_rdb_replica_id_t(crt_proc_t proc, crt_proc_op_t proc_op, rdb_replica_id_t *p) +{ + int rc; + + rc = crt_proc_uint32_t(proc, proc_op, &p->rri_rank); + if (unlikely(rc)) + return rc; + rc = crt_proc_uint32_t(proc, proc_op, &p->rri_gen); + if (unlikely(rc)) + return rc; + + return 0; +} + CRT_RPC_DEFINE(rdb_op, DAOS_ISEQ_RDB_OP, DAOS_OSEQ_RDB_OP) static int @@ -218,24 +233,42 @@ struct crt_proto_format rdb_proto_fmt = { .cpf_base = DAOS_RPC_OPCODE(0, DAOS_RDB_MODULE, 0) }; +/* Create an RDB RPC and fill the rdb_op_in fields. */ int -rdb_create_raft_rpc(crt_opcode_t opc, raft_node_t *node, crt_rpc_t **rpc) +rdb_create_raft_rpc(struct rdb *db, crt_opcode_t opc, raft_node_t *node, crt_rpc_t **rpc) { - crt_opcode_t opc_full; - crt_endpoint_t ep; + rdb_replica_id_t id = rdb_replica_id_decode(raft_node_get_id(node)); + crt_opcode_t opc_full; + crt_endpoint_t ep; struct dss_module_info *info = dss_get_module_info(); int rc; uint8_t rdb_ver; + struct rdb_op_in *in; rc = rdb_rpc_protocol(&rdb_ver); - if (rc) + if (rc != 0) { + DL_ERROR(rc, DF_DB ": failed to get RDB RPC protocol", DP_DB(db)); return rc; + } + opc_full = DAOS_RPC_OPCODE(opc, DAOS_RDB_MODULE, rdb_ver); - opc_full = DAOS_RPC_OPCODE(opc, DAOS_RDB_MODULE, rdb_ver); - ep.ep_grp = NULL; - ep.ep_rank = raft_node_get_id(node); - ep.ep_tag = daos_rpc_tag(DAOS_REQ_RDB, 0); - return crt_req_create(info->dmi_ctx, &ep, opc_full, rpc); + ep.ep_grp = NULL; + ep.ep_rank = id.rri_rank; + ep.ep_tag = daos_rpc_tag(DAOS_REQ_RDB, 0); + + rc = crt_req_create(info->dmi_ctx, &ep, opc_full, rpc); + if (rc != 0) { + DL_ERROR(rc, DF_DB ": failed to create RPC %u to " RDB_F_RID, DP_DB(db), opc, + RDB_P_RID(id)); + return rc; + } + + in = crt_req_get(*rpc); + uuid_copy(in->ri_uuid, db->d_uuid); + in->ri_from = db->d_replica_id; + in->ri_to = id; + + return 0; } struct rdb_raft_rpc { diff --git a/src/rdb/rdb_tx.c b/src/rdb/rdb_tx.c index 3e0f0617224..d924ee6b27e 100644 --- a/src/rdb/rdb_tx.c +++ b/src/rdb/rdb_tx.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -565,6 +566,7 @@ rdb_tx_create_kvs(struct rdb_tx *tx, const rdb_path_t *parent, .dto_attr = (struct rdb_kvs_attr *)attr }; + D_ASSERT(parent == &rdb_path_attrs || !rdb_path_is_attrs(parent)); return rdb_tx_append(tx, &op, false /* is_critical */); } @@ -590,6 +592,7 @@ rdb_tx_destroy_kvs(struct rdb_tx *tx, const rdb_path_t *parent, .dto_attr = NULL }; + D_ASSERT(parent == &rdb_path_attrs || !rdb_path_is_attrs(parent)); return rdb_tx_append(tx, &op, true /* is_critical */); } @@ -615,6 +618,7 @@ rdb_tx_update(struct rdb_tx *tx, const rdb_path_t *kvs, const d_iov_t *key, .dto_attr = NULL }; + D_ASSERT(kvs == &rdb_path_attrs || !rdb_path_is_attrs(kvs)); return rdb_tx_append(tx, &op, false /* is_critical */); } @@ -641,6 +645,7 @@ rdb_tx_update_critical(struct rdb_tx *tx, const rdb_path_t *kvs, const d_iov_t * .dto_attr = NULL }; + D_ASSERT(kvs == &rdb_path_attrs || !rdb_path_is_attrs(kvs)); return rdb_tx_append(tx, &op, true /* is_critical */); } @@ -664,6 +669,7 @@ rdb_tx_delete(struct rdb_tx *tx, const rdb_path_t *kvs, const d_iov_t *key) .dto_attr = NULL }; + D_ASSERT(kvs == &rdb_path_attrs || !rdb_path_is_attrs(kvs)); return rdb_tx_append(tx, &op, true /* is_critical */); } @@ -1116,8 +1122,12 @@ rdb_tx_query_pre(struct rdb_tx *tx, const rdb_path_t *path, } ABT_mutex_unlock(tx->dt_db->d_raft_mutex); - if (path == NULL) + if (path == NULL) { + D_ASSERT(kvs == NULL && index == NULL); return 0; + } + + D_ASSERT(path == &rdb_path_attrs || !rdb_path_is_attrs(path)); rc = rdb_kvs_lookup(tx->dt_db, path, i, true /* alloc */, kvs); if (rc != 0) diff --git a/src/rdb/tests/rdb_test.c b/src/rdb/tests/rdb_test.c index dcae690fc75..bd97be80d1b 100644 --- a/src/rdb/tests/rdb_test.c +++ b/src/rdb/tests/rdb_test.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2022 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -265,10 +266,12 @@ rdbt_test_path(void) static void rdbt_test_rsvc(void) { - char *svc_name = "tmp"; - d_iov_t svc_id; - uuid_t uuid; - int rc; + char *svc_name = "tmp"; + d_iov_t svc_id; + uuid_t uuid; + struct rdb_create_params create_params; + rdb_replica_id_t dummy_replicas[1] = {0}; + int rc; d_iov_set(&svc_id, svc_name, strlen(svc_name) + 1); uuid_generate(uuid); @@ -277,8 +280,15 @@ rdbt_test_rsvc(void) * A leader of an older term can't destroy a replica created by a * leader with a newer term. */ - MUST(ds_rsvc_start(DS_RSVC_CLASS_TEST, &svc_id, uuid, 2 /* term */, true /* create */, - DB_CAP, 0 /* vos_df_version */, NULL /* replicas */, NULL /* arg */)); + create_params.rcp_size = DB_CAP; + create_params.rcp_vos_df_version = 0; + create_params.rcp_layout_version = 0; + create_params.rcp_id.rri_rank = dss_self_rank(); + create_params.rcp_id.rri_gen = 1; + create_params.rcp_replicas = NULL; + create_params.rcp_replicas_len = 0; + MUST(ds_rsvc_start(DS_RSVC_CLASS_TEST, &svc_id, uuid, 2 /* term */, DS_RSVC_CREATE, + &create_params, NULL /* arg */)); rc = ds_rsvc_stop(DS_RSVC_CLASS_TEST, &svc_id, 1 /* term */, true /* destroy */); D_ASSERTF(rc == -DER_STALE, DF_RC"\n", DP_RC(rc)); @@ -286,13 +296,43 @@ rdbt_test_rsvc(void) * A leader of an older term can't destroy a replica touched by a * leader with a newer term. */ - rc = ds_rsvc_start(DS_RSVC_CLASS_TEST, &svc_id, uuid, 3 /* term */, true /* create */, - DB_CAP, 0 /* vos_df_version */, NULL /* replicas */, NULL /* arg */); + rc = ds_rsvc_start(DS_RSVC_CLASS_TEST, &svc_id, uuid, 3 /* term */, DS_RSVC_CREATE, + &create_params, NULL /* arg */); D_ASSERTF(rc == -DER_ALREADY, DF_RC"\n", DP_RC(rc)); rc = ds_rsvc_stop(DS_RSVC_CLASS_TEST, &svc_id, 2 /* term */, true /* destroy */); D_ASSERTF(rc == -DER_STALE, DF_RC"\n", DP_RC(rc)); - MUST(ds_rsvc_stop(DS_RSVC_CLASS_TEST, &svc_id, 3 /* term */, true /* destroy */)); + /* + * When creating and bootstrapping a replica, abort if there's an + * existing replica. + */ + create_params.rcp_replicas = dummy_replicas; + create_params.rcp_replicas_len = 1; + rc = ds_rsvc_start(DS_RSVC_CLASS_TEST, &svc_id, uuid, 4 /* term */, DS_RSVC_CREATE, + &create_params, NULL /* arg */); + D_ASSERTF(rc == -DER_EXIST, DF_RC "\n", DP_RC(rc)); + create_params.rcp_replicas = NULL; + create_params.rcp_replicas_len = 0; + + /* + * When creating a replica, destroy any existing replica with a lower + * generation. + */ + create_params.rcp_id.rri_gen = 2; + rc = ds_rsvc_start(DS_RSVC_CLASS_TEST, &svc_id, uuid, 5 /* term */, DS_RSVC_CREATE, + &create_params, NULL /* arg */); + D_ASSERTF(rc == 0, DF_RC "\n", DP_RC(rc)); + + /* + * When creating a replica, abort if there's an existing replica with a + * higher generation. + */ + create_params.rcp_id.rri_gen = 0; + rc = ds_rsvc_start(DS_RSVC_CLASS_TEST, &svc_id, uuid, 6 /* term */, DS_RSVC_CREATE, + &create_params, NULL /* arg */); + D_ASSERTF(rc == -DER_EXIST, DF_RC "\n", DP_RC(rc)); + + MUST(ds_rsvc_stop(DS_RSVC_CLASS_TEST, &svc_id, 7 /* term */, true /* destroy */)); } struct iterate_cb_arg { @@ -646,10 +686,12 @@ get_all_ranks(d_rank_list_t **list) static void rdbt_init_handler(crt_rpc_t *rpc) { - struct rdbt_init_in *in = crt_req_get(rpc); - d_rank_t rank; - d_rank_t ri; - d_rank_list_t *ranks; + struct rdbt_init_in *in = crt_req_get(rpc); + d_rank_t rank; + int i; + d_rank_list_t *ranks; + rdb_replica_id_t *replicas; + struct ds_rsvc_create_params create_params; MUST(crt_group_rank(NULL /* grp */, &rank)); get_all_ranks(&ranks); @@ -657,13 +699,24 @@ rdbt_init_handler(crt_rpc_t *rpc) if (in->tii_nreplicas < ranks->rl_nr) ranks->rl_nr = in->tii_nreplicas; + D_ALLOC_ARRAY(replicas, ranks->rl_nr); + D_ASSERT(replicas != NULL); + D_WARN("initializing rank %u: nreplicas=%u\n", rank, ranks->rl_nr); - for (ri = 0; ri < ranks->rl_nr; ri++) - D_WARN("ranks[%u]=%u\n", ri, ranks->rl_ranks[ri]); + for (i = 0; i < ranks->rl_nr; i++) { + replicas[i].rri_rank = ranks->rl_ranks[i]; + replicas[i].rri_gen = i + 1; + D_WARN(" replicas[%u]=" RDB_F_RID "\n", i, RDB_P_RID(replicas[i])); + } + create_params.scp_bootstrap = true; + create_params.scp_size = DB_CAP; + create_params.scp_vos_df_version = 0; + create_params.scp_layout_version = 0; + create_params.scp_replicas = replicas; + create_params.scp_replicas_len = ranks->rl_nr; MUST(ds_rsvc_dist_start(DS_RSVC_CLASS_TEST, &test_svc_id, in->tii_uuid, ranks, RDB_NIL_TERM, - DS_RSVC_CREATE, true /* bootstrap */, DB_CAP, - 0 /* vos_df_version*/)); + DS_RSVC_CREATE, &create_params)); crt_reply_send(rpc); } @@ -872,8 +925,7 @@ rdbt_dictate_handler(crt_rpc_t *rpc) ranks->rl_ranks[0] = in->rti_rank; ranks->rl_nr = 1; MUST(ds_rsvc_dist_start(DS_RSVC_CLASS_TEST, &test_svc_id, db_uuid, ranks, RDB_NIL_TERM, - DS_RSVC_DICTATE, false /* bootstrap */, 0 /* size */, - 0 /* vos_df_version */)); + DS_RSVC_DICTATE, NULL)); d_rank_list_free(ranks); out->rto_rc = 0; diff --git a/src/rdb/tests/rdbt.c b/src/rdb/tests/rdbt.c index 0d76aa376ca..4ec816c409e 100644 --- a/src/rdb/tests/rdbt.c +++ b/src/rdb/tests/rdbt.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2022 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -405,6 +406,7 @@ rdbt_add_replica_rank(crt_group_t *grp, d_rank_t ldr_rank, d_rank_t new_rank, if (out->rtmo_failed != NULL) fprintf(stderr, "ERR: adding replica %u (reply rank %u)\n", new_rank, out->rtmo_failed->rl_ranks[0]); + d_rank_list_free(replicas_to_add); destroy_rpc(rpc); return rc; } @@ -433,6 +435,7 @@ rdbt_remove_replica_rank(crt_group_t *group, d_rank_t ldr_rank, if (out->rtmo_failed != NULL) fprintf(stderr, "ERR: removing replica %u (reply rank %u)\n", rem_rank, out->rtmo_failed->rl_ranks[0]); + d_rank_list_free(replicas_to_remove); destroy_rpc(rpc); return rc; } diff --git a/src/rebuild/README.md b/src/rebuild/README.md index 33f6c32d7ca..61407959f60 100644 --- a/src/rebuild/README.md +++ b/src/rebuild/README.md @@ -206,8 +206,10 @@ struct daos_rebuild_status { /** Maximum supported layout version */ uint16_t rs_max_supported_layout_ver; - /** padding of rebuild status */ - int16_t rs_padding16; + /** See daos_rebuild_status_flag. */ + uint8_t rs_flags; + /** Do not access this field by name. */ + uint8_t rs_reserved_; /* Failure on which rank */ int32_t rs_fail_rank; diff --git a/src/rebuild/rebuild_iv.c b/src/rebuild/rebuild_iv.c index c7650ed8248..afb85154615 100644 --- a/src/rebuild/rebuild_iv.c +++ b/src/rebuild/rebuild_iv.c @@ -186,6 +186,13 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, if (rpt->rt_leader_term != src_iv->riv_leader_term) goto out; + if (ref_rc != 0) { + rc = ref_rc; + DL_WARN(rc, DF_UUID "bypass refresh, IV class id %d.", + DP_UUID(entry->ns->iv_pool_uuid), key->class_id); + goto out; + } + uuid_copy(dst_iv->riv_pool_uuid, src_iv->riv_pool_uuid); dst_iv->riv_master_rank = src_iv->riv_master_rank; dst_iv->riv_global_done = src_iv->riv_global_done; diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 0272014e025..61f8d86680c 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -68,9 +68,9 @@ rebuild_obj_fill_buf(daos_handle_t ih, d_iov_t *key_iov, shards[count] = obj_val->shard; arg->count++; - D_DEBUG(DB_REBUILD, "send oid/con "DF_UOID"/"DF_UUID" ephs "DF_U64 - "shard %d cnt %d tgt_id %d\n", DP_UOID(oids[count]), - DP_UUID(arg->cont_uuid), obj_val->eph, shards[count], + D_DEBUG(DB_REBUILD, + "send oid/con " DF_UOID "/" DF_UUID " ephs " DF_X64 " shard %d cnt %d tgt_id %d\n", + DP_UOID(oids[count]), DP_UUID(arg->cont_uuid), obj_val->eph, shards[count], arg->count, arg->tgt_id); rc = dbtree_iter_delete(ih, NULL); @@ -587,7 +587,7 @@ rebuild_obj_ult(void *data) struct rebuild_obj_arg *arg = data; struct rebuild_tgt_pool_tracker *rpt = arg->rpt; - ds_migrate_object(rpt->rt_pool, rpt->rt_poh_uuid, rpt->rt_coh_uuid, arg->co_uuid, + ds_migrate_object(rpt->rt_pool_uuid, rpt->rt_poh_uuid, rpt->rt_coh_uuid, arg->co_uuid, rpt->rt_rebuild_ver, rpt->rt_rebuild_gen, rpt->rt_stable_epoch, rpt->rt_rebuild_op, &arg->oid, &arg->epoch, &arg->punched_epoch, &arg->shard, 1, arg->tgt_index, rpt->rt_new_layout_ver); @@ -617,7 +617,7 @@ rebuild_object_local(struct rebuild_tgt_pool_tracker *rpt, uuid_t co_uuid, arg->tgt_index = tgt_index; arg->shard = shard; - rc = dss_ult_create(rebuild_obj_ult, arg, DSS_XS_SYS, 0, 0, NULL); + rc = dss_ult_create(rebuild_obj_ult, arg, DSS_XS_VOS, tgt_index, 0, NULL); if (rc) { D_FREE(arg); rpt_put(rpt); @@ -896,11 +896,12 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, while (cont_child->sc_ec_agg_active && rpt->rt_rebuild_op != RB_OP_RECLAIM && rpt->rt_rebuild_op != RB_OP_FAIL_RECLAIM) { - D_ASSERTF(rpt->rt_pool->sp_rebuilding >= 0, DF_UUID" rebuilding %d\n", - DP_UUID(rpt->rt_pool_uuid), rpt->rt_pool->sp_rebuilding); + D_ASSERTF(atomic_load(&rpt->rt_pool->sp_rebuilding) >= 0, + DF_UUID " rebuilding %d\n", DP_UUID(rpt->rt_pool_uuid), + atomic_load(&rpt->rt_pool->sp_rebuilding)); /* Wait for EC aggregation to abort before discard the object */ D_INFO(DF_RB " " DF_UUID " wait for ec agg abort, rebuilding %d.\n", DP_RB_RPT(rpt), - DP_UUID(entry->ie_couuid), rpt->rt_pool->sp_rebuilding); + DP_UUID(entry->ie_couuid), atomic_load(&rpt->rt_pool->sp_rebuilding)); dss_sleep(1000); if (rpt->rt_abort || rpt->rt_finishing) { D_DEBUG(DB_REBUILD, DF_RB " " DF_UUID " rebuild abort %u/%u.\n", @@ -1077,13 +1078,21 @@ static void rebuild_scan_leader(void *data) { struct rebuild_tgt_pool_tracker *rpt = data; - struct rebuild_pool_tls *tls; - int rc; - bool wait = false; - - D_DEBUG(DB_REBUILD, DF_RB " check resync %u/%u < %u\n", DP_RB_RPT(rpt), - rpt->rt_pool->sp_dtx_resync_version, rpt->rt_global_dtx_resync_version, - rpt->rt_rebuild_ver); + struct rebuild_pool_tls *tls; + int rc; + + if (rpt->rt_pool->sp_gl_dtx_resync_version >= rpt->rt_rebuild_ver) { + D_DEBUG(DB_REBUILD, DF_RB " sp_gl_dtx_resync_version %d exceed rt_rebuild_ver %d.", + DP_RB_RPT(rpt), rpt->rt_pool->sp_gl_dtx_resync_version, + rpt->rt_rebuild_ver); + if (rpt->rt_global_dtx_resync_version < rpt->rt_pool->sp_gl_dtx_resync_version) + rpt->rt_global_dtx_resync_version = rpt->rt_pool->sp_gl_dtx_resync_version; + goto do_scan; + } else { + D_DEBUG(DB_REBUILD, DF_RB " check resync %u/%u < %u\n", DP_RB_RPT(rpt), + rpt->rt_pool->sp_dtx_resync_version, rpt->rt_global_dtx_resync_version, + rpt->rt_rebuild_ver); + } /* Wait for dtx resync to finish */ while (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver) { @@ -1092,7 +1101,6 @@ rebuild_scan_leader(void *data) if (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver) { D_INFO(DF_RB " wait for global dtx %u\n", DP_RB_RPT(rpt), rpt->rt_global_dtx_resync_version); - wait = true; ABT_cond_wait(rpt->rt_global_dtx_wait_cond, rpt->rt_lock); } ABT_mutex_unlock(rpt->rt_lock); @@ -1102,23 +1110,21 @@ rebuild_scan_leader(void *data) D_GOTO(out, rc = -DER_SHUTDOWN); } } + if (rpt->rt_pool->sp_gl_dtx_resync_version < rpt->rt_global_dtx_resync_version) { + rpt->rt_pool->sp_gl_dtx_resync_version = rpt->rt_global_dtx_resync_version; + D_INFO(DF_RB " update sp_gl_dtx_resync_version to %d", DP_RB_RPT(rpt), + rpt->rt_pool->sp_gl_dtx_resync_version); + } - if (wait) - D_INFO(DF_RB " scan collective begin\n", DP_RB_RPT(rpt)); - else - D_DEBUG(DB_REBUILD, DF_RB " scan collective begin\n", DP_RB_RPT(rpt)); - +do_scan: + D_INFO(DF_RB " scan collective begin\n", DP_RB_RPT(rpt)); rc = ds_pool_thread_collective(rpt->rt_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, rebuild_scanner, rpt, DSS_ULT_DEEP_STACK); if (rc) D_GOTO(out, rc); - if (wait) - D_INFO(DF_RB " rebuild scan collective done\n", DP_RB_RPT(rpt)); - else - D_DEBUG(DB_REBUILD, DF_RB "rebuild scan collective done\n", DP_RB_RPT(rpt)); - + D_INFO(DF_RB " rebuild scan collective done\n", DP_RB_RPT(rpt)); ABT_mutex_lock(rpt->rt_lock); rc = ds_pool_task_collective(rpt->rt_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, rebuild_scan_done, rpt, 0); @@ -1263,7 +1269,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) D_GOTO(out, rc); } - rpt->rt_pool->sp_rebuilding++; /* reset in rebuild_tgt_fini */ + atomic_fetch_add(&rpt->rt_pool->sp_rebuilding, 1); /* reset in rebuild_tgt_fini */ rpt_get(rpt); /* step-3: start scan leader */ diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 373db1dbb4d..91185bb5cc3 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -288,11 +288,18 @@ rebuild_leader_set_status(struct rebuild_global_pool_tracker *rgt, return; } + if (status->dtx_resync_version != resync_ver) + D_DEBUG(DB_REBUILD, DF_RB " rank %d, update dtx_resync_version from %d to %d", + DP_RB_RGT(rgt), rank, status->dtx_resync_version, resync_ver); status->dtx_resync_version = resync_ver; - if (flags & SCAN_DONE) + if ((flags & SCAN_DONE) && !status->scan_done) { + D_DEBUG(DB_REBUILD, DF_RB " rank %d is scan_done", DP_RB_RGT(rgt), rank); status->scan_done = 1; - if (flags & PULL_DONE) + } + if ((flags & PULL_DONE) && !status->pull_done) { + D_DEBUG(DB_REBUILD, DF_RB " rank %d is pull_done", DP_RB_RGT(rgt), rank); status->pull_done = 1; + } } static void @@ -309,6 +316,7 @@ rebuild_leader_set_update_time(struct rebuild_global_pool_tracker *rgt, d_rank_t D_INFO("rank %u is not included in this rebuild.\n", rank); } +#define RB_DTX_RESYNC_VER_SKIP ((uint32_t)-1) static uint32_t rebuild_get_global_dtx_resync_ver(struct rebuild_global_pool_tracker *rgt) { @@ -318,7 +326,7 @@ rebuild_get_global_dtx_resync_ver(struct rebuild_global_pool_tracker *rgt) D_ASSERT(rgt->rgt_servers_number > 0); D_ASSERT(rgt->rgt_servers != NULL); for (i = 0; i < rgt->rgt_servers_number; i++) { - if (rgt->rgt_servers[i].dtx_resync_version == (uint32_t)(-1)) + if (rgt->rgt_servers[i].dtx_resync_version == RB_DTX_RESYNC_VER_SKIP) continue; if (min > rgt->rgt_servers[i].dtx_resync_version) @@ -565,6 +573,10 @@ rebuild_status_completed_update_partial(const uuid_t pool_uuid, int32_t rs_state rs_inlist = rebuild_status_completed_lookup(pool_uuid); if (rs_inlist != NULL) { + /* possible enhancement: only overwrite rs_inlist->rs_errno if rs_errno != 0 + * e.g., if marking a failed rebuild as done after Fail_reclaim, keep original + * rs_errno. + */ rs_inlist->rs_errno = rs_errno; rs_inlist->rs_state = rs_state; return 0; @@ -648,11 +660,14 @@ rebuild_tgt_query(struct rebuild_tgt_pool_tracker *rpt, struct ds_migrate_status dms = { 0 }; struct rebuild_pool_tls *tls; struct rebuild_tgt_query_arg arg; + bool global_scan_done; int rc; + /* Get rt_global_scan_done before querying dms.dm_migrating status */ + global_scan_done = rpt->rt_global_scan_done; + arg.rpt = rpt; arg.status = status; - if (rpt->rt_rebuild_op != RB_OP_RECLAIM && rpt->rt_rebuild_op != RB_OP_FAIL_RECLAIM) { rc = ds_migrate_query_status(rpt->rt_pool_uuid, rpt->rt_rebuild_ver, rpt->rt_rebuild_gen, rpt->rt_rebuild_op, @@ -678,7 +693,7 @@ rebuild_tgt_query(struct rebuild_tgt_pool_tracker *rpt, status->obj_count += dms.dm_obj_count; status->rec_count = dms.dm_rec_count; status->size = dms.dm_total_size; - if (status->scanning || dms.dm_migrating) + if (!global_scan_done || status->scanning || dms.dm_migrating) status->rebuilding = true; else status->rebuilding = false; @@ -876,14 +891,62 @@ enum { }; static bool -rebuild_is_stoppable(struct rebuild_global_pool_tracker *rgt, bool force) +rebuild_is_stoppable(struct rebuild_global_pool_tracker *rgt, bool force, int *rcp) { - if ((rgt->rgt_opc == RB_OP_REBUILD) || (rgt->rgt_opc == RB_OP_UPGRADE)) + /* NAK if nothing is rebuilding */ + if (rgt == NULL) { + *rcp = -DER_NONEXIST; + return false; + } + + /* NAK if another rebuild is queued for the same pool (it would run after this one stopped) + */ + if (!d_list_empty(&rebuild_gst.rg_queue_list)) { + struct rebuild_task *task; + + d_list_for_each_entry(task, &rebuild_gst.rg_queue_list, dst_list) { + if (uuid_compare(task->dst_pool_uuid, rgt->rgt_pool_uuid) == 0) { + *rcp = -DER_NO_PERM; + return false; + } + } + } + + if ((rgt->rgt_opc == RB_OP_REBUILD) || (rgt->rgt_opc == RB_OP_UPGRADE)) { + *rcp = 0; return true; + } - if ((rgt->rgt_opc == RB_OP_FAIL_RECLAIM) && force && (rgt->rgt_num_op_freclaim_fail > 0)) + /* Defer stop for many Fail_reclaim cases (until after it finishes). Do not return errors. + * Only allow force-stop of repeating failures in Fail_reclaim + */ + if (rgt->rgt_opc == RB_OP_FAIL_RECLAIM && force) { + if (rgt->rgt_num_op_freclaim_fail == 0) { + D_INFO(DF_RB + ": cannot force-stop op:Fail_reclaim with 0 failures - defer stop " + "until after it finishes\n", + DP_RB_RGT(rgt)); + *rcp = 0; + return false; + } + D_INFO(DF_RB ": force-stop in op:Fail_reclaim after %u failures\n", DP_RB_RGT(rgt), + rgt->rgt_num_op_freclaim_fail); + *rcp = 0; return true; + } else if (rgt->rgt_opc == RB_OP_FAIL_RECLAIM) { + D_INFO(DF_RB ": defer stop until after op:Fail_reclaim finishes\n", DP_RB_RGT(rgt)); + *rcp = 0; + return false; + } + + /* NAK if this rebuild is Reclaim (i.e., it's effectively done) */ + if (rgt->rgt_opc == RB_OP_RECLAIM) { + *rcp = -DER_BUSY; + return false; + } + /* Not expected */ + *rcp = -DER_MISC; return false; } @@ -892,34 +955,35 @@ int ds_rebuild_admin_stop(struct ds_pool *pool, uint32_t force) { struct rebuild_global_pool_tracker *rgt; + int rc = 0; /* look up the running rebuild and mark it as aborted (and by the administrator) */ rgt = rebuild_global_pool_tracker_lookup(pool->sp_uuid, -1 /* ver */, -1 /* gen */); - if (rgt == NULL) { - /* nothing running, make it a no-op */ - D_INFO(DF_UUID ": received request to stop rebuild - but nothing found to stop\n", - DP_UUID(pool->sp_uuid)); - return 0; - } - /* admin stop command does not terminate reclaim/fail_reclaim jobs (unless forced) */ - if (rebuild_is_stoppable(rgt, force)) { + /* admin stop command only for specific cases (and force option for failing op:Fail_reclaim) + */ + if (rebuild_is_stoppable(rgt, force, &rc)) { D_INFO(DF_RB ": stopping rebuild force=%u opc %u(%s)\n", DP_RB_RGT(rgt), force, rgt->rgt_opc, RB_OP_STR(rgt->rgt_opc)); rgt->rgt_abort = 1; rgt->rgt_status.rs_errno = -DER_OP_CANCELED; } else { - D_INFO(DF_RB ": NOT stopping rebuild during opc %u(%s)\n", DP_RB_RGT(rgt), - rgt->rgt_opc, RB_OP_STR(rgt->rgt_opc)); + if (rgt) { + D_INFO(DF_RB ": NOT stopping rebuild force=%u opc %u(%s), rc=%d\n", + DP_RB_RGT(rgt), force, rgt->rgt_opc, RB_OP_STR(rgt->rgt_opc), rc); + } else { + DL_INFO(rc, DF_UUID ": nothing found to stop", DP_UUID(pool->sp_uuid)); + return rc; + } } - /* admin stop command does not terminate op:Fail_reclaim, but it is remembered to avoid - * retrying the original op:Rebuild. + /* admin stop command does not usually terminate op:Fail_reclaim, but it is always + * remembered to avoid retrying the original op:Rebuild. */ if (rgt->rgt_abort || (rgt->rgt_opc == RB_OP_FAIL_RECLAIM)) rgt->rgt_stop_admin = 1; rgt_put(rgt); - return 0; + return rc; } /* @@ -954,53 +1018,69 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, char sbuf[RBLD_SBUF_LEN]; double now; char *str; - d_rank_list_t excluded = {0}; + d_rank_list_t rank_list = {0}; bool rebuild_abort = false; int i; + now = ABT_get_wtime(); ABT_rwlock_rdlock(pool->sp_lock); rc = map_ranks_init(pool->sp_map, - PO_COMP_ST_UP | PO_COMP_ST_DOWN | - PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW, - &excluded); + PO_COMP_ST_UP | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | + PO_COMP_ST_NEW, + &rank_list); if (rc != 0) { D_INFO(DF_RB ": get rank list: %d\n", DP_RB_RGT(rgt), rc); ABT_rwlock_unlock(pool->sp_lock); goto sleep; } - for (i = 0; i < excluded.rl_nr; i++) { + for (i = 0; i < rank_list.rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_dom_by_rank(pool->sp_map, excluded.rl_ranks[i]); + dom = pool_map_find_dom_by_rank(pool->sp_map, rank_list.rl_ranks[i]); D_ASSERT(dom != NULL); if (rgt->rgt_opc == RB_OP_REBUILD) { if (dom->do_comp.co_status == PO_COMP_ST_UP) { if (dom->do_comp.co_in_ver > rgt->rgt_rebuild_ver) { - D_INFO(DF_RB ": cancel rebuild co_in_ver=%u\n", - DP_RB_RGT(rgt), dom->do_comp.co_in_ver); + D_INFO(DF_RB ": cancel rebuild due to new REINT, " + "co_rank %d, co_in_ver %u\n", + DP_RB_RGT(rgt), dom->do_comp.co_rank, + dom->do_comp.co_in_ver); rebuild_abort = true; break; - } else { - continue; } } else if (dom->do_comp.co_status == PO_COMP_ST_DOWN) { if (dom->do_comp.co_fseq > rgt->rgt_rebuild_ver) { - D_INFO(DF_RB ": cancel rebuild co_fseq=%u\n", - DP_RB_RGT(rgt), dom->do_comp.co_fseq); + D_INFO(DF_RB ": cancel rebuild due to new DOWN, " + "co_rank %d, co_fseq %u\n", + DP_RB_RGT(rgt), dom->do_comp.co_rank, + dom->do_comp.co_fseq); rebuild_abort = true; break; } } } - D_INFO(DF_RB " exclude rank %d/%x.\n", DP_RB_RGT(rgt), dom->do_comp.co_rank, - dom->do_comp.co_status); - rebuild_leader_set_status(rgt, dom->do_comp.co_rank, - -1, SCAN_DONE | PULL_DONE); + + if (now - last_print > 20) + D_INFO(DF_RB " rank %d, status 0x%x.\n", DP_RB_RGT(rgt), + dom->do_comp.co_rank, dom->do_comp.co_status); + + /* Some engines don't participate the rebuild that will not report + * progress/completion or dtx resync version through IV, mark the complete/ + * skip. + * 1) PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW ranks + * 2) PO_COMP_ST_UP but co_in_ver > rebuild_ver also will be excluded from + * rebuild request, see rebuild_scan_broadcast(). + */ + if (dom->do_comp.co_status != PO_COMP_ST_UP || + dom->do_comp.co_in_ver > rgt->rgt_rebuild_ver) + rebuild_leader_set_status(rgt, dom->do_comp.co_rank, + RB_DTX_RESYNC_VER_SKIP, + SCAN_DONE | PULL_DONE); } ABT_rwlock_unlock(pool->sp_lock); - map_ranks_fini(&excluded); + map_ranks_fini(&rank_list); if (rebuild_abort) { rgt->rgt_abort = 1; @@ -1044,7 +1124,6 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, break; } - now = ABT_get_wtime(); /* print something at least for each 10 seconds */ if (now - last_print > 10) { last_print = now; @@ -1300,11 +1379,15 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker dom = pool_map_find_dom_by_rank(pool->sp_map, up_ranks.rl_ranks[i]); D_ASSERT(dom != NULL); - D_DEBUG(DB_REBUILD, DF_RB " rank %u co_in_ver %u\n", DP_RB_RGT(rgt), - up_ranks.rl_ranks[i], dom->do_comp.co_in_ver); - if (dom->do_comp.co_in_ver < rgt->rgt_rebuild_ver) + D_DEBUG(DB_REBUILD, DF_RB " rank %u co_in_ver %u, rebuild_ver %u.\n", + DP_RB_RGT(rgt), up_ranks.rl_ranks[i], dom->do_comp.co_in_ver, + rgt->rgt_rebuild_ver); + if (dom->do_comp.co_in_ver <= rgt->rgt_rebuild_ver) continue; + D_INFO(DF_RB " bypass UP rank %u co_in_ver %u exceed rebuild_ver %u\n", + DP_RB_RGT(rgt), up_ranks.rl_ranks[i], dom->do_comp.co_in_ver, + rgt->rgt_rebuild_ver); excluded->rl_ranks[nr++] = up_ranks.rl_ranks[i]; } excluded->rl_nr = nr; @@ -1314,13 +1397,11 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker rc = ds_pool_bcast_create(dss_get_module_info()->dmi_ctx, pool, DAOS_REBUILD_MODULE, REBUILD_OBJECTS_SCAN, rebuild_ver, &rpc, NULL, excluded, NULL); if (rc != 0) { - DL_ERROR(rc, DF_RB " pool map broadcast failed", DP_RB_RGT(rgt)); + DL_ERROR(rc, DF_RB " failed to create scan broadcast request", DP_RB_RGT(rgt)); D_GOTO(out, rc); } rsi = crt_req_get(rpc); - D_DEBUG(DB_REBUILD, DF_RB " scan broadcast\n", DP_RB_RGT(rgt)); - uuid_copy(rsi->rsi_pool_uuid, pool->sp_uuid); rsi->rsi_ns_id = pool->sp_iv_ns->iv_ns_id; rsi->rsi_leader_term = rgt->rgt_leader_term; @@ -1339,11 +1420,13 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker rso = crt_reply_get(rpc); if (rc == 0) rc = rso->rso_status; + else + DL_ERROR(rc, DF_RB " scan broadcast send failed.", DP_RB_RGT(rgt)); rgt->rgt_init_scan = 1; rgt->rgt_stable_epoch = rso->rso_stable_epoch; - D_DEBUG(DB_REBUILD, DF_RB " " DF_RC " got stable/reclaim epoch " DF_X64 "/" DF_X64 "\n", - DP_RB_RGT(rgt), DP_RC(rc), rgt->rgt_stable_epoch, rgt->rgt_reclaim_epoch); + DL_INFO(rc, DF_RB " got stable/reclaim epoch " DF_X64 "/" DF_X64, DP_RB_RGT(rgt), + rgt->rgt_stable_epoch, rgt->rgt_reclaim_epoch); crt_req_decref(rpc); out: if (excluded) @@ -1578,6 +1661,12 @@ rebuild_try_merge_tgts(struct ds_pool *pool, uint32_t map_ver, if (delay_sec != (uint64_t)(-1)) merge_task->dst_schedule_time = daos_gettime_coarse() + delay_sec; } + /* For the case of new rebuild task in queue, and then rebuild stop's fail reclaim + * complete and re-scheduled the original rebuild task with delay -1. + */ + if (merge_pre_task->dst_schedule_time != (uint64_t)(-1) && + delay_sec == (uint64_t)(-1)) + merge_task = merge_pre_task; } else if (merge_post_task != NULL && merge_post_task->dst_rebuild_op == rebuild_op) { if ((merge_post_task->dst_schedule_time == (uint64_t)(-1) && delay_sec == (uint64_t)(-1)) || @@ -1827,6 +1916,25 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, * fails, it will be used to discard all of the previous rebuild data * (reclaim - 1 see obj_reclaim()), but keep the in-flight I/O data. */ + if (rgt->rgt_stop_admin) { + rc = ds_rebuild_schedule( + pool, task->dst_reclaim_ver - 1 /* map_ver */, + rgt->rgt_stable_epoch, task->dst_new_layout_version, + &task->dst_tgts, RB_OP_FAIL_RECLAIM, + task->dst_rebuild_op /* retry_rebuild_op */, + task->dst_map_ver /* retry_map_ver */, rgt->rgt_stop_admin, + task, delay_sec); + DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, + DF_RB ": errno " DF_RC ", schedule %u(%s)", + DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), + RB_OP_FAIL_RECLAIM, RB_OP_STR(RB_OP_FAIL_RECLAIM)); + D_GOTO(complete, rc); + } + + /* revert pool map and defer scheduling a retry until Fail_reclaim is done + */ + retry_rebuild_task(task, rgt, &retry_opc); + rc = ds_rebuild_schedule( pool, task->dst_reclaim_ver - 1 /* map_ver */, rgt->rgt_stable_epoch, task->dst_new_layout_version, &task->dst_tgts, RB_OP_FAIL_RECLAIM, @@ -1836,10 +1944,6 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, DF_RB ": errno " DF_RC ", schedule %u(%s)", DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), RB_OP_FAIL_RECLAIM, RB_OP_STR(RB_OP_FAIL_RECLAIM)); - - /* revert pool map and defer scheduling a retry until Fail_reclaim is done - */ - retry_rebuild_task(task, rgt, &retry_opc); D_GOTO(complete, rc); } @@ -1899,10 +2003,26 @@ rebuild_task_complete_schedule(struct rebuild_task *task, struct ds_pool *pool, DL_CDEBUG(rc1, DLOG_ERR, DLOG_INFO, rc1, DF_RB ": updated, state %d errno " DF_RC, DP_RB_RGT(rgt), rgt->rgt_status.rs_state, DP_RC(rgt->rgt_status.rs_errno)); + + /* re-schedule the stopped original rebuild task with delay -1, to be merged with + * following rebuild task, to avoid losing the task->dst_tgts. + */ + if (task->dst_retry_rebuild_op == RB_OP_REBUILD) { + rc = ds_rebuild_schedule( + pool, task->dst_retry_map_ver, rgt->rgt_reclaim_epoch, + task->dst_new_layout_version, &task->dst_tgts, + task->dst_retry_rebuild_op, RB_OP_NONE /* retry_rebuild_op */, + 0 /* retry_map_ver */, false /* stop_admin */, task, + -1 /* delay_sec */); + DL_CDEBUG(rc, DLOG_ERR, DLOG_INFO, rc, + DF_RB ": errno " DF_RC ", schedule retry %u(%s) with delay -1", + DP_RB_RGT(rgt), DP_RC(rgt->rgt_status.rs_errno), + task->dst_retry_rebuild_op, + RB_OP_STR(task->dst_retry_rebuild_op)); + } } else if ((task->dst_rebuild_op == RB_OP_FAIL_RECLAIM) && (task->dst_retry_rebuild_op != RB_OP_NONE)) { - /* Fail_reclaim done (and a stop command wasn't received during) - retry original - * rebuild */ + /* Fail_reclaim done (and a stop command wasn't received during) - retry rebuild. */ rc1 = ds_rebuild_schedule(pool, task->dst_retry_map_ver, rgt->rgt_reclaim_epoch, task->dst_new_layout_version, &task->dst_tgts, task->dst_retry_rebuild_op, @@ -2588,11 +2708,12 @@ regenerate_task_of_type(struct ds_pool *pool, pool_comp_state_t match_states, ui return rc; } - -/* Regenerate the rebuild tasks when changing the leader. */ +/* Regenerate rebuild tasks when changing the leader, or manually starting rebuilds. + * auto_recovery (true for leader change, false for manual) applies to both sys_self_heal and prop. + */ int ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys_self_heal, - uint64_t delay_sec) + bool auto_recovery, uint64_t delay_sec) { struct daos_prop_entry *entry; char *env; @@ -2600,7 +2721,7 @@ ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys rebuild_gst.rg_abort = 0; - if (!(sys_self_heal & DS_MGMT_SELF_HEAL_POOL_REBUILD)) { + if (auto_recovery && !(sys_self_heal & DS_MGMT_SELF_HEAL_POOL_REBUILD)) { D_DEBUG(DB_REBUILD, DF_UUID ": pool_rebuild disabled in sys_self_heal\n", DP_UUID(pool->sp_uuid)); return DER_SUCCESS; @@ -2622,10 +2743,8 @@ ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys } entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SELF_HEAL); - D_ASSERT(entry != NULL); - if (entry->dpe_val & (DAOS_SELF_HEAL_AUTO_REBUILD | DAOS_SELF_HEAL_DELAY_REBUILD) && - !pool->sp_disable_rebuild) { + if (is_pool_rebuild_allowed(pool, entry->dpe_val /* self_heal */, auto_recovery)) { rc = regenerate_task_of_type( pool, PO_COMP_ST_DOWN, entry->dpe_val & DAOS_SELF_HEAL_DELAY_REBUILD ? -1 : delay_sec); @@ -2636,7 +2755,7 @@ ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys if (rc != 0) return rc; } else { - D_DEBUG(DB_REBUILD, DF_UUID" self healing is disabled\n", + D_DEBUG(DB_REBUILD, "Pool " DF_UUID " self healing is disabled\n", DP_UUID(pool->sp_uuid)); } @@ -2673,7 +2792,8 @@ ds_rebuild_admin_start(struct ds_pool *pool) goto out; } - rc = ds_rebuild_regenerate_task(pool, &prop, DS_MGMT_SELF_HEAL_ALL, 0); + rc = ds_rebuild_regenerate_task(pool, &prop, DS_MGMT_SELF_HEAL_ALL /* sys_self_heal */, + false /* auto_recovery */, 0 /* delay_sec */); daos_prop_fini(&prop); if (rc) DL_ERROR(rc, DF_UUID ": regenerate rebuild task failed", DP_UUID(pool->sp_uuid)); @@ -2732,8 +2852,8 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_INFO(DF_RB " finishing rebuild rpt refcount %u, pool refcount %u\n", DP_RB_RPT(rpt), rpt->rt_refcount, daos_lru_ref_count(&rpt->rt_pool->sp_entry)); - D_ASSERT(rpt->rt_pool->sp_rebuilding > 0); - rpt->rt_pool->sp_rebuilding--; + D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuilding) > 0); + atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); rpt->rt_pool->sp_rebuild_scan = 0; ABT_mutex_lock(rpt->rt_lock); @@ -2775,6 +2895,7 @@ rebuild_tgt_status_check_ult(void *arg) { struct rebuild_tgt_pool_tracker *rpt = arg; struct sched_req_attr attr = { 0 }; + uint32_t reported_dtx_resyc_ver = 0; D_ASSERT(rpt != NULL); sched_req_attr_init(&attr, SCHED_REQ_MIGRATE, &rpt->rt_pool_uuid); @@ -2878,6 +2999,12 @@ rebuild_tgt_status_check_ult(void *arg) rpt->rt_reported_obj_cnt = status.obj_count; rpt->rt_reported_rec_cnt = status.rec_count; rpt->rt_reported_size = status.size; + if (iv.riv_dtx_resyc_version > reported_dtx_resyc_ver) { + D_DEBUG(DB_REBUILD, + DF_RB "reported riv_dtx_resyc_version %d", + DP_RB_RPT(rpt), iv.riv_dtx_resyc_version); + reported_dtx_resyc_ver = iv.riv_dtx_resyc_version; + } } else { DL_WARN(rc, DF_RB " rebuild iv update failed", DP_RB_RPT(rpt)); /* Already finished rebuild, cannot find rebuild status on leader diff --git a/src/rsvc/rpc.h b/src/rsvc/rpc.h index 60fb5f94862..bd7e1d2cbf7 100644 --- a/src/rsvc/rpc.h +++ b/src/rsvc/rpc.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * RPC operation codes @@ -49,6 +50,8 @@ enum rsvc_operation { extern struct crt_proto_format rsvc_proto_fmt; +/* clang-format off */ + #define DAOS_ISEQ_RSVC_START /* input fields */ \ ((d_iov_t) (sai_svc_id) CRT_VAR) \ ((uuid_t) (sai_db_uuid) CRT_VAR) \ @@ -56,17 +59,22 @@ extern struct crt_proto_format rsvc_proto_fmt; ((uint32_t) (sai_mode) CRT_VAR) \ ((uint32_t) (sai_flags) CRT_VAR) \ ((uint32_t) (sai_vos_df_version) CRT_VAR) \ + ((uint32_t) (sai_layout_version) CRT_VAR) \ + ((uint32_t) (sai_padding) CRT_VAR) \ ((uint64_t) (sai_size) CRT_VAR) \ ((uint64_t) (sai_term) CRT_VAR) \ - ((d_rank_list_t) (sai_ranks) CRT_PTR) + ((rdb_replica_id_t) (sai_replicas) CRT_ARRAY) #define DAOS_OSEQ_RSVC_START /* output fields (rc: err count) */ \ ((int32_t) (sao_rc) CRT_VAR) \ ((int32_t) (sao_rc_errval) CRT_VAR) +/* clang-format on */ CRT_RPC_DECLARE(rsvc_start, DAOS_ISEQ_RSVC_START, DAOS_OSEQ_RSVC_START) +/* clang-format off */ + #define DAOS_ISEQ_RSVC_STOP /* input fields */ \ ((d_iov_t) (soi_svc_id) CRT_VAR) \ ((uint32_t) (soi_class) CRT_VAR) \ @@ -76,6 +84,8 @@ CRT_RPC_DECLARE(rsvc_start, DAOS_ISEQ_RSVC_START, DAOS_OSEQ_RSVC_START) #define DAOS_OSEQ_RSVC_STOP /* output fields */ \ ((int32_t) (soo_rc) CRT_VAR) +/* clang-format on */ + CRT_RPC_DECLARE(rsvc_stop, DAOS_ISEQ_RSVC_STOP, DAOS_OSEQ_RSVC_STOP) int diff --git a/src/rsvc/srv.c b/src/rsvc/srv.c index af0324c10d9..3f2b599eb2b 100644 --- a/src/rsvc/srv.c +++ b/src/rsvc/srv.c @@ -802,16 +802,16 @@ start_mode_str(enum ds_rsvc_start_mode mode) } static bool -self_only(d_rank_list_t *replicas) +self_only(struct rdb_create_params *p) { - return (replicas != NULL && replicas->rl_nr == 1 && - replicas->rl_ranks[0] == dss_self_rank()); + return p->rcp_replicas != NULL && p->rcp_replicas_len == 1 && + rdb_replica_id_compare(p->rcp_replicas[0], p->rcp_id) == 0; } static int start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t term, - enum ds_rsvc_start_mode mode, size_t size, uint32_t vos_df_version, d_rank_list_t *replicas, - void *arg, struct ds_rsvc **svcp) + enum ds_rsvc_start_mode mode, struct rdb_create_params *create_params, void *arg, + struct ds_rsvc **svcp) { struct rdb_storage *storage; struct ds_rsvc *svc = NULL; @@ -823,8 +823,8 @@ start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t term, svc->s_ref++; if (mode == DS_RSVC_CREATE) - rc = rdb_create(svc->s_db_path, svc->s_db_uuid, term, size, vos_df_version, - replicas, &rsvc_rdb_cbs, svc, &storage); + rc = rdb_create(svc->s_db_path, svc->s_db_uuid, term, create_params, &rsvc_rdb_cbs, + svc, &storage); else rc = rdb_open(svc->s_db_path, svc->s_db_uuid, term, &rsvc_rdb_cbs, svc, &storage); if (rc != 0) @@ -840,7 +840,7 @@ start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t term, if (rc != 0) goto err_storage; - if (mode == DS_RSVC_CREATE && self_only(replicas) && + if (mode == DS_RSVC_CREATE && self_only(create_params) && rsvc_class(class)->sc_bootstrap != NULL) { rc = bootstrap_self(svc, arg); if (rc != 0) @@ -944,19 +944,15 @@ ds_rsvc_stop_nodb(enum ds_rsvc_class_id class, d_iov_t *id) } /** - * Start a replicated service. If \a mode is not DS_RSVC_CREATE, all remaining - * input parameters are ignored; otherwise, create the replica first. If \a - * replicas is NULL, all remaining input parameters are ignored; otherwise, - * bootstrap the replicated service. + * Start a replicated service. If \a mode is DS_RSVC_CREATE, create the replica + * first; otherwise, \a create_params is ignored. * * \param[in] class replicated service class * \param[in] id replicated service ID * \param[in] db_uuid DB UUID * \param[in] caller_term caller term if not RDB_NIL_TERM (see rdb_open) * \param[in] mode mode of starting the replicated service - * \param[in] size replica size in bytes - * \param[in] vos_df_version version of VOS durable format - * \param[in] replicas optional initial membership + * \param[in] create_params parameters used when \a mode is DS_RSVC_CREATE * \param[in] arg argument for cbs.sc_bootstrap * * \retval -DER_ALREADY replicated service already started @@ -965,8 +961,7 @@ ds_rsvc_stop_nodb(enum ds_rsvc_class_id class, d_iov_t *id) */ int ds_rsvc_start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t caller_term, - enum ds_rsvc_start_mode mode, size_t size, uint32_t vos_df_version, - d_rank_list_t *replicas, void *arg) + enum ds_rsvc_start_mode mode, struct rdb_create_params *create_params, void *arg) { struct ds_rsvc *svc = NULL; d_list_t *entry; @@ -976,38 +971,74 @@ ds_rsvc_start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t entry = d_hash_rec_find(&rsvc_hash, id->iov_buf, id->iov_len); if (entry != NULL) { + rdb_replica_id_t rid; + svc = rsvc_obj(entry); - D_DEBUG(DB_MD, "%s: found: stop=%d\n", svc->s_name, svc->s_stop); - if (mode == DS_RSVC_DICTATE && !svc->s_stop) { + rid = rdb_get_replica_id(svc->s_db); + D_DEBUG(DB_MD, "%s: found " RDB_F_RID ": stop=%d mode=%s replicas=%p\n", + svc->s_name, RDB_P_RID(rid), svc->s_stop, start_mode_str(mode), + mode == DS_RSVC_CREATE ? create_params->rcp_replicas : NULL); + if (mode == DS_RSVC_CREATE && create_params->rcp_replicas != NULL) { + D_ERROR("%s: creating and bootstrapping existing replica not allowed\n", + svc->s_name); + rc = -DER_EXIST; + goto out_svc; + } else if (mode == DS_RSVC_CREATE && rid.rri_gen < create_params->rcp_id.rri_gen) { + int n = 10; + + /* + * Destroy the older replica and continue. Note that the destroy only + * happens when the last svc reference is released. + */ + D_INFO("%s: destroying older replica " RDB_F_RID " for " RDB_F_RID "\n", + svc->s_name, RDB_P_RID(rid), RDB_P_RID(create_params->rcp_id)); + rc = ds_rsvc_stop(class, id, caller_term, true /* destroy */); + if (rc != 0) { + DL_ERROR(rc, "%s: failed to destroy existing replica", svc->s_name); + goto out_svc; + } + while (svc->s_ref > 1 && n > 0) { + dss_sleep(1000); + n--; + } + if (svc->s_ref > 1) { + D_ERROR("%s: gave up waiting for other service references\n", + svc->s_name); + rc = -DER_CANCELED; + goto out_svc; + } + } else if (mode == DS_RSVC_CREATE && rid.rri_gen > create_params->rcp_id.rri_gen) { + D_ERROR("%s: found newer replica: " RDB_F_RID " > " RDB_F_RID "\n", + svc->s_name, RDB_P_RID(rid), RDB_P_RID(create_params->rcp_id)); + rc = -DER_EXIST; + goto out_svc; + } else if (mode == DS_RSVC_DICTATE && !svc->s_stop) { /* * If we need to dictate, and the service is not * stopping, then stop it, which should not fail in * this case, and continue. */ rc = ds_rsvc_stop(class, id, caller_term, false /* destroy */); - D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc)); - ds_rsvc_put(svc); + D_ASSERTF(rc == 0, DF_RC "\n", DP_RC(rc)); } else { if (caller_term != RDB_NIL_TERM) { rc = rdb_ping(svc->s_db, caller_term); if (rc != 0) { D_CDEBUG(rc == -DER_STALE, DB_MD, DLOG_ERR, "%s: failed to ping local replica\n", svc->s_name); - ds_rsvc_put(svc); - goto out; + goto out_svc; } } if (svc->s_stop) rc = -DER_CANCELED; else rc = -DER_ALREADY; - ds_rsvc_put(svc); - goto out; + goto out_svc; } + ds_rsvc_put(svc); } - rc = start(class, id, db_uuid, caller_term, mode, size, vos_df_version, replicas, arg, - &svc); + rc = start(class, id, db_uuid, caller_term, mode, create_params, arg, &svc); if (rc != 0) goto out; @@ -1020,6 +1051,7 @@ ds_rsvc_start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t } D_DEBUG(DB_MD, "%s: started replicated service\n", svc->s_name); +out_svc: ds_rsvc_put(svc); out: if (rc != 0 && rc != -DER_ALREADY && !(mode == DS_RSVC_CREATE && rc == -DER_EXIST)) @@ -1161,21 +1193,53 @@ int ds_rsvc_add_replicas_s(struct ds_rsvc *svc, d_rank_list_t *ranks, size_t size, uint32_t vos_df_version) { - int rc; + int i; + int rc = 0; - rc = ds_rsvc_dist_start(svc->s_class, &svc->s_id, svc->s_db_uuid, ranks, svc->s_term, - DS_RSVC_CREATE, false /* bootstrap */, size, vos_df_version); + /* Add one by one to reduce waste of replica generations. */ + for (i = 0; i < ranks->rl_nr; i++) { + d_rank_t r = ranks->rl_ranks[i]; + d_rank_list_t rl; + rdb_replica_id_t id; + int ids_len = 1; + struct ds_rsvc_create_params create_params; - /* TODO: Attempt to only add replicas that were successfully started */ - if (rc != 0) - goto out_stop; - rc = rdb_add_replicas(svc->s_db, ranks); -out_stop: - /* Clean up ranks that were not added */ - if (ranks->rl_nr > 0) { - D_ASSERT(rc != 0); - ds_rsvc_dist_stop(svc->s_class, &svc->s_id, ranks, NULL, svc->s_term, - true /* destroy */); + rl.rl_ranks = &r; + rl.rl_nr = 1; + + id.rri_rank = r; + + /* This allocation cannot be rolled back. */ + rc = rdb_alloc_replica_gen(svc->s_db, svc->s_term, &id.rri_gen); + if (rc != 0) + break; + + create_params.scp_bootstrap = false; + create_params.scp_size = size; + create_params.scp_vos_df_version = vos_df_version; + create_params.scp_layout_version = rdb_get_version(svc->s_db); + create_params.scp_replicas = &id; + create_params.scp_replicas_len = 1; + + rc = ds_rsvc_dist_start(svc->s_class, &svc->s_id, svc->s_db_uuid, &rl, svc->s_term, + DS_RSVC_CREATE, &create_params); + if (rc != 0) + break; + + rc = rdb_modify_replicas(svc->s_db, RDB_REPLICA_ADD, &id, &ids_len); + if (rc != 0) { + ds_rsvc_dist_stop(svc->s_class, &svc->s_id, &rl, NULL, svc->s_term, + true /* destroy */); + break; + } + } + + /* Remove all i successfully-added ranks from ranks. */ + if (i > 0) { + ranks->rl_nr -= i; + if (ranks->rl_nr > 0) + memmove(&ranks->rl_ranks[0], &ranks->rl_ranks[i], + ranks->rl_nr * sizeof(ranks->rl_ranks[0])); } return rc; } @@ -1209,22 +1273,70 @@ ds_rsvc_add_replicas(enum ds_rsvc_class_id class, d_iov_t *id, d_rank_list_t *ra } int -ds_rsvc_remove_replicas_s(struct ds_rsvc *svc, d_rank_list_t *ranks) +ds_rsvc_remove_replicas_s(struct ds_rsvc *svc, d_rank_list_t *ranks, bool destroy) { - d_rank_list_t *stop_ranks; - int rc; + d_rank_list_t *stop_ranks; + rdb_replica_id_t *all; + int all_len; + rdb_replica_id_t *to_remove; + int to_remove_len = 0; + int i; + int rc; - rc = daos_rank_list_dup(&stop_ranks, ranks); + rc = d_rank_list_dup(&stop_ranks, ranks); if (rc != 0) - return rc; - rc = rdb_remove_replicas(svc->s_db, ranks); + goto out; - /* filter out failed ranks */ - daos_rank_list_filter(ranks, stop_ranks, true /* exclude */); - if (stop_ranks->rl_nr > 0) - ds_rsvc_dist_stop(svc->s_class, &svc->s_id, stop_ranks, NULL, svc->s_term, - true /* destroy */); + /* Fill to_remove with replica IDs of ranks. */ + rc = rdb_get_replicas(svc->s_db, &all, &all_len); + if (rc != 0) + goto out_stop_ranks; + D_ALLOC_ARRAY(to_remove, ranks->rl_nr); + if (to_remove == NULL) { + rc = -DER_NOMEM; + goto out_all; + } + for (i = 0; i < ranks->rl_nr; i++) { + d_rank_t rank = ranks->rl_ranks[i]; + int j; + + for (j = 0; j < all_len; j++) { + if (all[j].rri_rank == rank) { + to_remove[to_remove_len] = all[j]; + to_remove_len++; + break; + } + } + if (j == all_len) { + D_ERROR("%s: rank %u not found in replica list\n", svc->s_name, rank); + rc = -DER_NONEXIST; + goto out_to_remove; + } + } + + rc = rdb_modify_replicas(svc->s_db, RDB_REPLICA_REMOVE, to_remove, &to_remove_len); + + /* Update ranks with to_remove (those that couldn't be removed). */ + D_ASSERTF(ranks->rl_nr >= to_remove_len, "%d >= %d\n", ranks->rl_nr, to_remove_len); + ranks->rl_nr = to_remove_len; + for (i = 0; i < to_remove_len; i++) + ranks->rl_ranks[i] = to_remove[i].rri_rank; + + if (destroy) { + /* filter out failed ranks */ + d_rank_list_filter(ranks, stop_ranks, true /* exclude */); + if (stop_ranks->rl_nr > 0) + ds_rsvc_dist_stop(svc->s_class, &svc->s_id, stop_ranks, NULL, svc->s_term, + true /* destroy */); + } + +out_to_remove: + D_FREE(to_remove); +out_all: + D_FREE(all); +out_stop_ranks: d_rank_list_free(stop_ranks); +out: return rc; } @@ -1238,7 +1350,7 @@ ds_rsvc_remove_replicas(enum ds_rsvc_class_id class, d_iov_t *id, rc = ds_rsvc_lookup_leader(class, id, &svc, hint); if (rc != 0) return rc; - rc = ds_rsvc_remove_replicas_s(svc, ranks); + rc = ds_rsvc_remove_replicas_s(svc, ranks, true /* destroy */); ds_rsvc_set_hint(svc, hint); ds_rsvc_put_leader(svc); return rc; @@ -1294,22 +1406,22 @@ bcast_create(crt_opcode_t opc, bool filter_invert, d_rank_list_t *filter_ranks, * \param[in] ranks list of replica ranks * \param[in] caller_term caller term if not RDB_NIL_TERM (see rdb_open) * \param[in] mode mode of starting the replicated service - * \param[in] bootstrap create with an initial list of replicas if \a mode is DS_RSVC_CREATE - * \param[in] size size of each replica in bytes if \a mode is DS_RSVC_CREATE - * \param[in] vos_df_version version of VOS durable format if \a mode is DS_RSVC_CREATE + * \param[in] create_params parameters used when \a mode is DS_RSVC_CREATE */ int ds_rsvc_dist_start(enum ds_rsvc_class_id class, d_iov_t *id, const uuid_t dbid, const d_rank_list_t *ranks, uint64_t caller_term, enum ds_rsvc_start_mode mode, - bool bootstrap, size_t size, uint32_t vos_df_version) + struct ds_rsvc_create_params *create_params) { crt_rpc_t *rpc; struct rsvc_start_in *in; struct rsvc_start_out *out; int rc; - D_ASSERT(!bootstrap || ranks != NULL); - D_ASSERT(mode != DS_RSVC_DICTATE || ranks->rl_nr == 1); + D_ASSERT(mode != DS_RSVC_CREATE || + (create_params != NULL && create_params->scp_replicas != NULL && + create_params->scp_replicas_len > 0)); + D_ASSERT(mode != DS_RSVC_DICTATE || (ranks != NULL && ranks->rl_nr == 1)); D_DEBUG(DB_MD, DF_UUID": %s DB\n", DP_UUID(dbid), start_mode_str(mode)); rc = bcast_create(RSVC_START, ranks != NULL /* filter_invert */, @@ -1318,21 +1430,23 @@ ds_rsvc_dist_start(enum ds_rsvc_class_id class, d_iov_t *id, const uuid_t dbid, goto out; in = crt_req_get(rpc); in->sai_class = class; - rc = daos_iov_copy(&in->sai_svc_id, id); - if (rc != 0) - goto out_rpc; + in->sai_svc_id = *id; uuid_copy(in->sai_db_uuid, dbid); in->sai_mode = mode; - if (mode == DS_RSVC_CREATE && bootstrap) - in->sai_flags |= RDB_AF_BOOTSTRAP; - in->sai_size = size; - in->sai_vos_df_version = vos_df_version; in->sai_term = caller_term; - in->sai_ranks = (d_rank_list_t *)ranks; + if (mode == DS_RSVC_CREATE) { + if (create_params->scp_bootstrap) + in->sai_flags |= RDB_AF_BOOTSTRAP; + in->sai_size = create_params->scp_size; + in->sai_vos_df_version = create_params->scp_vos_df_version; + in->sai_layout_version = create_params->scp_layout_version; + in->sai_replicas.ca_arrays = create_params->scp_replicas; + in->sai_replicas.ca_count = create_params->scp_replicas_len; + } rc = dss_rpc_send(rpc); if (rc != 0) - goto out_mem; + goto out_rpc; out = crt_reply_get(rpc); rc = out->sao_rc; @@ -1345,8 +1459,6 @@ ds_rsvc_dist_start(enum ds_rsvc_class_id class, d_iov_t *id, const uuid_t dbid, rc = out->sao_rc_errval; } -out_mem: - daos_iov_free(&in->sai_svc_id); out_rpc: crt_req_decref(rpc); out: @@ -1358,23 +1470,44 @@ ds_rsvc_start_handler(crt_rpc_t *rpc) { struct rsvc_start_in *in = crt_req_get(rpc); struct rsvc_start_out *out = crt_reply_get(rpc); - bool bootstrap = in->sai_flags & RDB_AF_BOOTSTRAP; + struct rdb_create_params create_params; + bool create = in->sai_mode == DS_RSVC_CREATE; int rc; - if (bootstrap && in->sai_ranks == NULL) { - rc = -DER_PROTO; - goto out; - } + if (create) { + d_rank_t self_rank = dss_self_rank(); + rdb_replica_id_t self; + bool bootstrap = in->sai_flags & RDB_AF_BOOTSTRAP; + int i; - if (in->sai_mode == DS_RSVC_DICTATE && - (in->sai_ranks == NULL || in->sai_ranks->rl_nr != 1)) { - rc = -DER_PROTO; - goto out; + if (in->sai_replicas.ca_arrays == NULL || in->sai_replicas.ca_count == 0) { + D_ERROR(DF_UUID ": no replica IDs\n", DP_UUID(in->sai_db_uuid)); + rc = -DER_PROTO; + goto out; + } + + /* Find self replica ID in in->sai_replicas. */ + for (i = 0; i < in->sai_replicas.ca_count; i++) + if (in->sai_replicas.ca_arrays[i].rri_rank == self_rank) + break; + if (i == in->sai_replicas.ca_count) { + D_ERROR(DF_UUID ": self not in replica IDs: self=%u replicas=" DF_U64 "\n", + DP_UUID(in->sai_db_uuid), self_rank, in->sai_replicas.ca_count); + rc = -DER_PROTO; + goto out; + } + self = in->sai_replicas.ca_arrays[i]; + + create_params.rcp_size = in->sai_size; + create_params.rcp_vos_df_version = in->sai_vos_df_version; + create_params.rcp_layout_version = in->sai_layout_version; + create_params.rcp_id = self; + create_params.rcp_replicas = bootstrap ? in->sai_replicas.ca_arrays : NULL; + create_params.rcp_replicas_len = bootstrap ? in->sai_replicas.ca_count : 0; } rc = ds_rsvc_start(in->sai_class, &in->sai_svc_id, in->sai_db_uuid, in->sai_term, - in->sai_mode, in->sai_size, in->sai_vos_df_version, - bootstrap ? in->sai_ranks : NULL, NULL /* arg */); + in->sai_mode, create ? &create_params : NULL, NULL /* arg */); if (rc == -DER_ALREADY) rc = 0; diff --git a/src/tests/SConscript b/src/tests/SConscript index 6467ef54b4e..be22f9a0bd2 100644 --- a/src/tests/SConscript +++ b/src/tests/SConscript @@ -38,7 +38,7 @@ def build_tests(env, prereqs): tenv = denv.Clone() tenv.require('argobots', 'pmdk') - libs_server += ['vos', 'bio', 'abt', 'numa'] + libs_server += ['vos', 'bio', 'ssl', 'abt', 'numa'] vos_engine = tenv.StaticObject(['vos_engine.c']) vos_perf = tenv.d_program('vos_perf', diff --git a/src/tests/ftest/aggregation/continuous_write.yaml b/src/tests/ftest/aggregation/continuous_write.yaml index 4e8e76ff4f9..56ff036a86f 100644 --- a/src/tests/ftest/aggregation/continuous_write.yaml +++ b/src/tests/ftest/aggregation/continuous_write.yaml @@ -11,13 +11,11 @@ server_config: 0: targets: 1 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server_0.log storage: auto pool: - scm_size: 100MB + scm_size: 128MB nvme_size: 1GiB container: diff --git a/src/tests/ftest/aggregation/multiple_pool_cont.yaml b/src/tests/ftest/aggregation/multiple_pool_cont.yaml index 3ced0823351..9a4d5d0b540 100644 --- a/src/tests/ftest/aggregation/multiple_pool_cont.yaml +++ b/src/tests/ftest/aggregation/multiple_pool_cont.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 1300 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,8 +15,6 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -23,19 +24,20 @@ server_config: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: size: 40% svcn: 1 + container: type: POSIX control_method: daos + ior: client_processes: np: 12 @@ -47,5 +49,6 @@ ior: block_size: '512M' dfs_oclass: "EC_2P1G1" dfs_dir_oclass: "EC_2P1G1" + runtime: total_runtime: 800 # total seconds to run diff --git a/src/tests/ftest/aggregation/punching.yaml b/src/tests/ftest/aggregation/punching.yaml index b62953919d4..a0e5e11bef6 100644 --- a/src/tests/ftest/aggregation/punching.yaml +++ b/src/tests/ftest/aggregation/punching.yaml @@ -1,3 +1,6 @@ +launch: + !filter-only : /run/pool/default # yamllint disable-line rule:colons + hosts: test_servers: 2 test_clients: 2 @@ -9,10 +12,12 @@ server_config: 0: log_mask: INFO storage: auto -pool: - scm_size: 8000000000 - nvme_size: 80000000000 - svcn: 1 +pool: !mux + default: + size: 100% + md_on_ssd_p2: + size: 100% + mem_ratio: 25 container: type: POSIX control_method: daos diff --git a/src/tests/ftest/aggregation/space_rb.py b/src/tests/ftest/aggregation/space_rb.py index 3f4b426da85..717631e28e2 100644 --- a/src/tests/ftest/aggregation/space_rb.py +++ b/src/tests/ftest/aggregation/space_rb.py @@ -1,5 +1,6 @@ """ (C) Copyright 2024 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -96,7 +97,11 @@ def test_space_rb(self): # 2. Call dmg pool get-prop and verify that Rebuild space ratio (space_rb) is 50%. self.log_step( "Call dmg pool get-prop and verify that Rebuild space ratio (space_rb) is 50%.") - expected_space_rb = int(self.params.get("properties", '/run/pool/*').split(":")[1]) + expected_space_rb = 0 + for pool_property in self.params.get("properties", "/run/pool/*", "").split(","): + if pool_property.startswith("space_rb:"): + expected_space_rb = int(pool_property.split(":")[1]) + break self.verify_space_rb_property(pool=pool_1, expected_space_rb=expected_space_rb) # 3. Run IOR to fill 50% of SCM. diff --git a/src/tests/ftest/aggregation/space_rb.yaml b/src/tests/ftest/aggregation/space_rb.yaml index c415cfc2418..c5d71faaf03 100644 --- a/src/tests/ftest/aggregation/space_rb.yaml +++ b/src/tests/ftest/aggregation/space_rb.yaml @@ -11,14 +11,12 @@ server_config: 0: targets: 4 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server_0.log storage: auto pool: size: 80G - properties: space_rb:50 + properties: space_rb:50,rd_fac:0 container: type: POSIX diff --git a/src/tests/ftest/cart/dual_iface_server.c b/src/tests/ftest/cart/dual_iface_server.c index 21993da7c4b..44f41a95c24 100644 --- a/src/tests/ftest/cart/dual_iface_server.c +++ b/src/tests/ftest/cart/dual_iface_server.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2018-2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -242,8 +243,7 @@ server_main(d_rank_t my_rank, const char *str_port, const char *str_interface, struct stat st; crt_init_options_t init_opts = {0}; - d_setenv("FI_UNIVERSE_SIZE", "1024", 1); - d_setenv("D_LOG_MASK", "ERR", 1); + d_setenv("D_LOG_MASK", "ERR", 0); d_setenv("D_PORT_AUTO_ADJUST", "1", 1); /* rank, num_attach_retries, is_server, assert_on_error */ diff --git a/src/tests/ftest/cart/test_ep_cred_client.c b/src/tests/ftest/cart/test_ep_cred_client.c index 0e491c58bc2..83686667d76 100644 --- a/src/tests/ftest/cart/test_ep_cred_client.c +++ b/src/tests/ftest/cart/test_ep_cred_client.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2018-2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -61,7 +62,6 @@ test_run() D_ASSERTF(rc == 0, "crt_group_config_path_set failed %d\n", rc); } - opt.cio_use_credits = 1; opt.cio_ep_credits = test.tg_credits; DBG_PRINT("Number of credits: %d Number of burst: %d\n", diff --git a/src/tests/ftest/cart/test_ep_cred_server.c b/src/tests/ftest/cart/test_ep_cred_server.c index 8ed17c398ce..7a24dd3b3c2 100644 --- a/src/tests/ftest/cart/test_ep_cred_server.c +++ b/src/tests/ftest/cart/test_ep_cred_server.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2018-2022 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -21,7 +21,6 @@ test_run(d_rank_t my_rank) DBG_PRINT("local group: %s remote group: %s\n", test.tg_local_group_name, test.tg_remote_group_name); - opt.cio_use_credits = 1; opt.cio_ep_credits = test.tg_credits; rc = crtu_srv_start_basic(test.tg_local_group_name, &test.tg_crt_ctx, &test.tg_tid, &grp, diff --git a/src/tests/ftest/cart/test_multisend_client.c b/src/tests/ftest/cart/test_multisend_client.c index 42b5364fab8..eda15df973a 100644 --- a/src/tests/ftest/cart/test_multisend_client.c +++ b/src/tests/ftest/cart/test_multisend_client.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2018-2022 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -14,15 +14,15 @@ static void rpc_cb_common(const struct crt_cb_info *info) { - crt_bulk_t *p_blk; + crt_bulk_t blk; int rc; - p_blk = (crt_bulk_t *)info->cci_arg; + blk = (crt_bulk_t)info->cci_arg; D_ASSERTF(info->cci_rc == 0, "rpc response failed. rc: %d\n", info->cci_rc); - if (p_blk && *p_blk) { - rc = crt_bulk_free(*p_blk); + if (blk != CRT_BULK_NULL) { + rc = crt_bulk_free(blk); if (rc) D_ERROR("bulk free failed with %d\n", rc); } @@ -151,6 +151,7 @@ test_run() /* TODO: for now rdma is disabled when forcing all rpcs to the same rank */ if (test.tg_force_rank == -1) { rc = d_sgl_init(&sgl, 1); + D_ASSERTF(rc == 0, "d_sgl_init() failed; rc: %d\n", rc); sgl.sg_iovs[0].iov_buf = dma_buff + (chunk_size * chunk_index); @@ -165,14 +166,16 @@ test_run() input->chunk_size = chunk_size; input->chunk_index = chunk_index; input->do_put = test.tg_do_put; + } else { + D_WARN("Disabling rdma transfer for forced rank for now\n"); input->chunk_size = 0; input->bulk_hdl = CRT_BULK_NULL; input->chunk_index = 0; input->do_put = false; } - rc = crt_req_send(rpc_req, rpc_cb_common, &bulk_hdl[chunk_index]); + rc = crt_req_send(rpc_req, rpc_cb_common, input->bulk_hdl); D_ASSERTF(rc == 0, "crt_req_send() failed. rc: %d\n", rc); if (test.tg_test_mode == TEST_MODE_SYNC) diff --git a/src/tests/ftest/cart/test_multisend_common.h b/src/tests/ftest/cart/test_multisend_common.h index 8caf06445ce..58ac1c9545b 100644 --- a/src/tests/ftest/cart/test_multisend_common.h +++ b/src/tests/ftest/cart/test_multisend_common.h @@ -1,5 +1,6 @@ /* * (C) Copyright 2018-2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -186,6 +187,12 @@ test_parse_args(int argc, char **argv) break; case 'm': test.tg_test_mode = atoi(optarg); + if ((test.tg_test_mode != TEST_MODE_ASYNC) && + (test.tg_test_mode != TEST_MODE_SYNC)) { + printf("Unknown test_mode=%d specified, defaulting to sync", + test.tg_test_mode); + test.tg_test_mode = TEST_MODE_SYNC; + } break; case 'n': test.tg_num_iterations = atoi(optarg); diff --git a/src/tests/ftest/cart/util/cart_logtest.py b/src/tests/ftest/cart/util/cart_logtest.py index e0cfe5bd3bb..decfa9fd015 100755 --- a/src/tests/ftest/cart/util/cart_logtest.py +++ b/src/tests/ftest/cart/util/cart_logtest.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # # (C) Copyright 2018-2024 Intel Corporation +# (C) Copyright 2025 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent @@ -14,11 +15,10 @@ import cart_logparse -HAVE_TABULATE = True try: import tabulate except ImportError: - HAVE_TABULATE = False + tabulate = None class LogCheckError(Exception): @@ -766,7 +766,7 @@ def report(self): errors.append("ERROR: Opcode {}: Alloc'd Total = {}, Dealloc'd Total = {}". format(operation, counts['ALLOCATED'], counts['DEALLOCATED'])) - if HAVE_TABULATE: + if tabulate is not None: print('Opcode State Transition Tally') print(tabulate.tabulate(table, headers=headers, diff --git a/src/tests/ftest/container/boundary.yaml b/src/tests/ftest/container/boundary.yaml index 5d1b8ad52a3..8dfd797a69e 100644 --- a/src/tests/ftest/container/boundary.yaml +++ b/src/tests/ftest/container/boundary.yaml @@ -10,8 +10,6 @@ server_config: engines: 0: targets: 4 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log env_vars: - DD_MASK=group_metadata_only @@ -19,8 +17,6 @@ server_config: storage: auto 1: targets: 4 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log env_vars: - DD_MASK=group_metadata_only @@ -28,7 +24,7 @@ server_config: storage: auto pool: - scm_size: 200M + scm_size: 512M label: pool set_logmasks: False @@ -48,7 +44,7 @@ boundary_test: !mux num_containers: 30000 with_io: false test_2: - num_pools: 100 + num_pools: 45 num_containers: 200 with_io: false test_with_io: diff --git a/src/tests/ftest/container/list.yaml b/src/tests/ftest/container/list.yaml index 77ca02a2ca3..a21f8014a98 100644 --- a/src/tests/ftest/container/list.yaml +++ b/src/tests/ftest/container/list.yaml @@ -2,7 +2,7 @@ hosts: test_servers: 1 test_clients: 1 -timeout: 360 +timeout: 720 server_config: name: daos_server diff --git a/src/tests/ftest/container/multiple_delete.yaml b/src/tests/ftest/container/multiple_delete.yaml index aa3447201d5..49843934f20 100644 --- a/src/tests/ftest/container/multiple_delete.yaml +++ b/src/tests/ftest/container/multiple_delete.yaml @@ -11,16 +11,12 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: INFO storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: INFO storage: auto diff --git a/src/tests/ftest/container/per_server_fault_domain.yaml b/src/tests/ftest/container/per_server_fault_domain.yaml index 2a741319f8e..3efa3128538 100644 --- a/src/tests/ftest/container/per_server_fault_domain.yaml +++ b/src/tests/ftest/container/per_server_fault_domain.yaml @@ -12,15 +12,11 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server_0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server_1.log storage: auto diff --git a/src/tests/ftest/container/query_properties.yaml b/src/tests/ftest/container/query_properties.yaml index 22cb146db98..f84193026a4 100644 --- a/src/tests/ftest/container/query_properties.yaml +++ b/src/tests/ftest/container/query_properties.yaml @@ -25,7 +25,7 @@ container: properties: cksum:crc16,cksum_size:16384,srv_cksum:on expected_get_prop: - layout_type: "POSIX (1)" + layout_type: "POSIX" cksum: "crc16" cksum_size: 16384 srv_cksum: "on" diff --git a/src/tests/ftest/container/snapshot_aggregation.yaml b/src/tests/ftest/container/snapshot_aggregation.yaml index ca18d0b9894..b55d173060f 100644 --- a/src/tests/ftest/container/snapshot_aggregation.yaml +++ b/src/tests/ftest/container/snapshot_aggregation.yaml @@ -1,7 +1,9 @@ timeout: 360 + hosts: test_servers: 1 test_clients: 3 + server_config: name: daos_server engines_per_host: 2 @@ -10,25 +12,24 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto system_ram_reserved: 8 + pool: scm_size: 80G nvme_size: 100G target_list: [0, 1] + container: control_method: daos type: POSIX + ior: flags: "-w -k" ppn: 2 diff --git a/src/tests/ftest/control/daos_agent_support_collect_log.yaml b/src/tests/ftest/control/daos_agent_support_collect_log.yaml index 1f6b4134142..e5fe5acce8c 100644 --- a/src/tests/ftest/control/daos_agent_support_collect_log.yaml +++ b/src/tests/ftest/control/daos_agent_support_collect_log.yaml @@ -1,20 +1,18 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 120 + server_config: name: daos_server engines_per_host: 2 engines: 0: pinned_numa_node: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/control/daos_server_support_collect_log.yaml b/src/tests/ftest/control/daos_server_support_collect_log.yaml index 4fc6f224400..9ac8518685c 100644 --- a/src/tests/ftest/control/daos_server_support_collect_log.yaml +++ b/src/tests/ftest/control/daos_server_support_collect_log.yaml @@ -1,19 +1,17 @@ hosts: test_servers: 3 + timeout: 120 + server_config: name: daos_server engines_per_host: 2 engines: 0: pinned_numa_node: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/control/daos_system_query.yaml b/src/tests/ftest/control/daos_system_query.yaml index 97f7ea867ee..8b7ccf4e1bc 100644 --- a/src/tests/ftest/control/daos_system_query.yaml +++ b/src/tests/ftest/control/daos_system_query.yaml @@ -11,7 +11,6 @@ server_config: 0: targets: 4 nr_xs_helpers: 0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -23,7 +22,6 @@ server_config: 1: targets: 4 nr_xs_helpers: 0 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: diff --git a/src/tests/ftest/control/dmg_pool_query_test.py b/src/tests/ftest/control/dmg_pool_query_test.py index f2280833bdf..f2d8b544f09 100644 --- a/src/tests/ftest/control/dmg_pool_query_test.py +++ b/src/tests/ftest/control/dmg_pool_query_test.py @@ -1,6 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -77,9 +77,11 @@ def test_pool_query_basic(self): "rebuild": { "status": self.params.get("rebuild_status", path="/run/exp_vals/rebuild/*"), "state": self.params.get("state", path="/run/exp_vals/rebuild/*"), + "derived_state": self.params.get("state", path="/run/exp_vals/rebuild/*"), "objects": self.params.get("objects", path="/run/exp_vals/rebuild/*"), "records": self.params.get("records", path="/run/exp_vals/rebuild/*"), - "total_objects": self.params.get("total_objects", path="/run/exp_vals/rebuild/*") + "total_objects": self.params.get("total_objects", path="/run/exp_vals/rebuild/*"), + 'degraded': self.params.get("degraded", path="/run/exp_vals/rebuild/*") }, "tier_stats": [ { diff --git a/src/tests/ftest/control/dmg_pool_query_test.yaml b/src/tests/ftest/control/dmg_pool_query_test.yaml index 6217024b764..17d5df77bad 100644 --- a/src/tests/ftest/control/dmg_pool_query_test.yaml +++ b/src/tests/ftest/control/dmg_pool_query_test.yaml @@ -46,6 +46,7 @@ exp_vals: objects: 0 records: 0 total_objects: 0 + degraded: False pool_uuids: uuids: diff --git a/src/tests/ftest/control/dmg_scale.yaml b/src/tests/ftest/control/dmg_scale.yaml index 84f4e35bc4d..58ee5e85d20 100644 --- a/src/tests/ftest/control/dmg_scale.yaml +++ b/src/tests/ftest/control/dmg_scale.yaml @@ -15,16 +15,12 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto targets: 8 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto targets: 8 diff --git a/src/tests/ftest/control/dmg_support_collect_log.yaml b/src/tests/ftest/control/dmg_support_collect_log.yaml index 794ca16cacc..913693122b7 100644 --- a/src/tests/ftest/control/dmg_support_collect_log.yaml +++ b/src/tests/ftest/control/dmg_support_collect_log.yaml @@ -1,19 +1,17 @@ hosts: test_servers: 3 + timeout: 200 + server_config: name: daos_server engines_per_host: 2 engines: 0: pinned_numa_node: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/control/dmg_system_start.yaml b/src/tests/ftest/control/dmg_system_start.yaml index a741830e835..5acb169f123 100644 --- a/src/tests/ftest/control/dmg_system_start.yaml +++ b/src/tests/ftest/control/dmg_system_start.yaml @@ -5,7 +5,6 @@ server_config: engines_per_host: 2 engines: 0: - fabric_iface_port: 31416 log_file: daos_server0.log targets: 4 nr_xs_helpers: 0 @@ -14,7 +13,6 @@ server_config: class: ram scm_mount: /mnt/daos0 1: - fabric_iface_port: 31516 log_file: daos_server1.log targets: 4 nr_xs_helpers: 0 diff --git a/src/tests/ftest/control/dmg_telemetry_io_basic.yaml b/src/tests/ftest/control/dmg_telemetry_io_basic.yaml index ac1529b01ee..a11d9fef932 100644 --- a/src/tests/ftest/control/dmg_telemetry_io_basic.yaml +++ b/src/tests/ftest/control/dmg_telemetry_io_basic.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 1 test_clients: 1 + timeout: 150 + server_config: name: daos_server engines_per_host: 2 @@ -9,26 +11,27 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: scm_size: 2G + container: type: POSIX control_method: daos + block_sizes: [10M, 500M] transfer_sizes: [256K, 1M] + telemetry_metrics: io_test_metrics_valid: [0, 18446744073709552000] + ior: api: POSIX flags: "-v -w -k" diff --git a/src/tests/ftest/daos_perf/large.yaml b/src/tests/ftest/daos_perf/large.yaml index 55131bac852..9f07ece7b9f 100644 --- a/src/tests/ftest/daos_perf/large.yaml +++ b/src/tests/ftest/daos_perf/large.yaml @@ -1,18 +1,16 @@ hosts: test_servers: 2 test_clients: 2 + # some run can take long to run, but needs to be verified # by running consecutively for accurate time. timeout: 3600 + job_manager: class_name: Orterun mpi_type: openmpi manager_timeout: 3600 -pool: - size: 1TB -container: - type: POSIX - control_method: daos + server_config: name: daos_server engines_per_host: 2 @@ -20,17 +18,20 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + +pool: + size: 1TB + +container: + type: POSIX + daos_perf: test_command: 'U;p F;p V O;p' test_type: daos diff --git a/src/tests/ftest/daos_perf/small.yaml b/src/tests/ftest/daos_perf/small.yaml index 8006d5d9647..6298aea44f9 100644 --- a/src/tests/ftest/daos_perf/small.yaml +++ b/src/tests/ftest/daos_perf/small.yaml @@ -1,16 +1,21 @@ hosts: test_servers: 2 test_clients: 2 + timeout: 540 + job_manager: class_name: Orterun mpi_type: openmpi manager_timeout: 480 + pool: size: 500GB + container: type: POSIX control_method: daos + server_config: name: daos_server engines_per_host: 2 @@ -18,25 +23,24 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + daos_perf: test_command: 'U;p F;p V O;p' test_type: daos diff --git a/src/tests/ftest/daos_racer/multi.yaml b/src/tests/ftest/daos_racer/multi.yaml index f013e664ca2..85b1fc983b9 100644 --- a/src/tests/ftest/daos_racer/multi.yaml +++ b/src/tests/ftest/daos_racer/multi.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 10800 + server_config: name: daos_server engines_per_host: 2 @@ -9,19 +11,16 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + daos_racer: runtime: 7200 clush_timeout: 10080 diff --git a/src/tests/ftest/daos_racer/parallel.yaml b/src/tests/ftest/daos_racer/parallel.yaml index 2c0d9f67847..9c79b82efb4 100644 --- a/src/tests/ftest/daos_racer/parallel.yaml +++ b/src/tests/ftest/daos_racer/parallel.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 4 test_clients: 4 + timeout: 1800 + server_config: name: daos_server engines_per_host: 2 @@ -9,23 +11,21 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: "ERR" storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: "ERR" storage: auto + job_manager: class_name: Orterun mpi_type: openmpi manager_timeout: 630 + daos_racer: runtime: 600 clush_timeout: 900 diff --git a/src/tests/ftest/daos_racer/simple.yaml b/src/tests/ftest/daos_racer/simple.yaml index 518a011f0db..6eda8bb011b 100644 --- a/src/tests/ftest/daos_racer/simple.yaml +++ b/src/tests/ftest/daos_racer/simple.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 1800 + server_config: name: daos_server engines_per_host: 2 @@ -9,19 +11,16 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: "ERR" storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: "ERR" storage: auto + daos_racer: runtime: 600 clush_timeout: 900 diff --git a/src/tests/ftest/daos_test/dfs.yaml b/src/tests/ftest/daos_test/dfs.yaml index 982611e9fc0..e27476b60bb 100644 --- a/src/tests/ftest/daos_test/dfs.yaml +++ b/src/tests/ftest/daos_test/dfs.yaml @@ -3,13 +3,14 @@ hosts: test_servers: 4 test_clients: 4 + timeout: 4000 + timeouts: test_daos_dfs_unit: 2030 test_daos_dfs_parallel: 2060 test_daos_dfs_sys: 90 -pool: - scm_size: 8G + server_config: name: daos_server engines_per_host: 2 @@ -18,8 +19,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log env_vars: - D_LOG_FILE_APPEND_PID=1 @@ -31,8 +30,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log env_vars: - D_LOG_FILE_APPEND_PID=1 @@ -43,12 +40,18 @@ server_config: scm_mount: /mnt/daos1 transport_config: allow_insecure: True + agent_config: transport_config: allow_insecure: True + dmg: transport_config: allow_insecure: True + +pool: + scm_size: 8G + daos_tests: test_name: test_daos_dfs_unit: DAOS_DFS_Unit diff --git a/src/tests/ftest/daos_test/nvme_recovery.yaml b/src/tests/ftest/daos_test/nvme_recovery.yaml index 66b6c3581d5..0788f0d5c23 100644 --- a/src/tests/ftest/daos_test/nvme_recovery.yaml +++ b/src/tests/ftest/daos_test/nvme_recovery.yaml @@ -2,13 +2,13 @@ # required quantity is indicated by the placeholders hosts: test_servers: 2 + timeout: 600 + # Remove this once DAOS-5134 is resolved setup: start_servers_once: false -pool: - scm_size: 8G - nvme_size: 16G + server_config: name: daos_server engines_per_host: 2 @@ -16,8 +16,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: DEBUG env_vars: @@ -26,8 +24,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: DEBUG env_vars: @@ -35,12 +31,19 @@ server_config: storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + +pool: + scm_size: 8G + nvme_size: 16G + daos_tests: num_clients: 1 test_name: diff --git a/src/tests/ftest/daos_test/rebuild.yaml b/src/tests/ftest/daos_test/rebuild.yaml index a76c093565a..2befad907b8 100644 --- a/src/tests/ftest/daos_test/rebuild.yaml +++ b/src/tests/ftest/daos_test/rebuild.yaml @@ -15,8 +15,6 @@ timeouts: test_rebuild_35: 180 test_rebuild_36: 200 test_rebuild_37: 250 -pool: - nvme_size: 0G server_config: name: daos_server @@ -25,8 +23,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -37,8 +33,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -57,6 +51,9 @@ dmg: transport_config: allow_insecure: false +pool: + nvme_size: 0G + daos_tests: num_clients: 1 num_replicas: 1 diff --git a/src/tests/ftest/daos_test/suite.py b/src/tests/ftest/daos_test/suite.py index 3c31ef9ec7e..3be9b5afe7b 100644 --- a/src/tests/ftest/daos_test/suite.py +++ b/src/tests/ftest/daos_test/suite.py @@ -302,51 +302,51 @@ def test_daos_rebuild_simple(self): """ self.run_subtest() - def test_daos_rebuild_simple_interactive(self): - """Jira ID: DAOS-17354 + def test_daos_drain_simple(self): + """Jira ID: DAOS-1568 Test Description: - Run daos_test -v --rebuild_interactive + Run daos_test -b Use cases: Core tests for daos_test - :avocado: tags=all,full_regression + :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,provider :avocado: tags=daos_test,daos_core_test,rebuild - :avocado: tags=DaosCoreTest,test_daos_rebuild_simple_interactive + :avocado: tags=DaosCoreTest,test_daos_drain_simple """ self.run_subtest() - def test_daos_drain_simple(self): + def test_daos_extend_simple(self): """Jira ID: DAOS-1568 Test Description: - Run daos_test -b + Run daos_test -B Use cases: Core tests for daos_test :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,provider - :avocado: tags=daos_test,daos_core_test - :avocado: tags=DaosCoreTest,test_daos_drain_simple + :avocado: tags=daos_test,daos_core_test,rebuild + :avocado: tags=DaosCoreTest,test_daos_extend_simple """ self.run_subtest() - def test_daos_extend_simple(self): - """Jira ID: DAOS-1568 + def test_daos_rebuild_interactive(self): + """Jira ID: DAOS-17358 Test Description: - Run daos_test -B + Run daos_test -j Use cases: Core tests for daos_test :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,provider - :avocado: tags=daos_test,daos_core_test - :avocado: tags=DaosCoreTest,test_daos_extend_simple + :avocado: tags=daos_test,daos_core_test,rebuild + :avocado: tags=DaosCoreTest,test_daos_rebuild_interactive """ self.run_subtest() diff --git a/src/tests/ftest/daos_test/suite.yaml b/src/tests/ftest/daos_test/suite.yaml index 5761ef1de1e..0a3f6a19fe0 100644 --- a/src/tests/ftest/daos_test/suite.yaml +++ b/src/tests/ftest/daos_test/suite.yaml @@ -2,6 +2,7 @@ # required quantity is indicated by the placeholders hosts: test_servers: 4 + # Note that subtests below can set their own timeout so this # should be a general average of all tests not including outliers # (I'm looking at you "rebuild tests") @@ -9,7 +10,7 @@ timeout: 600 timeouts: test_daos_degraded_mode: 450 test_daos_management: 110 - test_daos_pool: 180 + test_daos_pool: 240 test_daos_container: 700 test_daos_epoch: 125 test_daos_verify_consistency: 105 @@ -23,9 +24,9 @@ timeouts: test_daos_epoch_recovery: 104 test_daos_md_replication: 104 test_daos_rebuild_simple: 1800 - test_daos_rebuild_simple_interactive: 2100 - test_daos_drain_simple: 3600 + test_daos_drain_simple: 3720 test_daos_extend_simple: 3600 + test_daos_rebuild_interactive: 1185 test_daos_oid_allocator: 640 test_daos_checksum: 500 test_daos_rebuild_ec: 9000 @@ -34,10 +35,7 @@ timeouts: test_daos_dedup: 220 test_daos_upgrade: 300 test_daos_pipeline: 60 -pool: - # This will create 8G of SCM and 16G of NVMe size of pool. - scm_size: 8G - nvme_size: 16G + server_config: name: daos_server engines_per_host: 2 @@ -45,8 +43,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -60,8 +56,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -75,12 +69,20 @@ server_config: transport_config: allow_insecure: true system_ram_reserved: 64 + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + +pool: + # This will create 8G of SCM and 16G of NVMe size of pool. + scm_size: 8G + nvme_size: 16G + daos_tests: num_clients: test_daos_degraded_mode: 1 @@ -101,9 +103,9 @@ daos_tests: test_daos_epoch_recovery: 1 test_daos_md_replication: 2 test_daos_rebuild_simple: 1 - test_daos_rebuild_simple_interactive: 1 test_daos_drain_simple: 1 test_daos_extend_simple: 1 + test_daos_rebuild_interactive: 1 test_daos_oid_allocator: 1 test_daos_checksum: 1 test_daos_rebuild_ec: 1 @@ -131,7 +133,6 @@ daos_tests: test_daos_epoch_recovery: DAOS_Epoch_Recovery test_daos_md_replication: DAOS_MD_Replication test_daos_rebuild_simple: DAOS_Rebuild_Simple - test_daos_rebuild_simple_interactive: DAOS_Rebuild_Simple_Interactive test_daos_drain_simple: DAOS_Drain_Simple test_daos_oid_allocator: DAOS_OID_Allocator test_daos_checksum: DAOS_Checksum @@ -140,6 +141,7 @@ daos_tests: test_daos_degraded_ec: DAOS_Degraded_EC test_daos_dedup: DAOS_Dedup test_daos_extend_simple: DAOS_Extend_Simple + test_daos_rebuild_interactive: DAOS_Rebuild_Interactive test_daos_upgrade: DAOS_Upgrade test_daos_pipeline: DAOS_Pipeline daos_test: @@ -161,9 +163,9 @@ daos_tests: test_daos_epoch_recovery: o test_daos_md_replication: R test_daos_rebuild_simple: v - test_daos_rebuild_simple_interactive: v test_daos_drain_simple: b test_daos_extend_simple: B + test_daos_rebuild_interactive: j test_daos_oid_allocator: O test_daos_checksum: z test_daos_rebuild_ec: S @@ -178,9 +180,9 @@ daos_tests: test_daos_md_replication: -s5 test_daos_degraded_mode: -s7 test_daos_rebuild_simple: -s3 - test_daos_rebuild_simple_interactive: -s3 --rebuild_interactive test_daos_drain_simple: -s3 test_daos_extend_simple: -s3 + test_daos_rebuild_interactive: -s3 test_daos_oid_allocator: -s5 stopped_ranks: test_daos_degraded_mode: [5, 6, 7] diff --git a/src/tests/ftest/daos_vol/bigio.yaml b/src/tests/ftest/daos_vol/bigio.yaml index 3103218ff79..668e834b39d 100644 --- a/src/tests/ftest/daos_vol/bigio.yaml +++ b/src/tests/ftest/daos_vol/bigio.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 1 test_clients: 1 + timeout: 1000 + server_config: name: daos_server crt_timeout: 60 @@ -10,8 +12,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 4 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log env_vars: - D_LOG_FILE_APPEND_PID=1 @@ -21,19 +21,20 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 4 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log env_vars: - D_LOG_FILE_APPEND_PID=1 - FI_LOG_LEVEL=warn - D_LOG_STDERR_IN_LOG=1 storage: auto + pool: size: 50% + container: type: POSIX control_method: daos + daos_vol_tests: testname: h5_partest_t_bigio client_processes: 6 diff --git a/src/tests/ftest/datamover/large_dir.yaml b/src/tests/ftest/datamover/large_dir.yaml index 887540271f7..dcad40f8ad8 100644 --- a/src/tests/ftest/datamover/large_dir.yaml +++ b/src/tests/ftest/datamover/large_dir.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 420 + server_config: name: daos_server engines_per_host: 2 @@ -9,24 +11,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 95% + container: type: POSIX control_method: daos + mdtest: client_ppn: dcp: 32 @@ -43,10 +44,12 @@ mdtest: depth: 4 branching_factor: 4 bytes: 4096 + dcp: bufsize: 4M chunksize: 128M client_processes: ppn: 32 + datamover: posix_root: "self.workdir" diff --git a/src/tests/ftest/datamover/large_file.yaml b/src/tests/ftest/datamover/large_file.yaml index de060d01125..9a273ef9783 100644 --- a/src/tests/ftest/datamover/large_file.yaml +++ b/src/tests/ftest/datamover/large_file.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 420 + server_config: name: daos_server engines_per_host: 2 @@ -9,25 +11,24 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 95% svcn: 1 + container: type: POSIX control_method: daos + ior: client_ppn: dcp: 20 @@ -40,10 +41,12 @@ ior: transfer_size: 4M block_size: '1G' # aggregate of 20G for dcp and 10G for fs_copy dfs_oclass: EC_4P2GX + dcp: bufsize: 4M chunksize: 128M client_processes: ppn: 32 + datamover: posix_root: "self.workdir" diff --git a/src/tests/ftest/datamover/obj_large_posix.yaml b/src/tests/ftest/datamover/obj_large_posix.yaml index 781aed81517..245883e765b 100644 --- a/src/tests/ftest/datamover/obj_large_posix.yaml +++ b/src/tests/ftest/datamover/obj_large_posix.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 360 + server_config: name: daos_server engines_per_host: 2 @@ -9,24 +11,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: - size: 60G + size: 100G + container: type: POSIX control_method: daos + mdtest: client_processes: np: 30 @@ -41,6 +42,7 @@ mdtest: depth: 2 branching_factor: 2 bytes: 4096 + dcp: client_processes: np: 30 diff --git a/src/tests/ftest/datamover/serial_large_posix.yaml b/src/tests/ftest/datamover/serial_large_posix.yaml index 0081c0faa80..2f587af3b46 100644 --- a/src/tests/ftest/datamover/serial_large_posix.yaml +++ b/src/tests/ftest/datamover/serial_large_posix.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 480 + server_config: name: daos_server engines_per_host: 2 @@ -9,24 +11,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: - size: 60G + size: 100G + container: type: POSIX control_method: daos + mdtest: client_processes: np: 30 @@ -41,12 +42,15 @@ mdtest: depth: 2 branching_factor: 2 bytes: 4096 + dserialize: client_processes: np: 16 + ddeserialize: client_processes: np: 16 + dfuse: disable_caching: true enable_local_flock: true diff --git a/src/tests/ftest/deployment/agent_failure.yaml b/src/tests/ftest/deployment/agent_failure.yaml index 9944e6624f5..474b0d16a7b 100644 --- a/src/tests/ftest/deployment/agent_failure.yaml +++ b/src/tests/ftest/deployment/agent_failure.yaml @@ -17,15 +17,11 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/deployment/basic_checkout.yaml b/src/tests/ftest/deployment/basic_checkout.yaml index 79b9d105442..c68564a8dd9 100644 --- a/src/tests/ftest/deployment/basic_checkout.yaml +++ b/src/tests/ftest/deployment/basic_checkout.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 2700 + server_config: name: daos_server engines_per_host: 2 @@ -9,20 +11,18 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: size: 50% - properties: ec_cell_sz:128KiB + properties: rd_fac:0,space_rb:0,ec_cell_sz:128KiB + container: type: POSIX properties: cksum:crc16,cksum_size:16384,srv_cksum:on @@ -38,18 +38,21 @@ ior_easy: &ior_easy_base sw_deadline: 30 sw_wearout: 1 sw_status_file: stoneWallingStatusFile + ior_dfs_sx: <<: *ior_easy_base api: DFS dfs_oclass: SX dfs_chunk: 1MiB transfer_size: 1MiB + ior_dfs_ec_8p2gx: <<: *ior_easy_base api: DFS dfs_oclass: EC_8P2GX dfs_chunk: 8MiB transfer_size: 8MiB + ior_dfs_ec_16p2gx: <<: *ior_easy_base api: DFS @@ -71,16 +74,19 @@ mdtest_easy: &mdtest_easy_base stonewall_timer: 30 stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false + mdtest_dfs_s1: <<: *mdtest_easy_base dfs_oclass: S1 dfs_dir_oclass: SX dfs_chunk: 1MiB + mdtest_dfs_ec_8p2g1: <<: *mdtest_easy_base dfs_oclass: EC_8P2G1 dfs_dir_oclass: RP_3GX dfs_chunk: 8MiB + mdtest_dfs_ec_16p2g1: <<: *mdtest_easy_base dfs_oclass: EC_16P2G1 @@ -156,7 +162,9 @@ mdtest: - [POSIX, 4096, 4096, 2, 10, 5, ' '] - [DFS, 4096, 4096, 1, 25, 20, '-u'] - [POSIX, 0, 0, 2, 10, 5, '-u -C -T -r'] + dfuse: disable_caching: true + hdf5_vol: plugin_path: /usr/lib64/mpich/lib diff --git a/src/tests/ftest/deployment/critical_integration.yaml b/src/tests/ftest/deployment/critical_integration.yaml index 764c564eba0..212d121a6e8 100644 --- a/src/tests/ftest/deployment/critical_integration.yaml +++ b/src/tests/ftest/deployment/critical_integration.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 300 + server_config: name: daos_server engines_per_host: 2 @@ -9,17 +11,14 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + check_remote_root_access: false # this is needed as on aurora cluster we use libfabric provided # by HPE, which does not reside in regular location. diff --git a/src/tests/ftest/deployment/disk_failure.yaml b/src/tests/ftest/deployment/disk_failure.yaml index 8f1f5826e9e..f318f3e2c99 100644 --- a/src/tests/ftest/deployment/disk_failure.yaml +++ b/src/tests/ftest/deployment/disk_failure.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 900 + daos_server: pattern_timeout: 60 + server_config: name: daos_server engines_per_host: 2 @@ -11,27 +14,26 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto targets: 16 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto targets: 16 + pool: size: 80% svcn: 3 + container: type: POSIX control_method: daos oclass: RP_2GX properties: "cksum:crc16,rf:1" + ior: client_processes: np: 2 diff --git a/src/tests/ftest/deployment/io_sys_admin.yaml b/src/tests/ftest/deployment/io_sys_admin.yaml index 727a0bfa794..ce1a0b36667 100644 --- a/src/tests/ftest/deployment/io_sys_admin.yaml +++ b/src/tests/ftest/deployment/io_sys_admin.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 700 + server_config: name: daos_server engines_per_host: 2 @@ -9,37 +11,40 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + dmg: dmg_sub_command: storage storage: storage_sub_command: scan + pool_1: - scm_size: 256MiB + scm_size: 2GiB nvme_size: 16GiB + pool_2: scm_size: 10GiB nvme_size: 50GiB + pool_3: size: 90% + container_1: properties: cksum:crc16,cksum_size:16384,srv_cksum:on control_method: daos + container_2: type: POSIX properties: cksum:crc16,cksum_size:16384,srv_cksum:on control_method: daos oclass: RP_2GX + container_3: type: POSIX properties: cksum:crc16,cksum_size:16384,srv_cksum:on @@ -48,6 +53,7 @@ container_3: pool: size: 40% + container: type: POSIX properties: cksum:crc16,cksum_size:16384,srv_cksum:on @@ -72,6 +78,7 @@ largefilecount: mdtest_oclass: # Run once with S1 and then with EC_16P2G1 - S1 - EC_2P1G1 + ior: client_processes: ppn: 30 @@ -82,6 +89,7 @@ ior: signature: 123 transfer_size: '1Mib' block_size: '1Mib' + dfuse: disable_caching: True enable_local_flock: true @@ -98,11 +106,13 @@ mdtest: write_bytes: 4096 read_bytes: 4096 depth: 0 + dcp: bufsize: "64MB" chunksize: "128MB" client_processes: np: 16 + hdf5_vol: plugin_path: /usr/lib64/mpich/lib diff --git a/src/tests/ftest/deployment/ior_per_rank.yaml b/src/tests/ftest/deployment/ior_per_rank.yaml index 3475c594484..7c890651b0a 100644 --- a/src/tests/ftest/deployment/ior_per_rank.yaml +++ b/src/tests/ftest/deployment/ior_per_rank.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 2 + timeout: 1500 + server_config: name: daos_server engines_per_host: 2 @@ -9,26 +11,25 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: mode: 146 size: 350G # Cannot use percentage, as it does not work when using pool create for per rank. - properties: ec_cell_sz:128KiB + properties: rd_fac:0,space_rb:0,ec_cell_sz:128KiB + container: type: POSIX properties: cksum:crc16,cksum_size:16384,srv_cksum:on control_method: daos oclass: SX + ior: client_processes: ppn: 32 diff --git a/src/tests/ftest/deployment/network_failure.yaml b/src/tests/ftest/deployment/network_failure.yaml index 37536cdd9bd..12101be4d98 100644 --- a/src/tests/ftest/deployment/network_failure.yaml +++ b/src/tests/ftest/deployment/network_failure.yaml @@ -13,16 +13,12 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log targets: 8 storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log targets: 8 storage: auto diff --git a/src/tests/ftest/deployment/server_rank_failure.yaml b/src/tests/ftest/deployment/server_rank_failure.yaml index 3f4374cc011..3877aebca29 100644 --- a/src/tests/ftest/deployment/server_rank_failure.yaml +++ b/src/tests/ftest/deployment/server_rank_failure.yaml @@ -14,8 +14,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: INFO storage: auto @@ -25,8 +23,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: INFO storage: auto diff --git a/src/tests/ftest/deployment/target_failure.yaml b/src/tests/ftest/deployment/target_failure.yaml index e2053cb0972..99c9af40fec 100644 --- a/src/tests/ftest/deployment/target_failure.yaml +++ b/src/tests/ftest/deployment/target_failure.yaml @@ -11,15 +11,11 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/dfuse/caching_check.py b/src/tests/ftest/dfuse/caching_check.py index 852a24f0dfd..61980fd4aaa 100644 --- a/src/tests/ftest/dfuse/caching_check.py +++ b/src/tests/ftest/dfuse/caching_check.py @@ -1,5 +1,6 @@ """ (C) Copyright 2019-2023 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -36,22 +37,21 @@ def test_dfuse_caching_check(self): :avocado: tags=daosio,dfuse :avocado: tags=DfuseCachingCheck,test_dfuse_caching_check """ - # get params - flags = self.params.get("iorflags", '/run/ior/*') + # Get params + ior_flags_write = self.params.get("flags_write", self.ior_cmd.namespace) + ior_flags_read = self.params.get("flags_read", self.ior_cmd.namespace) read_x = self.params.get("read_x", "/run/ior/*", 1) - # update flag - self.ior_cmd.update_params(flags=flags[0]) - self.log_step('Write to the dfuse mount point') + self.ior_cmd.update_params(flags=ior_flags_write) self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) self.log_step('Get baseline read performance from dfuse with caching disabled') - self.ior_cmd.update_params(flags=flags[1]) + self.ior_cmd.update_params(flags=ior_flags_read) base_read_arr = [] - out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) + out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False, create_cont=False) base_read_arr.append(IorCommand.get_ior_metrics(out)) - out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) + out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False, create_cont=False) base_read_arr.append(IorCommand.get_ior_metrics(out)) # the index of max_mib @@ -62,12 +62,11 @@ def test_dfuse_caching_check(self): self.dfuse.update_params(disable_caching=False) self.dfuse.run() - self.log_step('Get first read performance with caching enabled') - out = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False) - base_read_arr.append(IorCommand.get_ior_metrics(out)) + self.log_step('Discard first read performance with caching enabled') + _ = self.run_ior_with_pool(fail_on_warning=False, stop_dfuse=False, create_cont=False) self.log_step('Get cached read performance') - out = self.run_ior_with_pool(fail_on_warning=False) + out = self.run_ior_with_pool(fail_on_warning=False, create_cont=False) with_caching = IorCommand.get_ior_metrics(out) self.log_step('Verify cached read performance is greater than first read') @@ -78,4 +77,4 @@ def test_dfuse_caching_check(self): for base_read in base_read_arr: actual_change = percent_change(base_read[0][max_mib], with_caching[0][max_mib]) if actual_change < read_x: - self.fail('Expected a speedup of {} but got {}'.format(read_x, actual_change)) + self.fail(f'Expected a speedup of {read_x} but got {actual_change}') diff --git a/src/tests/ftest/dfuse/caching_check.yaml b/src/tests/ftest/dfuse/caching_check.yaml index 42a9b3f3dc3..71e453114d4 100644 --- a/src/tests/ftest/dfuse/caching_check.yaml +++ b/src/tests/ftest/dfuse/caching_check.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 300 + server_config: name: daos_server engines_per_host: 1 @@ -10,24 +12,25 @@ server_config: log_mask: INFO storage: auto system_ram_reserved: 64 + pool: size: 50% + container: type: POSIX - control_method: daos + ior: client_processes: ppn: 32 test_file: testFile api: POSIX - dfs_destroy: false transfer_size: 1M block_size: 1G - dfs_oclass: "EC_2P1G1" - read_x: 3 # 300% - iorflags: - - "-v -w -k -G 3" - - "-v -r -k -G 3" + dfs_oclass: EC_2P1G1 + read_x: 2.5 # 250% + flags_write: "-v -w -k -G 3" + flags_read: "-v -r -k -G 3" + dfuse: disable_caching: true disable_wb_caching: true diff --git a/src/tests/ftest/dfuse/fio_pil4dfs_small.yaml b/src/tests/ftest/dfuse/fio_pil4dfs_small.yaml index 66491601a06..7269a12acff 100644 --- a/src/tests/ftest/dfuse/fio_pil4dfs_small.yaml +++ b/src/tests/ftest/dfuse/fio_pil4dfs_small.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 1000 + server_config: name: daos_server engines_per_host: 2 @@ -9,28 +11,28 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + pool: scm_size: 1600000000 nvme_size: 20000000000 + container: type: POSIX control_method: daos @@ -41,6 +43,7 @@ container: properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:1 rf2: properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:2 + fio: names: - global @@ -70,6 +73,7 @@ fio: rw: 'randrw' test: numjobs: 1 + dfuse: mount_dir: "/tmp/daos_dfuse" thread_count: 8 diff --git a/src/tests/ftest/dfuse/fio_small.yaml b/src/tests/ftest/dfuse/fio_small.yaml index ed343aa4875..089e5e57ccd 100644 --- a/src/tests/ftest/dfuse/fio_small.yaml +++ b/src/tests/ftest/dfuse/fio_small.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 1000 + server_config: name: daos_server engines_per_host: 2 @@ -9,28 +11,28 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + pool: scm_size: 1600000000 nvme_size: 20000000000 + container: type: POSIX control_method: daos @@ -41,6 +43,7 @@ container: properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:1 rf2: properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:2 + fio: names: - global @@ -70,6 +73,7 @@ fio: rw: 'randrw' test: numjobs: 1 + dfuse: mount_dir: "/tmp/daos_dfuse" thread_count: 8 diff --git a/src/tests/ftest/dfuse/pil4dfs_dcache.yaml b/src/tests/ftest/dfuse/pil4dfs_dcache.yaml index 719451ba5ba..34b301878c6 100644 --- a/src/tests/ftest/dfuse/pil4dfs_dcache.yaml +++ b/src/tests/ftest/dfuse/pil4dfs_dcache.yaml @@ -10,14 +10,10 @@ server_config: engines: 0: pinned_numa_node: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/dfuse/pil4dfs_fio.yaml b/src/tests/ftest/dfuse/pil4dfs_fio.yaml index e5b62446fbf..24101b84412 100644 --- a/src/tests/ftest/dfuse/pil4dfs_fio.yaml +++ b/src/tests/ftest/dfuse/pil4dfs_fio.yaml @@ -10,15 +10,11 @@ server_config: engines: 0: pinned_numa_node: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: INFO storage: auto 1: pinned_numa_node: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: INFO storage: auto diff --git a/src/tests/ftest/erasurecode/aggregation.yaml b/src/tests/ftest/erasurecode/aggregation.yaml index 5b289b7eaa5..a7addf82078 100644 --- a/src/tests/ftest/erasurecode/aggregation.yaml +++ b/src/tests/ftest/erasurecode/aggregation.yaml @@ -1,10 +1,13 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 2500 + setup: start_agents_once: false start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,25 +15,24 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 90% pool_query_timeout: 30 + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/cell_size.py b/src/tests/ftest/erasurecode/cell_size.py index d9e470f25cb..1e688345bfd 100644 --- a/src/tests/ftest/erasurecode/cell_size.py +++ b/src/tests/ftest/erasurecode/cell_size.py @@ -1,6 +1,6 @@ ''' (C) Copyright 2020-2023 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -36,7 +36,7 @@ def test_ec_cell_size(self): transfersize_blocksize = self.params.get("transfersize_blocksize", '/run/ior/*') for cell_size in pool_cell_sizes: - self.pool = self.get_pool(properties=f"ec_cell_sz:{cell_size}") + self.pool = self.get_pool(properties=f"rd_fac:0,space_rb:0,ec_cell_sz:{cell_size}") for dfs_oclass in dfs_oclass_list: self.ior_cmd.dfs_oclass.update(dfs_oclass) for transfer_size, block_size in transfersize_blocksize: diff --git a/src/tests/ftest/erasurecode/cell_size.yaml b/src/tests/ftest/erasurecode/cell_size.yaml index 5f22ee37a5f..bdcca24d7b5 100644 --- a/src/tests/ftest/erasurecode/cell_size.yaml +++ b/src/tests/ftest/erasurecode/cell_size.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 1600 + server_config: name: daos_server engines_per_host: 2 @@ -9,19 +11,16 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 93% cell_sizes: @@ -29,8 +28,10 @@ pool: - 64KiB - 128KiB - 1MiB + container: type: POSIX + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/cell_size_property.py b/src/tests/ftest/erasurecode/cell_size_property.py index e8e23608cd7..dcfceababb1 100644 --- a/src/tests/ftest/erasurecode/cell_size_property.py +++ b/src/tests/ftest/erasurecode/cell_size_property.py @@ -1,6 +1,6 @@ ''' (C) Copyright 2020-2023 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -57,10 +57,10 @@ def test_ec_pool_property(self): for pool_cell_size in pool_cell_sizes: # Create the pool - self.pool = self.get_pool(properties=f"ec_cell_sz:{pool_cell_size}") + self.pool = self.get_pool(properties=f"rd_fac:0,space_rb:0,ec_cell_sz:{pool_cell_size}") # Verify pool EC cell size - pool_prop_expected = int(self.pool.properties.value.split(":")[1]) + pool_prop_expected = int(self.pool.properties.value.split(",")[-1].split(":")[1]) self.assertEqual( pool_prop_expected, self.pool.get_property("ec_cell_sz"), "pool get-prop ec_cell_sz does not match set property") @@ -72,7 +72,8 @@ def test_ec_pool_property(self): # Use the default pool property for container and do not update if cont_cell != pool_prop_expected: - self.container.properties.update(f"ec_cell_sz:{cont_cell}") + self.container.properties.update( + f"cksum:off,srv_cksum:off,ec_cell_sz:{cont_cell}") # Create the container and open handle self.container.create() diff --git a/src/tests/ftest/erasurecode/cell_size_property.yaml b/src/tests/ftest/erasurecode/cell_size_property.yaml index 0710442abdc..d2aaa8fc024 100644 --- a/src/tests/ftest/erasurecode/cell_size_property.yaml +++ b/src/tests/ftest/erasurecode/cell_size_property.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 900 + server_config: name: daos_server engines_per_host: 2 @@ -10,25 +12,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 93% cell_sizes: - 4096 - 65536 - 131072 + container: type: POSIX cell_sizes: @@ -36,6 +36,7 @@ container: - 65536 - 131072 - 1048576 + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/ior_smoke.yaml b/src/tests/ftest/erasurecode/ior_smoke.yaml index 17b8ef5b43b..f10a60fd3ce 100644 --- a/src/tests/ftest/erasurecode/ior_smoke.yaml +++ b/src/tests/ftest/erasurecode/ior_smoke.yaml @@ -1,10 +1,13 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 900 + setup: start_agents_once: False start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -13,24 +16,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 93% + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/mdtest_smoke.yaml b/src/tests/ftest/erasurecode/mdtest_smoke.yaml index 4e565f3a83b..4a5b0543027 100644 --- a/src/tests/ftest/erasurecode/mdtest_smoke.yaml +++ b/src/tests/ftest/erasurecode/mdtest_smoke.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 600 + server_config: name: daos_server engines_per_host: 2 @@ -10,24 +12,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 53% + container: type: POSIX control_method: daos + mdtest: client_processes: np_48: diff --git a/src/tests/ftest/erasurecode/multiple_rank_failure.yaml b/src/tests/ftest/erasurecode/multiple_rank_failure.yaml index cd0d2e77bb5..88806efa2f3 100644 --- a/src/tests/ftest/erasurecode/multiple_rank_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_rank_failure.yaml @@ -15,15 +15,11 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/erasurecode/multiple_target_failure.yaml b/src/tests/ftest/erasurecode/multiple_target_failure.yaml index cd0d2e77bb5..88806efa2f3 100644 --- a/src/tests/ftest/erasurecode/multiple_target_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_target_failure.yaml @@ -15,15 +15,11 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/erasurecode/offline_rebuild.yaml b/src/tests/ftest/erasurecode/offline_rebuild.yaml index d583887c24f..b84689f101b 100644 --- a/src/tests/ftest/erasurecode/offline_rebuild.yaml +++ b/src/tests/ftest/erasurecode/offline_rebuild.yaml @@ -7,11 +7,14 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 2 + timeout: 1200 + setup: # Test variants use different server counts, so ensure servers are stopped after each run start_agents_once: False start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -20,22 +23,21 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log storage: auto + pool: size: 93% + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/offline_rebuild_aggregation.yaml b/src/tests/ftest/erasurecode/offline_rebuild_aggregation.yaml index f199835e4d2..8f66eaf8f73 100644 --- a/src/tests/ftest/erasurecode/offline_rebuild_aggregation.yaml +++ b/src/tests/ftest/erasurecode/offline_rebuild_aggregation.yaml @@ -1,10 +1,13 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 6000 + setup: start_agents_once: False start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -12,23 +15,22 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log storage: auto + pool: size: 90% pool_query_timeout: 30 + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/offline_rebuild_single.yaml b/src/tests/ftest/erasurecode/offline_rebuild_single.yaml index 210dd4589f6..0c94ac46f82 100644 --- a/src/tests/ftest/erasurecode/offline_rebuild_single.yaml +++ b/src/tests/ftest/erasurecode/offline_rebuild_single.yaml @@ -7,12 +7,16 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 1 + setup: start_servers_once: False + timeout: 900 + agent_config: #cache_expiration: 1 disable_caching: true + server_config: name: daos_server engines_per_host: 2 @@ -20,25 +24,24 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log storage: auto + pool: size: 93% pool_query_timeout: 30 + container: type: POSIX single_data_set: # [object_qty, record_qty, dkey, akey, data_size] - [1, 1, 1, 1, 4194304] + objectclass: dfs_oclass_list: #- [EC_Object_Class, Minimum number of servers] diff --git a/src/tests/ftest/erasurecode/online_rebuild.yaml b/src/tests/ftest/erasurecode/online_rebuild.yaml index 74a6eb29ba5..20708ea4d2c 100644 --- a/src/tests/ftest/erasurecode/online_rebuild.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild.yaml @@ -7,13 +7,17 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 2 + timeout: 1000 + setup: start_agents_once: False start_servers_once: False + agent_config: #cache_expiration: 1 disable_caching: true + server_config: name: daos_server engines_per_host: 2 @@ -21,27 +25,27 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log storage: auto + pool: size: 93% + container: type: POSIX control_method: daos + daos: container: destroy: env_vars: - CRT_TIMEOUT=10 + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index 5e60eae8eab..39459319e7f 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -7,10 +7,13 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 2 + timeout: 1500 + setup: start_agents_once: False start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -18,25 +21,33 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: INFO storage: auto + env_vars: + - NA_OFI_UNEXPECTED_TAG_MSG=0 + 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log log_mask: INFO storage: auto + env_vars: + - NA_OFI_UNEXPECTED_TAG_MSG=0 + +client: + env_vars: + - NA_OFI_UNEXPECTED_TAG_MSG=0 + pool: size: 93% + container: type: POSIX control_method: daos properties: rd_fac:2 + mdtest: client_processes: np: 4 diff --git a/src/tests/ftest/erasurecode/online_rebuild_single.yaml b/src/tests/ftest/erasurecode/online_rebuild_single.yaml index 781605d841b..e053eb98671 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_single.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_single.yaml @@ -7,9 +7,12 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 1 + timeout: 1200 + setup: start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -17,26 +20,25 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log storage: auto + pool: size: 93% pool_query_timeout: 30 properties: rd_fac:2 + container: type: POSIX single_data_set: # [object_qty, record_qty, dkey, akey, data_size] - [1, 1, 1, 1, 4194304] + objectclass: dfs_oclass_list: #- [EC_Object_Class, Minimum number of servers] diff --git a/src/tests/ftest/erasurecode/rank_failure.yaml b/src/tests/ftest/erasurecode/rank_failure.yaml index 51efe637095..f7f29738dc8 100644 --- a/src/tests/ftest/erasurecode/rank_failure.yaml +++ b/src/tests/ftest/erasurecode/rank_failure.yaml @@ -1,10 +1,13 @@ hosts: test_servers: 6 test_clients: 1 + timeout: 1800 + setup: start_agents_once: false start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,21 +15,19 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: scm_size: 8G + gen_io_conf: ranks: "11" targets: "4" diff --git a/src/tests/ftest/erasurecode/rebuild_disabled.yaml b/src/tests/ftest/erasurecode/rebuild_disabled.yaml index e970e043972..3cf796c63bf 100644 --- a/src/tests/ftest/erasurecode/rebuild_disabled.yaml +++ b/src/tests/ftest/erasurecode/rebuild_disabled.yaml @@ -7,11 +7,14 @@ hosts: 10_server: test_servers: server-[1-5] test_clients: 3 + timeout: 3500 + setup: # Test variants use different server counts, so ensure servers are stopped after each run start_agents_once: False start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -19,23 +22,22 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log storage: auto + pool: size: 93% pool_query_timeout: 30 + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml b/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml index 006e75079fb..a91040c59e8 100644 --- a/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml +++ b/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml @@ -7,11 +7,14 @@ hosts: 10_server: test_servers: server-[1-5] test_clients: 1 + timeout: 400 + setup: # Test variants use different server counts, so ensure servers are stopped after each run start_agents_once: False start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -20,26 +23,25 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log storage: auto + pool: size: 93% pool_query_timeout: 30 + container: type: POSIX control_method: daos single_data_set: # [object_qty, record_qty, dkey, akey, data_size] - [1, 1, 1, 1, 4194304] + objectclass: dfs_oclass_list: #- [EC_Object_Class, Minimum number of servers] diff --git a/src/tests/ftest/erasurecode/rebuild_fio.yaml b/src/tests/ftest/erasurecode/rebuild_fio.yaml index 6ec1a98faff..677e4f9a961 100644 --- a/src/tests/ftest/erasurecode/rebuild_fio.yaml +++ b/src/tests/ftest/erasurecode/rebuild_fio.yaml @@ -7,10 +7,13 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 1 + timeout: 1500 + setup: start_agents_once: False start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -18,8 +21,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: ERR targets: 2 @@ -27,16 +28,16 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log log_mask: ERR targets: 2 storage: auto + pool: size: 60% aggregation_timeout: 180 set_logmasks: False + container: type: POSIX control_method: daos @@ -45,6 +46,7 @@ container: properties: rd_fac:1 rf2: properties: rd_fac:2 + fio: names: - test @@ -64,6 +66,7 @@ fio: randrw: rw: 'randrw' rw_read: 'randrw' + dfuse: mount_dir: "/tmp/daos_dfuse" disable_caching: True diff --git a/src/tests/ftest/erasurecode/restart.yaml b/src/tests/ftest/erasurecode/restart.yaml index 0adb0c937b4..d8cc0c1a844 100644 --- a/src/tests/ftest/erasurecode/restart.yaml +++ b/src/tests/ftest/erasurecode/restart.yaml @@ -1,10 +1,13 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 1500 + setup: start_agents_once: False start_servers_once: False + server_config: name: daos_server engines_per_host: 2 @@ -12,26 +15,25 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 40% svcn: 3 pool_query_timeout: 30 + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: @@ -53,5 +55,6 @@ ior: - ["EC_4P1GX", 6] - ["EC_4P2GX", 6] - ["EC_8P2GX", 10] + aggregation: threshold: "70%" diff --git a/src/tests/ftest/erasurecode/space_usage.yaml b/src/tests/ftest/erasurecode/space_usage.yaml index dc33af3aaec..f20803d261e 100644 --- a/src/tests/ftest/erasurecode/space_usage.yaml +++ b/src/tests/ftest/erasurecode/space_usage.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 600 + server_config: name: daos_server engines_per_host: 2 @@ -9,25 +11,24 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 95% - properties: ec_cell_sz:128KiB,reclaim:disabled + properties: rd_fac:0,space_rb:0,ec_cell_sz:128KiB,reclaim:disabled + container: type: POSIX control_method: daos + ior: &ior_base client_processes: ppn: 32 @@ -36,11 +37,14 @@ ior: &ior_base transfer_size: 32KiB # Partial update block_size: 16GiB # Aggregate 500 GiB flags: "-w -C -e -g -G 27 -k -Q 1 -v" + ior_ec_4p2gx: <<: *ior_base dfs_oclass: EC_4P2GX + ior_ec_4p1gx: <<: *ior_base dfs_oclass: EC_4P1GX + space_usage: max_diff_percent: 0.05 diff --git a/src/tests/ftest/erasurecode/truncate.yaml b/src/tests/ftest/erasurecode/truncate.yaml index 4cfe9433251..64a28c0ae5b 100644 --- a/src/tests/ftest/erasurecode/truncate.yaml +++ b/src/tests/ftest/erasurecode/truncate.yaml @@ -7,10 +7,13 @@ hosts: 10_server: test_servers: server-[1-5] test_clients: 1 + timeout: 300 + setup: start_agents_once: false start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -18,21 +21,19 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 50% + container: type: POSIX control_method: daos @@ -41,6 +42,7 @@ container: properties: rd_fac:1 rf2: properties: rd_fac:2 + fio: names: - test @@ -54,6 +56,7 @@ fio: read_write: rw: 'write' truncate_size: '26214400' # 25Mb + dfuse: mount_dir: "/tmp/daos_dfuse" caching: !mux diff --git a/src/tests/ftest/fault_injection/ec.yaml b/src/tests/ftest/fault_injection/ec.yaml index ee607de4ae2..2a9b80c1b9c 100644 --- a/src/tests/ftest/fault_injection/ec.yaml +++ b/src/tests/ftest/fault_injection/ec.yaml @@ -1,10 +1,13 @@ hosts: test_servers: 7 test_clients: 1 + timeout: 300 + setup: start_agents_once: false start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,24 +15,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: size: 93% properties: ec_cell_sz:64KiB + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:2 + ior: api: "DFS" client_processes: @@ -46,6 +48,7 @@ ior: - "EC_2P2GX" - "EC_4P2GX" - "EC_8P2GX" + fio: names: - global @@ -62,8 +65,10 @@ fio: rw: 'rw' test: numjobs: 1 + dfuse: mount_dir: "/tmp/daos_dfuse" + faults: fault_list: - DAOS_CSUM_CORRUPT_FETCH diff --git a/src/tests/ftest/fault_injection/pool.yaml b/src/tests/ftest/fault_injection/pool.yaml index 5f1315f134b..dc31c0bc6f5 100644 --- a/src/tests/ftest/fault_injection/pool.yaml +++ b/src/tests/ftest/fault_injection/pool.yaml @@ -12,21 +12,17 @@ server_config: pinned_numa_node: 0 targets: 4 nr_xs_helpers: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 targets: 4 nr_xs_helpers: 0 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto pool: - size: 20G + size: 40G nranks: 4 rebuild_timeout: 120 pool_query_timeout: 60 diff --git a/src/tests/ftest/interoperability/upgrade_downgrade_base.py b/src/tests/ftest/interoperability/upgrade_downgrade_base.py index 5cfd839409a..e1d3777edf8 100644 --- a/src/tests/ftest/interoperability/upgrade_downgrade_base.py +++ b/src/tests/ftest/interoperability/upgrade_downgrade_base.py @@ -261,7 +261,8 @@ def install_daos(self, version, servers, clients): if servers: self.log.info("Installing version %s on servers, %s", version, servers) - if not install_packages(self.log, servers, server_packages, 'root').passed: + result = install_packages(self.log, servers, server_packages, 'root', allowerasing=True) + if not result.passed: self.fail(f"Failed to install version {version} on servers") self.current_server_version = Version(version) result = run_remote(self.log, NodeSet(servers[0]), 'dmg version') @@ -274,7 +275,8 @@ def install_daos(self, version, servers, clients): # Install on clients if clients: self.log.info("Installing version %s on clients, %s", version, clients) - if not install_packages(self.log, clients, client_packages, 'root').passed: + result = install_packages(self.log, clients, client_packages, 'root', allowerasing=True) + if not result.passed: self.fail(f"Failed to install version {version} on clients") self.current_client_version = Version(version) result = run_remote(self.log, clients, 'daos version') diff --git a/src/tests/ftest/io/io_consistency.yaml b/src/tests/ftest/io/io_consistency.yaml index d2fa53d73d2..43b783bacb2 100644 --- a/src/tests/ftest/io/io_consistency.yaml +++ b/src/tests/ftest/io/io_consistency.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 2 + timeout: 210 + server_config: name: daos_server engines_per_host: 2 @@ -9,25 +11,24 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: scm_size: 5000000000 nvme_size: 20000000000 + container: type: POSIX control_method: daos + ior: client_processes: np: 10 @@ -51,5 +52,6 @@ ior: objectclass: SX: dfs_oclass: "SX" + dfuse: disable_caching: true diff --git a/src/tests/ftest/io/large_file_count.yaml b/src/tests/ftest/io/large_file_count.yaml index 6ff375cf3a9..3c7fe6499e0 100644 --- a/src/tests/ftest/io/large_file_count.yaml +++ b/src/tests/ftest/io/large_file_count.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 5000 + server_config: name: daos_server engines_per_host: 2 @@ -9,22 +11,21 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: size: 95% + container: type: POSIX control_method: daos + largefilecount: api: - DFS @@ -36,6 +37,7 @@ largefilecount: mdtest_oclass: # Run once with S1 and then with EC_16P2G1 - S1 - EC_2P1G1 + ior: np: 30 dfs_destroy: false @@ -46,6 +48,7 @@ ior: block_size: '7G' env_vars: - D_IL_REPORT=1 + dfuse: disable_caching: true diff --git a/src/tests/ftest/io/macsio_test.yaml b/src/tests/ftest/io/macsio_test.yaml index 663e4efc67c..ea3f5605577 100644 --- a/src/tests/ftest/io/macsio_test.yaml +++ b/src/tests/ftest/io/macsio_test.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 1 test_clients: 3 + timeout: 120 + server_config: name: daos_server engines_per_host: 2 @@ -9,23 +11,22 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: scm_size: 5G nvme_size: 10G + container: control_method: daos type: POSIX + macsio: interface: hdf5 parallel_file_mode: SIF 1 @@ -36,6 +37,7 @@ macsio: num_dumps: 2 debug_level: 1 processes: 6 + job_manager: !mux mpich: class_name: Mpirun diff --git a/src/tests/ftest/io/seg_count.yaml b/src/tests/ftest/io/seg_count.yaml index 8014a3c6711..7a2d5bd5c8e 100644 --- a/src/tests/ftest/io/seg_count.yaml +++ b/src/tests/ftest/io/seg_count.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 4 test_clients: 4 + timeout: 2000 + server_config: name: daos_server engines_per_host: 2 @@ -9,22 +11,21 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: size: 95% + container: type: POSIX control_method: daos + ior: client_processes: !mux slots_16: diff --git a/src/tests/ftest/io/small_file_count.yaml b/src/tests/ftest/io/small_file_count.yaml index 79e02c3d787..2a33a3ef934 100644 --- a/src/tests/ftest/io/small_file_count.yaml +++ b/src/tests/ftest/io/small_file_count.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 750 + server_config: name: daos_server engines_per_host: 2 @@ -9,23 +11,22 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: scm_size: 40G nvme_size: 300G + container: type: POSIX control_method: daos + largefilecount: api: - DFS @@ -37,6 +38,7 @@ largefilecount: mdtest_oclass: # Run once with S1 and then with EC_16P2G1 - S1 - EC_2P1G1 + ior: np: 30 dfs_destroy: false @@ -47,6 +49,7 @@ ior: block_size: '2G' env_vars: - D_IL_REPORT=1 + dfuse: disable_caching: true diff --git a/src/tests/ftest/io/unaligned_io.yaml b/src/tests/ftest/io/unaligned_io.yaml index bf1a6c19a85..3bcc450c8c5 100644 --- a/src/tests/ftest/io/unaligned_io.yaml +++ b/src/tests/ftest/io/unaligned_io.yaml @@ -1,6 +1,8 @@ hosts: test_servers: 4 + timeout: 900 + server_config: name: daos_server engines_per_host: 2 @@ -9,8 +11,6 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -25,8 +25,6 @@ server_config: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -37,8 +35,10 @@ server_config: - D_LOG_FILE_APPEND_PID=1 - COVFILE=/tmp/test.cov storage: auto + pool: scm_size: 12G + datasize: sizes: - 20 diff --git a/src/tests/ftest/ior/crash.yaml b/src/tests/ftest/ior/crash.yaml index ba9579894d6..62c9c23f1f6 100644 --- a/src/tests/ftest/ior/crash.yaml +++ b/src/tests/ftest/ior/crash.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 1 test_clients: 3 + timeout: 800 + server_config: name: daos_server engines_per_host: 2 @@ -9,24 +11,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 90% + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/ior/hard.yaml b/src/tests/ftest/ior/hard.yaml index e04de4c0414..71394243cff 100644 --- a/src/tests/ftest/ior/hard.yaml +++ b/src/tests/ftest/ior/hard.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 5 test_clients: 3 + timeout: 1000 + server_config: name: daos_server engines_per_host: 2 @@ -9,8 +11,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: @@ -21,8 +21,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: @@ -30,12 +28,15 @@ server_config: class: dcpm scm_list: ["/dev/pmem1"] scm_mount: /mnt/daos1 + pool: scm_size: 500G + container: type: POSIX control_method: daos properties: dedup:memcmp + ior: client_processes: np: 48 diff --git a/src/tests/ftest/ior/hard_rebuild.yaml b/src/tests/ftest/ior/hard_rebuild.yaml index 54a89fc237f..32665ab18ad 100644 --- a/src/tests/ftest/ior/hard_rebuild.yaml +++ b/src/tests/ftest/ior/hard_rebuild.yaml @@ -7,10 +7,13 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 2 + timeout: 1000 + setup: start_agents_once: false start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -18,24 +21,23 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto + pool: size: 90% + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/ior/intercept_multi_client.yaml b/src/tests/ftest/ior/intercept_multi_client.yaml index 94a0508fbbb..2fcf2a0454a 100644 --- a/src/tests/ftest/ior/intercept_multi_client.yaml +++ b/src/tests/ftest/ior/intercept_multi_client.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 6 -timeout: 1000 + +timeout: 760 + server_config: name: daos_server engines_per_host: 2 @@ -9,25 +11,22 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: WARN storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: WARN storage: auto + pool: size: 90% - svcn: 1 + container: type: POSIX - control_method: daos + ior: env_vars: - D_LOG_MASK=WARN @@ -35,13 +34,13 @@ ior: client_processes: ppn: 16 test_file: testFile - repetitions: 3 sw_deadline: 60 flags: "-v -w -r -R" - dfs_oclass: "SX" - block_size: '100G' + dfs_oclass: SX + block_size: 100G write_x: 0.10 # Max 10% performance difference. read_x: 0.10 # Loosely derived from 3% stddev + 8% actual deviation. + enforce_performance: false # Skip enformance in CI since it is flaky. transfersize: !mux 512B: transfer_size: '512B' @@ -49,5 +48,6 @@ ior: transfer_size: '4K' 1M: transfer_size: '1M' + dfuse: disable_caching: true diff --git a/src/tests/ftest/ior/small.yaml b/src/tests/ftest/ior/small.yaml index c9bc6d90ef8..9f5145debc8 100644 --- a/src/tests/ftest/ior/small.yaml +++ b/src/tests/ftest/ior/small.yaml @@ -15,15 +15,11 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto transport_config: diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index a62abe0a508..e79f5fbfd61 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ (C) Copyright 2018-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -310,15 +310,22 @@ def _run(self, args): message = f"Error detecting tests that match tags: {' '.join(args.tags)}" return self.get_exit_status(1, message, "Setup", sys.exc_info()) + logger.info("Finished detecting tests") + # Verify at least one test was requested if not group.tests: message = f"No tests found for tags: {' '.join(args.tags)}" return self.get_exit_status(1, message, "Setup", sys.exc_info()) + logger.info("BANG: 1") + # Done if just listing tests matching the tags if args.list and not args.modify: + logger.info(f"args.list: {args.list}, args.modify: {args.modify}") return self.get_exit_status(0, "Listing tests complete") + logger.info("BANG: 2") + # Setup the fuse configuration try: setup_fuse_config(logger, args.test_servers | args.test_clients) @@ -326,7 +333,7 @@ def _run(self, args): # Warn but don't fail message = "Issue detected setting up the fuse configuration" setup_result.warn_test(logger, "Setup", message, sys.exc_info()) - + logger.info("BANG: 3") # Setup override systemctl files try: clients = args.test_clients if args.test_clients else args.test_servers @@ -335,7 +342,7 @@ def _run(self, args): except LaunchException: message = "Issue detected setting up the systemctl configuration" return self.get_exit_status(1, message, "Setup", sys.exc_info()) - + logger.info("BANG: 4") # Get the core file pattern information core_files = {} if args.process_cores: @@ -347,7 +354,7 @@ def _run(self, args): return self.get_exit_status(1, message, "Setup", sys.exc_info()) else: logger.debug("Not collecting core files") - + logger.info("BANG: 5") # Determine if bullseye code coverage collection is enabled code_coverage = CodeCoverage(test_env) # pylint: disable=unsupported-binary-operation @@ -364,14 +371,14 @@ def _run(self, args): except StorageException: message = "Error detecting storage information for test yaml files" status |= self.get_exit_status(1, message, "Setup", sys.exc_info()) - + logger.info("BANG: 6") if args.modify: return self.get_exit_status(0, "Modifying test yaml files complete") - + logger.info("BANG: 7") # Configure slurm if any tests use partitions test_status = group.setup_slurm( logger, self.slurm_setup, self.slurm_install, self.user, self.result) - + logger.info("BANG: 8") # Split the timer for the test result to account for any non-test execution steps as not # to double report the test time accounted for in each individual test result setup_result.end() @@ -382,7 +389,7 @@ def _run(self, args): not args.disable_stop_daos, args.archive, args.rename, args.jenkinslog, core_files, args.logs_threshold, args.user_create, code_coverage, self.job_results_dir, self.logdir, args.clear_mounts, cleanup_files) - + logger.info("BANG: 9") # Convert the test status to a launch.py status status |= summarize_run(logger, self.mode, test_status) @@ -392,6 +399,8 @@ def _run(self, args): # Restart the timer for the test result to account for any non-test execution steps setup_result.start() + logger.info("BANG: 10") + # Return the appropriate return code and mark the test result to account for any non-test # execution steps complete return self.get_exit_status(status, "Executing tests complete") @@ -731,6 +740,7 @@ def main(): args = parser.parse_args() # Override arguments via the mode + logger.info(f"args.mode: {args.mode}") if args.mode == "ci": args.archive = True args.include_localhost = True @@ -764,4 +774,4 @@ def main(): logger.addHandler(get_console_handler("%(message)s", logging.DEBUG)) main() else: - logger = logging.getLogger() + logger = logging.getLogger() \ No newline at end of file diff --git a/src/tests/ftest/mdtest/small.yaml b/src/tests/ftest/mdtest/small.yaml index 8fd080664f4..afa3efd2e3d 100644 --- a/src/tests/ftest/mdtest/small.yaml +++ b/src/tests/ftest/mdtest/small.yaml @@ -15,15 +15,11 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto transport_config: diff --git a/src/tests/ftest/nvme/enospace.py b/src/tests/ftest/nvme/enospace.py index 89376c9427d..1b85b26e1f4 100644 --- a/src/tests/ftest/nvme/enospace.py +++ b/src/tests/ftest/nvme/enospace.py @@ -461,6 +461,7 @@ def run_enospace_foreground(self, log_file): # Fill 75% of current SCM free space. Aggregation is Enabled so NVMe space will # start to fill up. + # pylint: disable-next=logging-too-few-args self.log.info('--Filling 75% of the current SCM free space--') try: self.start_ior_load(storage='SCM', operation="Auto_Write", percent=75) @@ -477,6 +478,7 @@ def run_enospace_foreground(self, log_file): # Fill 60% of current SCM free space. This time, NVMe will be Full so data will # not be moved to NVMe and continue to fill up SCM. SCM will be full and this # command is expected to fail with DER_NOSPACE. + # pylint: disable-next=logging-too-few-args self.log.info('--Filling 60% of the current SCM free space--') try: self.start_ior_load( diff --git a/src/tests/ftest/nvme/enospace.yaml b/src/tests/ftest/nvme/enospace.yaml index 43bdd6b787b..396ddee7cfc 100644 --- a/src/tests/ftest/nvme/enospace.yaml +++ b/src/tests/ftest/nvme/enospace.yaml @@ -1,9 +1,11 @@ hosts: test_servers: 2 test_clients: 2 + # Few tests are in loop for ~10 times and single loop # is taking ~600 seconds to finish so larger timeout to run tests. timeout: 7500 + server_config: name: daos_server engines_per_host: 2 @@ -11,25 +13,24 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 targets: 1 storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 targets: 1 storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + pool: scm_size: 5G nvme_size: 5G @@ -43,6 +44,7 @@ container: control_method: daos register_cleanup: False # Skip teardown destroy. Test manually destroys containers. type: POSIX + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/nvme/fragmentation.yaml b/src/tests/ftest/nvme/fragmentation.yaml index 835a0d70ac3..279f2c50498 100644 --- a/src/tests/ftest/nvme/fragmentation.yaml +++ b/src/tests/ftest/nvme/fragmentation.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 2 + timeout: 6000 + server_config: name: daos_server engines_per_host: 2 @@ -9,22 +11,21 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: size: 95% + container: type: POSIX control_method: daos + ior: num_repeat: 30 num_parallel_job: 10 diff --git a/src/tests/ftest/nvme/health.py b/src/tests/ftest/nvme/health.py index d23cb8427a4..16460bf680b 100644 --- a/src/tests/ftest/nvme/health.py +++ b/src/tests/ftest/nvme/health.py @@ -1,5 +1,6 @@ ''' (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -47,24 +48,51 @@ def test_monitor_for_large_pools(self): # Calculate the potential number of pools and use up to the max from config potential_num_pools = int((nvme_per_engine / (min_nvme_per_target * targets_per_engine))) actual_num_pools = min(max_num_pools, potential_num_pools) + self.log.debug("Calculating pool size and quantity") + self.log.debug( + " scm_per_engine: %s * %s = %s", + space_per_engine['scm'], total_pool_percentage, scm_per_engine) + self.log.debug( + " nvme_per_engine: %s * %s = %s", + space_per_engine['nvme'], total_pool_percentage, nvme_per_engine) + self.log.debug( + " potential_num_pools: %s / (%s * %s) = %s", + nvme_per_engine, min_nvme_per_target, targets_per_engine, potential_num_pools) # consider 1GiB RDB memory consume for MD-on-SSD rdb_size = 1073741824 if self.server_managers[0].manager.job.using_control_metadata: - min_scm_per_pool = 104857600 + min_scm_per_pool = 1073741824 potential_num_pools = int(scm_per_engine / (min_scm_per_pool + rdb_size)) actual_num_pools = min(potential_num_pools, actual_num_pools) + self.log.debug( + " potential_num_pools (md on ssd): %s / (%s + %s) = %s", + scm_per_engine, scm_per_engine, rdb_size, potential_num_pools) + + self.log.debug( + " actual_num_pools: min(%s, %s) = %s", + max_num_pools, potential_num_pools, actual_num_pools) # Split available space across the number of pools to be created scm_per_pool = int(scm_per_engine / actual_num_pools) + self.log.debug( + " scm_per_pool: %s / %s = %s", + scm_per_engine, actual_num_pools, scm_per_pool) if self.server_managers[0].manager.job.using_control_metadata: + self.log.debug( + " scm_per_pool (md on ssd): %s - %s = %s", + scm_per_pool, rdb_size, int(scm_per_pool - rdb_size)) scm_per_pool = int(scm_per_pool - rdb_size) + nvme_per_pool = int(nvme_per_engine / actual_num_pools) + self.log.debug( + " nvme_per_pool: %s / %s = %s", + nvme_per_engine, actual_num_pools, nvme_per_pool) # Create the pools pool_list = [] for pool_num in range(actual_num_pools): - self.log.info("-- Creating pool number = %s", pool_num) + self.log.info("-- Creating pool number %s of %s", pool_num + 1, actual_num_pools) try: pool_list.append(self.get_pool(scm_size=scm_per_pool, nvme_size=nvme_per_pool)) except TestFail as error: diff --git a/src/tests/ftest/nvme/health.yaml b/src/tests/ftest/nvme/health.yaml index 5eb9ea7f3e3..e8827d15070 100644 --- a/src/tests/ftest/nvme/health.yaml +++ b/src/tests/ftest/nvme/health.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 900 + server_config: name: daos_server engines_per_host: 2 @@ -10,26 +12,25 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto transport_config: allow_insecure: True + agent_config: transport_config: allow_insecure: True + dmg: transport_config: allow_insecure: True + pool: max_num_pools: 40 total_pool_percentage: 95 diff --git a/src/tests/ftest/nvme/io.yaml b/src/tests/ftest/nvme/io.yaml index d671c4f84d9..972149d633b 100644 --- a/src/tests/ftest/nvme/io.yaml +++ b/src/tests/ftest/nvme/io.yaml @@ -13,7 +13,7 @@ server_config: storage: auto pool: - properties: reclaim:disabled + properties: rd_fac:0,space_rb:0,reclaim:disabled container: control_method: daos diff --git a/src/tests/ftest/nvme/io_verification.yaml b/src/tests/ftest/nvme/io_verification.yaml index a183f319cd2..48fbb787825 100644 --- a/src/tests/ftest/nvme/io_verification.yaml +++ b/src/tests/ftest/nvme/io_verification.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 4000 + server_config: name: daos_server engines_per_host: 2 @@ -9,30 +11,33 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: num_pools: 4 + pool_0: size: 20% + pool_1: size: 30% + pool_2: size: 50% + pool_3: size: 60% + container: type: POSIX control_method: daos + ior: client_processes: np: 16 diff --git a/src/tests/ftest/nvme/object.yaml b/src/tests/ftest/nvme/object.yaml index 9df02748652..6fdac0a0a41 100644 --- a/src/tests/ftest/nvme/object.yaml +++ b/src/tests/ftest/nvme/object.yaml @@ -1,9 +1,11 @@ hosts: test_servers: 3 test_clients: 1 + timeouts: test_nvme_object_single_pool: 270 test_nvme_object_multiple_pools: 16000 + server_config: name: daos_server engines_per_host: 2 @@ -11,26 +13,26 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool_1: scm_size: 4GB nvme_size: 20G + pool_2: scm_size: 4GB nvme_size: 100GB + pool_3: scm_size: 4GB nvme_size: 350GB + container: object_qty: 10 record_size: diff --git a/src/tests/ftest/nvme/pool_capacity.py b/src/tests/ftest/nvme/pool_capacity.py index dea4d5bedca..c66f40927f5 100644 --- a/src/tests/ftest/nvme/pool_capacity.py +++ b/src/tests/ftest/nvme/pool_capacity.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -93,10 +94,7 @@ def run_test_create_delete(self, num_pool=2, num_cont=5, total_count=100): self.log.info("Running test %s", loop_count) offset = loop_count * num_pool for val in range(offset, offset + num_pool): - self.pool.append( - self.get_pool( - namespace="/run/pool_qty_{}/*".format(num_pool), - properties="reclaim:disabled")) + self.pool.append(self.get_pool(namespace=f"/run/pool_qty_{num_pool}/*")) display_string = "pool{} space at the Beginning".format(val) self.pool[-1].display_pool_daos_space(display_string) @@ -151,8 +149,7 @@ def run_test(self, num_pool=1): # Create the IOR threads threads = [] for val in range(0, num_pool): - self.pool.append(self.get_pool(namespace="/run/pool_qty_{}/*".format(num_pool), - properties="reclaim:disabled")) + self.pool.append(self.get_pool(namespace=f"/run/pool_qty_{num_pool}/*")) display_string = "pool{} space at the Beginning".format(val) self.pool[-1].display_pool_daos_space(display_string) diff --git a/src/tests/ftest/nvme/pool_capacity.yaml b/src/tests/ftest/nvme/pool_capacity.yaml index 135bb746207..cfe5c31704f 100644 --- a/src/tests/ftest/nvme/pool_capacity.yaml +++ b/src/tests/ftest/nvme/pool_capacity.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 2 + timeout: 1800 + server_config: name: daos_server engines_per_host: 1 @@ -10,20 +12,30 @@ server_config: nr_xs_helpers: 1 log_file: daos_server0.log storage: auto -pool: - mode: 146 - name: daos_server + +pool_common: &pool_common + properties: rd_fac:0,space_rb:0,reclaim:disabled + pool_qty_1: size: 50% + <<: *pool_common + pool_qty_2: size: 25% + <<: *pool_common + pool_qty_3: size: 16% + <<: *pool_common + pool_qty_10: size: 5% + <<: *pool_common + container: type: POSIX control_method: daos + ior: no_parallel_job: 10 clientslots: diff --git a/src/tests/ftest/nvme/pool_exclude.yaml b/src/tests/ftest/nvme/pool_exclude.yaml index 1b576c018d7..f08fd7f24d4 100644 --- a/src/tests/ftest/nvme/pool_exclude.yaml +++ b/src/tests/ftest/nvme/pool_exclude.yaml @@ -1,13 +1,16 @@ hosts: test_servers: 5 test_clients: 3 + # If we define the server under test_servers, launch.py will convert it to the # actual server name passed into --test_servers. If above test_servers is hosts, # it'll be used as one of the servers at test startup time, so use something # other than hosts. timeout: 1000 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -15,29 +18,28 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: ERR storage: auto + pool: scm_size: 50000000000 nvme_size: 300000000000 svcn: 4 rebuild_timeout: 180 pool_query_timeout: 30 + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:2 + ior_flags_common: &ior_flags_common write_flags: "-w -F -k -G 1" read_flags: "-F -r -R -k -G 1" @@ -49,6 +51,7 @@ ior_flags_common: &ior_flags_common - ["NA", "NA", 4000, 50000000] # [4K, 51M] - ["NA", "NA", 1000000, 500000000] # [1M, 512M] - ["NA", "NA", 1000000000, 8000000000] # [1G, 8G] + ior: client_processes: np: 48 @@ -68,11 +71,15 @@ ior: <<: *ior_flags_common dfs_oclass: EC_8P2GX dfs_dir_oclass: EC_8P2GX + loop_test: iterations: 2 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false diff --git a/src/tests/ftest/nvme/pool_extend.py b/src/tests/ftest/nvme/pool_extend.py index 876050a8561..485c6b3ee88 100644 --- a/src/tests/ftest/nvme/pool_extend.py +++ b/src/tests/ftest/nvme/pool_extend.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -54,7 +55,7 @@ def run_nvme_pool_extend(self, num_pool, oclass=None): pools = [] for _ in range(0, num_pool): pools.append(self.get_pool(namespace="/run/pool_qty_{}/*".format(num_pool), - properties="reclaim:disabled")) + properties="rd_fac:0,space_rb:0,reclaim:disabled")) # On each pool (max 3), extend the ranks # eg: ranks : 4,5 ; 6,7; 8,9. diff --git a/src/tests/ftest/nvme/pool_extend.yaml b/src/tests/ftest/nvme/pool_extend.yaml index 5664bb725d2..62744c5d5e5 100644 --- a/src/tests/ftest/nvme/pool_extend.yaml +++ b/src/tests/ftest/nvme/pool_extend.yaml @@ -1,15 +1,19 @@ hosts: test_servers: server-[1-2] test_clients: 2 + # If we define the server under test_servers, launch.py will convert it to the # actual server name passed into --test_servers. If above test_servers is hosts, # it'll be used as one of the servers at test startup time, so use something # other than hosts. extra_servers: test_servers: server-[3-5] + timeout: 1000 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -17,31 +21,33 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log storage: auto + pool: svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 + pool_qty_1: size: "50%" + pool_qty_2: size: "25%" + pool_qty_3: size: "16%" + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:1 + ior_flags_common: &ior_flags_common write_flags: "-w -F -k -G 1" read_flags: "-F -r -R -k -G 1" @@ -53,6 +59,7 @@ ior_flags_common: &ior_flags_common - ["NA", "NA", 4000, 5000000] # [4K, 5.1M] - ["NA", "NA", 1000000, 500000000] # [1M, 512M] - ["NA", "NA", 1000000000, 8000000000] # [1G, 8G] + ior: client_processes: np: 48 @@ -68,11 +75,15 @@ ior: <<: *ior_flags_common dfs_oclass: EC_2P1GX dfs_dir_oclass: EC_2P1GX + loop_test: iterations: 3 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false diff --git a/src/tests/ftest/osa/dmg_negative_test.yaml b/src/tests/ftest/osa/dmg_negative_test.yaml index b2adbe6584f..f9e943defd6 100644 --- a/src/tests/ftest/osa/dmg_negative_test.yaml +++ b/src/tests/ftest/osa/dmg_negative_test.yaml @@ -1,14 +1,18 @@ hosts: test_servers: server-[1-2] test_clients: 1 + # If we define the server under test_servers, launch.py will convert it to the # actual server name passed into --test_servers. If above test_servers is hosts, # it'll be used as one of the servers at test startup time, so use something # other than hosts. extra_servers: test_servers: server-3 + timeout: 1800 + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -16,8 +20,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -26,19 +28,20 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 6000000000 nvme_size: 54000000000 svcn: 4 + container: properties: cksum:crc16,cksum_size:16384 + test_sequence: dmg_cmd_test: # Rank list, target_list, Pass/Fail Condition diff --git a/src/tests/ftest/osa/offline_drain.py b/src/tests/ftest/osa/offline_drain.py index 04cb98f67e8..e7ba3b1b25d 100644 --- a/src/tests/ftest/osa/offline_drain.py +++ b/src/tests/ftest/osa/offline_drain.py @@ -23,23 +23,21 @@ def setUp(self): """Set up for test case.""" super().setUp() self.dmg_command = self.get_dmg_command() - self.ranks = self.params.get("rank_list", '/run/test_ranks/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') self.ior_test_sequence = self.params.get( "ior_test_sequence", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) - def run_offline_drain_test(self, num_pool, data=False, oclass=None, pool_fillup=0, - num_ranks=1): + def run_offline_drain_test(self, num_pool, ranks, data=False, oclass=None, pool_fillup=0): """Run the offline drain without data. Args: num_pool (int) : total pools to create for testing purposes. + ranks (list) : Ranks to drain. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. oclass (str): DAOS object class (eg: RP_2G1,etc) - num_ranks (int): Number of ranks to drain. Defaults to 1. """ # Create a pool pool = {} @@ -48,12 +46,6 @@ def run_offline_drain_test(self, num_pool, data=False, oclass=None, pool_fillup= if oclass is None: oclass = self.ior_cmd.dfs_oclass.value - # Get a random rank(s) based on num_ranks input. - ranklist = list(self.server_managers[0].ranks.keys()) - # For tests which uses num_ranks equal to 1, use the YAML file information. - if num_ranks > 1: - self.ranks = [",".join(map(str, self.random.sample(ranklist, k=num_ranks)))] - # Exclude target : random two targets (target idx : 0-7) exc = self.random.randint(0, 6) target_list.append(exc) @@ -92,7 +84,7 @@ def run_offline_drain_test(self, num_pool, data=False, oclass=None, pool_fillup= # Drain ranks and targets for val in range(0, num_pool): # Drain ranks provided in YAML file - for index, rank in enumerate(self.ranks): + for index, rank in enumerate(ranks): self.pool = pool[val] # If we are testing using multiple pools, reintegrate # the rank back and then drain. @@ -164,7 +156,8 @@ def test_osa_offline_drain(self): :avocado: tags=OSAOfflineDrain,test_osa_offline_drain """ self.log.info("Offline Drain : Basic Drain") - self.run_offline_drain_test(1, True) + ranks = self.get_random_test_ranks() + self.run_offline_drain_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_drain_without_checksum(self): """Test ID: DAOS-7159. @@ -178,7 +171,8 @@ def test_osa_offline_drain_without_checksum(self): """ self.test_with_checksum = self.params.get("test_with_checksum", "/run/checksum/*") self.log.info("Offline Drain : Without Checksum") - self.run_offline_drain_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_drain_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_drain_during_aggregation(self): """Test ID: DAOS-7159. @@ -193,7 +187,8 @@ def test_osa_offline_drain_during_aggregation(self): self.test_during_aggregation = self.params.get( "test_with_aggregation", "/run/aggregation/*") self.log.info("Offline Drain : During Aggregation") - self.run_offline_drain_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_drain_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_drain_oclass(self): """Test ID: DAOS-7159. @@ -207,8 +202,9 @@ def test_osa_offline_drain_oclass(self): """ self.test_with_checksum = self.params.get("test_with_checksum", "/run/checksum/*") self.log.info("Offline Drain : Oclass") + ranks = self.get_random_test_ranks() for oclass in self.test_oclass: - self.run_offline_drain_test(1, data=True, oclass=oclass) + self.run_offline_drain_test(num_pool=1, data=True, ranks=ranks, oclass=oclass) def test_osa_offline_drain_multiple_pools(self): """Test ID: DAOS-7159. @@ -221,7 +217,8 @@ def test_osa_offline_drain_multiple_pools(self): :avocado: tags=OSAOfflineDrain,test_osa_offline_drain_multiple_pools """ self.log.info("Offline Drain : Multiple Pools") - self.run_offline_drain_test(2, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_drain_test(num_pool=2, data=True, ranks=ranks) def test_osa_offline_drain_during_rebuild(self): """Test ID: DAOS-7159. @@ -235,7 +232,8 @@ def test_osa_offline_drain_during_rebuild(self): """ self.test_during_rebuild = self.params.get("test_with_rebuild", "/run/rebuild/*") self.log.info("Offline Drain : During Rebuild") - self.run_offline_drain_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_drain_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_drain_after_snapshot(self): """Test ID: DAOS-8057. @@ -249,7 +247,8 @@ def test_osa_offline_drain_after_snapshot(self): """ self.test_with_snapshot = self.params.get("test_with_snapshot", "/run/snapshot/*") self.log.info("Offline Drain : After taking snapshot") - self.run_offline_drain_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_drain_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_drain_with_less_pool_space(self): """Test ID: DAOS-7160. @@ -264,7 +263,9 @@ def test_osa_offline_drain_with_less_pool_space(self): self.log.info("Offline Drain : Test with less pool space") oclass = self.params.get("pool_test_oclass", '/run/pool_capacity/*') pool_fillup = self.params.get("pool_fillup", '/run/pool_capacity/*') - self.run_offline_drain_test(1, data=True, oclass=oclass, pool_fillup=pool_fillup) + ranks = self.get_random_test_ranks() + self.run_offline_drain_test(num_pool=1, data=True, ranks=ranks, oclass=oclass, + pool_fillup=pool_fillup) def test_osa_offline_drain_with_multiple_ranks(self): """Test ID: DAOS-4753. @@ -277,4 +278,5 @@ def test_osa_offline_drain_with_multiple_ranks(self): :avocado: tags=OSAOfflineDrain,test_osa_offline_drain_with_multiple_ranks """ self.log.info("Offline Drain : Test with multiple ranks") - self.run_offline_drain_test(1, data=True, num_ranks=2) + ranks = self.get_random_test_ranks(join_ranks=False) + self.run_offline_drain_test(num_pool=1, data=True, ranks=ranks) diff --git a/src/tests/ftest/osa/offline_drain.yaml b/src/tests/ftest/osa/offline_drain.yaml index a8776edd4a4..61c9aae2849 100644 --- a/src/tests/ftest/osa/offline_drain.yaml +++ b/src/tests/ftest/osa/offline_drain.yaml @@ -1,10 +1,14 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 2400 + setup: start_servers_once: false + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -12,8 +16,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: INFO,MEM=ERR env_vars: @@ -22,36 +24,40 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: INFO,MEM=ERR env_vars: - DD_MASK=mgmt,md storage: auto + pool: scm_size: 12000000000 nvme_size: 108000000000 svcn: 4 rebuild_timeout: 240 - properties: scrub:timed + properties: rd_fac:0,space_rb:0,scrub:timed + container: type: POSIX control_method: daos oclass: RP_3G6 properties: cksum:crc64,cksum_size:16384,srv_cksum:on,rd_fac:2 + dkeys: single: no_of_dkeys: - 50 + akeys: single: no_of_akeys: - 10 + record: 1KB: length: - 1024 + ior: clientslots: slots: 48 @@ -69,6 +75,7 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [12000000000, 108000000000, 500000, 500000000] + mdtest: api: DFS client_processes: @@ -87,21 +94,25 @@ mdtest: read_bytes: 32768 verbosity_value: 1 depth: 0 + test_obj_class: oclass: - RP_2G8 - RP_4G1 - EC_2P1G1 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false + snapshot: test_with_snapshot: true -test_ranks: - rank_list: ["2", "5"] + pool_capacity: pool_fillup: 10 pool_test_oclass: RP_2GX diff --git a/src/tests/ftest/osa/offline_extend.yaml b/src/tests/ftest/osa/offline_extend.yaml index 599d514db37..40eb0467f41 100644 --- a/src/tests/ftest/osa/offline_extend.yaml +++ b/src/tests/ftest/osa/offline_extend.yaml @@ -1,12 +1,17 @@ hosts: test_servers: server-[1-2] test_clients: 1 + extra_servers: test_servers: server-3 + timeout: 1100 + setup: start_servers_once: false + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -14,8 +19,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -24,23 +27,24 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 6000000000 nvme_size: 54000000000 svcn: 2 - properties: scrub:lazy + properties: rd_fac:0,space_rb:0,scrub:lazy + container: type: POSIX control_method: daos oclass: RP_2G1 properties: cksum:crc64,cksum_size:16384,srv_cksum:on,rd_fac:1 + ior: clientslots: slots: 48 @@ -58,6 +62,7 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [6000000000, 54000000000, 500000, 500000000] + mdtest: api: DFS client_processes: @@ -76,19 +81,26 @@ mdtest: read_bytes: 32768 verbosity_value: 1 depth: 0 + test_obj_class: oclass: - RP_2G8 - EC_2P1G1 + loop_test: iterations: 3 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false + snapshot: test_with_snapshot: true + test_ranks: rank_list: ["4", "5"] diff --git a/src/tests/ftest/osa/offline_parallel_test.yaml b/src/tests/ftest/osa/offline_parallel_test.yaml index e8dd4563339..b17bf53705f 100644 --- a/src/tests/ftest/osa/offline_parallel_test.yaml +++ b/src/tests/ftest/osa/offline_parallel_test.yaml @@ -1,12 +1,17 @@ hosts: test_servers: server-[1-2] test_clients: 1 + extra_servers: test_servers: server-3 + timeout: 700 + setup: start_servers_once: false + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -14,8 +19,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -24,22 +27,23 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 6000000000 nvme_size: 54000000000 svcn: 4 + container: type: POSIX control_method: daos oclass: RP_2G8 properties: cksum:crc64,cksum_size:16384,srv_cksum:on,rd_fac:1 + ior: clientslots: slots: 48 @@ -57,6 +61,7 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [6000000000, 54000000000, 500000, 500000000] + mdtest: api: DFS client_processes: @@ -75,16 +80,22 @@ mdtest: read_bytes: 32768 verbosity_value: 1 depth: 0 + test_obj_class: oclass: - RP_2G8 + loop_test: iterations: 3 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false + system_stop_start: flags: true diff --git a/src/tests/ftest/osa/offline_reintegration.py b/src/tests/ftest/osa/offline_reintegration.py index 7f39b161f73..8d92ce8a556 100644 --- a/src/tests/ftest/osa/offline_reintegration.py +++ b/src/tests/ftest/osa/offline_reintegration.py @@ -1,6 +1,6 @@ """ (C) Copyright 2020-2023 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -32,13 +32,14 @@ def setUp(self): self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.dmg_command.exit_status_exception = True - def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False, oclass=None, - pool_fillup=0, num_ranks=1): + def run_offline_reintegration_test(self, num_pool, ranks, data=False, server_boot=False, + oclass=None, pool_fillup=0): # pylint: disable=too-many-branches """Run the offline reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. + ranks (list) : Ranks to exclude and reintegrate during the testing. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. server_boot (bool) : Perform system stop/start on a rank. Defaults to False. @@ -47,6 +48,12 @@ def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False operations. num_ranks (int): Number of ranks to drain. Defaults to 1. """ + # Figure out an additional unique rank to stop during rebuild. + # Used when self.test_during_rebuild is True + all_ranks = list(map(str, self.server_managers[0].ranks.keys())) + all_exclude_ranks = ','.join(ranks).split(',') + rank_during_rebuild = self.random.choice(list(set(all_ranks) - set(all_exclude_ranks))) + # Create 'num_pool' number of pools pools = [] if oclass is None: @@ -81,15 +88,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False if self.test_during_aggregation is True: self.run_ior_thread("Write", oclass, test_seq) - if num_ranks > 1: - # Exclude ranks from a random pool - ranklist = list(self.server_managers[0].ranks.keys()) - ranks = [",".join(map(str, self.random.sample(ranklist, k=num_ranks)))] - else: - # Exclude ranks 0 and 3 from a random pool (when num_ranks equal to 1) - ranks = ["0", "3"] - - self.pool = self.random.choice(pools) # nosec + self.pool = self.random.choice(pools) for loop in range(0, self.loop_test_cnt): self.log.info( "==> (Loop %s/%s) Excluding ranks %s from %s", @@ -102,8 +101,8 @@ def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False initial_free_space = self.pool.get_total_free_space(refresh=True) if server_boot is False: if (self.test_during_rebuild is True and index == 0): - # Exclude rank 5 - output = self.pool.exclude("5") + # Exclude an additional rank + output = self.pool.exclude(rank_during_rebuild) self.print_and_assert_on_rebuild_failure(output) if self.test_during_aggregation is True: self.delete_extra_container(self.pool) @@ -122,10 +121,9 @@ def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False output = self.dmg_command.system_stop(ranks=rank, force=True) self.print_and_assert_on_rebuild_failure(output) output = self.dmg_command.system_start(ranks=rank) - # Just try to reintegrate rank 5 + # Just try to reintegrate the additional rank if (self.test_during_rebuild is True and index == 2): - # Reintegrate rank 5 - output = self.pool.reintegrate("5") + output = self.pool.reintegrate(rank_during_rebuild) self.print_and_assert_on_rebuild_failure(output) pver_exclude = self.pool.get_version(True) @@ -134,10 +132,12 @@ def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False # Check pool version incremented after pool exclude # pver_exclude should be greater than # pver_begin + 1 (1 target + exclude) - self.assertTrue(pver_exclude > (pver_begin + 1), - "Pool Version Error: After exclude") - self.assertTrue(initial_free_space > free_space_after_exclude, - "Expected free space after exclude is less than initial") + if not pver_exclude > (pver_begin + 1): + self.fail(f"Pool version after exclude: {pver_exclude} !> {pver_begin + 1}") + if not initial_free_space > free_space_after_exclude: + self.fail( + "Expected free space after exclude: " + f"{initial_free_space} !> {free_space_after_exclude}") # Reintegrate the ranks which was excluded self.log.info( @@ -161,12 +161,14 @@ def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False free_space_after_reintegration = self.pool.get_total_free_space(refresh=True) pver_reint = self.pool.get_version(True) self.log.info("Pool Version after reintegrate %d", pver_reint) - # Check pool version incremented after pool reintegrate - self.assertTrue(pver_reint > pver_exclude, "Pool Version Error: After reintegrate") - self.assertTrue(free_space_after_reintegration > free_space_after_exclude, - "Expected free space after reintegration is less than exclude") - - display_string = "{} space at the End".format(str(self.pool)) + if not pver_reint > pver_exclude: + self.fail(f"Pool version after reintegrate: {pver_reint} !> {pver_exclude}") + if not free_space_after_reintegration > free_space_after_exclude: + self.fail( + "Expected free space after reintegrate: " + f"{free_space_after_reintegration} !> {free_space_after_exclude}") + + display_string = f"{str(self.pool)} space at the End" self.pool.display_pool_daos_space(display_string) # Finally check whether the written data can be accessed. @@ -195,7 +197,8 @@ def test_osa_offline_reintegration_without_checksum(self): """ self.test_with_checksum = self.params.get("test_with_checksum", '/run/checksum/*') self.log.info("Offline Reintegration : Without Checksum") - self.run_offline_reintegration_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_reintegration_multiple_pools(self): """Test ID: DAOS-6923. @@ -208,7 +211,8 @@ def test_osa_offline_reintegration_multiple_pools(self): :avocado: tags=OSAOfflineReintegration,test_osa_offline_reintegration_multiple_pools """ self.log.info("Offline Reintegration : Multiple Pools") - self.run_offline_reintegration_test(5, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=5, data=True, ranks=ranks) def test_osa_offline_reintegration_server_stop(self): """Test ID: DAOS-6748. @@ -221,7 +225,8 @@ def test_osa_offline_reintegration_server_stop(self): :avocado: tags=OSAOfflineReintegration,test_osa_offline_reintegration_server_stop """ self.log.info("Offline Reintegration : System Start/Stop") - self.run_offline_reintegration_test(1, data=True, server_boot=True) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=1, data=True, server_boot=True, ranks=ranks) def test_osa_offline_reintegrate_during_rebuild(self): """Test ID: DAOS-6923. @@ -236,7 +241,8 @@ def test_osa_offline_reintegrate_during_rebuild(self): self.loop_test_cnt = self.params.get("iterations", '/run/loop_test/*') self.test_during_rebuild = self.params.get("test_with_rebuild", '/run/rebuild/*') self.log.info("Offline Reintegration : Rebuild") - self.run_offline_reintegration_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_reintegration_oclass(self): """Test ID: DAOS-6923. @@ -249,8 +255,10 @@ def test_osa_offline_reintegration_oclass(self): :avocado: tags=OSAOfflineReintegration,test_osa_offline_reintegration_oclass """ self.log.info("Offline Reintegration : Object Class") + ranks = self.get_random_test_ranks() for oclass in self.test_oclass: - self.run_offline_reintegration_test(1, data=True, server_boot=False, oclass=oclass) + self.run_offline_reintegration_test(num_pool=1, data=True, server_boot=False, + oclass=oclass, ranks=ranks) def test_osa_offline_reintegrate_during_aggregation(self): """Test ID: DAOS-6923. @@ -265,7 +273,8 @@ def test_osa_offline_reintegrate_during_aggregation(self): self.test_during_aggregation = self.params.get("test_with_aggregation", '/run/aggregation/*') self.log.info("Offline Reintegration : Aggregation") - self.run_offline_reintegration_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_reintegration_with_rf(self): """Test ID: DAOS-6923. @@ -280,7 +289,8 @@ def test_osa_offline_reintegration_with_rf(self): """ self.log.info("Offline Reintegration : RF") self.test_with_rf = self.params.get("test_with_rf", '/run/test_rf/*') - self.run_offline_reintegration_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_reintegrate_with_blank_node(self): """Test ID: DAOS-6923. @@ -294,7 +304,8 @@ def test_osa_offline_reintegrate_with_blank_node(self): """ self.test_with_blank_node = self.params.get("test_with_blank_node", '/run/blank_node/*') self.log.info("Offline Reintegration : Test with blank node") - self.run_offline_reintegration_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_reintegrate_after_snapshot(self): """Test ID: DAOS-8057. @@ -308,7 +319,8 @@ def test_osa_offline_reintegrate_after_snapshot(self): """ self.test_with_snapshot = self.params.get("test_with_snapshot", '/run/snapshot/*') self.log.info("Offline Reintegration : Test with snapshot") - self.run_offline_reintegration_test(1, data=True) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=1, data=True, ranks=ranks) def test_osa_offline_reintegrate_with_less_pool_space(self): """Test ID: DAOS-7160. @@ -323,7 +335,9 @@ def test_osa_offline_reintegrate_with_less_pool_space(self): self.log.info("Offline Reintegration : Test with less pool space") oclass = self.params.get("pool_test_oclass", '/run/pool_capacity/*') pool_fillup = self.params.get("pool_fillup", '/run/pool_capacity/*') - self.run_offline_reintegration_test(1, data=True, oclass=oclass, pool_fillup=pool_fillup) + ranks = self.get_random_test_ranks() + self.run_offline_reintegration_test(num_pool=1, data=True, oclass=oclass, + pool_fillup=pool_fillup, ranks=ranks) def test_osa_offline_reintegrate_with_multiple_ranks(self): """Test ID: DAOS-4753. @@ -336,4 +350,5 @@ def test_osa_offline_reintegrate_with_multiple_ranks(self): :avocado: tags=OSAOfflineReintegration,test_osa_offline_reintegrate_with_multiple_ranks """ self.log.info("Offline Reintegration : Test with multiple ranks") - self.run_offline_reintegration_test(1, data=True, num_ranks=2) + ranks = self.get_random_test_ranks(join_ranks=False) + self.run_offline_reintegration_test(num_pool=1, data=True, ranks=ranks) diff --git a/src/tests/ftest/osa/offline_reintegration.yaml b/src/tests/ftest/osa/offline_reintegration.yaml index 359c7bab9b6..659186f7dbc 100644 --- a/src/tests/ftest/osa/offline_reintegration.yaml +++ b/src/tests/ftest/osa/offline_reintegration.yaml @@ -1,12 +1,17 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 1300 + timeouts: test_osa_offline_reintegrate_with_less_pool_space: 1800 + setup: start_servers_once: false + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -14,8 +19,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -24,8 +27,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -33,27 +34,32 @@ server_config: storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + pool: scm_size: 6000000000 nvme_size: 54000000000 svcn: 4 rebuild_timeout: 240 pool_query_timeout: 30 - properties: scrub:timed,scrub_freq:1 + properties: rd_fac:0,space_rb:0,scrub:timed,scrub_freq:1 + container: type: POSIX control_method: daos oclass: RP_3G6 properties: cksum:crc64,cksum_size:16384,srv_cksum:on,rd_fac:2 + ior: - clientslots: - slots: 48 + client_processes: + np: 2 test_file: /testFile repetitions: 2 dfs_destroy: false @@ -68,6 +74,7 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [6000000000, 54000000000, 500000, 500000000] + mdtest: api: DFS client_processes: @@ -86,24 +93,33 @@ mdtest: read_bytes: 32768 verbosity_value: 1 depth: 0 + test_obj_class: oclass: - RP_4G1 - EC_2P2G1 + loop_test: iterations: 3 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false + test_rf: test_with_rf: true + blank_node: test_with_blank_node: true + snapshot: test_with_snapshot: true + pool_capacity: pool_fillup: 10 pool_test_oclass: RP_2GX diff --git a/src/tests/ftest/osa/online_drain.py b/src/tests/ftest/osa/online_drain.py index 98cba3dc94f..f044ad38af5 100644 --- a/src/tests/ftest/osa/online_drain.py +++ b/src/tests/ftest/osa/online_drain.py @@ -32,14 +32,14 @@ def setUp(self): self.dmg_command.exit_status_exception = True self.pool = None - def run_online_drain_test(self, num_pool, oclass=None, app_name="ior", num_ranks=1): + def run_online_drain_test(self, num_pool, ranks, oclass=None, app_name="ior"): """Run the Online drain without data. Args: num_pool (int) : total pools to create for testing purposes. + ranks (list) : list of ranks to drain oclass (str) : Object class type (RP_2G1, etc) app_name (str) : application to run on parallel (ior or mdtest). Defaults to ior. - num_ranks (int): Number of ranks to drain. Defaults to 1. """ # Create a pool pool = {} @@ -51,10 +51,6 @@ def run_online_drain_test(self, num_pool, oclass=None, app_name="ior", num_ranks targets = int(self.server_managers[-1].get_config_value('targets')) t_string = ','.join(map(str, self.random.sample(range(targets), 2))) - # Get random rank(s) from the rank list. - ranklist = list(self.server_managers[0].ranks.keys()) - rank = ",".join(map(str, self.random.sample(ranklist, k=num_ranks))) - for val in range(0, num_pool): pool[val] = add_pool(self, connect=False) pool[val].set_property("reclaim", "disabled") @@ -89,7 +85,7 @@ def run_online_drain_test(self, num_pool, oclass=None, app_name="ior", num_ranks self.log.info("Pool Version at the beginning %s", pver_begin) # Get initial total space (scm+nvme) initial_total_space = self.pool.get_total_space(refresh=True) - output = self.pool.drain(rank, t_string) + output = self.pool.drain(ranks, t_string) self.print_and_assert_on_rebuild_failure(output) total_space_after_drain = self.pool.get_total_space(refresh=True) @@ -127,7 +123,8 @@ def test_osa_online_drain(self): :avocado: tags=OSAOnlineDrain,test_osa_online_drain """ self.log.info("Online Drain : With Checksum") - self.run_online_drain_test(1) + ranks = self.get_random_test_ranks() + self.run_online_drain_test(num_pool=1, ranks=ranks) def test_osa_online_drain_no_csum(self): """Test ID: DAOS-6909 @@ -143,7 +140,8 @@ def test_osa_online_drain_no_csum(self): self.log.info("Online Drain : No Checksum") self.test_with_checksum = self.params.get("test_with_checksum", '/run/checksum/*') - self.run_online_drain_test(1) + ranks = self.get_random_test_ranks() + self.run_online_drain_test(num_pool=1, ranks=ranks) def test_osa_online_drain_oclass(self): """Test ID: DAOS-6909 @@ -157,8 +155,9 @@ def test_osa_online_drain_oclass(self): :avocado: tags=OSAOnlineDrain,test_osa_online_drain_oclass """ self.log.info("Online Drain : Oclass") + ranks = self.get_random_test_ranks() for oclass in self.test_oclass: - self.run_online_drain_test(1, oclass=oclass) + self.run_online_drain_test(num_pool=1, oclass=oclass, ranks=ranks) def test_osa_online_drain_with_aggregation(self): """Test ID: DAOS-6909 @@ -174,7 +173,8 @@ def test_osa_online_drain_with_aggregation(self): self.log.info("Online Drain : Aggregation") self.test_during_aggregation = self.params.get("test_with_aggregation", '/run/aggregation/*') - self.run_online_drain_test(1) + ranks = self.get_random_test_ranks() + self.run_online_drain_test(num_pool=1, ranks=ranks) def test_osa_online_drain_mdtest(self): """Test ID: DAOS-4750 @@ -188,7 +188,8 @@ def test_osa_online_drain_mdtest(self): :avocado: tags=OSAOnlineDrain,test_osa_online_drain_mdtest """ self.log.info("Online Drain : With Mdtest") - self.run_online_drain_test(1, app_name="mdtest") + ranks = self.get_random_test_ranks() + self.run_online_drain_test(1, app_name="mdtest", ranks=ranks) def test_osa_online_drain_with_multiple_ranks(self): """Test ID: DAOS-4753. @@ -201,4 +202,5 @@ def test_osa_online_drain_with_multiple_ranks(self): :avocado: tags=OSAOnlineDrain,test_osa_online_drain_with_multiple_ranks """ self.log.info("Online Drain : Test with multiple ranks") - self.run_online_drain_test(1, num_ranks=2) + ranks = self.get_random_test_ranks(join_ranks=False) + self.run_online_drain_test(num_pool=1, ranks=ranks) diff --git a/src/tests/ftest/osa/online_drain.yaml b/src/tests/ftest/osa/online_drain.yaml index 738683694b1..9dbcfbbec44 100644 --- a/src/tests/ftest/osa/online_drain.yaml +++ b/src/tests/ftest/osa/online_drain.yaml @@ -1,12 +1,17 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 1000 + job_manager: manager_timeout: 300 + setup: start_servers_once: false + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -14,8 +19,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -24,24 +27,25 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 12G nvme_size: 108G rebuild_timeout: 120 pool_query_timeout: 30 - properties: scrub:timed,scrub_freq:1 + properties: rd_fac:0,space_rb:0,scrub:timed,scrub_freq:1 + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:1 oclass: RP_2G4 + ior: client_processes: np: 2 @@ -59,6 +63,7 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [4000000000, 18000000000, 40000, 500000000, PASS] # [4G, 18G, 40K, 510M, PASS] + mdtest: api: DFS client_processes: @@ -77,16 +82,21 @@ mdtest: read_bytes: 32768 verbosity_value: 1 depth: 0 + test_obj_class: oclass: - RP_3G6 - RP_4G1 - S1 + loop_test: iterations: 3 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false diff --git a/src/tests/ftest/osa/online_extend.yaml b/src/tests/ftest/osa/online_extend.yaml index 0d2db08dcbd..144c6441371 100644 --- a/src/tests/ftest/osa/online_extend.yaml +++ b/src/tests/ftest/osa/online_extend.yaml @@ -1,18 +1,24 @@ hosts: test_servers: server-[1-2] test_clients: 1 + # If we define the server under test_servers, launch.py will convert it to the # actual server name passed into --test_servers. If above test_servers is hosts, # it'll be used as one of the servers at test startup time, so use something # other than hosts. extra_servers: test_servers: server-3 + timeout: 1000 + job_manager: manager_timeout: 330 + setup: start_servers_once: false + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -20,8 +26,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -30,25 +34,26 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 12000000000 nvme_size: 108000000000 svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 - properties: scrub:lazy + properties: rd_fac:0,space_rb:0,scrub:lazy + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:1 oclass: RP_2G1 + ior: client_processes: np: 2 @@ -66,6 +71,7 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [4000000000, 18000000000, 40000, 500000000, PASS] # [4G, 18G, 40K, 510M, PASS] + mdtest: api: DFS client_processes: @@ -84,19 +90,26 @@ mdtest: read_bytes: 32768 verbosity_value: 1 depth: 0 + daos_racer: runtime: 480 clush_timeout: 1000 + test_obj_class: oclass: - S1 + loop_test: iterations: 3 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false + test_ranks: rank_list: ["4,5"] diff --git a/src/tests/ftest/osa/online_parallel_test.yaml b/src/tests/ftest/osa/online_parallel_test.yaml index 63668dfca69..4eedc63838f 100644 --- a/src/tests/ftest/osa/online_parallel_test.yaml +++ b/src/tests/ftest/osa/online_parallel_test.yaml @@ -1,10 +1,14 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 1110 + job_manager: manager_timeout: 400 + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -12,8 +16,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -22,22 +24,23 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 12000000000 nvme_size: 108000000000 svcn: 4 + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on oclass: RP_2G1 + ior: no_parallel_job: 2 client_processes: @@ -57,6 +60,7 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [4000000000, 18000000000, 4000, 50000000, PASS] # [4G, 18G, 4K, 51M, PASS] + daos_racer: runtime: 480 clush_timeout: 1000 diff --git a/src/tests/ftest/osa/online_reintegration.py b/src/tests/ftest/osa/online_reintegration.py index 332e93a871f..4420d12ed56 100644 --- a/src/tests/ftest/osa/online_reintegration.py +++ b/src/tests/ftest/osa/online_reintegration.py @@ -43,17 +43,17 @@ def daos_racer_thread(self): self.daos_racer.get_params(self) self.daos_racer.run() - def run_online_reintegration_test(self, num_pool, racer=False, server_boot=False, oclass=None, - num_ranks=1): + def run_online_reintegration_test(self, num_pool, ranks, racer=False, server_boot=False, + oclass=None): """Run the Online reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. + ranks (list) : list of ranks to reintegrate. racer (bool) : whether pool has no data or to create some data in pool. Defaults to False. server_boot (bool) : Perform system stop/start on a rank. Defaults to False. oclass (str) : daos object class string (eg: "RP_2G8"). Defaults to None. - num_ranks (int): Number of ranks to drain. Defaults to 1. """ if oclass is None: oclass = self.ior_cmd.dfs_oclass.value @@ -61,9 +61,6 @@ def run_online_reintegration_test(self, num_pool, racer=False, server_boot=False # Create a pool pool = {} - ranklist = list(self.server_managers[0].ranks.keys()) - rank = ",".join(map(str, self.random.sample(ranklist, k=num_ranks))) - # Start the daos_racer thread if racer is True: daos_racer_thread = threading.Thread(target=self.daos_racer_thread) @@ -100,13 +97,13 @@ def run_online_reintegration_test(self, num_pool, racer=False, server_boot=False # Get initial total free space (scm+nvme) initial_free_space = self.pool.get_total_free_space(refresh=True) if server_boot is False: - output = self.pool.exclude(rank) + output = self.pool.exclude(ranks) else: - output = self.dmg_command.system_stop(ranks=rank, force=True) + output = self.dmg_command.system_stop(ranks=ranks, force=True) self.pool.wait_for_rebuild_to_start() self.pool.wait_for_rebuild_to_end() self.log.info(output) - output = self.dmg_command.system_start(ranks=rank) + output = self.dmg_command.system_start(ranks=ranks) self.pool.wait_for_rebuild_to_start() self.print_and_assert_on_rebuild_failure(output) @@ -120,7 +117,7 @@ def run_online_reintegration_test(self, num_pool, racer=False, server_boot=False self.assertTrue(pver_exclude > (pver_begin + 8), "Pool Version Error: After exclude") self.assertTrue(initial_free_space > free_space_after_exclude, "Expected space after exclude is less than initial") - output = self.pool.reintegrate(rank) + output = self.pool.reintegrate(ranks) self.print_and_assert_on_rebuild_failure(output) free_space_after_reintegration = self.pool.get_total_free_space(refresh=True) @@ -164,7 +161,8 @@ def test_osa_online_reintegration(self): :avocado: tags=OSAOnlineReintegration,test_osa_online_reintegration """ self.log.info("Online Reintegration : Basic test") - self.run_online_reintegration_test(1) + ranks = self.get_random_test_ranks(total_ranks=1) + self.run_online_reintegration_test(num_pool=1, ranks=ranks) def test_osa_online_reintegration_server_stop(self): """Test ID: DAOS-5920. @@ -177,7 +175,8 @@ def test_osa_online_reintegration_server_stop(self): :avocado: tags=OSAOnlineReintegration,test_osa_online_reintegration_server_stop """ self.log.info("Online Reintegration : System stop/start") - self.run_online_reintegration_test(1, server_boot=True) + ranks = self.get_random_test_ranks(total_ranks=1) + self.run_online_reintegration_test(num_pool=1, server_boot=True, ranks=ranks) def test_osa_online_reintegration_without_csum(self): """Test ID: DAOS-5075. @@ -191,7 +190,8 @@ def test_osa_online_reintegration_without_csum(self): """ self.log.info("Online Reintegration : No Checksum") self.test_with_checksum = self.params.get("test_with_checksum", "/run/checksum/*") - self.run_online_reintegration_test(1) + ranks = self.get_random_test_ranks(total_ranks=1) + self.run_online_reintegration_test(num_pool=1, ranks=ranks) def test_osa_online_reintegration_with_aggregation(self): """Test ID: DAOS-6715. @@ -206,7 +206,8 @@ def test_osa_online_reintegration_with_aggregation(self): self.test_during_aggregation = self.params.get("test_with_aggregation", '/run/aggregation/*') self.log.info("Online Reintegration : Aggregation") - self.run_online_reintegration_test(1) + ranks = self.get_random_test_ranks(total_ranks=1) + self.run_online_reintegration_test(num_pool=1, ranks=ranks) def test_osa_online_reintegration_oclass(self): """Test ID: DAOS-6715. @@ -219,8 +220,9 @@ def test_osa_online_reintegration_oclass(self): :avocado: tags=OSAOnlineReintegration,test_osa_online_reintegration_oclass """ self.log.info("Online Reintegration : Object Class") + ranks = self.get_random_test_ranks(total_ranks=1) for oclass in self.test_oclass: - self.run_online_reintegration_test(1, oclass=oclass) + self.run_online_reintegration_test(num_pool=1, oclass=oclass, ranks=ranks) def test_osa_online_reintegration_with_multiple_ranks(self): """Test ID: DAOS-4753. @@ -233,4 +235,5 @@ def test_osa_online_reintegration_with_multiple_ranks(self): :avocado: tags=OSAOnlineReintegration,test_osa_online_reintegration_with_multiple_ranks """ self.log.info("Online Reintegration : Multiple ranks") - self.run_online_reintegration_test(1, oclass="RP_3G1", num_ranks=2) + ranks = self.get_random_test_ranks(join_ranks=False) + self.run_online_reintegration_test(num_pool=1, oclass="RP_3G1", ranks=ranks) diff --git a/src/tests/ftest/osa/online_reintegration.yaml b/src/tests/ftest/osa/online_reintegration.yaml index 79b313604be..531e4c16e20 100644 --- a/src/tests/ftest/osa/online_reintegration.yaml +++ b/src/tests/ftest/osa/online_reintegration.yaml @@ -1,12 +1,17 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 1110 + job_manager: manager_timeout: 300 + setup: start_servers_once: false + skip_add_log_msg: true + server_config: name: daos_server engines_per_host: 2 @@ -14,8 +19,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -24,25 +27,26 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 12000000000 nvme_size: 108000000000 svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 - properties: scrub:timed + properties: rd_fac:0,space_rb:0,scrub:timed + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:1 oclass: RP_2G1 + ior: client_processes: np: 2 @@ -60,9 +64,11 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [4000000000, 18000000000, 40000, 500000000, PASS] # [4G, 18G, 40K, 510M, PASS] + daos_racer: runtime: 480 clush_timeout: 1000 + mdtest: api: DFS client_processes: @@ -81,15 +87,20 @@ mdtest: read_bytes: 32768 verbosity_value: 1 depth: 0 + test_obj_class: oclass: - RP_3G6 - RP_4G1 + loop_test: iterations: 3 + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true + checksum: test_with_checksum: false diff --git a/src/tests/ftest/performance/ior_easy.yaml b/src/tests/ftest/performance/ior_easy.yaml index b846b179277..139f199f53e 100644 --- a/src/tests/ftest/performance/ior_easy.yaml +++ b/src/tests/ftest/performance/ior_easy.yaml @@ -12,23 +12,19 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto pool: size: 95% - properties: ec_cell_sz:1MiB + properties: rd_fac:0,space_rb:0,ec_cell_sz:1MiB container: type: POSIX diff --git a/src/tests/ftest/performance/ior_hard.yaml b/src/tests/ftest/performance/ior_hard.yaml index 91b897dbb0b..deeb807bded 100644 --- a/src/tests/ftest/performance/ior_hard.yaml +++ b/src/tests/ftest/performance/ior_hard.yaml @@ -12,23 +12,19 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto pool: size: 95% - properties: ec_cell_sz:128KiB + properties: rd_fac:0,space_rb:0,ec_cell_sz:128KiB container: type: POSIX diff --git a/src/tests/ftest/performance/mdtest_easy.yaml b/src/tests/ftest/performance/mdtest_easy.yaml index d2925536b79..01444ec8519 100644 --- a/src/tests/ftest/performance/mdtest_easy.yaml +++ b/src/tests/ftest/performance/mdtest_easy.yaml @@ -12,23 +12,19 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto pool: size: 95% - properties: ec_cell_sz:1MiB + properties: rd_fac:0,space_rb:0,ec_cell_sz:1MiB container: type: POSIX diff --git a/src/tests/ftest/performance/mdtest_hard.yaml b/src/tests/ftest/performance/mdtest_hard.yaml index 0599ea61319..f5c49502f58 100644 --- a/src/tests/ftest/performance/mdtest_hard.yaml +++ b/src/tests/ftest/performance/mdtest_hard.yaml @@ -12,23 +12,19 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR storage: auto pool: size: 95% - properties: ec_cell_sz:1MiB + properties: rd_fac:0,space_rb:0,ec_cell_sz:1MiB container: type: POSIX diff --git a/src/tests/ftest/pool/create.py b/src/tests/ftest/pool/create.py index 433dd88110d..89bd8a2bd75 100644 --- a/src/tests/ftest/pool/create.py +++ b/src/tests/ftest/pool/create.py @@ -1,5 +1,6 @@ """ (C) Copyright 2021-2023 Intel Corporation. +(C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -116,11 +117,19 @@ def test_create_no_space_loop(self): "existing pool on one server consuming the required space.") # Create the third of three pools which should succeed. - pools[2].create() + attempts = 1 + while attempts <= 3: + pools[2].create() + if pools[2].dmg.result.exit_status == 0: + break + self.log.info( + "Loop %s: Pool create on ranks %s failed in %s/3 attempts", + index, pools[2].target_list.value, attempts) + attempts += 1 if pools[2].dmg.result.exit_status != 0: self.fail( "Creating a large capacity pool that spans across all but the first server " - "should succeed.") + f"should succeed - failed after {attempts} loops.") # Destroy the third of three pools so it can be created again in the next loop pools[2].destroy() diff --git a/src/tests/ftest/pool/create.yaml b/src/tests/ftest/pool/create.yaml index a1e7f0d30c9..87a5a385fab 100644 --- a/src/tests/ftest/pool/create.yaml +++ b/src/tests/ftest/pool/create.yaml @@ -1,11 +1,13 @@ hosts: test_servers: 3 test_clients: 1 + timeouts: test_create_max_pool_scm_only: 180 test_create_max_pool: 300 test_create_no_space: 300 - test_create_no_space_loop: 3500 + test_create_no_space_loop: 3700 + server_config: name: daos_server engines_per_host: 2 @@ -13,21 +15,19 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool_1: scm_size: 1 svcn: 1 quantity: 1 + pool_2: size: 90% svcn: 1 diff --git a/src/tests/ftest/pool/create_all_hw.py b/src/tests/ftest/pool/create_all_hw.py index 2762cd07407..66f0cec8080 100644 --- a/src/tests/ftest/pool/create_all_hw.py +++ b/src/tests/ftest/pool/create_all_hw.py @@ -1,5 +1,6 @@ """ (C) Copyright 2022-2023 Intel Corporation. +(C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -108,17 +109,20 @@ def test_two_pools_hw(self): pool_half_deltas_bytes = self.get_deltas("test_two_pools_hw", "pool_half") pool_full_deltas_bytes = self.get_deltas("test_two_pools_hw", "pool_full") distribution_deltas_bytes = self.get_deltas("test_two_pools_hw", "distribution") + # pylint: disable-next=logging-too-few-args self.log.info( "Test pool creation of two pools with 50% and 100% of the available storage") for name in ('pool_half', 'pool_full', 'distribution'): val = locals()["{}_deltas_bytes".format(name)] self.log_deltas(*val, prefix=name) + # pylint: disable-next=logging-format-truncated self.log.info("Creating first pool with half of the available storage: size=50%") self.check_pool_half_storage(*pool_half_deltas_bytes) self.log.info("Checking data distribution among the different engines") self.check_pool_distribution(*distribution_deltas_bytes) + # pylint: disable-next=logging-format-truncated self.log.info("Creating second pool with all the available storage: size=100%") self.check_pool_full_storage(*pool_full_deltas_bytes) diff --git a/src/tests/ftest/pool/create_all_hw.yaml b/src/tests/ftest/pool/create_all_hw.yaml index b47802955d6..c4fcf5cb770 100644 --- a/src/tests/ftest/pool/create_all_hw.yaml +++ b/src/tests/ftest/pool/create_all_hw.yaml @@ -44,8 +44,6 @@ server_config: engines: 0: pinned_numa_node: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: 0: @@ -57,8 +55,6 @@ server_config: bdev_list: ["aaaa:aa:aa.a"] 1: pinned_numa_node: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: 0: diff --git a/src/tests/ftest/pool/create_all_vm.py b/src/tests/ftest/pool/create_all_vm.py index eeaee46b9db..43830531712 100644 --- a/src/tests/ftest/pool/create_all_vm.py +++ b/src/tests/ftest/pool/create_all_vm.py @@ -156,6 +156,7 @@ def test_two_pools_vm(self): "distribution", "/run/test_two_pools_vm/deltas/*", 0) + # pylint: disable-next=logging-too-few-args self.log.info( "Test pool creation of two pools with 50% and 100% of the available storage") for name in ('pool_half', 'pool_full', 'distribution'): @@ -163,11 +164,13 @@ def test_two_pools_vm(self): self.log.info("\t- %s=%s (%d Bytes)", name, bytes_to_human(val), val) self.log.info("\t- scm_hugepages_enabled=%s", self.scm_hugepages_enabled) + # pylint: disable-next=logging-format-truncated self.log.info("Creating first pool with half of the available storage: size=50%") self.check_pool_half_storage(pool_half_delta_bytes) self.log.info("Checking data distribution among the different engines") self.check_pool_distribution(distribution_delta_bytes) + # pylint: disable-next=logging-format-truncated self.log.info("Creating second pool with all the available storage: size=100%") self.check_pool_full_storage(pool_full_delta_bytes) diff --git a/src/tests/ftest/pool/create_capacity.yaml b/src/tests/ftest/pool/create_capacity.yaml index 6d8e9b359b6..06fa03b82c8 100644 --- a/src/tests/ftest/pool/create_capacity.yaml +++ b/src/tests/ftest/pool/create_capacity.yaml @@ -14,8 +14,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: DEBUG targets: 1 @@ -28,8 +26,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: DEBUG targets: 1 diff --git a/src/tests/ftest/pool/destroy.yaml b/src/tests/ftest/pool/destroy.yaml index f2da22876bd..6e42554177c 100644 --- a/src/tests/ftest/pool/destroy.yaml +++ b/src/tests/ftest/pool/destroy.yaml @@ -16,7 +16,7 @@ server_config: class: ram scm_mount: /mnt/daos system_ram_reserved: 1 -timeout: 360 +timeout: 720 pool: scm_size: 1G container: diff --git a/src/tests/ftest/pool/eviction_metrics.py b/src/tests/ftest/pool/eviction_metrics.py new file mode 100644 index 00000000000..8318f290d35 --- /dev/null +++ b/src/tests/ftest/pool/eviction_metrics.py @@ -0,0 +1,124 @@ +""" + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import json +import math + +from job_manager_utils import get_job_manager +from mdtest_utils import MDTEST_NAMESPACE, run_mdtest +from telemetry_test_base import TestWithTelemetry + + +class EvictionMetrics(TestWithTelemetry): + """ + Tests DAOS client eviction from a pool that the client is using. + + :avocado: recursive + """ + + def test_eviction_metrics(self): + """Verify page eviction on the pool + + 1. Create a pool with a mem ratio of 100% (for pmem or phase 1) or 25% (for phase 2) + 2. Collect a baseline for the pool eviction metrics + 3. Run mdtest -a DFS to generate many small files larger than mem size + 4. Collect new page eviction metrics + 5. Verify page eviction + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=pool + :avocado: tags=EvictionMetrics,test_eviction_metrics + """ + write_bytes = self.params.get('write_bytes', MDTEST_NAMESPACE, None) + processes = self.params.get('processes', MDTEST_NAMESPACE, None) + ppn = self.params.get('ppn', MDTEST_NAMESPACE, None) + + evict_metrics = list(self.telemetry.ENGINE_POOL_VOS_CACHE_METRICS) + + self.log_step('Creating a pool (dmg pool create)') + pool = self.get_pool(connect=False) + try: + _result = json.loads(pool.dmg.result.stdout) + tier_bytes_scm = int(_result["response"]["tier_bytes"][0]) + mem_file_bytes = int(_result["response"]["mem_file_bytes"]) + except Exception as error: # pylint: disable=broad-except + self.fail(f"Error extracting data for dmg pool create output: {error}") + + # Calculate the mdtest files_per_process based upon the scm size and other mdtest params + _write_processes = processes + if ppn is not None: + _write_processes = ppn * len(self.host_info.clients.hosts) + files_per_process = math.floor(mem_file_bytes / (write_bytes * _write_processes)) + if tier_bytes_scm > mem_file_bytes: + # Write more (110%) files to exceed mem_file_bytes and cause eviction + mdtest_params = {"num_of_files_dirs": math.ceil(files_per_process * 1.10)} + else: + # Write less (30%) files to avoid out of space errors + mdtest_params = {"num_of_files_dirs": math.floor(files_per_process * 0.30)} + + self.log.debug("-" * 60) + self.log.debug("Pool %s create data:", pool) + self.log.debug(" tier_bytes_scm: %s", tier_bytes_scm) + self.log.debug(" mem_file_bytes: %s", mem_file_bytes) + self.log.debug(" mem_ratio.value: %s", pool.mem_ratio.value) + self.log.debug("Mdtest write parameters:") + self.log.debug(" write_bytes: %s", write_bytes) + if ppn is not None: + self.log.debug(" ppn / nodes: %s / %s", ppn, len(self.host_info.clients.hosts)) + else: + self.log.debug(" processes: %s", processes) + self.log.debug(" files_per_process: %s", files_per_process) + self.log.debug(" num_of_files_dirs: %s", mdtest_params["num_of_files_dirs"]) + self.log.debug(" expected to write: %s", + _write_processes * write_bytes * mdtest_params["num_of_files_dirs"]) + self.log.debug("-" * 60) + + self.log_step('Creating a container (dmg container create)') + container = self.get_container(pool) + + self.log_step( + 'Collect pool eviction metrics after creating a pool (dmg telemetry metrics query)') + expected_ranges = self.telemetry.collect_data(evict_metrics) + for metric in expected_ranges: + for label in expected_ranges[metric]: + if pool.mem_ratio.value is not None and metric.endswith('_hit'): + expected_ranges[metric][label] = [0, 100] # 0-100 (phase 2) + elif pool.mem_ratio.value is not None and metric.endswith('_miss'): + expected_ranges[metric][label] = [0, 5] # 0-5 (phase 2) + elif pool.mem_ratio.value is not None and metric.endswith('_ne'): + expected_ranges[metric][label] = [0, 5] # 0-5 (phase 2) + else: + expected_ranges[metric][label] = [0, 0] # 0 only + self.log.debug("%s expected_ranges: %s", pool, expected_ranges) + + self.log_step('Verify pool eviction metrics after pool creation') + if not self.telemetry.verify_data(expected_ranges): + self.fail('Pool eviction metrics verification failed after pool creation') + + self.log_step('Writing data to the pool (mdtest -a DFS)') + manager = get_job_manager(self, subprocess=False, timeout=None) + run_mdtest( + self, self.hostlist_clients, self.workdir, None, container, processes, ppn, manager, + mdtest_params=mdtest_params) + + self.log_step( + 'Collect pool eviction metrics after writing data (dmg telemetry metrics query)') + expected_ranges = self.telemetry.collect_data(evict_metrics) + for metric in expected_ranges: + for label in expected_ranges[metric]: + if pool.mem_ratio.value is None: + expected_ranges[metric][label] = [0, 0] # 0 only (phase 1) + elif metric.endswith('_page_flush'): + expected_ranges[metric][label] = [0] # 0 or greater (phase 2) + else: + expected_ranges[metric][label] = [1] # 1 or greater (phase 2) + self.log.debug("%s expected_ranges: %s", pool, expected_ranges) + + self.log_step('Verify pool eviction metrics after writing data') + if not self.telemetry.verify_data(expected_ranges): + self.fail('Pool eviction metrics verification failed after writing data') + + self.log_step('Test passed') diff --git a/src/tests/ftest/pool/eviction_metrics.yaml b/src/tests/ftest/pool/eviction_metrics.yaml new file mode 100644 index 00000000000..f9026a02b9c --- /dev/null +++ b/src/tests/ftest/pool/eviction_metrics.yaml @@ -0,0 +1,42 @@ +launch: + !filter-only : /run/pool/default # yamllint disable-line rule:colons + +hosts: + test_servers: 1 + test_clients: 3 + +timeout: 120 + +server_config: + name: daos_server + engines_per_host: 1 + engines: + 0: + targets: 4 + nr_xs_helpers: 0 + storage: auto + +pool: !mux + default: + size: 10G + md_on_ssd_p2: + size: 10G + mem_ratio: 25 + +container: + type: POSIX + oclass: S1 + dir_oclass: SX + +mdtest: + dfs_oclass: S1 + dfs_dir_oclass: SX + dfs_destroy: False + manager: "MPICH" + ppn: 32 + test_dir: "/" + api: DFS + flags: "-C -F -G 27 -N 1 -Y -u -L" + branching_factor: 1 + write_bytes: 3072 + read_bytes: 3072 diff --git a/src/tests/ftest/pool/list_verbose.py b/src/tests/ftest/pool/list_verbose.py index f75320b17e5..3928b86de81 100644 --- a/src/tests/ftest/pool/list_verbose.py +++ b/src/tests/ftest/pool/list_verbose.py @@ -1,6 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -25,7 +25,7 @@ class ListVerboseTest(IorTestBase): def create_expected(self, pool, scm_free, nvme_free, scm_imbalance, nvme_imbalance, targets_disabled=0, scm_size=None, nvme_size=None, state=None, rebuild_state=None, - ranks_disabled=None): + ranks_disabled=None, rebuild_degraded=False): # pylint: disable=too-many-arguments """Create expected dmg pool list output to compare against the actual. @@ -42,6 +42,8 @@ def create_expected(self, pool, scm_free, nvme_free, scm_imbalance, state (str, optional): Expected pool state. Defaults to None. rebuild_state (str, optional): Expected pool rebuild state. Defaults to None. ranks_disabled (list, optional): List of disabled ranks. Defaults to None. + rebuild_degraded (bool, optional): Whether rebuild status flag `degraded` is set. + Defaults to False. Returns: dict: Expected in the same format of actual. @@ -80,10 +82,13 @@ def create_expected(self, pool, scm_free, nvme_free, scm_imbalance, "rebuild": { "status": 0, "state": rebuild_state, + "derived_state": rebuild_state, "objects": 0, "records": 0, - "total_objects": 0 + "total_objects": 0, + 'degraded': rebuild_degraded }, + "self_heal_policy": "", # NB: tests should not expect min/max/mean values "tier_stats": [ { @@ -184,7 +189,7 @@ def verify_scm_size(self, actual, created, rank_count): self.assertTrue(diff < threshold, msg) def verify_pool_lists(self, targets_disabled, scm_size, nvme_size, state, rebuild_state, - ranks_disabled): + ranks_disabled, rebuild_degraded): """Call dmg pool list and verify. self.pool should be a list. The elements of the inputs should @@ -197,6 +202,7 @@ def verify_pool_lists(self, targets_disabled, scm_size, nvme_size, state, rebuil state (list): List of pool state for pools. rebuild_state (list): List of pool rebuild state for pools. ranks_disabled (list): List of disabled ranks for pools. + rebuild_degraded (list): List of rebuild status flag `degraded` for pools. Returns: list: a list of dictionaries containing information for each pool from the dmg @@ -243,7 +249,8 @@ def verify_pool_lists(self, targets_disabled, scm_size, nvme_size, state, rebuil nvme_size=nvme_size[index], state=state[index], rebuild_state=rebuild_state[index], - ranks_disabled=ranks_disabled[index])) + ranks_disabled=ranks_disabled[index], + rebuild_degraded=rebuild_degraded[index])) # Sort pools by UUID. actual_pools.sort(key=lambda item: item.get("uuid")) @@ -310,9 +317,11 @@ def test_fields_basic(self): state = ["Ready"] rebuild_state = ["idle"] ranks_disabled = [[]] + rebuild_degraded = [False] self.verify_pool_lists( targets_disabled=targets_disabled, scm_size=scm_size, nvme_size=nvme_size, - state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled) + state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled, + rebuild_degraded=rebuild_degraded) # 3. Create second pool. self.log_step("Create second pool") @@ -327,9 +336,11 @@ def test_fields_basic(self): state.append("Ready") rebuild_state.append("idle") ranks_disabled.append([]) + rebuild_degraded.append(False) self.verify_pool_lists( targets_disabled=targets_disabled, scm_size=scm_size, nvme_size=nvme_size, - state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled) + state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled, + rebuild_degraded=rebuild_degraded) # 5. Exclude target 7 in rank 1 of pool 1. self.log_step("Exclude target 7 in rank 1 of pool 1") @@ -347,10 +358,12 @@ def test_fields_basic(self): nvme_size[0] = reduced_nvme_size state[0] = "TargetsExcluded" rebuild_state[0] = "busy" + rebuild_degraded[0] = True self.verify_pool_lists( targets_disabled=targets_disabled, scm_size=scm_size, nvme_size=nvme_size, - state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled) + state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled, + rebuild_degraded=rebuild_degraded) # 7-11. Destroy and verify until the pools are gone. while self.pool: @@ -363,10 +376,12 @@ def test_fields_basic(self): scm_size.pop() nvme_size.pop() ranks_disabled.pop() + rebuild_degraded.pop() self.verify_pool_lists( targets_disabled=targets_disabled, scm_size=scm_size, nvme_size=nvme_size, - state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled) + state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled, + rebuild_degraded=rebuild_degraded) def verify_used_imbalance(self, storage): """Verification steps for test_used_imbalance. @@ -394,9 +409,11 @@ def verify_used_imbalance(self, storage): state = ["Ready"] rebuild_state = ["idle"] ranks_disabled = [[]] + rebuild_degraded = [False] actual_pools_before = self.verify_pool_lists( targets_disabled=targets_disabled, scm_size=scm_size, nvme_size=nvme_size, - state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled) + state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled, + rebuild_degraded=rebuild_degraded) # 3. Store free. free_before, _ = self.get_free_imbalance(actual_pools_before[0], storage) @@ -414,7 +431,8 @@ def verify_used_imbalance(self, storage): # obtained from actual. actual_pools_after = self.verify_pool_lists( targets_disabled=targets_disabled, scm_size=scm_size, nvme_size=nvme_size, - state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled) + state=state, rebuild_state=rebuild_state, ranks_disabled=ranks_disabled, + rebuild_degraded=rebuild_degraded) # Obtain the new free and imbalance. free_after, imbalance_after = self.get_free_imbalance( diff --git a/src/tests/ftest/pool/mem_ratio.py b/src/tests/ftest/pool/mem_ratio.py index 822e393ea2b..64b86358951 100644 --- a/src/tests/ftest/pool/mem_ratio.py +++ b/src/tests/ftest/pool/mem_ratio.py @@ -22,16 +22,21 @@ def check_insufficient_size(self, error): Args: error (Exception): the error raised during pool creation """ - allowed_errors = [ - "Insufficient scm size", - "No space on storage target", - "requested NVMe capacity too small"] - pattern = f"({'|'.join(allowed_errors)})" self.log.debug("Verifying Pool creation failure: %s", error) - result = self.server_managers[0].search_engine_logs(pattern) - if not result.passed: - raise error - self.log.debug("Pool create failure expected due to: '%s'", pattern) + pattern_methods = ( + ("(Insufficient scm size|No space on storage target)", + self.server_managers[0].search_engine_logs), + ("requested NVMe capacity too small", + self.server_managers[0].search_control_logs) + ) + for pattern, method in pattern_methods: + result = method(pattern) + for data in result.output: + if data.passed and not data.timeout: + # Expected failure detected in at least one of the logs + self.log.debug("Pool create failure expected due to: '%s'", pattern) + return + raise error @staticmethod def readable_bytes(size): diff --git a/src/tests/ftest/pool/pda.yaml b/src/tests/ftest/pool/pda.yaml index 9fdec66ca4f..3b98ced5bfc 100644 --- a/src/tests/ftest/pool/pda.yaml +++ b/src/tests/ftest/pool/pda.yaml @@ -18,7 +18,7 @@ pool: scm_size: 1G pool_1: scm_size: 1G - properties: ec_pda:2,rp_pda:4 + properties: rd_fac:0,space_rb:0,ec_pda:2,rp_pda:4 container: type: POSIX control_method: daos diff --git a/src/tests/ftest/pool/rf.yaml b/src/tests/ftest/pool/rf.yaml index d3b9761d349..0cb7c5bf389 100644 --- a/src/tests/ftest/pool/rf.yaml +++ b/src/tests/ftest/pool/rf.yaml @@ -6,7 +6,7 @@ timeout: 300 server_config: engines_per_host: 1 - system_ram_reserved: 16 + system_ram_reserved: 21 engines: 0: storage: diff --git a/src/tests/ftest/pool/target_query.yaml b/src/tests/ftest/pool/target_query.yaml index e9e4fc53a7d..d146f50c38f 100644 --- a/src/tests/ftest/pool/target_query.yaml +++ b/src/tests/ftest/pool/target_query.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 5 test_clients: 1 + timeout: 300 + server_config: name: daos_server engines_per_host: 2 @@ -9,26 +11,25 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log targets: 2 storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31517 log_file: daos_server1.log targets: 2 storage: auto + pool: scm_size: 2G nvme_size: 16G pool_query_timeout: 30 + container: type: POSIX control_method: daos + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/pool/verify_dtx.py b/src/tests/ftest/pool/verify_dtx.py new file mode 100644 index 00000000000..869d0c42a27 --- /dev/null +++ b/src/tests/ftest/pool/verify_dtx.py @@ -0,0 +1,152 @@ +""" + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import json +import math + +from job_manager_utils import get_job_manager +from mdtest_utils import MDTEST_NAMESPACE, run_mdtest +from telemetry_test_base import TestWithTelemetry + + +class VerifyDTXMetrics(TestWithTelemetry): + """ + Ensures DTX is involved with MD on SSD phase 2 pool. + + :avocado: recursive + """ + + def test_verify_dtx_metrics(self): + """Ensure DTX is involved with MD on SSD phase 2 pool. + + 1. Create a pool with a mem ratio of 100% (for pmem or phase 1) or 25% (for phase 2) + 2. Collect a baseline for the DTX metrics + 3. Run mdtest -a DFS to write data with different object classes + 4. Collect new DTX metrics + 5. Verify DTX metrics + + :avocado: tags=all,full_regression + :avocado: tags=hw,large + :avocado: tags=pool + :avocado: tags=VerifyDTXMetrics,test_verify_dtx_metrics + """ + # pylint: disable=too-many-branches + write_bytes = self.params.get('write_bytes', MDTEST_NAMESPACE, None) + processes = self.params.get('processes', MDTEST_NAMESPACE, None) + ppn = self.params.get('ppn', MDTEST_NAMESPACE, None) + object_classes = self.params.get('object_classes', '/run/*') + + dtx_metrics = list(self.telemetry.ENGINE_POOL_VOS_CACHE_METRICS[:1]) + dtx_metrics += list(self.telemetry.ENGINE_IO_DTX_COMMITTED_METRICS) + + self.log_step('Creating a pool (dmg pool create)') + pool = self.get_pool(connect=False) + try: + _result = json.loads(pool.dmg.result.stdout) + tier_bytes_scm = int(_result['response']['tier_bytes'][0]) + mem_file_bytes = int(_result['response']['mem_file_bytes']) + total_engines = len(_result['response']['tgt_ranks']) + except Exception as error: # pylint: disable=broad-except + self.fail(f'Error extracting data for dmg pool create output: {error}') + + # Calculate the mdtest files_per_process based upon the scm size and other mdtest params + _write_procs = processes + _mdtest_cmds = len(object_classes) + if ppn is not None: + _write_procs = ppn * len(self.host_info.clients.hosts) + files_per_process = math.floor(mem_file_bytes / (write_bytes * _write_procs * _mdtest_cmds)) + if tier_bytes_scm > mem_file_bytes: + # Write more (225%) files to exceed mem_file_bytes and cause eviction + num_of_files_dirs = math.ceil(files_per_process * 2.25) + else: + # Write less (75%) files to avoid out of space errors + num_of_files_dirs = math.floor(files_per_process * 0.75) + + self.log.debug("-" * 60) + self.log.debug("Pool %s create data:", pool) + self.log.debug(" tier_bytes_scm (per engine/total): %s / %s", + tier_bytes_scm, tier_bytes_scm * total_engines) + self.log.debug(" mem_file_bytes (per engine/total): %s / %s", + mem_file_bytes, mem_file_bytes * total_engines) + self.log.debug(" mem_ratio.value: %s", pool.mem_ratio.value) + self.log.debug(" total_engines: %s", total_engines) + self.log.debug("Mdtest write parameters:") + self.log.debug(" write_bytes per mdtest: %s", write_bytes) + if ppn is not None: + self.log.debug(" processes (ppn * nodes): %s * %s = %s", + ppn, len(self.host_info.clients.hosts), _write_procs) + else: + self.log.debug(" processes: %s", processes) + self.log.debug(" files_per_process per mtest: %s", files_per_process) + self.log.debug(" number of mdtest commands: %s", _mdtest_cmds) + self.log.debug(" num_of_files_dirs per mdtest: %s", num_of_files_dirs) + self.log.debug(" total expected to write: %s", + _mdtest_cmds * _write_procs * write_bytes * num_of_files_dirs) + self.log.debug("-" * 60) + + self.log_step('Collect DTX metrics after creating a pool (dmg telemetry metrics query)') + expected_ranges = self.telemetry.collect_data(dtx_metrics) + for metric in expected_ranges: + for label in expected_ranges[metric]: + expected_ranges[metric][label] = [0, 0] # 0 only + if pool.mem_ratio.value is not None: + suffixes = [ + '_dtx_committed_max', + '_dtx_committed_mean', + '_dtx_committed_samples', + '_dtx_committed_stddev', + '_dtx_committed_sum', + '_dtx_committed_sumsquares' + ] + if any(map(metric.endswith, suffixes)): + expected_ranges[metric][label] = [0] # 0 or greater (phase 2) + self.log.debug('%s expected_ranges: %s', pool, expected_ranges) + + self.log_step('Verify DTX metrics after pool creation') + if not self.telemetry.verify_data(expected_ranges): + self.fail('DTX metrics verification failed after pool creation') + + manager = get_job_manager(self, subprocess=False, timeout=None) + processes = self.params.get('processes', MDTEST_NAMESPACE, None) + ppn = self.params.get('ppn', MDTEST_NAMESPACE, None) + for oclass in object_classes: + self.log_step(f'Write data into a containers with the {oclass} object classes (mdtest)') + container = self.get_container(pool, oclass=oclass, dir_oclass=oclass) + run_mdtest( + self, self.hostlist_clients, self.workdir, None, container, processes, ppn, manager, + mdtest_params={'dfs_oclass': oclass, 'dfs_dir_oclass': oclass, + 'num_of_files_dirs': num_of_files_dirs}) + + self.log_step('Collect DTX metrics after writing data (dmg telemetry metrics query)') + expected_ranges = self.telemetry.collect_data(dtx_metrics) + for metric in expected_ranges: + for label in expected_ranges[metric]: + if metric.endswith('_dtx_committed'): + expected_ranges[metric][label] = [0] # 0 or greater + elif metric.endswith('_dtx_committed_max'): + expected_ranges[metric][label] = [100] # 100 or greater + elif metric.endswith('_dtx_committed_mean'): + expected_ranges[metric][label] = [50] # 50 or greater + elif metric.endswith('_dtx_committed_min'): + expected_ranges[metric][label] = [0] # 0 or greater + elif metric.endswith('_dtx_committed_sum'): + expected_ranges[metric][label] = [1000] # 1000 or greater + elif metric.endswith('_dtx_committed_sumsquares'): + expected_ranges[metric][label] = [100000] # 100,000 or greater + elif metric.endswith('_vos_cache_page_evict'): + if pool.mem_ratio.value is None: + expected_ranges[metric][label] = [0, 0] # 0 only (phase 1) + else: + expected_ranges[metric][label] = [1] # 1 or greater (phase 2) + else: + # e.g. *_dtx_committed_samples, *_dtx_committed_stddev + expected_ranges[metric][label] = [1] # 1 or greater + self.log.debug('%s expected_ranges: %s', pool, expected_ranges) + + self.log_step('Verify DTX metrics after writing data') + if not self.telemetry.verify_data(expected_ranges): + self.fail('DTX metrics verification failed after writing data') + + self.log_step('Test passed') diff --git a/src/tests/ftest/pool/verify_dtx.yaml b/src/tests/ftest/pool/verify_dtx.yaml new file mode 100644 index 00000000000..e0947402b6b --- /dev/null +++ b/src/tests/ftest/pool/verify_dtx.yaml @@ -0,0 +1,64 @@ +launch: + !filter-only : /run/pool/default # yamllint disable-line rule:colons + +hosts: + test_servers: 5 + test_clients: 3 + +timeout: 600 + +server_config: + name: daos_server + engines_per_host: 2 + engines: + 0: + pinned_numa_node: 0 + nr_xs_helpers: 0 + log_file: daos_server0.log + storage: auto + 1: + pinned_numa_node: 1 + nr_xs_helpers: 0 + log_file: daos_server1.log + storage: auto + +pool: !mux + default: + size: 575G + md_on_ssd_p2: + size: 575G + mem_ratio: 25 + +container: + type: POSIX + +mdtest: + dfs_destroy: False + manager: "MPICH" + ppn: 32 + test_dir: "/" + api: DFS + flags: "-C -F -G 27 -N 1 -Y -u -L" + branching_factor: 1 + write_bytes: 3072 + read_bytes: 3072 + +object_classes: + - "EC_2P1G1" + - "EC_2P1G8" + - "EC_2P1GX" + - "EC_2P2G1" + - "EC_2P2G8" + - "EC_2P2GX" + - "EC_4P1G1" + - "EC_4P1G8" + - "EC_4P1GX" + - "EC_4P2G1" + - "EC_4P2G8" + - "EC_4P2GX" + - "EC_4P3G1" + - "EC_4P3G8" + - "EC_4P3GX" + - "EC_8P2G1" + - "EC_8P2G8" + - "EC_8P2GX" diff --git a/src/tests/ftest/pytorch/checkpoint.py b/src/tests/ftest/pytorch/checkpoint.py index fd450768db5..685491dacc9 100644 --- a/src/tests/ftest/pytorch/checkpoint.py +++ b/src/tests/ftest/pytorch/checkpoint.py @@ -1,9 +1,10 @@ """ (C) Copyright 2025 Google LLC - (C) Copyright 2025 Enakta Labs Ltd + (C) Copyright 2025-2026 Enakta Labs Ltd SPDX-License-Identifier: BSD-2-Clause-Patent """ +import errno import os import uuid @@ -73,6 +74,41 @@ def test_checkpoint_chunking(self): chunk_size=chunk_size, chunks_limit=chunks_limit, workers=worker) + def test_checkpoint_nested_directories(self): + """ Test Pytorch Checkpoint interface with nested directories + Test Description: Ensure that parent directories are created for the checkpoint path + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=pytorch + :avocado: tags=PytorchCheckpointTest,test_checkpoint_nested_directories + """ + + pool = self.get_pool() + container = self.get_container(pool) + + d1, d2 = str(uuid.uuid4()), str(uuid.uuid4()) + files = ["/file.pt", f"/{d1}/file.pt", f"/{d1}/{d2}/file.pt"] + + # by default parent directories should be created + with Checkpoint(pool.identifier, container.identifier) as pt: + for name in files: + with pt.writer(name) as w: + w.write(os.urandom(4096)) + + # ensure that it fails with expected exception + try: + with Checkpoint(pool.identifier, container.identifier) as pt: + fname = f"/{str(uuid.uuid4())}/file.pt" + with pt.writer(fname, ensure_path=False) as w: + w.write(os.urandom(4096)) + raise RuntimeError("expected OSError with errno.ENOENT") + except OSError as e: + if e.errno != errno.ENOENT: + raise RuntimeError(f"expected errno.ENOENT, got {os.strerror(e.errno)}") from e + except Exception as e: + raise RuntimeError(f"unexpected error: {e}") from e + def _test_checkpoint(self, pool, cont, writes, chunk_size=0, chunks_limit=0, workers=0): """Creates a checkpoint with the given parameters, writes the given data to it, then reads written data back from it and compares it with the expected writes. @@ -80,19 +116,17 @@ def _test_checkpoint(self, pool, cont, writes, chunk_size=0, chunks_limit=0, wor self.log.info("Checkpoint test: writes=%s, chunk_size=%s, chunks_limit=%s, workers=%s", len(writes), chunk_size, chunks_limit, workers) - chkp = Checkpoint(pool, cont, transfer_chunk_size=chunk_size, chunks_limit=chunks_limit, - workers=workers) - - expected = bytearray() - fname = str(uuid.uuid4()) - with chkp.writer(fname) as w: - for chunk in writes: - w.write(chunk) - expected.extend(chunk) - - actual = chkp.reader(fname) - if expected != actual.getvalue(): - self.fail( - f"checkpoint did not read back the expected content for {len(writes)} writes," - f"chunk_size={chunk_size}, chunks_limit={chunks_limit}, workers={workers}") - del chkp + with Checkpoint(pool, cont, transfer_chunk_size=chunk_size, chunks_limit=chunks_limit, + workers=workers) as chkp: + expected = bytearray() + fname = str(uuid.uuid4()) + with chkp.writer(fname) as w: + for chunk in writes: + w.write(chunk) + expected.extend(chunk) + + actual = chkp.reader(fname) + if expected != actual.getvalue(): + self.fail( + f"checkpoint did not read back the expected content for {len(writes)} writes," + f"chunk_size={chunk_size}, chunks_limit={chunks_limit}, workers={workers}") diff --git a/src/tests/ftest/rebuild/auto_recovery_policy.py b/src/tests/ftest/rebuild/auto_recovery_policy.py new file mode 100644 index 00000000000..c9e335948d0 --- /dev/null +++ b/src/tests/ftest/rebuild/auto_recovery_policy.py @@ -0,0 +1,454 @@ +""" + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import re +import time +from functools import partial + +from apricot import TestWithServers +from data_utils import assert_val_in_list +from general_utils import list_to_str + + +class RbldAutoRecoveryPolicy(TestWithServers): + """Rebuild test cases related to Auto Recovery Policies. + + :avocado: recursive + """ + + def test_rebuild_auto_recovery_policy(self): + """Jira ID: DAOS-17420. + + Test Description: Verify Rebuild Auto Recovery Policy + + Scenario 1: System Creation and default self_heal. + Scenario 2: Disabling and Enabling Self-Heal. + Scenario 3: Online System Maintenance. + Scenario 4: Offline System Maintenance. + Scenario 5: Normal System Restart. + Scenario 6: Unexpected System Restart. + Scenario 7: Problematic Pools. + + See each corresponding _verify_scenario_X method for detailed steps. + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=pool,rebuild,self_heal + :avocado: tags=RbldAutoRecoveryPolicy,test_rebuild_auto_recovery_policy + """ + self.log_step('Setup pool') + pool = self.get_pool(connect=False) + + # Run just the scenarios requested, or all by default + total_scenarios = 0 + scenarios_passed = 0 + scenarios_to_verify = set(self.params.get('scenarios_to_verify', '/run/test/*', ['all'])) + for method in filter(lambda x: '_verify_scenario_' in x, dir(self)): + total_scenarios += 1 + scenario_number = int(method.split('_')[-1]) + if scenario_number not in scenarios_to_verify and 'all' not in scenarios_to_verify: + self.log.warning('Skipping scenario %s', scenario_number) + continue + self.log.info('Running scenario %s', scenario_number) + scenario_method = getattr(self, method) + scenario_method(pool) + scenarios_passed += 1 + + self.log_step('Destroy pool') + pool.destroy() + + self.log_step(f'Test passed on {scenarios_passed}/{total_scenarios} scenarios') + + def _verify_scenario_1(self, pool): + """Scenario 1: System Creation and default self_heal. + + Verify the default self_heal properties at the system and pool level. + + See self.log_step() calls for test steps. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + self.log_step('Scenario 1 - Verify default system self_heal policy') + response = dmg.system_get_prop(properties='self_heal')['response'] + actual_value = response[0]['value'] + expected_value = 'exclude;pool_exclude;pool_rebuild' + if actual_value != expected_value: + self.fail( + f'Expected system self_heal policy to be {expected_value}, ' + f'but got {actual_value}') + + self.log_step('Scenario 1 - Verify default pool self_heal policy') + response = pool.get_prop(name='self_heal')['response'] + actual_value = response[0]['value'] + expected_value = 'exclude;rebuild' + if actual_value != expected_value: + self.fail( + f'Expected pool self_heal policy to be {expected_value}, ' + f'but got {actual_value}') + + def _verify_scenario_2(self, pool): + """Scenario 2: Disabling and Enabling Self-Heal. + + Verify disabling self_heal prevents exclusions and rebuilds. + Verify enabling self_heal allows exclusions and rebuilds. + + See self.log_step() calls for test steps. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get 2 distinct sets of ranks to stop + all_ranks = list(self.server_managers[0].ranks.keys()) + ranks_x = sorted(self.random.sample(all_ranks, k=1)) + ranks_y = sorted(self.random.sample(list(set(all_ranks) - set(ranks_x)), k=1)) + + self.log_step('Scenario 2 - Disable system self_heal') + dmg.system_set_prop('self_heal:none') + + self.log_step('Scenario 2 - Stop a rank and verify it is not excluded') + dmg.system_stop(ranks=ranks_x) + self.server_managers[0].update_expected_states(ranks_x, 'stopped') + self._wait_detection_delay() + self._verify_rank_state(ranks_x, 'stopped') + + self.log_step( + 'Scenario 2 - Enable system self_heal and invoke dmg system self-heal eval') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + dmg.system_self_heal_eval() + self.server_managers[0].update_expected_states(ranks_x, ['stopped', 'excluded']) + + self.log_step('Scenario 2 - Verify ranks are excluded and rebuilt in the pool') + self._verify_rank_state(ranks_x, 'excluded') + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + + self.log_step( + 'Scenario 2 - Stop another rank and verify it is excluded and rebuilt in the pool') + dmg.system_stop(ranks=ranks_y) + self.server_managers[0].update_expected_states(ranks_y, ['stopped', 'excluded']) + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + self._verify_rank_state(ranks_y, 'excluded') + + self.log_step( + 'Scenario 2 - Reintegrate stopped ranks to bring system back to original state') + stopped_ranks_str = list_to_str(ranks_x + ranks_y) + dmg.system_start(stopped_ranks_str) + dmg.system_reintegrate(stopped_ranks_str) + self.server_managers[0].update_expected_states(ranks_x + ranks_y, ['joined']) + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + self._verify_rank_state(all_ranks, 'joined') + + def _verify_scenario_3(self, pool): + """Scenario 3: Online System Maintenance. + + Verify self_heal can be set such that ranks are excluded but not rebuilt. + + See self.log_step() calls for test steps. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a random rank to stop + all_ranks = list(self.server_managers[0].ranks.keys()) + ranks_x = sorted(self.random.sample(all_ranks, k=1)) + + self.log_step('Scenario 3 - Set system.self_heal.pool_rebuild = disabled') + dmg.system_set_prop('self_heal:exclude;pool_exclude') + dmg.system_get_prop(properties='self_heal') + + self.log_step('Scenario 3 - Stop a rank and verify it is excluded without rebuild') + dmg.system_stop(ranks=ranks_x) + self.server_managers[0].update_expected_states(ranks_x, ['stopped', 'excluded']) + self._wait_detection_delay() + self._verify_rank_state(ranks_x, 'excluded') + pool.verify_query({ + 'disabled_ranks': ranks_x, + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}}) + # Targets should be down but not down_out + pool.verify_query_targets_state(ranks_x, 'down') + + self.log_step('Scenario 3 - Restart the rank and make sure it rejoins') + dmg.system_start(ranks=ranks_x) + self.server_managers[0].update_expected_states(ranks_x, ['joined']) + self._verify_rank_state(all_ranks, 'joined', tries=5, delay=3) + + self.log_step('Scenario 3 - Reintegrate the rank and wait for rebuild') + dmg.system_reintegrate(list_to_str(ranks_x)) + self.server_managers[0].update_expected_states(ranks_x, ['joined']) + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + + # The pool version changes after exclusion, + # but should not changed after resetting self_heal + self.log.info('Save current pool version') + pool_version = pool.query()['response']['version'] + + self.log_step('Scenario 3 - Reset system self_heal to default') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + + self.log_step('Scenario 3 - Verify dmg system self-heal eval does not trigger rebuild') + dmg.system_self_heal_eval() + self._wait_detection_delay() + pool.verify_query({ + 'disabled_ranks': [], + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}, + 'version': pool_version}) + + def _verify_scenario_4(self, pool): + """Scenario 4: Offline System Maintenance. + + Verify disabling self_heal prevents exclusions even when the ranks restart. + + See self.log_step() calls for test steps. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a list of all ranks + all_ranks = list(self.server_managers[0].ranks.keys()) + + self.log_step('Scenario 4 - Disable system self_heal') + dmg.system_set_prop('self_heal:none') + + # We expect the pool version to stay the same through this scenario since + # there are no exclusions or rebuilds + self.log.info('Save current pool version') + pool_version = pool.query()['response']['version'] + + self.log_step( + 'Scenario 4 - Stop more ranks than the pool RF and verify there are no exclusions') + pool_rf = int(re.findall(r'rd_fac:([0-9]+)', pool.properties.value)[0]) + self.assertGreater( + len(all_ranks), pool_rf, 'Not enough ranks to stop more than pool RF') + ranks_over_rf = sorted(self.random.sample(all_ranks, k=pool_rf + 1)) + dmg.system_stop(ranks=list_to_str(ranks_over_rf)) + self.server_managers[0].update_expected_states(ranks_over_rf, ['stopped']) + self._wait_detection_delay() + self._verify_rank_state(ranks_over_rf, 'stopped') + + self.log_step('Scenario 4 - Restart the stopped ranks and make sure they rejoin') + dmg.system_start(ranks=list_to_str(ranks_over_rf)) + self.server_managers[0].update_expected_states(ranks_over_rf, ['joined']) + self._verify_rank_state(all_ranks, 'joined', tries=5, delay=3) + + self.log_step('Scenario 4 - Reset system self_heal to default') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + + self.log_step('Scenario 4 - Verify dmg system self-heal eval does not trigger rebuild') + dmg.system_self_heal_eval() + self._wait_detection_delay() + pool.verify_query({ + 'disabled_ranks': [], + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}, + 'version': pool_version}) + + def _verify_scenario_5(self, pool): + """Scenario 5: Normal System Restart. + + Verify disabling self_heal prevents exclusions even when the system restarts. + + See self.log_step() calls for test steps. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a list of all ranks + all_ranks = list(self.server_managers[0].ranks.keys()) + + self.log_step('Scenario 5 - Disable system self_heal') + dmg.system_set_prop('self_heal:none') + + # We expect the pool version to stay the same through this scenario since + # there are no exclusions or rebuilds + self.log.info('Save current pool version') + pool_version = pool.query()['response']['version'] + + self.log_step('Scenario 5 - Stop the system and verify no ranks are excluded') + dmg.system_stop() + self.server_managers[0].update_expected_states(all_ranks, ['stopped']) + self._wait_detection_delay() + self._verify_rank_state(all_ranks, 'stopped') + + self.log_step('Scenario 5 - Restart the system and make sure all ranks rejoin') + dmg.system_start() + self.server_managers[0].update_expected_states(all_ranks, ['joined']) + self._verify_rank_state(all_ranks, 'joined', tries=5, delay=3) + + self.log_step('Scenario 5 - Reset system self_heal to default') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + + self.log_step('Scenario 5 - Verify dmg system self-heal eval does not trigger rebuild') + dmg.system_self_heal_eval() + self._wait_detection_delay() + pool.verify_query({ + 'disabled_ranks': [], + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}, + 'version': pool_version}) + + def _verify_scenario_6(self, pool): + """Scenario 6: Unexpected System Restart. + + Verify disabling self_heal immediately after unexpected system restart prevents exclusions. + + See self.log_step() calls for test steps. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a list of all ranks except 1 + all_ranks = list(self.server_managers[0].ranks.keys()) + all_ranks_minus_1 = sorted(self.random.sample(all_ranks, k=len(all_ranks) - 1)) + + self.log_step('Scenario 6 - Simulate restart with dmg system stop') + # We expect the pool version to stay the same through this scenario since + # there are no exclusions or rebuilds + self.log.info('Save current pool version') + pool_version = pool.query()['response']['version'] + dmg.system_stop() + self.server_managers[0].update_expected_states(all_ranks, ['stopped']) + + self.log_step('Scenario 6 - Start all but 1 rank and immediately disable self-heal') + dmg.system_start(ranks=list_to_str(all_ranks_minus_1)) + self.server_managers[0].update_expected_states(all_ranks_minus_1, ['joined']) + dmg.system_set_prop('self_heal:none') + + self.log_step('Scenario 6 - Verify all but 1 rank rejoins') + self._verify_rank_state(all_ranks_minus_1, 'joined', tries=5, delay=3) + + self.log_step('Scenario 6 - Restart the last rank and make sure it rejoins') + dmg.system_start() + self.server_managers[0].update_expected_states(all_ranks, ['joined']) + self._verify_rank_state(all_ranks, 'joined', tries=5, delay=3) + + self.log_step('Scenario 6 - Reset system self_heal to default') + dmg.system_set_prop('self_heal:exclude;pool_exclude;pool_rebuild') + + self.log_step('Scenario 6 - Verify dmg system self-heal eval does not trigger rebuild') + dmg.system_self_heal_eval() + self._wait_detection_delay() + pool.verify_query({ + 'disabled_ranks': [], + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}, + 'version': pool_version}) + + def _verify_scenario_7(self, pool): + """Scenario 7: Problematic Pools. + + Verify disabling self_heal on specific pools prevents exclusions and rebuilds for + those pools only. + + See self.log_step() calls for test steps. + + Args: + pool (TestPool): The pool to use + """ + dmg = self.get_dmg_command() + + # Get a random rank to stop + all_ranks = list(self.server_managers[0].ranks.keys()) + ranks_x = sorted(self.random.sample(all_ranks, k=1)) + + self.log_step('Scenario 7 - Create a second pool') + pool2 = self.get_pool(connect=False) + + self.log_step('Scenario 7 - Disable self_heal rebuild on just the second pool') + pool2.set_prop('self_heal:exclude') + pool2.query() + + self.log_step('Scenario 7 - Stop a rank and wait for the detection delay') + dmg.system_stop(ranks=ranks_x) + self.server_managers[0].update_expected_states(ranks_x, ['stopped', 'excluded']) + self._wait_detection_delay() + + self.log_step( + 'Scenario 7 - Verify the rank is excluded and rebuilds in first pool only') + self._verify_rank_state(ranks_x, 'excluded') + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + pool.verify_query({ + 'disabled_ranks': ranks_x, + 'rebuild': { + 'state': 'done'}}) + self.log_step( + 'Scenario 7 - Verify the rank is excluded and does not rebuild in second pool') + pool2.verify_query({ + 'disabled_ranks': ranks_x, + 'rebuild': { + 'state': partial(assert_val_in_list, allowed_list=['done', 'idle'])}}) + # Targets should be down but not down_out + pool2.verify_query_targets_state(ranks_x, 'down') + + self.log_step( + 'Scenario 7 - Reintegrate stopped ranks to bring system back to original state') + stopped_ranks_str = list_to_str(ranks_x) + dmg.system_start(stopped_ranks_str) + dmg.system_reintegrate(stopped_ranks_str) + self.server_managers[0].update_expected_states(ranks_x, ['joined']) + pool.wait_for_rebuild_to_start(interval=1) + pool.wait_for_rebuild_to_end(interval=3) + self._verify_rank_state(all_ranks, 'joined') + + self.log_step('Scenario 7 - Destroy second pool') + pool2.destroy() + + def _verify_rank_state(self, ranks, state, tries=1, delay=3): + """Verify the state of the given ranks. + + Args: + ranks (list): The list of ranks to verify. + state (str): The expected state of the ranks. + tries (int, optional): Number of attempts to verify the state. Defaults to 1. + delay (int, optional): Delay between attempts in seconds. Defaults to 3. + """ + for current_try in range(tries): + current_state = self.server_managers[0].get_current_state() + + # All ranks are in expected state + if set(current_state[rank]['state'] for rank in ranks) == {state}: + return + + # Retry + if current_try < tries - 1: + self.log.info( + 'Not all ranks are in expected state %s. Retrying in %s seconds...', + state, delay) + time.sleep(delay) + continue + + # Final attempt failed + for rank in ranks: + if current_state[rank]['state'] != state: + self.fail( + f'Expected rank {rank} to be in state {state}, ' + f'but current state is {current_state[rank]["state"]}') + + def _wait_detection_delay(self): + """Wait for the detection delay.""" + # The detection delay shall be a couple of SWIM periods (1s) + SWIM suspicion timeout (20s) + # + CRT_EVENT_DELAY (1s) + some margin of error (?) + # This is difficult to calculate so set to 30 based on current environment + detection_delay = 30 + self.log.info('Waiting for detection delay of %s seconds', detection_delay) + time.sleep(detection_delay) diff --git a/src/tests/ftest/rebuild/auto_recovery_policy.yaml b/src/tests/ftest/rebuild/auto_recovery_policy.yaml new file mode 100644 index 00000000000..98499991195 --- /dev/null +++ b/src/tests/ftest/rebuild/auto_recovery_policy.yaml @@ -0,0 +1,28 @@ +hosts: + test_servers: 7 + test_clients: 1 + +timeout: 900 + +server_config: + name: daos_server + engines_per_host: 1 + engines: + 0: + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos + system_ram_reserved: 1 + +pool: + size: 10G + properties: rd_fac:2 + pool_query_timeout: 30 + register_cleanup: False # if something goes wrong, this will likely timeout + +test: + scenarios_to_verify: + - all diff --git a/src/tests/ftest/rebuild/basic.py b/src/tests/ftest/rebuild/basic.py index 8a8f8148f3c..a6b0801775a 100644 --- a/src/tests/ftest/rebuild/basic.py +++ b/src/tests/ftest/rebuild/basic.py @@ -25,7 +25,7 @@ def test_rebuild_basic(self): Multiple pool rebuild, single client, various record/object counts :avocado: tags=all,daily_regression - :avocado: tags=vm + :avocado: tags=hw,large :avocado: tags=rebuild,pool,daos_cmd :avocado: tags=RbldBasic,test_rebuild_basic """ diff --git a/src/tests/ftest/rebuild/basic.yaml b/src/tests/ftest/rebuild/basic.yaml index 80fbb15d9ce..100c3716b8d 100644 --- a/src/tests/ftest/rebuild/basic.yaml +++ b/src/tests/ftest/rebuild/basic.yaml @@ -19,7 +19,6 @@ server_config: 0: class: ram scm_mount: /mnt/daos - system_ram_reserved: 1 pool: size: 1G diff --git a/src/tests/ftest/rebuild/cascading_failures.yaml b/src/tests/ftest/rebuild/cascading_failures.yaml index abc15a45f29..294332d854b 100644 --- a/src/tests/ftest/rebuild/cascading_failures.yaml +++ b/src/tests/ftest/rebuild/cascading_failures.yaml @@ -20,7 +20,7 @@ server_config: pool: size: 1G pool_query_timeout: 30 - properties: rd_fac:2 + properties: rd_fac:2,space_rb:0 container: akey_size: 5 dkey_size: 5 diff --git a/src/tests/ftest/rebuild/container_create_race.yaml b/src/tests/ftest/rebuild/container_create_race.yaml index 19876bb6cdb..892af3461e0 100644 --- a/src/tests/ftest/rebuild/container_create_race.yaml +++ b/src/tests/ftest/rebuild/container_create_race.yaml @@ -13,16 +13,12 @@ server_config: targets: 2 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: targets: 2 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto @@ -32,7 +28,7 @@ testparams: pool: scm_size: 8G pool_query_timeout: 15 - properties: rd_fac:1 + properties: rd_fac:1,space_rb:0 container: type: POSIX diff --git a/src/tests/ftest/rebuild/continues_after_stop.yaml b/src/tests/ftest/rebuild/continues_after_stop.yaml index 5f332674687..85861c95d1a 100644 --- a/src/tests/ftest/rebuild/continues_after_stop.yaml +++ b/src/tests/ftest/rebuild/continues_after_stop.yaml @@ -11,15 +11,11 @@ server_config: 0: targets: 4 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server_0.log storage: auto 1: targets: 4 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server_1.log storage: auto diff --git a/src/tests/ftest/rebuild/interactive.py b/src/tests/ftest/rebuild/interactive.py new file mode 100644 index 00000000000..181a00391e6 --- /dev/null +++ b/src/tests/ftest/rebuild/interactive.py @@ -0,0 +1,212 @@ +""" + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time +from functools import partial + +from apricot import TestWithServers +from data_utils import assert_val_in_list +from exception_utils import CommandFailure +from ior_utils import get_ior +from job_manager_utils import get_job_manager + + +class RbldInteractive(TestWithServers): + """Test class for interactive rebuild tests. + + :avocado: recursive + """ + + def test_rebuild_interactive(self): + """ + Use Cases: + Pool rebuild with interactive start/stop. + + :avocado: tags=all,daily_regression + :avocado: tags=hw,large + :avocado: tags=rebuild,pool + :avocado: tags=RbldInteractive,test_rebuild_interactive + """ + self.log_step("Setup pool") + pool = self.get_pool(connect=False) + + # Collect server configuration information + server_count = len(self.hostlist_servers) + engines_per_host = int(self.server_managers[0].get_config_value('engines_per_host') or 1) + targets_per_engine = int(self.server_managers[0].get_config_value('targets')) + self.log.info( + 'Running with %s servers, %s engines per server, and %s targets per engine', + server_count, engines_per_host, targets_per_engine) + + self.log_step('Create container and run IOR') + cont_ior = self.get_container(pool, namespace='/run/cont_ior/*') + ior_flags_write = self.params.get('flags_write', '/run/ior/*') + ior_ppn = self.params.get('ppn', '/run/ior/*') + + job_manager = get_job_manager(self, subprocess=False) + ior = get_ior( + self, job_manager, self.hostlist_clients, self.workdir, None, namespace='/run/ior/*') + ior.manager.job.update_params(flags=ior_flags_write, dfs_oclass=cont_ior.oclass.value) + ior.run(cont_ior.pool, cont_ior, None, ior_ppn, display_space=False) + + self.__run_rebuild_interactive( + pool, cont_ior, ior, + num_ranks_to_exclude=1, + exclude_method='dmg pool exclude', + reint_method='dmg pool reintegrate') + + self.log_step('Test Passed') + + def __run_rebuild_interactive(self, pool, cont_ior, ior, + num_ranks_to_exclude, exclude_method, reint_method): + """Run interactive rebuild test sequence. + + Args: + pool (TestPool): pool to use + cont_ior (TestContainer): container used for IOR + iort (Ior): the Ior object + num_ranks_to_exclude (int): number of ranks to exclude/reintegrate + exclude_method (str): method to exclude ranks. Must be in + - 'dmg pool exclude' + - 'dmg system exclude' + reint_method (str): method to reintegrate ranks. Must be in + - 'dmg pool reintegrate' + - 'dmg system reintegrate' + """ + + ior_flags_read = self.params.get('flags_read', '/run/ior/*') + ior_ppn = self.params.get('ppn', '/run/ior/*') + + self.log_step('Verify pool state before rebuild') + self.__verify_pool_query( + pool, rebuild_status=0, rebuild_state=['idle', 'done'], disabled_ranks=[]) + + ranks_to_exclude = self.random.sample( + list(self.server_managers[0].ranks.keys()), k=num_ranks_to_exclude) + self.log_step(f'Exclude random rank {ranks_to_exclude}') + if exclude_method == 'dmg pool exclude': + pool.exclude(ranks_to_exclude) + elif exclude_method == 'dmg system exclude': + pool.dmg.system_exclude(ranks_to_exclude) + else: + self.fail(f'Unsupported exclude_method: {exclude_method}') + + self.log_step(f'{exclude_method} - Wait for rebuild to start') + pool.wait_for_rebuild_to_start(interval=1) + + self.log_step(f'{exclude_method} - Manually stop rebuild') + for i in range(3): + try: + pool.rebuild_stop() + break + except CommandFailure as error: + if i == 2 or 'DER_NONEXIST' not in str(error): + raise + self.log.info('Assuming rebuild is not started yet. Retrying in 3 seconds...') + time.sleep(3) + + self.log_step(f'{exclude_method} - Wait for rebuild to stop') + pool.wait_for_rebuild_to_stop(interval=3) + + self.log_step(f'{exclude_method} - Verify pool state after rebuild stopped') + self.__verify_pool_query( + pool, rebuild_status=-2027, rebuild_state=['idle'], + disabled_ranks=ranks_to_exclude) + + self.log_step(f'{exclude_method} - Verify IOR after rebuild stopped') + ior.manager.job.update_params(flags=ior_flags_read) + ior.run(cont_ior.pool, cont_ior, None, ior_ppn, display_space=False) + + self.log_step(f'{exclude_method} - Manually start rebuild') + pool.rebuild_start() + + self.log_step(f'{exclude_method} - Wait for rebuild to start') + pool.wait_for_rebuild_to_start(interval=1) + + self.log_step(f'{exclude_method} - Wait for rebuild to end') + pool.wait_for_rebuild_to_end(interval=3) + + self.log_step(f'{exclude_method} - Verify pool state after rebuild completed') + self.__verify_pool_query( + pool, rebuild_status=0, rebuild_state=['idle', 'done'], + disabled_ranks=ranks_to_exclude) + + self.log_step(f'{exclude_method} - Verify IOR after rebuild completed') + ior.manager.job.update_params(flags=ior_flags_read) + ior.run(cont_ior.pool, cont_ior, None, ior_ppn, display_space=False) + + self.log_step('Reintegrate excluded ranks') + if reint_method == 'dmg pool reintegrate': + pool.reintegrate(ranks_to_exclude) + elif reint_method == 'dmg system reintegrate': + pool.dmg.system_reintegrate(ranks_to_exclude) + else: + self.fail(f'Unsupported reint_method: {reint_method}') + + self.log_step(f'{reint_method} - Wait for rebuild to start') + pool.wait_for_rebuild_to_start(interval=1) + + self.log_step(f'{reint_method} - Manually stop rebuild') + for i in range(3): + try: + pool.rebuild_stop() + break + except CommandFailure as error: + if i == 2 or 'DER_NONEXIST' not in str(error): + raise + self.log.info('Assuming rebuild is not started yet. Retrying in 3 seconds...') + time.sleep(3) + + self.log_step(f'{reint_method} - Wait for rebuild to stop') + pool.wait_for_rebuild_to_stop(interval=3) + + self.log_step(f'{reint_method} - Verify pool state after rebuild stopped') + self.__verify_pool_query( + pool, rebuild_status=-2027, rebuild_state=['idle'], + disabled_ranks=[]) + + self.log_step(f'{reint_method} - Verify IOR after rebuild stopped') + ior.manager.job.update_params(flags=ior_flags_read) + ior.run(cont_ior.pool, cont_ior, None, ior_ppn, display_space=False) + + self.log_step(f'{reint_method} - Manually start rebuild') + pool.rebuild_start() + + self.log_step(f'{reint_method} - Wait for rebuild to start') + pool.wait_for_rebuild_to_start(interval=1) + + self.log_step(f'{reint_method} - Wait for rebuild to end') + pool.wait_for_rebuild_to_end(interval=3) + + self.log_step(f'{reint_method} - Verify pool state after rebuild completed') + self.__verify_pool_query( + pool, rebuild_status=0, rebuild_state=['idle', 'done'], disabled_ranks=[]) + + self.log_step(f'{reint_method} - Verify IOR after rebuild completed') + ior.manager.job.update_params(flags=ior_flags_read) + ior.run(cont_ior.pool, cont_ior, None, ior_ppn, display_space=False) + + def __verify_pool_query(self, pool, rebuild_status, rebuild_state, disabled_ranks): + """Verify pool query. + + Args: + pool (TestPool): pool to query + rebuild_status (int): expected rebuild status + rebuild_state (str/list): expected rebuild state + disabled_ranks (list): expected disabled ranks + + """ + try: + pool.verify_query( + { + 'rebuild': { + 'status': rebuild_status, + 'state': partial(assert_val_in_list, allowed_list=rebuild_state) + }, + 'disabled_ranks': disabled_ranks + }, + use_cached_query=True) + except AssertionError as error: + self.fail(f'Unexpected pool query response: {str(error)}') diff --git a/src/tests/ftest/rebuild/interactive.yaml b/src/tests/ftest/rebuild/interactive.yaml new file mode 100644 index 00000000000..0cd95906695 --- /dev/null +++ b/src/tests/ftest/rebuild/interactive.yaml @@ -0,0 +1,49 @@ +hosts: + test_servers: 7 + test_clients: 1 + +timeout: 400 + +server_config: + name: daos_server + engines_per_host: 2 + engines: + 0: + pinned_numa_node: 0 + nr_xs_helpers: 1 + log_file: daos_server0.log + log_mask: DEBUG,MEM=ERR + env_vars: + - DD_MASK=group_metadata_only,io,epc,rebuild + - D_LOG_FILE_APPEND_PID=1 + - D_LOG_FILE_APPEND_RANK=1 + storage: auto + 1: + pinned_numa_node: 1 + nr_xs_helpers: 1 + log_file: daos_server1.log + log_mask: DEBUG,MEM=ERR + env_vars: + - DD_MASK=group_metadata_only,io,epc,rebuild + - D_LOG_FILE_APPEND_PID=1 + - D_LOG_FILE_APPEND_RANK=1 + storage: auto + +pool: + size: 90% + pool_query_timeout: 30 + properties: rd_fac:3 + +cont_ior: + type: POSIX + properties: rd_fac:3 + oclass: EC_8P3GX + +ior: + ppn: 16 + test_file: /testFile + api: DFS + transfer_size: 1M + block_size: 128M + flags_write: "-v -w -k -G 1 -F" + flags_read: "-v -r -R -k -G 1 -F" diff --git a/src/tests/ftest/rebuild/mdtest.yaml b/src/tests/ftest/rebuild/mdtest.yaml index 37022e4e44f..07d11c38d12 100644 --- a/src/tests/ftest/rebuild/mdtest.yaml +++ b/src/tests/ftest/rebuild/mdtest.yaml @@ -12,8 +12,6 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: INFO storage: auto @@ -21,8 +19,6 @@ server_config: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: INFO storage: auto diff --git a/src/tests/ftest/rebuild/no_cap.yaml b/src/tests/ftest/rebuild/no_cap.yaml index 5ab0b089a8a..2f4f287a5a8 100644 --- a/src/tests/ftest/rebuild/no_cap.yaml +++ b/src/tests/ftest/rebuild/no_cap.yaml @@ -3,7 +3,9 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 360 + server_config: name: daos_server # reduce cart timeout to make IV update return timeout @@ -15,26 +17,25 @@ server_config: targets: 1 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: targets: 1 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + container: control_method: daos properties: "rd_fac:1" + pool: scm_size: 1G pool_query_timeout: 30 pool_query_interval: 1 test_data_list: [1048576] oclass: "OC_RP_4G1" + rebuild: rank_to_kill: 1 diff --git a/src/tests/ftest/rebuild/pool_destroy_race.yaml b/src/tests/ftest/rebuild/pool_destroy_race.yaml index f1a43623dbf..987332712e8 100644 --- a/src/tests/ftest/rebuild/pool_destroy_race.yaml +++ b/src/tests/ftest/rebuild/pool_destroy_race.yaml @@ -14,16 +14,12 @@ server_config: targets: 4 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: targets: 4 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/rebuild/with_ior.yaml b/src/tests/ftest/rebuild/with_ior.yaml index 963180310ea..34c6f2d9eda 100644 --- a/src/tests/ftest/rebuild/with_ior.yaml +++ b/src/tests/ftest/rebuild/with_ior.yaml @@ -3,9 +3,11 @@ hosts: test_clients: 1 timeout: 360 + agent_config: #cache_expiration: 1 disable_caching: true + server_config: name: daos_server engines_per_host: 2 @@ -14,8 +16,6 @@ server_config: targets: 2 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: INFO storage: auto @@ -23,8 +23,6 @@ server_config: targets: 2 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: INFO storage: auto diff --git a/src/tests/ftest/recovery/cat_recov_core.yaml b/src/tests/ftest/recovery/cat_recov_core.yaml index 23200a8b403..73a5a583283 100644 --- a/src/tests/ftest/recovery/cat_recov_core.yaml +++ b/src/tests/ftest/recovery/cat_recov_core.yaml @@ -1,15 +1,16 @@ hosts: test_servers: 4 -timeout: 5700 + +timeout: 1H40M + server_config: name: daos_server engines_per_host: 2 engines: 0: + targets: 4 pinned_numa_node: 0 nr_xs_helpers: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -20,11 +21,11 @@ server_config: - FI_LOG_LEVEL=warn - D_LOG_STDERR_IN_LOG=1 storage: auto + 1: + targets: 4 pinned_numa_node: 1 nr_xs_helpers: 0 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -35,15 +36,22 @@ server_config: - FI_LOG_LEVEL=warn - D_LOG_STDERR_IN_LOG=1 storage: auto + transport_config: allow_insecure: true - system_ram_reserved: 64 + +pool: + scm_size: 6G + nvme_size: 80G + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + daos_tests: num_clients: test_daos_cat_recov_core: 1 diff --git a/src/tests/ftest/recovery/check_policy.yaml b/src/tests/ftest/recovery/check_policy.yaml index 12e26ea6686..55d0111c106 100644 --- a/src/tests/ftest/recovery/check_policy.yaml +++ b/src/tests/ftest/recovery/check_policy.yaml @@ -8,14 +8,10 @@ server_config: engines_per_host: 2 engines: 0: - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log nr_xs_helpers: 1 storage: auto 1: - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log nr_xs_helpers: 1 storage: auto diff --git a/src/tests/ftest/recovery/check_repair.yaml b/src/tests/ftest/recovery/check_repair.yaml index 65141559ced..f162fc0a68c 100644 --- a/src/tests/ftest/recovery/check_repair.yaml +++ b/src/tests/ftest/recovery/check_repair.yaml @@ -8,14 +8,10 @@ server_config: engines_per_host: 2 engines: 0: - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log nr_xs_helpers: 1 storage: auto 1: - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log nr_xs_helpers: 1 storage: auto diff --git a/src/tests/ftest/recovery/check_start_corner_case.py b/src/tests/ftest/recovery/check_start_corner_case.py index d56d81aa2d3..150e1a540e4 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.py +++ b/src/tests/ftest/recovery/check_start_corner_case.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -76,8 +76,9 @@ def test_start_back_to_back(self): result in Operation already performed error. In that case, repeat. When the first pool is fixed, the second start should work. 6. Query checker and verify that they’re fixed. - 7. Disable checker and start system. - 8. Verify that the faults are actually fixed. + 7. Clear the checker inconsistency reports. + 8. Disable checker and start system. + 9. Verify that the faults are actually fixed. Jira ID: DAOS-17860 @@ -86,14 +87,12 @@ def test_start_back_to_back(self): :avocado: tags=recovery,cat_recov :avocado: tags=DMGCheckStartCornerCaseTest,test_start_back_to_back """ - # 1. Create two pools and a container. self.log_step("Create two pools and a container.") pool_1 = self.get_pool(connect=False) pool_2 = self.get_pool(connect=False) container_1 = self.get_container(pool=pool_1) container_2 = self.get_container(pool=pool_2) - # 2. Inject fault on both containers. self.log_step("Inject fault on both containers.") daos_command = self.get_daos_command() daos_command.faults_container( @@ -103,16 +102,13 @@ def test_start_back_to_back(self): pool=pool_2.identifier, cont=container_2.identifier, location="DAOS_CHK_CONT_ORPHAN") - # 3. Enable checker. self.log_step("Enable checker.") dmg_command = self.get_dmg_command() dmg_command.check_enable() - # 4. Start with the first pool. self.log_step("Start with the first pool.") dmg_command.check_start(pool=pool_1.identifier) - # 5. Immediately after starting the first pool, start the second pool. self.log_step("Immediately after starting the first pool, start the second pool.") pool_2_started = False for count in range(8): @@ -130,15 +126,19 @@ def test_start_back_to_back(self): time.sleep(5) self.assertTrue(pool_2_started, "dmg check start pool_2 failed after 40 sec!") - # 6. Query checker and verify that they’re fixed. self.log_step("Query checker and verify that they’re fixed.") wait_for_check_complete(dmg=dmg_command) - # 7. Disable checker and start system. + self.log_step("Clear the checker inconsistency reports.") + dmg_command.check_stop() + # Start with --reset clears the inconsistency reports. Old inconsistency reports + # may cause subsequent tests to fail because the tests don't expect them. The + # tests expect the system to be clean at the beginning. + dmg_command.check_start(reset=True) + self.log_step("Disable checker and start system.") dmg_command.check_disable() - # 8. Verify that the faults are actually fixed. self.log_step("Verify that the faults are actually fixed.") # In this case, check that the containers were removed. container_list_out_1 = daos_command.pool_list_containers(pool=pool_1.identifier) @@ -153,3 +153,121 @@ def test_start_back_to_back(self): # Containers were removed by the checker. container_1.skip_cleanup() container_2.skip_cleanup() + + def test_two_pools_healthy(self): + """Test to pass in two pool labels where one is healthy pool. + + 1. Create three pools and one container. + 2. Inject container bad label into one of them. + 3. Enable checker and set policy to --all-interactive. + 4. Call dmg check start with two different healthy pool labels. + 5. Call dmg check start with two same healthy pool labels. + 6. Call dmg check start with healthy pool and corrupted pool. + 7. Repair with option 2 (original container label) and wait for checker to finish. + 8. Call dmg check start with healthy pool and invalid label. + 9. Disable checker and verify that the fault is actually fixed. + + Jira ID: DAOS-17858 + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=recovery,cat_recov + :avocado: tags=DMGCheckStartCornerCaseTest,test_two_pools_healthy + """ + # 1. Create three pools and one container. + self.log_step("Create three pools and one container.") + pool_1 = self.get_pool(connect=False) + pool_2 = self.get_pool(connect=False) + pool_3 = self.get_pool(connect=False) + container = self.get_container(pool=pool_3) + + # 2. Inject container bad label into one of them. + self.log_step("Inject container bad label into one of them.") + daos_command = self.get_daos_command() + daos_command.faults_container( + pool=pool_3.identifier, cont=container.identifier, + location="DAOS_CHK_CONT_BAD_LABEL") + + # 3. Enable checker and set policy to --all-interactive. + self.log_step("Enable checker and set policy to --all-interactive.") + dmg_command = self.get_dmg_command() + dmg_command.check_enable() + dmg_command.check_set_policy(all_interactive=True) + + # 4. Call dmg check start with two different healthy pool labels. + self.log_step("Call dmg check start with two different healthy pool labels.") + healthy_diff = pool_1.identifier + " " + pool_2.identifier + try: + dmg_command.check_start(pool=healthy_diff) + msg = ("dmg check start with two different healthy pool labels worked as " + "expected.") + self.log.info(msg) + except CommandFailure as command_failure: + msg = (f"dmg check start with two different healthy pool labels failed! " + f"{command_failure}") + self.fail(msg) + # Need to stop before starting again. + dmg_command.check_stop() + + # 5. Call dmg check start with two same healthy pool labels. + self.log_step("Call dmg check start with two same healthy pool labels.") + healthy_same = pool_1.identifier + " " + pool_1.identifier + try: + dmg_command.check_start(pool=healthy_same) + msg = ("dmg check start with two same healthy pool labels worked as " + "expected.") + self.log.info(msg) + except CommandFailure as command_failure: + msg = (f"dmg check start with two same healthy pool labels failed! " + f"{command_failure}") + self.fail(msg) + dmg_command.check_stop() + + # 6. Call dmg check start with healthy pool and corrupted pool. + self.log_step("Call dmg check start with healthy pool and corrupted pool.") + healthy_corrupted = pool_1.identifier + " " + pool_3.identifier + dmg_command.check_start(pool=healthy_corrupted) + + # 7. Repair with option 2 and wait for checker to finish. + self.log_step("Repair with option 2 and wait for checker to finish.") + # Wait for the checker to detect the inconsistent container label. + query_reports = None + for _ in range(8): + check_query_out = dmg_command.check_query(pool=pool_3.uuid) + # Status becomes RUNNING immediately, but it may take a while to detect the + # inconsistency. If detected, "reports" field is filled. + if check_query_out["response"]["status"] == "RUNNING": + query_reports = check_query_out["response"]["reports"] + if query_reports: + break + time.sleep(5) + if not query_reports: + self.fail("Checker didn't detect any inconsistency!") + fault_msg = query_reports[0]["msg"] + expected_fault = "inconsistent container label" + if expected_fault not in fault_msg: + self.fail(f"Checker didn't detect {expected_fault}! Fault msg = {fault_msg}") + # Obtain the seq num (ID) to repair. + seq = query_reports[0]["seq"] + # Repair with action 2, which is to use the original container label. + dmg_command.check_repair(seq_num=str(seq), action="2") + wait_for_check_complete(dmg=dmg_command) + dmg_command.check_stop() + + # 8. Call dmg check start with healthy pool and invalid label. + self.log_step("Call dmg check start with healthy pool and invalid label.") + healthy_invalid = pool_1.identifier + " TestPool0" + try: + dmg_command.check_start(pool=healthy_invalid) + self.fail("dmg check start with healthy and invalid pool labels worked!") + except CommandFailure as command_failure: + exp_msg = "unable to find pool service" + if exp_msg not in str(command_failure): + self.fail(f"{exp_msg} is not in the error message!") + + # 9. Disable checker and verify that the fault is actually fixed. + self.log_step("Disable checker and verify that the fault is actually fixed.") + dmg_command.check_disable() + expected_props = {"label": container.label.value} + label_verified = container.verify_prop(expected_props=expected_props) + self.assertTrue(label_verified, "Container label isn't fixed!") diff --git a/src/tests/ftest/recovery/check_start_corner_case.yaml b/src/tests/ftest/recovery/check_start_corner_case.yaml index 2863ca4e4f0..ed1816f3b8c 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.yaml +++ b/src/tests/ftest/recovery/check_start_corner_case.yaml @@ -2,21 +2,17 @@ hosts: test_servers: 1 test_clients: 1 -timeout: 4M +timeout: 5M server_config: name: daos_server engines_per_host: 2 engines: 0: - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log nr_xs_helpers: 1 storage: auto 1: - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log nr_xs_helpers: 1 storage: auto diff --git a/src/tests/ftest/recovery/check_start_options.py b/src/tests/ftest/recovery/check_start_options.py index 2d17edb95b8..7d91e1ec8be 100644 --- a/src/tests/ftest/recovery/check_start_options.py +++ b/src/tests/ftest/recovery/check_start_options.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -10,6 +10,11 @@ from recovery_utils import query_detect from run_utils import command_as_user, run_remote +# Enum values used in this test +ENUM_CIC_POOL_NONEXIST_ON_MS = 4 +ENUM_CIA_INTERACT = 1 +ENUM_CIA_STALE = 0xffff + class DMGCheckStartOptionsTest(TestWithServers): """Test dmg check start options. @@ -25,6 +30,8 @@ class DMGCheckStartOptionsTest(TestWithServers): :avocado: recursive """ + MAX_QUERY_RETRY = 8 # max retries for check query + def test_check_start_reset(self): """Test dmg check start --reset. @@ -36,13 +43,12 @@ def test_check_start_reset(self): from "unchecked". 4. Verify that the orphan pool is detected. 5. Stop the checker. The state is now at "stopped". - 6. Remove the pool directory from the mount point. - 7. Start the checker without --reset. State is back to "checking". - 8. Verify that the action entry is still there. - 9. Stop the checker. State is "stopped". - 10. Start the checker with --reset. The state should have transitioned to + 6. Start the checker without --reset. State is back to "checking". + 7. Verify that the action entry is still there. + 8. Stop the checker. State is "stopped". + 9. Start the checker with --reset. The state should have transitioned to "unchecked", then "checking". - 11. Verify that the action entry is empty and the status is COMPLETED. + 10. Verify that the action entry is empty and the status is COMPLETED. Jira ID: DAOS-17623 @@ -61,10 +67,9 @@ def test_check_start_reset(self): dmg_command.faults_mgmt_svc_pool( pool=pool.identifier, checker_report_class="CIC_POOL_NONEXIST_ON_MS") - # 3. Start the checker with interactive mode. - self.log_step("Start the checker with interactive mode.") + # 3. Start the checker. + self.log_step("Start the checker.") dmg_command.check_enable() - dmg_command.check_set_policy(all_interactive=True) dmg_command.check_start() # 4. Verify that the orphan pool is detected. @@ -75,36 +80,18 @@ def test_check_start_reset(self): self.log_step("Stop the checker.") dmg_command.check_stop() - # 6. Remove the pool directory from the mount point. - self.log_step("Remove the pool directory from the mount point.") - pool_path = self.server_managers[0].get_vos_path(pool) - pool_out = check_file_exists( - hosts=self.hostlist_servers, filename=pool_path, sudo=True) - if not pool_out[0]: - msg = ("MD-on-SSD cluster. Contents under mount point are removed by control " - "plane after system stop.") - self.log.info(msg) - dmg_command.system_start() - # return results in PASS. - return - command = command_as_user(command=f"rm -rf {pool_path}", user="root") - remove_result = run_remote( - log=self.log, hosts=self.hostlist_servers, command=command) - if not remove_result.passed: - self.fail(f"Failed to remove {pool_path} from {remove_result.failed_hosts}") - - # 7. Start the checker without --reset. + # 6. Start the checker without --reset. self.log_step("Start the checker without --reset.") dmg_command.check_start() - # 8. Verify that the action entry is still there. - self.log_step("Verify that the action entry is still there.") - # At this point, the status is STOPPED (it will not turn to RUNNING), so just - # check whether msg contains "orphan pool". + # 7. Verify that the action entry is still there. + self.log_step("Verify that the old action entry is still there.") check_query_out = dmg_command.check_query() query_reports = check_query_out["response"]["reports"] if not query_reports: self.fail("Checker didn't detect any inconsistency!") + if len(query_reports) != 1: + self.fail(f"Expected only one report, but multiple reports found: {query_reports}") fault_msg = query_reports[0]["msg"] if "orphan pool" not in fault_msg: msg = (f"Checker didn't detect the orphan pool (2)! Fault msg = " @@ -112,15 +99,15 @@ def test_check_start_reset(self): dmg_command.check_disable() self.fail(msg) - # 9. Stop the checker. + # 8. Stop the checker. self.log_step("Stop the checker.") dmg_command.check_stop() - # 10. Start the checker with --reset. + # 9. Start the checker with --reset. self.log_step("Start the checker with --reset.") dmg_command.check_start(reset=True) - # 11. Verify that the action entry is empty and the status is COMPLETED. + # 10. Verify that the action entry is empty and the status is COMPLETED. self.log_step( "Verify that the action entry is empty and the status is COMPLETED.") repair_reports = None @@ -141,8 +128,152 @@ def test_check_start_reset(self): # Disable the checker to prepare for the tearDown. dmg_command.check_disable() - # The pool is orphan pool, so skip the cleanup. - pool.skip_cleanup() + + def get_reports(self, cmd): + """Helper function - get the reports from the check query""" + check_query_out = cmd.check_query() + return check_query_out["response"]["reports"] + + def expect_reports(self, query_reports, exp_reports): + """Helper function - verify expected check reports are found in actual query reports""" + if not query_reports: + self.fail("Checker didn't detect any inconsistency!") + for exp in exp_reports: + found = False + for report in query_reports: + if ( + report["pool_uuid"].lower() == exp["pool_uuid"].lower() + and report["class"] == exp["class"] + and report["action"] == exp["action"] + ): + found = True + break + if not found: + self.fail(f"expected report {exp} not found") + + def test_check_start_interactive(self): + """Test dmg check start's effects on interactive actions. + + 1. Create 2 pools. + 2. Inject faults on both pools. + 3. Start the checker with interactive mode for all. + 4. Verify that the first pool's issue is found. + 5. Stop the checker. + 6. Start the checker on the second pool. + 7. Verify that the first pool's action is now STALE, and the second pool's fault appears. + 8. Stop the checker. + 9. Start the checker on the first pool. + 10. Verify that the first pool's action is not STALE, but the second pool's action is STALE. + 11. Stop the checker. + 12. Start the checker with no pool specified. + 13. Check that both pools have non-stale actions. + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=recovery,cat_recov + :avocado: tags=DMGCheckStartOptionsTest,test_check_start_interactive + """ + # 1. Create a pool. + self.log_step("Create a pool") + pool1 = self.get_pool(connect=False, size="50%") + pool2 = self.get_pool(connect=False, size="50%") + + # 2. Inject pool faults. + self.log_step("Inject pool faults") + dmg_command = self.get_dmg_command() + dmg_command.faults_mgmt_svc_pool( + pool=pool1.identifier, checker_report_class="CIC_POOL_NONEXIST_ON_MS") + dmg_command.faults_mgmt_svc_pool( + pool=pool2.identifier, checker_report_class="CIC_POOL_NONEXIST_ON_MS") + + # 3. Enable the checker with interactive policies. + self.log_step("Enable the checker with interactive policies") + dmg_command.check_enable() + dmg_command.check_set_policy(all_interactive=True) + + # 4. Start the checker on pool 1. + self.log_step("Start the checker on pool1") + dmg_command.check_start(pool=pool1.uuid) + + # 5. Verify the interactive action + self.log_step("Verify the interactive action for pool1") + reports = self.get_reports(dmg_command) + self.expect_reports(reports, [{ + "pool_uuid": pool1.uuid, + "class": ENUM_CIC_POOL_NONEXIST_ON_MS, + "action": ENUM_CIA_INTERACT, + }]) + + # 6. Stop the checker. + self.log_step("Stop the checker") + dmg_command.check_stop() + + # 7. Start the checker on pool2. + self.log_step("Start the checker on pool2") + dmg_command.check_start(pool=pool2.uuid) + + # 8. Verify pool2 action is INTERACT, pool1 is STALE. + self.log_step("Verify the interactive and stale actions") + reports = self.get_reports(dmg_command) + self.expect_reports(reports, [{ + "pool_uuid": pool1.uuid, + "class": ENUM_CIC_POOL_NONEXIST_ON_MS, + "action": ENUM_CIA_STALE, + }, { + "pool_uuid": pool2.uuid, + "class": ENUM_CIC_POOL_NONEXIST_ON_MS, + "action": ENUM_CIA_INTERACT, + }]) + + # 9. Stop the checker. + self.log_step("Stop the checker") + dmg_command.check_stop() + + # 10. Start the checker on pool1. + self.log_step("Start the checker on pool1") + dmg_command.check_start(pool=pool1.uuid) + + # 11. Verify pool1 action is INTERACT, pool2 is STALE. + self.log_step("Verify the interactive and stale actions") + reports = self.get_reports(dmg_command) + self.expect_reports(reports, [{ + "pool_uuid": pool1.uuid, + "class": ENUM_CIC_POOL_NONEXIST_ON_MS, + "action": ENUM_CIA_INTERACT, + }, { + "pool_uuid": pool2.uuid, + "class": ENUM_CIC_POOL_NONEXIST_ON_MS, + "action": ENUM_CIA_STALE, + }]) + + # 12. Stop the checker. + self.log_step("Stop the checker") + dmg_command.check_stop() + + # 13. Start the checker on the whole system. + self.log_step("Start the checker on the whole system") + dmg_command.check_start() + + # 14. Verify both pool actions are INTERACT. + self.log_step("Verify the interactive actions") + reports = self.get_reports(dmg_command) + self.expect_reports(reports, [{ + "pool_uuid": pool1.uuid, + "class": ENUM_CIC_POOL_NONEXIST_ON_MS, + "action": ENUM_CIA_INTERACT, + }, { + "pool_uuid": pool2.uuid, + "class": ENUM_CIC_POOL_NONEXIST_ON_MS, + "action": ENUM_CIA_INTERACT, + }]) + + # 15. Repair both of the injected faults. + self.log_step("Repairing all findings with default option") + for report in reports: + dmg_command.check_repair(seq_num=report["seq"], action=0) + + # Disable the checker to prepare for the tearDown. + dmg_command.check_disable() def test_check_start_failout(self): """Test dmg check start --failout=on. @@ -190,7 +321,7 @@ def test_check_start_failout(self): # 6. Remove the pool directory from the mount point. self.log_step("Remove the pool directory from the mount point.") - pool_path = self.server_managers[0].get_vos_path(pool) + pool_path = self.server_managers[0].get_vos_paths(pool)[0] pool_out = check_file_exists( hosts=self.hostlist_servers, filename=pool_path, sudo=True) if not pool_out[0]: @@ -233,6 +364,36 @@ def test_check_start_failout(self): # The pool is orphan pool, so skip the cleanup. pool.skip_cleanup() + def query_nr_reports(self, dmg_command, nr_exp_reports): + """ + Query until the number of expected reports are found or max retries reached. + """ + query_wait = 10 # initial wait after starting the check, in seconds + query_sleep = 5 # wait period between retries in seconds + + time.sleep(query_wait) + query_reports = None + for _ in range(self.MAX_QUERY_RETRY): + check_query_out = dmg_command.check_query() + # Even if "status" is RUNNING, "reports" may be null/None, so check both. + status = check_query_out["response"]["status"] + query_reports = check_query_out["response"]["reports"] + if query_reports and len(query_reports) > 0: + self.log.debug("found %d reports, need %d", len(query_reports), nr_exp_reports) + if status == "RUNNING" and query_reports and len(query_reports) >= nr_exp_reports: + break + time.sleep(query_sleep) + + if not query_reports: + if nr_exp_reports > 0: + self.fail("Checker didn't detect any inconsistency!") + else: + return query_reports + + if len(query_reports) < nr_exp_reports: + self.fail(f"Expected at least {nr_exp_reports} reports, but found {len(query_reports)}") + return query_reports + def test_check_start_find_orphans(self): """Test dmg check start --find-orphans. @@ -265,7 +426,7 @@ def test_check_start_find_orphans(self): """ # 1. Create a pool and a container. self.log_step("Create a pool and a container.") - pool_1 = self.get_pool(connect=False) + pool_1 = self.get_pool(connect=False, size="45%") container = self.get_container(pool=pool_1) # 2. Inject non orphan pool fault such as orphan container. @@ -286,16 +447,7 @@ def test_check_start_find_orphans(self): # 4. Check that orphan container is detected. self.log_step("Check that orphan container is detected.") - for _ in range(8): - check_query_out = dmg_command.check_query() - # Even if "status" is RUNNING, "reports" may be null/None, so check both. - status = check_query_out["response"]["status"] - query_reports = check_query_out["response"]["reports"] - if status == "RUNNING" and query_reports: - break - time.sleep(5) - if not query_reports: - self.fail("Checker didn't detect any inconsistency!") + query_reports = self.query_nr_reports(dmg_command, 1) fault_msg = query_reports[0]["msg"] orphan_container = "orphan container" if orphan_container not in fault_msg: @@ -310,7 +462,7 @@ def test_check_start_find_orphans(self): # 6. Create an orphan pool. self.log_step("Create an orphan pool.") - pool_2 = self.get_pool(connect=False) + pool_2 = self.get_pool(connect=False, size="45%") dmg_command.faults_mgmt_svc_pool( pool=pool_2.identifier, checker_report_class="CIC_POOL_NONEXIST_ON_MS") @@ -321,14 +473,7 @@ def test_check_start_find_orphans(self): # 8. Check that orphan pool isn't detected. self.log_step("Check that orphan pool isn't detected.") - for _ in range(8): - check_query_out = dmg_command.check_query() - if check_query_out["response"]["status"] == "RUNNING": - query_reports = check_query_out["response"]["reports"] - break - time.sleep(5) - if not query_reports: - self.fail("Checker didn't detect any inconsistency!") + query_reports = self.query_nr_reports(dmg_command, 1) orphan_pool = "orphan pool" # Now we have multiple faults, so iterate query_reports. for query_report in query_reports: @@ -343,14 +488,7 @@ def test_check_start_find_orphans(self): # 10. Verify that the orphan pool is detected this time. self.log_step("Verify that the orphan pool is detected this time.") - for _ in range(8): - check_query_out = dmg_command.check_query() - if check_query_out["response"]["status"] == "RUNNING": - query_reports = check_query_out["response"]["reports"] - break - time.sleep(5) - if not query_reports: - self.fail("Checker didn't detect any inconsistency!") + query_reports = self.query_nr_reports(dmg_command, 2) orphan_pool_found = False pool_2_seq_num = None for query_report in query_reports: @@ -370,7 +508,7 @@ def test_check_start_find_orphans(self): dmg_command.check_repair(seq_num=pool_2_seq_num, action="0") repair_phase = None orphan_pool_repaired = False - for _ in range(8): + for _ in range(self.MAX_QUERY_RETRY): check_query_out = dmg_command.check_query() if check_query_out["response"]["status"] == "RUNNING": # Check the "phase" field of pool_2. Look for CSP_DONE. diff --git a/src/tests/ftest/recovery/check_start_options.yaml b/src/tests/ftest/recovery/check_start_options.yaml index 3d1825a81ed..ad603b0d728 100644 --- a/src/tests/ftest/recovery/check_start_options.yaml +++ b/src/tests/ftest/recovery/check_start_options.yaml @@ -12,9 +12,6 @@ server_config: engines_per_host: 1 engines: 0: - fabric_iface: ib0 - fabric_iface_port: 31317 - log_file: daos_server0.log nr_xs_helpers: 1 storage: auto diff --git a/src/tests/ftest/recovery/check_stop.yaml b/src/tests/ftest/recovery/check_stop.yaml index 2863ca4e4f0..d4ba8437916 100644 --- a/src/tests/ftest/recovery/check_stop.yaml +++ b/src/tests/ftest/recovery/check_stop.yaml @@ -9,14 +9,10 @@ server_config: engines_per_host: 2 engines: 0: - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log nr_xs_helpers: 1 storage: auto 1: - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log nr_xs_helpers: 1 storage: auto diff --git a/src/tests/ftest/recovery/container_cleanup.yaml b/src/tests/ftest/recovery/container_cleanup.yaml index d207e4193e5..5a2a97b4619 100644 --- a/src/tests/ftest/recovery/container_cleanup.yaml +++ b/src/tests/ftest/recovery/container_cleanup.yaml @@ -14,7 +14,7 @@ server_config: storage: auto pool: - size: 5G + size: 15G container: type: POSIX diff --git a/src/tests/ftest/recovery/container_list_consolidation.yaml b/src/tests/ftest/recovery/container_list_consolidation.yaml index d207e4193e5..5a2a97b4619 100644 --- a/src/tests/ftest/recovery/container_list_consolidation.yaml +++ b/src/tests/ftest/recovery/container_list_consolidation.yaml @@ -14,7 +14,7 @@ server_config: storage: auto pool: - size: 5G + size: 15G container: type: POSIX diff --git a/src/tests/ftest/recovery/ddb.py b/src/tests/ftest/recovery/ddb.py index 17993cfba05..21ac91230b4 100644 --- a/src/tests/ftest/recovery/ddb.py +++ b/src/tests/ftest/recovery/ddb.py @@ -1,6 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -10,12 +10,9 @@ from apricot import TestWithServers from ddb_utils import DdbCommand -from exception_utils import CommandFailure -from file_utils import distribute_files -from general_utils import (DaosTestError, create_string_buffer, get_random_string, report_errors, - run_command) +from general_utils import create_string_buffer, get_random_string, report_errors from pydaos.raw import DaosObjClass, IORequest -from run_utils import get_clush_command +from run_utils import command_as_user, run_remote def insert_objects(context, container, object_count, dkey_count, akey_count, base_dkey, @@ -78,35 +75,6 @@ def insert_objects(context, container, object_count, dkey_count, akey_count, bas return (ioreqs, dkeys, akeys, data_list) -def copy_remote_to_local(remote_file_path, test_dir, remote): - """Copy the given file from the server node to the local test node and retrieve - the original name. - - Args: - remote_file_path (str): File path to copy to local. - test_dir (str): Test directory. Usually self.test_dir. - remote (str): Remote hostname to copy file from. - """ - # Use clush --rcopy to copy the file from the remote server node to the local test - # node. clush will append . to the file when copying. - args = f"--rcopy {remote_file_path} --dest {test_dir}" - clush_command = get_clush_command(hosts=remote, args=args, timeout=60) - try: - run_command(command=clush_command, timeout=None) - except DaosTestError as error: - raise DaosTestError( - f"ERROR: Copying {remote_file_path} from {remote}: {error}") from error - - # Remove the appended . from the copied file. - current_file_path = "".join([remote_file_path, ".", remote]) - mv_command = f"mv {current_file_path} {remote_file_path}" - try: - run_command(command=mv_command) - except DaosTestError as error: - raise DaosTestError( - f"ERROR: Moving {current_file_path} to {remote_file_path}: {error}") from error - - class DdbTest(TestWithServers): """Test ddb subcommands. @@ -125,6 +93,18 @@ def __init__(self, *args, **kwargs): self.random_akey = get_random_string(10) self.random_data = get_random_string(10) + def run_cmd_check_result(self, command): + """Run given command as root and check its result. + + Args: + command (str): Command to execute. + """ + command_root = command_as_user(command=command, user="root") + result = run_remote( + log=self.log, hosts=self.hostlist_servers, command=command_root) + if not result.passed: + self.fail(f"{command} failed on {result.failed_hosts}!") + def test_recovery_ddb_ls(self): """Test ddb ls. @@ -138,35 +118,66 @@ def test_recovery_ddb_ls(self): 6. Reset the container and the pool to prepare for the cleanup. :avocado: tags=all,full_regression - :avocado: tags=vm + :avocado: tags=hw,medium :avocado: tags=recovery :avocado: tags=DdbTest,ddb_cmd,test_recovery_ddb_ls """ - # Create a pool and a container. + # This is where we load pool for MD-on-SSD. It's called tmpfs_mount in ddb + # prov_mem documentation, but use daos_load_path here for clarity. + daos_load_path = "/mnt/daos_load" + md_on_ssd = self.server_managers[0].manager.job.using_control_metadata + if md_on_ssd: + self.log_step("MD-on-SSD: Create a directory to load pool data under /mnt.") + self.run_cmd_check_result(command=f"mkdir {daos_load_path}") + + self.log_step("Create a pool and a container.") pool = self.get_pool() container = self.get_container(pool) - # Find the vos file name. e.g., /mnt/daos0//vos-0. - vos_paths = self.server_managers[0].get_vos_files(pool) - if not vos_paths: - self.fail(f"vos file wasn't found in {self.server_managers[0].get_vos_path(pool)}") - ddb_command = DdbCommand(self.server_managers[0].hosts[0:1], self.bin, vos_paths[0]) + if md_on_ssd: + vos_path = '""' + else: + # Find the vos file name. e.g., /mnt/daos0//vos-0. + vos_paths = self.server_managers[0].get_vos_files(pool) + if not vos_paths: + self.fail("vos file wasn't found!") + vos_path = vos_paths[0] + + ddb_command = DdbCommand( + server_host=self.server_managers[0].hosts[0:1], path=self.bin, + vos_path=vos_path) errors = [] object_count = self.object_count dkey_count = self.dkey_count akey_count = self.akey_count - # Insert objects with API. + self.log_step("Insert objects with API.") insert_objects( context=self.context, container=container, object_count=object_count, dkey_count=dkey_count, akey_count=akey_count, base_dkey=self.random_dkey, base_akey=self.random_akey, base_data=self.random_data) - # Need to stop the server to use ddb. + self.log_step("Stop server to use ddb.") self.get_dmg_command().system_stop() - # 1. Verify container UUID. + db_path = None + if md_on_ssd: + self.log_step(f"MD-on-SSD: Load pool dir to {daos_load_path}") + db_path = os.path.join( + self.server_managers[0].manager.job.yaml.metadata_params.path.value, + "daos_control", "engine0") + ddb_command.prov_mem(db_path=db_path, tmpfs_mount=daos_load_path) + + self.log_step("Verify container UUID.") + if md_on_ssd: + # "ddb ls" command for MD-on-SSD is quite different. + # PMEM: ddb /mnt/daos//vos-0 ls + # MD-on-SSD: ddb --db_path=/var/tmp/daos_testing/control_metadata/daos_control + # /engine0 --vos_path /mnt/daos_load//vos-0 ls + ddb_command.db_path.update(value=" ".join(["--db_path", db_path])) + ddb_command.vos_path.update( + value=os.path.join(daos_load_path, pool.uuid.lower(), "vos-0")) cmd_result = ddb_command.list_component() # Sample output. # Listing contents of '/' @@ -183,10 +194,11 @@ def test_recovery_ddb_ls(self): actual_uuid = match.group(1).lower() expected_uuid = container.uuid.lower() if actual_uuid != expected_uuid: - msg = f"Unexpected container UUID! Expected = {expected_uuid}; Actual = {actual_uuid}" - errors.append(msg) + errors.append( + f"Unexpected container UUID! Expected = {expected_uuid}; Actual = " + f"{actual_uuid}") - # 2. Verify object count in the container. + self.log_step("Verify object count in the container.") cmd_result = ddb_command.list_component(component_path="[0]") # Sample output. # Listing contents of 'CONT: (/[0]) /3082b7d3-32f9-41ea-bcbf-5d6450c1b34f' @@ -204,17 +216,21 @@ def test_recovery_ddb_ls(self): f"Unexpected object count! Expected = {object_count}; " f"Actual = {actual_object_count}") - # 3. Verify there are two dkeys for every object. Also verify the dkey string and - # the size. + msg = ("Verify there are two dkeys for every object. Also verify the dkey string " + "and the size.") + self.log_step(msg) dkey_regex = f"/{uuid_regex}/{object_id_regex}/(.*)" actual_dkey_count = 0 for obj_index in range(object_count): - component_path = f"[0]/[{obj_index}]" - cmd_result = ddb_command.list_component(component_path=component_path) - # Sample output. - # /d4e0c836-17bd-4df3-b255-929732486bab/281479271677953.0.0/ - # [0] 'Sample dkey 0 0' (15) - # [1] 'Sample dkey 0 1' (15) + cmd_result = ddb_command.list_component(component_path=f"[0]/[{obj_index}]") + # Sample output. There are three lines, but a line break is added to fit into + # the code. + # Listing contents of 'OBJ: (/[0]/[0]) + # /a78b65a1-31f4-440b-95e1-b4ead193b3f1/281479271677953.0.0.2' + # DKEY: (/[0]/[0]/[0]) + # /a78b65a1-31f4-440b-95e1-b4ead193b3f1/281479271677953.0.0.2/GSWOPOF1EX 0 0 + # DKEY: (/[0]/[0]/[1]) + # /a78b65a1-31f4-440b-95e1-b4ead193b3f1/281479271677953.0.0.2/GSWOPOF1EX 0 1 match = re.findall(dkey_regex, cmd_result.joined_stdout) actual_dkey_count += len(match) @@ -227,30 +243,34 @@ def test_recovery_ddb_ls(self): f"Actual = {actual_dkey}") errors.append(msg) - # Verify there are two dkeys for every object. + self.log_step("Verify there are two dkeys for every object.") expected_dkey_count = object_count * dkey_count if actual_dkey_count != expected_dkey_count: - msg = (f"Unexpected number of dkeys! Expected = {expected_dkey_count}; " - f"Actual = {actual_dkey_count}") - errors.append(msg) + errors.append( + f"Unexpected number of dkeys! Expected = {expected_dkey_count}; " + f"Actual = {actual_dkey_count}") - # 4. Verify there is one akey for every dkey. Also verify the key string and the - # size. + self.log_step( + "Verify there is one akey for every dkey. Also verify the key string and " + "the size.") akey_count = 0 for obj_index in range(object_count): for dkey_index in range(dkey_count): - component_path = f"[0]/[{obj_index}]/[{dkey_index}]" - cmd_result = ddb_command.list_component(component_path=component_path) + cmd_result = ddb_command.list_component( + component_path=f"[0]/[{obj_index}]/[{dkey_index}]") ls_out = cmd_result.joined_stdout msg = (f"List akeys obj_index = {obj_index}, dkey_index = {dkey_index}, " f"stdout = {ls_out}") self.log.info(msg) - # Output is in the same format as dkey, so use the same regex. - # /d4e0c836-17bd-4df3-b255-929732486bab/281479271677954.0.0/' - # Sample dkey 1 0'/ - # [0] 'Sample akey 1 0 0' (17) + # Output is in the same format as dkey, so use the same regex. There are + # two lines, but line breaks are added to fit into the code. + # Listing contents of 'DKEY: (/[0]/[0]/[0]) + # /a78b65a1-31f4-440b-95e1-b4ead193b3f1/281479271677953.0.0.2/ + # GSWOPOF1EX 0 0' + # AKEY: (/[0]/[0]/[0]/[0]) + # /a78b65a1-31f4-440b-95e1-b4ead193b3f1/281479271677953.0.0.2/ + # GSWOPOF1EX 0 0/OOJ2TNAHS7 0 0 0 match = re.findall(f"{dkey_regex}/(.*)", ls_out) - akey_count += len(match) # Verify akey string. As in dkey, ignore the numbers at the end. @@ -260,28 +280,30 @@ def test_recovery_ddb_ls(self): f"Expected = {self.random_akey}; Actual = {actual_akey}") errors.append(msg) - # Verify there is one akey for every dkey. + self.log_step("Verify there is one akey for every dkey.") if expected_dkey_count != akey_count: msg = (f"Unexpected number of akeys! Expected = {expected_dkey_count}; " f"Actual = {akey_count}") errors.append(msg) - # 5. Restart the server for the cleanup. + if md_on_ssd: + self.log_step(f"MD-on-SSD: Clean {daos_load_path}") + self.run_cmd_check_result(command=f"umount {daos_load_path}") + self.run_cmd_check_result(command=f"rm -rf {daos_load_path}") + + self.log_step("Restart the server for the cleanup.") self.get_dmg_command().system_start() - # 6. Reset the container and the pool to prepare for the cleanup. + self.log_step("Reset the container and the pool to prepare for the cleanup.") container.close() pool.disconnect() pool.connect() container.open() - self.get_dmg_command().system_start() - self.log.info("##### Errors #####") report_errors(test=self, errors=errors) - self.log.info("##################") def test_recovery_ddb_rm(self): - """Test rm. + """Test ddb rm. 1. Create a pool and a container. Insert objects, dkeys, and akeys. 2. Stop the server to use ddb. @@ -299,29 +321,48 @@ def test_recovery_ddb_rm(self): 14. Call ddb rm to remove the object. 15. Restart the server to use daos command. 16. Reset the container and pool so that cleanup works. - 17. Call "daos container list-objects " to verify that the - object was removed. + 17. Call "daos container list-objects " to verify that the object was + removed. :avocado: tags=all,full_regression - :avocado: tags=vm + :avocado: tags=hw,medium :avocado: tags=recovery :avocado: tags=DdbTest,ddb_cmd,test_recovery_ddb_rm """ - # 1. Create a pool and a container. Insert objects, dkeys, and akeys. + # This is where we load pool for MD-on-SSD. It's called tmpfs_mount in ddb prov_mem + # documentation, but use daos_load_path here for clarity. + daos_load_path = "/mnt/daos_load" + md_on_ssd = self.server_managers[0].manager.job.using_control_metadata + if md_on_ssd: + self.log_step("MD-on-SSD: Create a directory to load pool data under /mnt.") + self.run_cmd_check_result(command=f"mkdir {daos_load_path}") + + self.log_step("Create a pool and a container.") pool = self.get_pool(connect=True) container = self.get_container(pool) - # Insert one object with one dkey and one akey with API. + if md_on_ssd: + vos_path = '""' + else: + # Find the vos file name. e.g., /mnt/daos0//vos-0. + vos_paths = self.server_managers[0].get_vos_files(pool) + if not vos_paths: + self.fail("vos file wasn't found!") + vos_path = vos_paths[0] + + ddb_command = DdbCommand( + server_host=self.server_managers[0].hosts[0:1], path=self.bin, vos_path=vos_path) + + self.log_step("Insert one object with one dkey and one akey with API.") obj_dataset = insert_objects( - context=self.context, container=container, object_count=1, - dkey_count=1, akey_count=2, base_dkey=self.random_dkey, - base_akey=self.random_akey, base_data=self.random_data) + context=self.context, container=container, object_count=1, dkey_count=1, akey_count=2, + base_dkey=self.random_dkey, base_akey=self.random_akey, base_data=self.random_data) ioreqs = obj_dataset[0] dkeys_inserted = obj_dataset[1] akeys_inserted = obj_dataset[2] - # For debugging/reference, check that the dkey and the akey we just inserted are - # returned from the API. + # For debugging/reference, check that the dkey and the akey we just inserted are returned + # from the API. akeys_api = ioreqs[0].list_akey(dkey=dkeys_inserted[0]) self.log.info("akeys from API (before) = %s", akeys_api) dkeys_api = ioreqs[0].list_dkey() @@ -332,24 +373,34 @@ def test_recovery_ddb_rm(self): pool=pool.identifier, cont=container.uuid) self.log.info("Object list (before) = %s", list_obj_out["response"]) - # 2. Need to stop the server to use ddb. + self.log_step("Stop the server to use ddb.") dmg_command = self.get_dmg_command() dmg_command.system_stop() - # 3. Find the vos file name. - vos_paths = self.server_managers[0].get_vos_files(pool) - if not vos_paths: - self.fail(f"vos file wasn't found in {self.server_managers[0].get_vos_path(pool)}") - ddb_command = DdbCommand(self.server_managers[0].hosts[0:1], self.bin, vos_paths[0]) - - # 4. Call ddb rm to remove the akey. + db_path = None + if md_on_ssd: + self.log_step(f"MD-on-SSD: Load pool dir to {daos_load_path}") + db_path = os.path.join( + self.server_managers[0].manager.job.yaml.metadata_params.path.value, "daos_control", + "engine0") + ddb_command.prov_mem(db_path=db_path, tmpfs_mount=daos_load_path) + + self.log_step("Call ddb rm to remove the akey.") + if md_on_ssd: + # "ddb rm" command for MD-on-SSD is quite different. + # PMEM: ddb /mnt/daos//vos-0 rm + # MD-on-SSD: ddb -w --db_path=/var/tmp/daos_testing/control_metadata/daos_control + # /engine0 --vos_path /mnt/daos_load//vos-0 rm + ddb_command.db_path.update(value=" ".join(["--db_path", db_path])) + ddb_command.vos_path.update( + value=os.path.join(daos_load_path, pool.uuid.lower(), "vos-0")) cmd_result = ddb_command.remove_component(component_path="[0]/[0]/[0]/[0]") self.log.info("rm akey stdout = %s", cmd_result.joined_stdout) - # 5. Restart the server to use the API. + self.log_step("Restart the server to use the API.") dmg_command.system_start() - # 6. Reset the object, container, and pool to use the API after server restart. + self.log_step("Reset the object, container, and pool to use the API after server restart.") ioreqs[0].obj.close() container.close() pool.disconnect() @@ -357,10 +408,9 @@ def test_recovery_ddb_rm(self): container.open() ioreqs[0].obj.open() - # 7. Call list_akey() in pydaos API to verify that the akey was removed. + self.log_step("Call list_akey() in pydaos API to verify that the akey was removed.") akeys_api = ioreqs[0].list_akey(dkey=dkeys_inserted[0]) self.log.info("akeys from API (after) = %s", akeys_api) - errors = [] expected_len = len(akeys_inserted) - 1 actual_len = len(akeys_api) @@ -369,17 +419,17 @@ def test_recovery_ddb_rm(self): f"Actual = {actual_len}") errors.append(msg) - # 8. Stop the server to use ddb. + self.log_step("Stop the server to use ddb.") dmg_command.system_stop() - # 9. Call ddb rm to remove the dkey. + self.log_step("Call ddb rm to remove the dkey.") cmd_result = ddb_command.remove_component(component_path="[0]/[0]/[0]") self.log.info("rm dkey stdout = %s", cmd_result.joined_stdout) - # 10. Restart the server to use the API. + self.log_step("Restart the server to use the API.") dmg_command.system_start() - # 11. Reset the object, container, and pool to use the API after server restart. + self.log_step("Reset the object, container, and pool to use the API after server restart.") ioreqs[0].obj.close() container.close() pool.disconnect() @@ -387,10 +437,9 @@ def test_recovery_ddb_rm(self): container.open() ioreqs[0].obj.open() - # 12. Call list_dkey() in pydaos API to verify that the dkey was removed. + self.log_step("Call list_dkey() in pydaos API to verify that the dkey was removed.") dkeys_api = ioreqs[0].list_dkey() self.log.info("dkeys from API (after) = %s", dkeys_api) - expected_len = len(dkeys_inserted) - 1 actual_len = len(dkeys_api) if actual_len != expected_len: @@ -398,29 +447,27 @@ def test_recovery_ddb_rm(self): f"Actual = {actual_len}") errors.append(msg) - # 13. Stop the server to use ddb. + self.log_step("Stop the server to use ddb.") dmg_command.system_stop() - # 14. Call ddb rm to remove the object. + self.log_step("Call ddb rm to remove the object.") cmd_result = ddb_command.remove_component(component_path="[0]/[0]") self.log.info("rm object stdout = %s", cmd_result.joined_stdout) - # 15. Restart the server to use daos command. + self.log_step("Restart the server to use daos command.") dmg_command.system_start() - # 16. Reset the container and pool so that cleanup works. + self.log_step("Reset the container and pool so that cleanup works.") container.close() pool.disconnect() pool.connect() container.open() - # 17. Call "daos container list-objects " to verify that - # the object was removed. + self.log_step("Call daos container list-objects to verify that the object was removed.") list_obj_out = self.get_daos_command().container_list_objects( pool=pool.identifier, cont=container.uuid) obj_list = list_obj_out["response"] self.log.info("Object list (after) = %s", obj_list) - expected_len = len(ioreqs) - 1 if obj_list: actual_len = len(obj_list) @@ -431,194 +478,9 @@ def test_recovery_ddb_rm(self): f"Actual = {actual_len}") errors.append(msg) - self.log.info("##### Errors #####") - report_errors(test=self, errors=errors) - self.log.info("##################") - - def test_recovery_ddb_load(self): - """Test ddb value_load. - - 1. Create a pool and a container. - 2. Insert one object with one dkey with the API. - 3. Stop the server to use ddb. - 4. Find the vos file name. e.g., /mnt/daos0//vos-0. - 5. Load new data into [0]/[0]/[0]/[0] - 6. Restart the server. - 7. Reset the object, container, and pool to use the API. - 8. Verify the data in the akey with single_fetch(). - - :avocado: tags=all,full_regression - :avocado: tags=vm - :avocado: tags=recovery - :avocado: tags=DdbTest,ddb_cmd,test_recovery_ddb_load - """ - # 1. Create a pool and a container. - pool = self.get_pool(connect=True) - container = self.get_container(pool) - - # 2. Insert one object with one dkey with API. - obj_dataset = insert_objects( - context=self.context, container=container, object_count=1, - dkey_count=1, akey_count=1, base_dkey=self.random_dkey, - base_akey=self.random_akey, base_data=self.random_data) - ioreqs = obj_dataset[0] - dkeys_inserted = obj_dataset[1] - akeys_inserted = obj_dataset[2] - data_list = obj_dataset[3] - - # For debugging/reference, call single_fetch and get the data just inserted. - # Pass in size + 1 to single_fetch to avoid the no-space error. - data_size = len(data_list[0]) + 1 - data = ioreqs[0].single_fetch( - dkey=dkeys_inserted[0], akey=akeys_inserted[0], size=data_size) - self.log.info("data (before) = %s", data.value.decode('utf-8')) - - # 3. Stop the server to use ddb. - dmg_command = self.get_dmg_command() - dmg_command.system_stop() - - # 4. Find the vos file name. - host = self.server_managers[0].hosts[0:1] - vos_paths = self.server_managers[0].get_vos_files(pool) - if not vos_paths: - self.fail(f"vos file wasn't found in {self.server_managers[0].get_vos_path(pool)}") - ddb_command = DdbCommand(host, self.bin, vos_paths[0]) - - # 5. Load new data into [0]/[0]/[0]/[0] - # Create a file in test node. - load_file_path = os.path.join(self.test_dir, "new_data.txt") - new_data = "New akey data 0123456789" - with open(load_file_path, "w", encoding="utf-8") as file: - file.write(new_data) - - # Copy the created file to server node. - result = distribute_files(self.log, host, load_file_path, load_file_path, False) - if not result.passed: - raise CommandFailure(f"ERROR: Copying new_data.txt to {result.failed_hosts}") - - # The file with the new data is ready. Run ddb load. - ddb_command.value_load(component_path="[0]/[0]/[0]/[0]", load_file_path=load_file_path) - - # 6. Restart the server. - dmg_command.system_start() - - # 7. Reset the object, container, and pool to use the API after server restart. - ioreqs[0].obj.close() - container.close() - pool.disconnect() - pool.connect() - container.open() - ioreqs[0].obj.open() - - # 8. Verify the data in the akey with single_fetch(). - data_size = len(new_data) + 1 - data = ioreqs[0].single_fetch( - dkey=dkeys_inserted[0], akey=akeys_inserted[0], size=data_size) - actual_data = data.value.decode('utf-8') - self.log.info("data (after) = %s", actual_data) - - errors = [] - if new_data != actual_data: - msg = f"ddb load failed! Expected = {new_data}; Actual = {actual_data}" - errors.append(msg) - - self.log.info("##### Errors #####") - report_errors(test=self, errors=errors) - self.log.info("##################") - - def test_recovery_ddb_dump_value(self): - """Test ddb dump_value. - - 1. Create a pool and a container. - 2. Insert one object with one dkey with API. - 3. Stop the server to use ddb. - 4. Find the vos file name. e.g., /mnt/daos0//vos-0. - 5. Dump the two akeys to files. - 6. Verify the content of the files. - 7. Restart the server for the cleanup. - 8. Reset the object, container, and pool to prepare for the cleanup. - - :avocado: tags=all,full_regression - :avocado: tags=vm - :avocado: tags=recovery - :avocado: tags=DdbTest,ddb_cmd,test_recovery_ddb_dump_value - """ - # 1. Create a pool and a container. - pool = self.get_pool(connect=True) - container = self.get_container(pool) - - # 2. Insert one object with one dkey with API. - obj_dataset = insert_objects( - context=self.context, container=container, object_count=1, - dkey_count=1, akey_count=2, base_dkey=self.random_dkey, - base_akey=self.random_akey, base_data=self.random_data) - ioreqs = obj_dataset[0] - data_list = obj_dataset[3] - - # 3. Stop the server to use ddb. - dmg_command = self.get_dmg_command() - dmg_command.system_stop() - - # 4. Find the vos file name. - vos_paths = self.server_managers[0].get_vos_files(pool) - if not vos_paths: - self.fail(f"vos file wasn't found in {self.server_managers[0].get_vos_path(pool)}") - ddb_command = DdbCommand(self.server_managers[0].hosts[0:1], self.bin, vos_paths[0]) - - # 5. Dump the two akeys to files. - akey1_file_path = os.path.join(self.test_dir, "akey1.txt") - ddb_command.value_dump( - component_path="[0]/[0]/[0]/[0]", out_file_path=akey1_file_path) - akey2_file_path = os.path.join(self.test_dir, "akey2.txt") - ddb_command.value_dump( - component_path="[0]/[0]/[0]/[1]", out_file_path=akey2_file_path) - - # Copy them from remote server node to local test node. - copy_remote_to_local( - remote_file_path=akey1_file_path, test_dir=self.test_dir, - remote=self.hostlist_servers[0]) - copy_remote_to_local( - remote_file_path=akey2_file_path, test_dir=self.test_dir, - remote=self.hostlist_servers[0]) - - # 6. Verify the content of the files. - actual_akey1_data = None - with open(akey1_file_path, "r", encoding="utf-8") as file: - actual_akey1_data = file.readlines()[0] - actual_akey2_data = None - with open(akey2_file_path, "r", encoding="utf-8") as file: - actual_akey2_data = file.readlines()[0] - - errors = [] - str_data_list = [] - # Convert the data to string. - for data in data_list: - str_data_list.append(data.value.decode("utf-8")) - # Verify that we were able to obtain the data and akey1 and akey2 aren't the same. - if actual_akey1_data is None or actual_akey2_data is None or \ - actual_akey1_data == actual_akey2_data: - msg = (f"Invalid dumped value! Dumped akey1 data = {actual_akey1_data}; " - f"Dumped akey2 data = {actual_akey2_data}") - errors.append(msg) - # Verify that the data we obtained with ddb are the ones we wrote. The order isn't - # deterministic, so check with "in". - if actual_akey1_data not in str_data_list or \ - actual_akey2_data not in str_data_list: - msg = (f"Unexpected dumped value! Dumped akey data 1 = {actual_akey1_data}; Dumped " - f"akey data 2 = {actual_akey2_data}; Expected data list = {str_data_list}") - errors.append(msg) - - # 7. Restart the server for the cleanup. - dmg_command.system_start() - - # 8. Reset the object, container, and pool to prepare for the cleanup. - ioreqs[0].obj.close() - container.close() - pool.disconnect() - pool.connect() - container.open() - ioreqs[0].obj.open() + if md_on_ssd: + self.log_step(f"MD-on-SSD: Clean {daos_load_path}") + self.run_cmd_check_result(command=f"umount {daos_load_path}") + self.run_cmd_check_result(command=f"rm -rf {daos_load_path}") - self.log.info("##### Errors #####") report_errors(test=self, errors=errors) - self.log.info("##################") diff --git a/src/tests/ftest/recovery/ddb.yaml b/src/tests/ftest/recovery/ddb.yaml index a89fa7beb29..81d6b803aa4 100644 --- a/src/tests/ftest/recovery/ddb.yaml +++ b/src/tests/ftest/recovery/ddb.yaml @@ -2,19 +2,19 @@ hosts: test_servers: 1 test_clients: 1 -timeout: 1800 +timeout: 7M server_config: name: daos_server engines_per_host: 1 engines: 0: + log_file: daos_server0.log + nr_xs_helpers: 1 + # Objects are placed in different targets, or in different vos-x, so we need to use + # 1 target to make the test steps simpler. targets: 1 - storage: - 0: - class: ram - scm_mount: /mnt/daos - system_ram_reserved: 1 + storage: auto # In CI, all tests in ddb.py are ran in a single launch.py execution. In that case, the # test_dir (/var/tmp/daos_testing/) in the server node will not be created @@ -24,4 +24,4 @@ setup: start_servers_once: False pool: - scm_size: 1G + scm_size: 50G diff --git a/src/tests/ftest/recovery/ddb_pmem.py b/src/tests/ftest/recovery/ddb_pmem.py new file mode 100644 index 00000000000..5c3f5f80df2 --- /dev/null +++ b/src/tests/ftest/recovery/ddb_pmem.py @@ -0,0 +1,298 @@ +""" + (C) Copyright 2022-2024 Intel Corporation. + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import ctypes +import os + +from apricot import TestWithServers +from ddb_utils import DdbCommand +from exception_utils import CommandFailure +from file_utils import distribute_files +from general_utils import (DaosTestError, create_string_buffer, get_random_string, report_errors, + run_command) +from pydaos.raw import DaosObjClass, IORequest +from run_utils import get_clush_command + + +def insert_objects(context, container, object_count, dkey_count, akey_count, base_dkey, base_akey, + base_data): + """Insert objects, dkeys, akeys, and data into the container. + + Args: + context (DaosContext): + container (TestContainer): Container to insert objects. + object_count (int): Number of objects to insert. + dkey_count (int): Number of dkeys to insert. + akey_count (int): Number of akeys to insert. + base_dkey (str): Base dkey. Index numbers will be appended to it. + base_akey (str):Base akey. Index numbers will be appended to it. + base_data (str):Base data that goes inside akey. Index numbers will be appended to it. + + Returns: + tuple: Inserted objects, dkeys, akeys, and data as (ioreqs, dkeys, akeys, data_list) + """ + ioreqs = [] + dkeys = [] + akeys = [] + data_list = [] + + container.open() + + for obj_index in range(object_count): + # Insert object. + ioreqs.append(IORequest( + context=context, container=container.container, obj=None, + objtype=DaosObjClass.OC_S1)) + + for dkey_index in range(dkey_count): + # Prepare the dkey to insert into the object. + dkey_str = " ".join([base_dkey, str(obj_index), str(dkey_index)]).encode("utf-8") + dkeys.append(create_string_buffer(value=dkey_str, size=len(dkey_str))) + + for akey_index in range(akey_count): + # Prepare the akey to insert into the dkey. + akey_str = " ".join( + [base_akey, str(obj_index), str(dkey_index), str(akey_index)]).encode("utf-8") + akeys.append(create_string_buffer(value=akey_str, size=len(akey_str))) + + # Prepare the data to insert into the akey. + data_str = " ".join( + [base_data, str(obj_index), str(dkey_index), str(akey_index)]).encode("utf-8") + data_list.append(create_string_buffer(value=data_str, size=len(data_str))) + c_size = ctypes.c_size_t(ctypes.sizeof(data_list[-1])) + + # Insert dkeys, akeys, and the data. + ioreqs[-1].single_insert( + dkey=dkeys[-1], akey=akeys[-1], value=data_list[-1], size=c_size) + + return (ioreqs, dkeys, akeys, data_list) + + +def copy_remote_to_local(remote_file_path, test_dir, remote): + """Copy the given file from the server node to the local test node and retrieve the original + name. + + Args: + remote_file_path (str): File path to copy to local. + test_dir (str): Test directory. Usually self.test_dir. + remote (NodeSet): Remote hostname to copy file from. + """ + # Use clush --rcopy to copy the file from the remote server node to the local test + # node. clush will append . to the file when copying. + args = f"--rcopy {remote_file_path} --dest {test_dir}" + clush_command = get_clush_command(hosts=remote, args=args, timeout=60) + try: + run_command(command=clush_command, timeout=None) + except DaosTestError as error: + raise DaosTestError(f"ERROR: Copying {remote_file_path} from {remote}: {error}") from error + + # Remove the appended . from the copied file. + current_file_path = "".join([remote_file_path, ".", remote]) + mv_command = f"mv {current_file_path} {remote_file_path}" + try: + run_command(command=mv_command) + except DaosTestError as error: + raise DaosTestError( + f"ERROR: Moving {current_file_path} to {remote_file_path}: {error}") from error + + +class DdbPMEMTest(TestWithServers): + """Test ddb subcommands. + + :avocado: recursive + """ + + def __init__(self, *args, **kwargs): + """Initialize a DdbPMEMTest object.""" + super().__init__(*args, **kwargs) + # Number of objects and keys to insert/expect. + self.object_count = 5 + self.dkey_count = 2 + self.akey_count = 1 + # Generate random keys and data to insert into the object. + self.random_dkey = get_random_string(10) + self.random_akey = get_random_string(10) + self.random_data = get_random_string(10) + + def test_recovery_ddb_load(self): + """Test ddb value_load. + + 1. Create a pool and a container. + 2. Insert one object with one dkey with the API. + 3. Stop the server to use ddb. + 4. Find the vos file name. e.g., /mnt/daos0//vos-0. + 5. Load new data into [0]/[0]/[0]/[0] + 6. Restart the server. + 7. Reset the object, container, and pool to use the API. + 8. Verify the data in the akey with single_fetch(). + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=recovery + :avocado: tags=DdbPMEMTest,ddb_cmd,test_recovery_ddb_load + """ + self.log_step("Create a pool and a container.") + pool = self.get_pool(connect=True) + container = self.get_container(pool) + + self.log_step("Insert one object with one dkey with API.") + obj_dataset = insert_objects( + context=self.context, container=container, object_count=1, dkey_count=1, akey_count=1, + base_dkey=self.random_dkey, base_akey=self.random_akey, base_data=self.random_data) + ioreqs = obj_dataset[0] + dkeys_inserted = obj_dataset[1] + akeys_inserted = obj_dataset[2] + data_list = obj_dataset[3] + + # For debugging/reference, call single_fetch and get the data just inserted. + # Pass in size + 1 to single_fetch to avoid the no-space error. + data_size = len(data_list[0]) + 1 + data = ioreqs[0].single_fetch( + dkey=dkeys_inserted[0], akey=akeys_inserted[0], size=data_size) + self.log.info("data (before) = %s", data.value.decode('utf-8')) + + self.log_step("Stop the server to use ddb.") + dmg_command = self.get_dmg_command() + dmg_command.system_stop() + + self.log_step("Find the vos file name.") + host = self.server_managers[0].hosts[0:1] + vos_paths = self.server_managers[0].get_vos_files(pool) + if not vos_paths: + self.fail("vos file wasn't found!") + ddb_command = DdbCommand(host, self.bin, vos_paths[0]) + + self.log_step("Load new data into [0]/[0]/[0]/[0]; Create a file in test node.") + load_file_path = os.path.join(self.test_dir, "new_data.txt") + new_data = get_random_string(20) + with open(load_file_path, "w", encoding="utf-8") as file: + file.write(new_data) + + self.log_step("Copy the created file to server node.") + result = distribute_files(self.log, host, load_file_path, load_file_path, False) + if not result.passed: + raise CommandFailure(f"ERROR: Copying new_data.txt to {result.failed_hosts}") + + self.log_step("The file with the new data is ready. Run ddb load.") + ddb_command.value_load(component_path="[0]/[0]/[0]/[0]", load_file_path=load_file_path) + + self.log_step("Restart the server.") + dmg_command.system_start() + + self.log_step("Reset the object, container, and pool to use the API after server restart.") + ioreqs[0].obj.close() + container.close() + pool.disconnect() + pool.connect() + container.open() + ioreqs[0].obj.open() + + self.log_step("Verify the data in the akey with single_fetch().") + data_size = len(new_data) + 1 + data = ioreqs[0].single_fetch( + dkey=dkeys_inserted[0], akey=akeys_inserted[0], size=data_size) + actual_data = data.value.decode('utf-8') + self.log.info("data (after) = %s", actual_data) + errors = [] + if new_data != actual_data: + msg = f"ddb load failed! Expected = {new_data}; Actual = {actual_data}" + errors.append(msg) + + report_errors(test=self, errors=errors) + + def test_recovery_ddb_dump_value(self): + """Test ddb dump_value. + + 1. Create a pool and a container. + 2. Insert one object with one dkey with API. + 3. Stop the server to use ddb. + 4. Find the vos file name. e.g., /mnt/daos0//vos-0. + 5. Dump the two akeys to files. + 6. Verify the content of the files. + 7. Restart the server for the cleanup. + 8. Reset the object, container, and pool to prepare for the cleanup. + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=recovery + :avocado: tags=DdbPMEMTest,ddb_cmd,test_recovery_ddb_dump_value + """ + self.log_step("Create a pool and a container.") + pool = self.get_pool(connect=True) + container = self.get_container(pool) + + self.log_step("Insert one object with one dkey with API.") + obj_dataset = insert_objects( + context=self.context, container=container, object_count=1, dkey_count=1, akey_count=2, + base_dkey=self.random_dkey, base_akey=self.random_akey, base_data=self.random_data) + ioreqs = obj_dataset[0] + data_list = obj_dataset[3] + + self.log_step("Stop the server to use ddb.") + dmg_command = self.get_dmg_command() + dmg_command.system_stop() + + self.log_step("Find the vos file name.") + vos_paths = self.server_managers[0].get_vos_files(pool) + if not vos_paths: + self.fail("vos file wasn't found!") + ddb_command = DdbCommand(self.server_managers[0].hosts[0:1], self.bin, vos_paths[0]) + + self.log_step("Dump the two akeys to files.") + akey1_file_path = os.path.join(self.test_dir, "akey1.txt") + ddb_command.value_dump( + component_path="[0]/[0]/[0]/[0]", out_file_path=akey1_file_path) + akey2_file_path = os.path.join(self.test_dir, "akey2.txt") + ddb_command.value_dump( + component_path="[0]/[0]/[0]/[1]", out_file_path=akey2_file_path) + + self.log_step("Copy them from remote server node to local test node.") + copy_remote_to_local( + remote_file_path=akey1_file_path, test_dir=self.test_dir, + remote=self.hostlist_servers[0]) + copy_remote_to_local( + remote_file_path=akey2_file_path, test_dir=self.test_dir, + remote=self.hostlist_servers[0]) + + self.log_step("Verify the content of the files.") + actual_akey1_data = None + with open(akey1_file_path, "r", encoding="utf-8") as file: + actual_akey1_data = file.readlines()[0] + actual_akey2_data = None + with open(akey2_file_path, "r", encoding="utf-8") as file: + actual_akey2_data = file.readlines()[0] + + errors = [] + str_data_list = [] + # Convert the data to string. + for data in data_list: + str_data_list.append(data.value.decode("utf-8")) + # Verify that we were able to obtain the data and akey1 and akey2 aren't the same. + if actual_akey1_data is None or actual_akey2_data is None or \ + actual_akey1_data == actual_akey2_data: + msg = (f"Invalid dumped value! Dumped akey1 data = {actual_akey1_data}; " + f"Dumped akey2 data = {actual_akey2_data}") + errors.append(msg) + # Verify that the data we obtained with ddb are the ones we wrote. The order isn't + # deterministic, so check with "in". + if actual_akey1_data not in str_data_list or \ + actual_akey2_data not in str_data_list: + msg = (f"Unexpected dumped value! Dumped akey data 1 = {actual_akey1_data}; Dumped " + f"akey data 2 = {actual_akey2_data}; Expected data list = {str_data_list}") + errors.append(msg) + + self.log_step("Restart the server for the cleanup.") + dmg_command.system_start() + + self.log_step("Reset the object, container, and pool to prepare for the cleanup.") + ioreqs[0].obj.close() + container.close() + pool.disconnect() + pool.connect() + container.open() + ioreqs[0].obj.open() + + report_errors(test=self, errors=errors) diff --git a/src/tests/ftest/recovery/ddb_pmem.yaml b/src/tests/ftest/recovery/ddb_pmem.yaml new file mode 100644 index 00000000000..d1591f5eee8 --- /dev/null +++ b/src/tests/ftest/recovery/ddb_pmem.yaml @@ -0,0 +1,27 @@ +hosts: + test_servers: 1 + test_clients: 1 + +timeout: 7M + +server_config: + name: daos_server + engines_per_host: 1 + engines: + 0: + targets: 1 + storage: + 0: + class: ram + scm_mount: /mnt/daos + system_ram_reserved: 1 + +# In CI, all tests in ddb.py are ran in a single launch.py execution. In that case, the +# test_dir (/var/tmp/daos_testing/) in the server node will not be created +# for each test if "start_servers_once: False" isn't set. test_load() needs this +# directory, so we need to set it. +setup: + start_servers_once: False + +pool: + scm_size: 1G diff --git a/src/tests/ftest/recovery/ms_membership.yaml b/src/tests/ftest/recovery/ms_membership.yaml index db567fddca8..017a461e06b 100644 --- a/src/tests/ftest/recovery/ms_membership.yaml +++ b/src/tests/ftest/recovery/ms_membership.yaml @@ -7,16 +7,17 @@ server_config: engines_per_host: 2 engines: 0: + targets: 4 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: + targets: 4 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + +pool: + size: 80G diff --git a/src/tests/ftest/recovery/pool_cleanup.yaml b/src/tests/ftest/recovery/pool_cleanup.yaml index cbefae8b3f4..3ca798f822b 100644 --- a/src/tests/ftest/recovery/pool_cleanup.yaml +++ b/src/tests/ftest/recovery/pool_cleanup.yaml @@ -13,4 +13,4 @@ server_config: storage: auto pool: - size: 5G + size: 15G diff --git a/src/tests/ftest/recovery/pool_list_consolidation.py b/src/tests/ftest/recovery/pool_list_consolidation.py index 41be76bd5cf..6b11a9cc386 100644 --- a/src/tests/ftest/recovery/pool_list_consolidation.py +++ b/src/tests/ftest/recovery/pool_list_consolidation.py @@ -1,6 +1,6 @@ """ (C) Copyright 2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -192,7 +192,7 @@ def verify_pool_dir_removed(self, pool, errors): list: Error list. """ - pool_path = self.server_managers[0].get_vos_path(pool) + pool_path = self.server_managers[0].get_vos_paths(pool)[0] check_out = check_file_exists( hosts=self.hostlist_servers, filename=pool_path, directory=True) if check_out[0]: @@ -265,13 +265,12 @@ def test_orphan_pool_trust_ms(self): def test_lost_majority_ps_replicas(self): """Test lost the majority of PS replicas. - 1. Create a pool with --nsvc=3. Rank 0, 1, and 2 will be pool service replicas. + 1. Create a pool with --nsvc=3. There will be three ranks with rdb-pool. 2. Stop servers. - 3. Remove //rdb-pool from rank 0 and 2. - 4. Start servers. - 5. Run DAOS checker under kinds of mode. - 6. Try creating a container. The pool can be started now, so create should succeed. - 7. Show that rdb-pool are recovered. i.e., at least three out of four ranks + 3. Remove //rdb-pool from two ranks. + 4. Run DAOS checker under kinds of mode. + 5. Try creating a container. The pool can be started now, so create should succeed. + 6. Show that rdb-pool are recovered. i.e., at least three out of four ranks should have rdb-pool. Jira ID: DAOS-12029 @@ -281,7 +280,17 @@ def test_lost_majority_ps_replicas(self): :avocado: tags=recovery,cat_recov,pool_list_consolidation :avocado: tags=PoolListConsolidationTest,test_lost_majority_ps_replicas """ + if self.server_managers[0].manager.job.using_control_metadata: + self.log.info("MD-on-SSD cluster. It will be supported later.") + self.cancelForTicket('DAOS-18395') + self.log_step("Create a pool with --nsvc=3.") + # We can generalize this test more. For example, use + # svcn = self.server_managers[0].engines - 1 + # Then remove (svcn / 2 + 1) count of rdb-pool, etc. However, I don't think it's + # necessary to increase the number of servers for this test. Also, I'm not sure + # if --nsvc > 3 will work. Thus, we keep the numbers hard-coded to make the code + # simple. pool = self.get_pool(svcn=3) self.log_step("Stop servers") @@ -289,31 +298,31 @@ def test_lost_majority_ps_replicas(self): dmg_command.system_stop() self.log_step("Remove //rdb-pool from two ranks.") - rdb_pool_path = f"{self.server_managers[0].get_vos_path(pool)}/rdb-pool" - command = f"sudo rm {rdb_pool_path}" + rdb_pool_paths = [] + for engine_params in self.server_managers[0].manager.job.yaml.engine_params: + scm_mount = engine_params.get_value('scm_mount') + rdb_pool_path = f"{scm_mount}/{pool.uuid.lower()}/rdb-pool" + rdb_pool_paths.append(rdb_pool_path) + self.log.info("rdb_pool_paths = %s", rdb_pool_paths) hosts = list(set(self.server_managers[0].ranks.values())) count = 0 + # Iterate both pool mount points of both ranks. I.e., 4 ranks total. for host in hosts: - node = NodeSet(host) - check_out = check_file_exists(hosts=node, filename=rdb_pool_path, sudo=True) - if check_out[0]: - if not run_remote(log=self.log, hosts=node, command=command).passed: - self.fail(f'Failed to remove {rdb_pool_path} on {host}') - self.log.info("rm rdb-pool from %s", str(node)) - count += 1 - if count > 1: - break - using_control_metadata = self.server_managers[0].manager.job.using_control_metadata - if count == 0 or using_control_metadata: - msg = ("MD-on-SSD cluster. Contents under mount point are removed by control plane " - "after system stop.") - self.log.info(msg) - dmg_command.system_start() - # return results in PASS. - return - - self.log_step("Start servers.") - dmg_command.system_start() + for rdb_pool_path in rdb_pool_paths: + node = NodeSet(host) + check_out = check_file_exists( + hosts=node, filename=rdb_pool_path, sudo=True) + if check_out[0]: + command = f"rm {rdb_pool_path}" + command_root = command_as_user(command=command, user="root") + if not run_remote(log=self.log, hosts=node, command=command_root).passed: + self.fail(f'Failed to remove {rdb_pool_path} on {host}') + self.log.info("Remove %s from %s", rdb_pool_path, str(node)) + count += 1 + if count == 2: + break + if count == 2: + break self.log_step("Run DAOS checker under kinds of mode.") errors = [] @@ -329,27 +338,29 @@ def test_lost_majority_ps_replicas(self): cont_create_success = True break except TestFail as error: - msg = f"## Container create failed after running checker! error = {error}" + msg = f"Container create failed after running checker! error = {error}" self.log.debug(msg) if not cont_create_success: errors.append("Container create failed after running checker!") - msg = ("Show that rdb-pool are recovered. i.e., at least three out of four ranks should " + msg = ("Show that rdb-pool are recovered. i.e., three out of four ranks should " "have rdb-pool.") self.log_step(msg) hosts = list(set(self.server_managers[0].ranks.values())) count = 0 for host in hosts: - node = NodeSet(host) - check_out = check_file_exists(hosts=node, filename=rdb_pool_path, sudo=True) - if check_out[0]: - count += 1 - self.log.info("rdb-pool found at %s", str(node)) + for rdb_pool_path in rdb_pool_paths: + node = NodeSet(host) + check_out = check_file_exists( + hosts=node, filename=rdb_pool_path, sudo=True) + if check_out[0]: + count += 1 + self.log.info("rdb-pool found at %s: %s", str(node), rdb_pool_path) self.log.info("rdb-pool count = %d", count) - if count < len(hosts) - 1: - errors.append(f"Not enough rdb-pool has been recovered! - {count} ranks") + if count != 3: + errors.append(f"Unexpected number of rdb-pool after repair! - {count} ranks") report_errors(test=self, errors=errors) diff --git a/src/tests/ftest/recovery/pool_list_consolidation.yaml b/src/tests/ftest/recovery/pool_list_consolidation.yaml index 0e37358b66c..2ea68fd69e4 100644 --- a/src/tests/ftest/recovery/pool_list_consolidation.yaml +++ b/src/tests/ftest/recovery/pool_list_consolidation.yaml @@ -9,14 +9,10 @@ server_config: engines_per_host: 2 engines: 0: - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log nr_xs_helpers: 1 storage: auto 1: - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log nr_xs_helpers: 1 storage: auto @@ -27,7 +23,7 @@ setup: start_servers_once: False pool: - size: 60G + size: 100G container: control_method: daos diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index 67b5dff96bc..3bf3a9a9769 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -1,6 +1,6 @@ """ (C) Copyright 2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -251,7 +251,7 @@ def test_dangling_pool_map(self): self.log_step("Manually remove ///vos-0 from rank 0 node.") rank_0_host = NodeSet(self.server_managers[0].get_host(0)) - vos_0_path = f"{self.server_managers[0].get_vos_path(pool)}/vos-0" + vos_0_path = f"{self.server_managers[0].get_vos_paths(pool)[0]}/vos-0" vos_0_result = check_file_exists(hosts=self.hostlist_servers, filename=vos_0_path) if not vos_0_result[0]: msg = ("MD-on-SSD cluster. Contents under mount point are removed by control plane " @@ -315,6 +315,11 @@ def test_dangling_rank_entry(self): :avocado: tags=recovery,cat_recov,pool_membership :avocado: tags=PoolMembershipTest,test_dangling_rank_entry """ + if self.server_managers[0].manager.job.using_control_metadata: + self.log.info("MD-on-SSD cluster. Will be supported later.") + # return results in PASS. + return + targets = self.params.get("targets", "/run/server_config/engines/0/*") exp_msg = "dangling rank entry" @@ -332,7 +337,7 @@ def test_dangling_rank_entry(self): self.log_step("Remove pool directory from one of the mount points.") rank_1_host = NodeSet(self.server_managers[0].get_host(1)) - pool_directory = self.server_managers[0].get_vos_path(self.pool) + pool_directory = self.server_managers[0].get_vos_paths(self.pool)[0] pool_directory_result = check_file_exists( hosts=self.hostlist_servers, filename=pool_directory, directory=True) if not pool_directory_result[0]: diff --git a/src/tests/ftest/recovery/pool_membership.yaml b/src/tests/ftest/recovery/pool_membership.yaml index e51e12f24b1..39a990dddb3 100644 --- a/src/tests/ftest/recovery/pool_membership.yaml +++ b/src/tests/ftest/recovery/pool_membership.yaml @@ -9,17 +9,13 @@ server_config: engines: 0: storage: auto - fabric_iface_port: 31416 log_file: daos_server_0.log pinned_numa_node: 0 - fabric_iface: ib0 targets: 8 1: storage: auto - fabric_iface_port: 31516 log_file: daos_server_1.log pinned_numa_node: 1 - fabric_iface: ib1 targets: 8 pool: diff --git a/src/tests/ftest/scripts/main.sh b/src/tests/ftest/scripts/main.sh index 80bf68070cf..3f3e2c7cb69 100755 --- a/src/tests/ftest/scripts/main.sh +++ b/src/tests/ftest/scripts/main.sh @@ -1,8 +1,8 @@ #!/bin/bash # shellcheck disable=SC1113 # /* -# * (C) Copyright 2016-2024 Intel Corporation. -# * Copyright 2025 Hewlett Packard Enterprise Development LP +# * Copyright 2016-2024 Intel Corporation. +# * Copyright 2025-2026 Hewlett Packard Enterprise Development LP # * # * SPDX-License-Identifier: BSD-2-Clause-Patent # */ @@ -27,7 +27,15 @@ python3 -m venv venv # shellcheck disable=SC1091 source venv/bin/activate +cat < venv/pip.conf +[global] + progress_bar = off + no_color = true + quiet = 1 +EOF + pip install --upgrade pip + pip install -r "$PREFIX"/lib/daos/TESTING/ftest/requirements-ftest.txt if $TEST_RPMS; then @@ -58,7 +66,7 @@ unset D_PROVIDER # Disable D_INTERFACE to allow launch.py to pick the fastest interface unset D_INTERFACE -# At Oct2018 Longmond F2F it was decided that per-server logs are preferred +# At Oct2018 Longmont F2F it was decided that per-server logs are preferred # But now we need to collect them! Avoid using 'client_daos.log' due to # conflicts with the daos_test log renaming. # shellcheck disable=SC2153 @@ -74,7 +82,7 @@ if ${SETUP_ONLY:-false}; then exit 0 fi -# need to increase the number of oopen files (on EL8 at least) +# need to increase the number of open files (on EL8 at least) ulimit -n 4096 # Clean stale job results diff --git a/src/tests/ftest/scrubber/aggregation.yaml b/src/tests/ftest/scrubber/aggregation.yaml index b4d061357a2..cfee3de68f8 100644 --- a/src/tests/ftest/scrubber/aggregation.yaml +++ b/src/tests/ftest/scrubber/aggregation.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 660 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,8 +15,6 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -23,22 +24,23 @@ server_config: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: size: 80% svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 + container: type: POSIX control_method: daos properties: "cksum:sha512,rd_fac:1" + ior: &ior_base ior_timeout: 60 test_file: daos:testFile @@ -46,16 +48,19 @@ ior: &ior_base flags: "-v -W -w -r -R -k" api: DFS dfs_oclass: RP_2GX + ior_small_block_size: <<: *ior_base transfer_size: 5M block_size: 250M + ior_large_block_size: <<: *ior_base transfer_size: 1M block_size: 20G client_processes: np: 6 + faults: fault_list: - DAOS_DELAYED_CSUM_CORRUPT_DISK diff --git a/src/tests/ftest/scrubber/basic.py b/src/tests/ftest/scrubber/basic.py index 13ec352c360..fe6445cea60 100644 --- a/src/tests/ftest/scrubber/basic.py +++ b/src/tests/ftest/scrubber/basic.py @@ -1,5 +1,6 @@ """ (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -13,14 +14,8 @@ class TestWithScrubberBasic(TestWithScrubber): :avocado: recursive """ - def run_scrubber_basic(self, pool_prop=None, cont_prop=None): - """JIRA ID: DAOS-7371 - Scrubber basic main method which runs the basic testing. - - Args: - pool_prop(str) : Test pool properties string. - cont_prop(str) : Test container properties string - """ + def run_scrubber_basic(self): + """Runs the basic scrubber testing.""" flags = self.params.get("ior_flags", '/run/ior/iorflags/*') apis = self.params.get("ior_api", '/run/ior/iorflags/*') transfer_block_size = self.params.get("transfer_block_size", @@ -30,7 +25,6 @@ def run_scrubber_basic(self, pool_prop=None, cont_prop=None): self.ior_cmd.flags.update(flags[0], "ior.flags") self.ior_cmd.dfs_oclass.update(obj_class[0]) self.ior_cmd.dfs_dir_oclass.update(obj_class[0]) - self.create_pool_cont_with_scrubber(pool_prop=pool_prop, cont_prop=cont_prop) for test in transfer_block_size: self.ior_cmd.transfer_size.update(test[0]) self.ior_cmd.block_size.update(test[1]) @@ -57,7 +51,16 @@ def test_scrubber_disabled_during_pool_creation(self): :avocado: tags=TestWithScrubberBasic,test_scrubber_disabled_during_pool_creation """ - self.run_scrubber_basic(None, None) + other_properties = self.params.get("other_properties", '/run/pool/*') + + self.add_pool() + for prop_val in other_properties.split(","): + if prop_val is not None: + value = prop_val.split(":") + self.pool.set_property(value[0], value[1]) + self.add_container(pool=self.pool) + + self.run_scrubber_basic() def test_scrubber_enabled_during_pool_creation(self): """JIRA ID: DAOS-7371 @@ -74,6 +77,10 @@ def test_scrubber_enabled_during_pool_creation(self): :avocado: tags=TestWithScrubberBasic,test_scrubber_enabled_during_pool_creation """ - pool_prop = self.params.get("properties", '/run/pool/*') - cont_prop = self.params.get("properties", '/run/container/*') - self.run_scrubber_basic(pool_prop, cont_prop) + pool_properties = self.params.get("properties", '/run/pool/*') + other_properties = self.params.get("other_properties", '/run/pool/*') + + self.add_pool(properties=f"{pool_properties},{other_properties}") + self.add_container(pool=self.pool) + + self.run_scrubber_basic() diff --git a/src/tests/ftest/scrubber/basic.yaml b/src/tests/ftest/scrubber/basic.yaml index 35ceff204b4..60a98eb3142 100644 --- a/src/tests/ftest/scrubber/basic.yaml +++ b/src/tests/ftest/scrubber/basic.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 150 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -11,8 +14,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -21,8 +22,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -30,23 +29,30 @@ server_config: storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + pool: scm_size: 6000000000 nvme_size: 54000000000 svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 - properties: "scrub:timed,scrub_freq:1" + properties: rd_fac:0,space_rb:0 + other_properties: scrub:timed,scrub_freq:1 + container: type: POSIX control_method: daos oclass: RP_2G1 + properties: cksum:crc16 + ior: ior_timeout: 60 client_processes: diff --git a/src/tests/ftest/scrubber/check_csum_metrics_mdtest.yaml b/src/tests/ftest/scrubber/check_csum_metrics_mdtest.yaml index 25e87b233a5..befb63d1942 100644 --- a/src/tests/ftest/scrubber/check_csum_metrics_mdtest.yaml +++ b/src/tests/ftest/scrubber/check_csum_metrics_mdtest.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 2 + timeout: 360 + server_config: name: daos_server engines_per_host: 2 @@ -10,8 +12,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -20,23 +20,24 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=csum,mgmt,md,rebuild storage: auto + pool: size: 50% svcn: 4 pool_query_timeout: 30 - properties: "scrub:timed,scrub_freq:1" + properties: "rd_fac:0,space_rb:0,scrub:timed,scrub_freq:1" + container: type: POSIX control_method: daos oclass: RP_2G1 properties: cksum:crc16,cksum_size:16384,srv_cksum:on + mdtest: client_processes: ppn: 4 diff --git a/src/tests/ftest/scrubber/csum_fault.yaml b/src/tests/ftest/scrubber/csum_fault.yaml index 4f2c294fa95..407d8b7b414 100644 --- a/src/tests/ftest/scrubber/csum_fault.yaml +++ b/src/tests/ftest/scrubber/csum_fault.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 150 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,8 +15,6 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -23,8 +24,6 @@ server_config: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -32,24 +31,29 @@ server_config: storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + pool: scm_size: 6G nvme_size: 54G svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 - properties: "scrub:timed,scrub_freq:1" + properties: "rd_fac:0,space_rb:0,scrub:timed,scrub_freq:1" + container: type: POSIX control_method: daos oclass: RP_2GX properties: "cksum:crc16" + ior: ior_timeout: 60 client_processes: @@ -68,6 +72,7 @@ ior: - [1M, 2G] obj_class: - RP_2GX + faults: fault_list: - DAOS_CSUM_CORRUPT_DISK diff --git a/src/tests/ftest/scrubber/frequency.yaml b/src/tests/ftest/scrubber/frequency.yaml index 14cc19321f0..befbe818106 100644 --- a/src/tests/ftest/scrubber/frequency.yaml +++ b/src/tests/ftest/scrubber/frequency.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 1800 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -11,8 +14,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -21,24 +22,25 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 6000000000 nvme_size: 54000000000 svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 - properties: "scrub:timed" + properties: rd_fac:0,space_rb:0,scrub:timed + container: type: POSIX control_method: daos oclass: RP_2G1 + ior: ior_timeout: 60 client_processes: diff --git a/src/tests/ftest/scrubber/rebuild.yaml b/src/tests/ftest/scrubber/rebuild.yaml index ab6cff21835..41775934891 100644 --- a/src/tests/ftest/scrubber/rebuild.yaml +++ b/src/tests/ftest/scrubber/rebuild.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 400 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,8 +15,6 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -23,25 +24,26 @@ server_config: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 6G nvme_size: 54G svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 - properties: "scrub:timed,scrub_freq:1,scrub_thresh:2" + properties: "rd_fac:0,space_rb:0,scrub:timed,scrub_freq:1,scrub_thresh:2" + container: type: POSIX control_method: daos oclass: RP_2GX properties: "cksum:sha256,rd_fac:1" + ior: ior_timeout: 60 client_processes: @@ -55,6 +57,7 @@ ior: block_size: 2G dfs_oclass: RP_2GX dfs_dir_oclass: RP_2GX + faults: fault_list: - DAOS_CSUM_CORRUPT_DISK diff --git a/src/tests/ftest/scrubber/snapshot.yaml b/src/tests/ftest/scrubber/snapshot.yaml index fe332df4be9..156feef76d1 100644 --- a/src/tests/ftest/scrubber/snapshot.yaml +++ b/src/tests/ftest/scrubber/snapshot.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 400 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,8 +15,6 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -23,25 +24,26 @@ server_config: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: - DD_MASK=mgmt,md,rebuild storage: auto + pool: scm_size: 6G nvme_size: 54G svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 - properties: "scrub:timed,scrub_freq:2" + properties: "rd_fac:0,space_rb:0,scrub:timed,scrub_freq:2" + container: type: POSIX control_method: daos oclass: RP_2GX properties: "cksum:crc64,rd_fac:1" + ior: ior_timeout: 60 client_processes: @@ -55,6 +57,7 @@ ior: block_size: 2G dfs_oclass: RP_2GX dfs_dir_oclass: RP_2GX + faults: fault_list: - DAOS_CSUM_CORRUPT_DISK diff --git a/src/tests/ftest/scrubber/target_auto_eviction.yaml b/src/tests/ftest/scrubber/target_auto_eviction.yaml index 2638f410d07..9ab153651f6 100644 --- a/src/tests/ftest/scrubber/target_auto_eviction.yaml +++ b/src/tests/ftest/scrubber/target_auto_eviction.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 400 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -12,8 +15,6 @@ server_config: targets: 8 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -23,8 +24,6 @@ server_config: targets: 8 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -32,24 +31,29 @@ server_config: storage: auto transport_config: allow_insecure: true + agent_config: transport_config: allow_insecure: true + dmg: transport_config: allow_insecure: true + pool: scm_size: 6G nvme_size: 54G svcn: 4 rebuild_timeout: 120 pool_query_timeout: 30 - properties: "scrub:timed,scrub_freq:1,scrub_thresh:2" + properties: rd_fac:0,space_rb:0,scrub:timed,scrub_freq:1,scrub_thresh:2 + container: type: POSIX control_method: daos oclass: RP_2GX properties: "cksum:crc16" + ior: ior_timeout: 60 client_processes: @@ -63,6 +67,7 @@ ior: block_size: 2G dfs_oclass: RP_2GX dfs_dir_oclass: RP_2GX + faults: fault_list: - DAOS_CSUM_CORRUPT_DISK diff --git a/src/tests/ftest/server/cpu_usage.yaml b/src/tests/ftest/server/cpu_usage.yaml index 4bf4a7713ec..ec11f001e14 100644 --- a/src/tests/ftest/server/cpu_usage.yaml +++ b/src/tests/ftest/server/cpu_usage.yaml @@ -1,25 +1,30 @@ hosts: test_servers: 1 test_clients: 1 + timeout: 130 + server_config: engines_per_host: 1 engines: 0: targets: 8 nr_xs_helpers: 8 - fabric_iface: ib0 storage: auto + ior: client_processes: np: 1 flags: "-v -D 60 -w -r" transfer_size: 1M block_size: 1G + pool: scm_size: 10G nvme_size: 100G + container: type: POSIX control_method: daos + usage_limit: 200 diff --git a/src/tests/ftest/server/daos_server_config.py b/src/tests/ftest/server/daos_server_config.py index b5383793cec..db0d1d043b1 100644 --- a/src/tests/ftest/server/daos_server_config.py +++ b/src/tests/ftest/server/daos_server_config.py @@ -1,8 +1,10 @@ """ (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ +import os from apricot import TestWithServers from server_utils import ServerFailed @@ -44,7 +46,15 @@ def test_daos_server_config_basic(self): self.hostfile_servers_slots) # Get the input to verify - c_val = self.params.get("config_val", "/run/server_config_val/*/") + c_val = list(self.params.get("config_val", "/run/server_config_val/*/")) + + # Handle "auto" value for control_iface - use DAOS_TEST_CONTROL_IFACE + if c_val[0] == "control_iface" and c_val[1] == "auto": + control_iface = os.environ.get("DAOS_TEST_CONTROL_IFACE") + if not control_iface: + self.skipTest("DAOS_TEST_CONTROL_IFACE not set; cannot test control_iface") + c_val[1] = control_iface + self.log.info("Resolved control_iface 'auto' to '%s'", control_iface) if c_val[0] == "name": # Set the dmg system name to match the server in order to avoid diff --git a/src/tests/ftest/server/daos_server_config.yaml b/src/tests/ftest/server/daos_server_config.yaml index 889417aa74d..b11fe6fee03 100644 --- a/src/tests/ftest/server/daos_server_config.yaml +++ b/src/tests/ftest/server/daos_server_config.yaml @@ -171,3 +171,23 @@ server_config_val: !mux - "targets" - -1 - "FAIL" + control_iface_invalid: + config_val: + - "control_iface" + - "nonexistent_interface_12345" + - "FAIL" + # Loopback interface is valid but its address (127.0.0.1) won't match the + # MS replica address derived from the test node's hostname, causing startup + # to fail with an address mismatch error. + control_iface_loopback_mismatch: + config_val: + - "control_iface" + - "lo" + - "FAIL" + # Use the auto-detected control interface (from DAOS_TEST_CONTROL_IFACE) which + # has an IP matching the hostname, so it should work with the MS replica address. + control_iface_valid: + config_val: + - "control_iface" + - "auto" + - "PASS" diff --git a/src/tests/ftest/server/daos_server_restart.yaml b/src/tests/ftest/server/daos_server_restart.yaml index 3fa1cd0a742..99263715abf 100644 --- a/src/tests/ftest/server/daos_server_restart.yaml +++ b/src/tests/ftest/server/daos_server_restart.yaml @@ -15,15 +15,11 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto diff --git a/src/tests/ftest/server/metadata.py b/src/tests/ftest/server/metadata.py index 6d86273c5ce..958686c2f12 100644 --- a/src/tests/ftest/server/metadata.py +++ b/src/tests/ftest/server/metadata.py @@ -1,6 +1,6 @@ """ (C) Copyright 2019-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -81,7 +81,7 @@ def create_pool(self, svc_ops_enabled=True): self.add_pool() else: params = {} - params['properties'] = "svc_ops_enabled:0" + params['properties'] = "rd_fac:0,space_rb:0,svc_ops_enabled:0" self.add_pool(**params) self.log.info("Created %s: svc ranks:", str(self.pool)) for index, rank in enumerate(self.pool.svc_ranks): diff --git a/src/tests/ftest/server/metadata.yaml b/src/tests/ftest/server/metadata.yaml index 48a6c84ef61..b9ce987770e 100644 --- a/src/tests/ftest/server/metadata.yaml +++ b/src/tests/ftest/server/metadata.yaml @@ -17,8 +17,6 @@ server_config: nr_xs_helpers: 4 first_core: 0 pinned_numa_node: 0 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR env_vars: @@ -34,8 +32,6 @@ server_config: nr_xs_helpers: 4 first_core: 0 pinned_numa_node: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: DEBUG,MEM=ERR env_vars: @@ -49,7 +45,7 @@ server_config: pool: svcn: 5 scm_size: 1G - properties: svc_ops_entry_age:60 + properties: rd_fac:0,space_rb:0,svc_ops_entry_age:60 # Uncomment the following for manual test with different svc_ops_entry_age value # properties: svc_ops_entry_age:150 # properties: svc_ops_entry_age:300 diff --git a/src/tests/ftest/server/replay.py b/src/tests/ftest/server/replay.py index 28ed9ea7486..fb4fd23685f 100644 --- a/src/tests/ftest/server/replay.py +++ b/src/tests/ftest/server/replay.py @@ -1,6 +1,6 @@ """ (C) Copyright 2023 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -359,7 +359,7 @@ def test_replay_check_pointing(self): """ frequency = 5 container = self.create_container( - properties=f'checkpoint:timed,checkpoint_freq:{frequency}') + properties=f'rd_fac:0,space_rb:0,checkpoint:timed,checkpoint_freq:{frequency}') self.log.info('%s check point frequency: %s seconds', container.pool, frequency) self.log_step('Write data to the container (ior)') diff --git a/src/tests/ftest/server/storage_tiers.yaml b/src/tests/ftest/server/storage_tiers.yaml index 6edced47786..6529446c22d 100644 --- a/src/tests/ftest/server/storage_tiers.yaml +++ b/src/tests/ftest/server/storage_tiers.yaml @@ -14,15 +14,11 @@ timeout: 30 engine_0: &engine_0_base pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log engine_1: &engine_1_base pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage_0: &storage_dcpm diff --git a/src/tests/ftest/soak/faults.yaml b/src/tests/ftest/soak/faults.yaml index 165cb573031..4ce3c4880d6 100644 --- a/src/tests/ftest/soak/faults.yaml +++ b/src/tests/ftest/soak/faults.yaml @@ -22,7 +22,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 2 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR env_vars: @@ -31,7 +30,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 2 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR env_vars: diff --git a/src/tests/ftest/soak/harassers.yaml b/src/tests/ftest/soak/harassers.yaml index af7065fb3d8..a2032e85d61 100644 --- a/src/tests/ftest/soak/harassers.yaml +++ b/src/tests/ftest/soak/harassers.yaml @@ -22,7 +22,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 2 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: INFO env_vars: @@ -32,7 +31,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 2 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: INFO env_vars: diff --git a/src/tests/ftest/soak/smoke.yaml b/src/tests/ftest/soak/smoke.yaml index ca1d4fb7a4c..948fe1f8802 100644 --- a/src/tests/ftest/soak/smoke.yaml +++ b/src/tests/ftest/soak/smoke.yaml @@ -24,7 +24,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 2 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR env_vars: @@ -33,7 +32,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 2 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR env_vars: diff --git a/src/tests/ftest/soak/stress.yaml b/src/tests/ftest/soak/stress.yaml index 15a6a3033a3..b3e9b87dd91 100644 --- a/src/tests/ftest/soak/stress.yaml +++ b/src/tests/ftest/soak/stress.yaml @@ -27,7 +27,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 2 - fabric_iface_port: 31317 log_file: daos_server0.log log_mask: ERR env_vars: @@ -37,7 +36,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 2 - fabric_iface_port: 31417 log_file: daos_server1.log log_mask: ERR env_vars: diff --git a/src/tests/ftest/telemetry/engine_events.yaml b/src/tests/ftest/telemetry/engine_events.yaml index 713eac94e9f..4fa3be16d3d 100644 --- a/src/tests/ftest/telemetry/engine_events.yaml +++ b/src/tests/ftest/telemetry/engine_events.yaml @@ -9,14 +9,10 @@ server_config: 0: targets: 4 nr_xs_helpers: 0 - fabric_iface: ib0 - fabric_iface_port: 31416 log_file: daos_server_0.log storage: auto 1: targets: 4 nr_xs_helpers: 0 - fabric_iface: ib1 - fabric_iface_port: 31516 log_file: daos_server_1.log storage: auto diff --git a/src/tests/ftest/telemetry/pool_space_metrics.yaml b/src/tests/ftest/telemetry/pool_space_metrics.yaml index 9a2e261f6b2..a7c2632cb3b 100644 --- a/src/tests/ftest/telemetry/pool_space_metrics.yaml +++ b/src/tests/ftest/telemetry/pool_space_metrics.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 180 + server_config: name: daos_server engines_per_host: 2 @@ -10,28 +12,28 @@ server_config: targets: 4 pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_engine_0.log storage: auto 1: targets: 4 pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 32317 log_file: daos_engine_1.log storage: auto + pool_scm: scm_size: 1G nvme_size: 0 + pool_scm_nvme: size: 80% + container: type: POSIX control_method: daos properties: rd_fac:0 oclass: SX + ior: api: DFS transfer_size: 1048576 # 1MiB @@ -43,6 +45,7 @@ ior: dfs_destroy: false env_vars: - D_LOG_MASK=INFO + mpirun: args: "--bind-to socket" diff --git a/src/tests/ftest/telemetry/wal_metrics.py b/src/tests/ftest/telemetry/wal_metrics.py index 19bd605704a..748ef30cc56 100644 --- a/src/tests/ftest/telemetry/wal_metrics.py +++ b/src/tests/ftest/telemetry/wal_metrics.py @@ -1,6 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -143,7 +143,7 @@ def test_wal_checkpoint_metrics(self): wal_metrics = list(self.telemetry.ENGINE_POOL_CHECKPOINT_METRICS) self.log_step('Creating a pool with check pointing disabled (dmg pool create)') - add_pool(self, properties='checkpoint:disabled') + add_pool(self, properties='rd_fac:0,space_rb:0,checkpoint:disabled') self.log_step( 'Collect WAL checkpoint metrics after creating a pool w/o check pointing ' @@ -160,7 +160,8 @@ def test_wal_checkpoint_metrics(self): self.fail('WAL check point metrics not zero after creating a pool w/o check pointing') self.log_step('Creating a pool with timed check pointing (dmg pool create)') - pool = add_pool(self, properties=f'checkpoint:timed,checkpoint_freq:{frequency}') + pool = add_pool( + self, properties=f'rd_fac:0,space_rb:0,checkpoint:timed,checkpoint_freq:{frequency}') self.log_step( 'Collect WAL checkpoint metrics after creating a pool w/ check pointing ' diff --git a/src/tests/ftest/telemetry/wal_metrics.yaml b/src/tests/ftest/telemetry/wal_metrics.yaml index 71ba8cbc17b..1f991444383 100644 --- a/src/tests/ftest/telemetry/wal_metrics.yaml +++ b/src/tests/ftest/telemetry/wal_metrics.yaml @@ -11,7 +11,7 @@ server_config: storage: auto pool: - size: 20G + size: 40G container: control_method: daos diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 092b0e9a31e..829ee0f7bfa 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -851,6 +851,7 @@ def write_string_to_logfile(self, message): if self.server_managers and self.agent_managers: # Compose and run cart_ctl command cart_ctl = CartCtl() + cart_ctl.get_params(self) cart_ctl.add_log_msg.value = "add_log_msg" cart_ctl.rank.value = "all" cart_ctl.log_message.value = message diff --git a/src/tests/ftest/util/container_rf_test_base.py b/src/tests/ftest/util/container_rf_test_base.py index f9af2673faa..045f77d3ef4 100644 --- a/src/tests/ftest/util/container_rf_test_base.py +++ b/src/tests/ftest/util/container_rf_test_base.py @@ -166,9 +166,9 @@ def execute_cont_rf_test(self, create_container=True, mode=None): # Verify the rank to be excluded has at least one object self.verify_rank_has_objects() # Start the rebuild process - self.start_rebuild_cont_rf(rd_fac) + self.start_rebuild_cont_rf(rf_match.group(1)) # Execute the test steps during rebuild - self.execute_during_rebuild_cont_rf(rd_fac, expect_cont_status) + self.execute_during_rebuild_cont_rf(rf_match.group(1), expect_cont_status) # Refresh local pool and container self.log.info("==>(6)Check for pool and container info after rebuild.") self.pool.check_pool_info() diff --git a/src/tests/ftest/util/data_utils.py b/src/tests/ftest/util/data_utils.py index 74ead097879..005e3626526 100644 --- a/src/tests/ftest/util/data_utils.py +++ b/src/tests/ftest/util/data_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2023 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -150,3 +151,83 @@ def dict_subtract(dict1, dict2): except TypeError as error: raise TypeError('Invalid type for key {}'.format(key)) from error return dict3 + + +def assert_val_in_list(val, allowed_list): + """Assert whether a value is in the allowed list. + + Args: + val (object): value to check + allowed_list (list): list of allowed values + + Returns: + bool: True if val is in allowed_list + + Raises: + AssertionError: if val is not in allowed_list + """ + if val not in allowed_list: + raise AssertionError(f'Expected one of {allowed_list}') + return True + + +def assert_dict_subset(subset, full): + """Assert that a dictionary is a subset of another dictionary. + + For example: + assert_dict_subset({'a': 1}, {'a': 1, 'b': 2}) -> passes + assert_dict_subset({'a': 2}, {'a': 1, 'b': 2}) -> AssertionError + assert_dict_subset({'c': 2}, {'a': 1, 'b': 2}) -> AssertionError + + Args: + subset (dict): Expected subset dictionary, where only keys in the subset are verified + in the full dictionary. + Expected values can be type callable(actual_value) -> bool for custom verification. + full (dict): Full dictionary to verify against the subset + + Raises: + AssertionError: if subset is not contained in full + """ + + def _format_keys(keys): + """Convert list of keys to ["key1"]["key2"] format.""" + return ''.join(map(lambda k: f'["{k}"]', keys)) + + def _assert_subset(prev_keys, expected, actual): + """Recursively verify expected dict matches actual dict.""" + for key, expected_value in expected.items(): + cur_keys = prev_keys + [key] + try: + actual_value = actual[key] + except KeyError as error: + raise AssertionError( + f'Missing expected key {_format_keys(cur_keys)}') from error + + if callable(expected_value): + # Use custom callable to verify value + try: + if not expected_value(actual_value): + raise AssertionError( + f'{_format_keys(cur_keys)} = {actual_value} ; ' + f'expected to satisfy {expected_value.__name__}') + except AssertionError as error: + # If the custom callable raised an AssertionError, use its error message + raise AssertionError( + f'{_format_keys(cur_keys)} = {actual_value} ; {str(error)}') from error + + elif type(expected_value) is not type(actual_value): + # Types must match + raise AssertionError( + f'type({_format_keys(cur_keys)}) = {type(actual_value)} ; ' + f'expected {type(expected_value)}') + + elif isinstance(expected_value, dict): + # Recursively verify nested dict + _assert_subset(cur_keys, expected_value, actual_value) + + elif expected_value != actual_value: + # Compare leaf values + raise AssertionError( + f'{_format_keys(cur_keys)} = {actual_value} ; expected "{expected_value}"') + + _assert_subset([], subset, full) diff --git a/src/tests/ftest/util/ddb_utils.py b/src/tests/ftest/util/ddb_utils.py index 53f9601653e..beea3092606 100644 --- a/src/tests/ftest/util/ddb_utils.py +++ b/src/tests/ftest/util/ddb_utils.py @@ -1,6 +1,6 @@ """ (C) Copyright 2022 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -32,13 +32,16 @@ def __init__(self, server_host, path, verbose=True, timeout=None, sudo=True): self.host = server_host # Write mode that's necessary for the commands that alters the data such as load. - self.write_mode = FormattedParameter("-w", default=False) + self.write_mode = FormattedParameter("-w", default=False, position=1) - # Command to run on the VOS file that contains container, object info, etc. - self.single_command = BasicParameter(None, position=2) + # Path to the system database. Used for MD-on-SSD. + self.db_path = BasicParameter(None, position=2) # VOS file path. - self.vos_path = BasicParameter(None, position=1) + self.vos_path = FormattedParameter("--vos_path {}", position=3) + + # Command to run on the VOS file that contains container, object info, etc. + self.single_command = BasicParameter(None, position=4) # Members needed for run(). self.verbose = verbose @@ -92,7 +95,7 @@ def __init__(self, server_host, path, vos_path): self.vos_path.update(vos_path, "vos_path") def list_component(self, component_path=None): - """Call ddb -R "ls " + """Call ddb ls ls is similar to the Linux ls command. It lists objects inside the container, dkeys inside the object, and so on. @@ -160,11 +163,11 @@ def value_load(self, component_path, load_file_path): return self.run() def remove_component(self, component_path): - """Call ddb -w -R "rm " + """Call ddb -w rm Args: - component_path (str): Component that comes after rm. e.g., [0]/[1] for first - container, second object. + component_path (str): Component that comes after rm. e.g., [0]/[1] for first container, + second object. Returns: CommandResult: groups of command results from the same hosts with the same return status @@ -282,3 +285,21 @@ def dtx_cmt_clear(self, component_path="[0]"): self.single_command.value = " ".join(["dtx_cmt_clear", component_path]) return self.run() + + def prov_mem(self, db_path, tmpfs_mount): + """Call ddb --vos_path "" prov_mem . + + Args: + db_path (str): Path to the system database. e.g., + /var/tmp/daos_testing/control_metadata/daos_control/engine0 + tmpfs_mount (str): Path to the tmpfs mount point. Directory that needs to be created + beforehand. e.g., /mnt/daos_load + + Returns: + CommandResult: groups of command results from the same hosts with the same return status + """ + self.vos_path.value = '""' + cmd = ["prov_mem", db_path, tmpfs_mount] + self.single_command.value = " ".join(cmd) + + return self.run() diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index 95febc8173c..10e56a616a1 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -1,6 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -639,78 +639,23 @@ def pool_create(self, scm_size, uid=None, gid=None, nvme_size=None, return data - def pool_query(self, pool, show_enabled=False, health_only=False): - """Query a pool with the dmg command. + def pool_delete_acl(self, pool, principal): + """Delete the acl for a given pool. Args: - pool (str): Pool UUID or label to query. - show_enabled (bool, optional): Display enabled ranks. - health_only (bool, optional): Only perform pool health related queries. - - Raises: - CommandFailure: if the dmg pool query command fails. + pool (str): Pool for which to delete the ACL. + principal (str): principal to be deleted Returns: - dict: the dmg json command output converted to a python dictionary - - """ - # Sample JSON output - # { - # "response": { - # "status": 0, - # "uuid": "EDAE0965-7A6E-48BD-A71C-A29F199C679F", - # "total_targets": 8, - # "active_targets": 8, - # "total_engines": 1, - # "disabled_targets": 0, - # "version": 1, - # "svc_ldr": 0, - # "rebuild": { - # "status": 0, - # "state": "idle", - # "objects": 0, - # "records": 0 - # }, - # "scm": { - # "total": 16000000000, - # "free": 15999992320, - # "min": 1999999040, - # "max": 1999999040, - # "mean": 1999999040 - # }, - # "nvme": { - # "total": 32000000000, - # "free": 31999950848, - # "min": 3999993856, - # "max": 3999993856, - # "mean": 3999993856 - # }, - # "enabled_ranks": [0,1,3], - # "disabled_ranks": [2] - # }, - # "error": null, - # "status": 0 - # } - return self._get_json_result(("pool", "query"), pool=pool, - show_enabled=show_enabled, health_only=health_only) - - def pool_query_targets(self, pool, rank=None, target_idx=None): - """Call dmg pool query-targets. - - Args: - pool (str): Pool UUID or label - rank (str, optional): Engine rank of the targets to be queried - target_idx (str, optional): Comma-separated list of target idx(s) to be queried + CmdResult: Object that contains exit status, stdout, and other + information. Raises: - CommandFailure: if the command fails. - - Returns: - dict: the dmg json command output converted to a python dictionary + CommandFailure: if the dmg pool delete-acl command fails. """ - return self._get_json_result(("pool", "query-targets"), pool=pool, - rank=rank, target_idx=target_idx) + return self._get_result( + ("pool", "delete-acl"), pool=pool, principal=principal) def pool_destroy(self, pool, force=True, recursive=True): """Destroy a pool with the dmg command. @@ -729,91 +674,116 @@ def pool_destroy(self, pool, force=True, recursive=True): """ return self._get_result(("pool", "destroy"), pool=pool, force=force, recursive=recursive) - def pool_get_acl(self, pool): - """Get the ACL for a given pool. + def pool_drain(self, pool, ranks, tgt_idx=None): + """Drain a daos_server from the pool. Args: - pool (str): Pool for which to get the ACL. + pool (str): Pool uuid. + ranks (str): Comma separated daos_server-rank ranges to drain e.g. + "0,2-5". + tgt_idx (list, optional): targets to drain on ranks e.g. "1,2". + Defaults to None. Returns: CmdResult: Object that contains exit status, stdout, and other - information. + information. Raises: - CommandFailure: if the dmg pool get-acl command fails. + CommandFailure: if the dmg pool drain command fails. """ - return self._get_result(("pool", "get-acl"), pool=pool) + return self._get_result( + ("pool", "drain"), pool=pool, ranks=ranks, tgt_idx=tgt_idx) - def pool_update_acl(self, pool, acl_file=None, entry=None): - """Update the acl for a given pool. + def pool_evict(self, pool): + """Evict a pool. Args: - pool (str): Pool for which to update the ACL. - acl_file (str, optional): ACL file to update - entry (str, optional): entry to be updated + pool (str): UUID of DAOS pool to evict connection to Returns: CmdResult: Object that contains exit status, stdout, and other information. Raises: - CommandFailure: if the dmg pool update-acl command fails. + CommandFailure: if the dmg pool evict command fails. + + """ + return self._get_result(("pool", "evict"), pool=pool) + + def pool_exclude(self, pool, ranks, tgt_idx=None, force=False): + """Exclude a daos_server from the pool. + + Args: + pool (str): Pool uuid. + ranks (str): Comma separated daos_server-rank ranges to exclude e.g. + "0,2-5". + tgt_idx (list, optional): targets to exclude on ranks e.g. "1,2". + Defaults to None. + force (bool, optional): force exclusion regardless of data loss. Defaults to False + + Returns: + CmdResult: Object that contains exit status, stdout, and other + information. + + Raises: + CommandFailure: if the dmg pool exclude command fails. """ return self._get_result( - ("pool", "update-acl"), pool=pool, acl_file=acl_file, entry=entry) + ("pool", "exclude"), pool=pool, ranks=ranks, tgt_idx=tgt_idx, force=force) - def pool_upgrade(self, pool): - """Call dmg pool upgrade. + def pool_extend(self, pool, ranks): + """Extend the daos_server pool. Args: - pool (str): pool to upgrade + pool (str): Pool uuid. + ranks (str): Comma separated daos_server-rank ranges to extend e.g. + "0,2-5". Returns: - dict: the dmg json command output converted to a python dictionary + CmdResult: Object that contains exit status, stdout, and other + information. Raises: - CommandFailure: if the command fails. + CommandFailure: if the dmg pool extend command fails. """ - return self._get_json_result(("pool", "upgrade"), pool=pool) + return self._get_result( + ("pool", "extend"), pool=pool, ranks=ranks) - def pool_overwrite_acl(self, pool, acl_file): - """Overwrite the acl for a given pool. + def pool_get_acl(self, pool): + """Get the ACL for a given pool. Args: - pool (str): Pool for which to overwrite the ACL. - acl_file (str): ACL file to update + pool (str): Pool for which to get the ACL. Returns: CmdResult: Object that contains exit status, stdout, and other information. Raises: - CommandFailure: if the dmg pool overwrite-acl command fails. + CommandFailure: if the dmg pool get-acl command fails. """ - return self._get_result( - ("pool", "overwrite-acl"), pool=pool, acl_file=acl_file) + return self._get_result(("pool", "get-acl"), pool=pool) - def pool_delete_acl(self, pool, principal): - """Delete the acl for a given pool. + def pool_get_prop(self, pool, name=None): + """Get the Property for a given pool. Args: - pool (str): Pool for which to delete the ACL. - principal (str): principal to be deleted + pool (str): Pool for which to get the property. + name (str, optional): Get the Property value based on name. Returns: CmdResult: Object that contains exit status, stdout, and other information. Raises: - CommandFailure: if the dmg pool delete-acl command fails. + CommandFailure: if the dmg pool get-prop command fails. """ - return self._get_result( - ("pool", "delete-acl"), pool=pool, principal=principal) + return self._get_json_result(("pool", "get-prop"), pool=pool, name=name) def pool_list(self, no_query=False, verbose=False): """List pools. @@ -865,121 +835,198 @@ def pool_list(self, no_query=False, verbose=False): return self._get_json_result( ("pool", "list"), no_query=no_query, verbose=verbose) - def pool_set_prop(self, pool, properties): - """Set property for a given Pool. + def pool_overwrite_acl(self, pool, acl_file): + """Overwrite the acl for a given pool. Args: - pool (str): Pool uuid for which property is supposed to be set. - properties (str): Property in the form of key:val[,key:val...] + pool (str): Pool for which to overwrite the ACL. + acl_file (str): ACL file to update Returns: - CmdResult: Object that contains exit status, stdout, and other information. + CmdResult: Object that contains exit status, stdout, and other + information. Raises: - CommandFailure: if the dmg pool set-prop command fails. + CommandFailure: if the dmg pool overwrite-acl command fails. """ - return self._get_result(("pool", "set-prop"), pool=pool, properties=properties) + return self._get_result( + ("pool", "overwrite-acl"), pool=pool, acl_file=acl_file) - def pool_get_prop(self, pool, name=None): - """Get the Property for a given pool. + def pool_query(self, pool, show_enabled=False, health_only=False): + """Query a pool with the dmg command. Args: - pool (str): Pool for which to get the property. - name (str, optional): Get the Property value based on name. + pool (str): Pool UUID or label to query. + show_enabled (bool, optional): Display enabled ranks. + health_only (bool, optional): Only perform pool health related queries. + + Raises: + CommandFailure: if the dmg pool query command fails. Returns: - CmdResult: Object that contains exit status, stdout, and other - information. + dict: the dmg json command output converted to a python dictionary + + """ + # Sample JSON output + # { + # "response": { + # "status": 0, + # "uuid": "EDAE0965-7A6E-48BD-A71C-A29F199C679F", + # "total_targets": 8, + # "active_targets": 8, + # "total_engines": 1, + # "disabled_targets": 0, + # "version": 1, + # "svc_ldr": 0, + # "rebuild": { + # "status": 0, + # "state": "idle", + # "objects": 0, + # "records": 0 + # }, + # "scm": { + # "total": 16000000000, + # "free": 15999992320, + # "min": 1999999040, + # "max": 1999999040, + # "mean": 1999999040 + # }, + # "nvme": { + # "total": 32000000000, + # "free": 31999950848, + # "min": 3999993856, + # "max": 3999993856, + # "mean": 3999993856 + # }, + # "enabled_ranks": [0,1,3], + # "disabled_ranks": [2] + # }, + # "error": null, + # "status": 0 + # } + return self._get_json_result(("pool", "query"), pool=pool, + show_enabled=show_enabled, health_only=health_only) + + def pool_query_targets(self, pool, rank=None, target_idx=None): + """Call dmg pool query-targets. + + Args: + pool (str): Pool UUID or label + rank (str, optional): Engine rank of the targets to be queried + target_idx (str, optional): Comma-separated list of target idx(s) to be queried Raises: - CommandFailure: if the dmg pool get-prop command fails. + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary """ - return self._get_json_result(("pool", "get-prop"), pool=pool, name=name) + return self._get_json_result(("pool", "query-targets"), pool=pool, + rank=rank, target_idx=target_idx) - def pool_exclude(self, pool, ranks, tgt_idx=None, force=False): - """Exclude a daos_server from the pool. + def pool_reintegrate(self, pool, ranks, tgt_idx=None): + """Reintegrate a daos_server to the pool. Args: pool (str): Pool uuid. - ranks (str): Comma separated daos_server-rank ranges to exclude e.g. - "0,2-5". - tgt_idx (list, optional): targets to exclude on ranks e.g. "1,2". + ranks (str): Comma separated daos_server-rank ranges to reintegrate + e.g. "0,2-5". + tgt_idx (list, optional): targets to reintegrate on ranks e.g. "1,2". Defaults to None. - force (bool, optional): force exclusion regardless of data loss. Defaults to False Returns: CmdResult: Object that contains exit status, stdout, and other information. Raises: - CommandFailure: if the dmg pool exclude command fails. + CommandFailure: if the dmg pool reintegrate command fails. """ return self._get_result( - ("pool", "exclude"), pool=pool, ranks=ranks, tgt_idx=tgt_idx, force=force) + ("pool", "reintegrate"), pool=pool, ranks=ranks, tgt_idx=tgt_idx) - def pool_extend(self, pool, ranks): - """Extend the daos_server pool. + def pool_rebuild_start(self, pool): + """Rebuild start request submitted to pool. Args: - pool (str): Pool uuid. - ranks (str): Comma separated daos_server-rank ranges to extend e.g. - "0,2-5". + pool (str): Pool label or uuid. Returns: - CmdResult: Object that contains exit status, stdout, and other - information. + CmdResult: Object that contains exit status, stdout, and other information. Raises: - CommandFailure: if the dmg pool extend command fails. + CommandFailure: if the command fails. """ - return self._get_result( - ("pool", "extend"), pool=pool, ranks=ranks) + return self._get_result(("pool", "rebuild", "start"), pool=pool) - def pool_drain(self, pool, ranks, tgt_idx=None): - """Drain a daos_server from the pool. + def pool_rebuild_stop(self, pool, force=False): + """Rebuild stop request submitted to pool. Args: - pool (str): Pool uuid. - ranks (str): Comma separated daos_server-rank ranges to drain e.g. - "0,2-5". - tgt_idx (list, optional): targets to drain on ranks e.g. "1,2". - Defaults to None. + pool (str): Pool label or uuid. + force (bool): Force stop rebuild. Returns: - CmdResult: Object that contains exit status, stdout, and other - information. + CmdResult: Object that contains exit status, stdout, and other information. Raises: - CommandFailure: if the dmg pool drain command fails. + CommandFailure: if the command fails. """ - return self._get_result( - ("pool", "drain"), pool=pool, ranks=ranks, tgt_idx=tgt_idx) + return self._get_result(("pool", "rebuild", "stop"), pool=pool, force=force) - def pool_reintegrate(self, pool, ranks, tgt_idx=None): - """Reintegrate a daos_server to the pool. + def pool_set_prop(self, pool, properties): + """Set property for a given Pool. Args: - pool (str): Pool uuid. - ranks (str): Comma separated daos_server-rank ranges to reintegrate - e.g. "0,2-5". - tgt_idx (list, optional): targets to reintegrate on ranks e.g. "1,2". - Defaults to None. + pool (str): Pool uuid for which property is supposed to be set. + properties (str): Property in the form of key:val[,key:val...] + + Returns: + CmdResult: Object that contains exit status, stdout, and other information. + + Raises: + CommandFailure: if the dmg pool set-prop command fails. + + """ + return self._get_result(("pool", "set-prop"), pool=pool, properties=properties) + + def pool_update_acl(self, pool, acl_file=None, entry=None): + """Update the acl for a given pool. + + Args: + pool (str): Pool for which to update the ACL. + acl_file (str, optional): ACL file to update + entry (str, optional): entry to be updated Returns: CmdResult: Object that contains exit status, stdout, and other - information. + information. Raises: - CommandFailure: if the dmg pool reintegrate command fails. + CommandFailure: if the dmg pool update-acl command fails. """ return self._get_result( - ("pool", "reintegrate"), pool=pool, ranks=ranks, tgt_idx=tgt_idx) + ("pool", "update-acl"), pool=pool, acl_file=acl_file, entry=entry) + + def pool_upgrade(self, pool): + """Call dmg pool upgrade. + + Args: + pool (str): pool to upgrade + + Returns: + dict: the dmg json command output converted to a python dictionary + + Raises: + CommandFailure: if the command fails. + + """ + return self._get_json_result(("pool", "upgrade"), pool=pool) def cont_set_owner(self, pool, cont, user=None, group=None): """Dmg container set-owner to the specified new user/group. @@ -1038,18 +1085,18 @@ def system_cleanup(self, machinename=None, verbose=True): return self._get_json_result( ("system", "cleanup"), machinename=machinename, verbose=verbose) - def system_clear_exclude(self, ranks, rank_hosts): - """Clear exclude ranks from system. + def system_clear_exclude(self, ranks=None, rank_hosts=None): + """Call dmg system clear-exclude. - Either ranks or rank_hosts is necessary. Pass in None to one of them. + Either ranks or rank_hosts is required. Args: - ranks (str): Comma separated rank-ranges to exclude e.g. "0,2-5". - rank_hosts (str): hostlist representing hosts whose managed ranks are to be + ranks (str, optional): Comma separated rank-ranges to exclude e.g. "0,2-5". + rank_hosts (str, optional): hostlist representing hosts whose managed ranks are to be operated on. Raises: - CommandFailure: if the dmg system clear-exclude command fails. + CommandFailure: if the command fails. Returns: dict: the dmg json command output converted to a python dictionary @@ -1058,6 +1105,85 @@ def system_clear_exclude(self, ranks, rank_hosts): return self._get_json_result( ("system", "clear-exclude"), ranks=ranks, rank_hosts=rank_hosts) + def system_drain(self, ranks=None, rank_hosts=None): + """Call dmg system drain. + + Either ranks or rank_hosts is required. + + Args: + ranks (str, optional): Comma separated rank-ranges to exclude e.g. "0,2-5". + rank_hosts (str, optional): hostlist representing hosts whose managed ranks are to be + operated on. + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result( + ("system", "drain"), ranks=ranks, rank_hosts=rank_hosts) + + def system_erase(self): + """Erase system metadata prior to reformat. + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("system", "erase")) + + def system_exclude(self, ranks=None, rank_hosts=None): + """Call dmg system exclude. + + Either ranks or rank_hosts is required. + + Args: + ranks (str, optional): Comma separated rank-ranges to exclude e.g. "0,2-5". + rank_hosts (str, optional): hostlist representing hosts whose managed ranks are to be + operated on. + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result( + ("system", "exclude"), ranks=ranks, rank_hosts=rank_hosts) + + def system_get_prop(self, properties=None): + """Call dmg system get-prop. + + Args: + properties (str, optional): Comma separated properties to get. + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("system", "get-prop"), properties=properties) + + def system_leader_query(self): + """Call dmg system leader-query. + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("system", "leader-query")) + def system_query(self, ranks=None, verbose=True): """Query system to obtain the status of the servers. @@ -1081,7 +1207,7 @@ def system_query(self, ranks=None, verbose=True): # { # "addr": "10.8.1.11:10001", # "state": "joined", - # "fault_domain": "/wolf-11.wolf.hpdd.intel.com", + # "fault_domain": "/wolf-11.wolf.example.com", # "rank": 0, # "uuid": "e7f2cb06-a111-4d55-a6a5-b494b70d62ab", # "fabric_uri": "ofi+sockets://192.168.100.11:31416", @@ -1091,7 +1217,7 @@ def system_query(self, ranks=None, verbose=True): # { # "addr": "10.8.1.74:10001", # "state": "excluded", - # "fault_domain": "/wolf-74.wolf.hpdd.intel.com", + # "fault_domain": "/wolf-74.wolf.example.com", # "rank": 1, # "uuid": "db36ab28-fdb0-4822-97e6-89547393ed03", # "fabric_uri": "ofi+sockets://192.168.100.74:31416", @@ -1106,60 +1232,83 @@ def system_query(self, ranks=None, verbose=True): return self._get_json_result( ("system", "query"), ranks=ranks, verbose=verbose) - def system_leader_query(self): - """Query system to obtain the MS leader and replica information. + def system_reintegrate(self, ranks=None, rank_hosts=None): + """Call dmg system reintegrate. + + Args: + ranks (str, optional): Comma separated rank-ranges to exclude e.g. "0,2-5". + rank_hosts (str, optional): hostlist representing hosts whose managed ranks are to be + operated on. Raises: - CommandFailure: if the dmg system query command fails. + CommandFailure: if the dmg system reintegrate command fails. Returns: dict: the dmg json command output converted to a python dictionary """ - # Example JSON output: - # { - # "response": { - # "current_leader": "127.0.0.1:10001", - # "replicas": [ - # "127.0.0.1:10001" - # ] - # }, - # "error": null, - # "status": 0 - # } - return self._get_json_result(("system", "leader-query")) + return self._get_json_result( + ("system", "reintegrate"), ranks=ranks, rank_hosts=rank_hosts) - def system_erase(self): - """Erase system metadata prior to reformat. + def system_rebuild_start(self, verbose=False): + """Call dmg system rebuild start. + + Args: + verbose (str, optional): Print pool identifiers Raises: - CommandFailure: if the dmg system erase command fails. + CommandFailure: if the dmg system rebuild start command fails. Returns: dict: the dmg json command output converted to a python dictionary """ - return self._get_json_result(("system", "erase")) - - def system_exclude(self, ranks, rank_hosts): - """Exclude ranks from system. + return self._get_json_result( + ("system", "rebuild", "start"), verbose=verbose) - Either ranks or rank_hosts is necessary. Pass in None to one of them. + def system_rebuild_stop(self, verbose=False, force=False): + """Call dmg system rebuild stop. Args: - ranks (str): Comma separated rank-ranges to exclude e.g. "0,2-5". - rank_hosts (str): hostlist representing hosts whose managed ranks are to be - operated on. + verbose (str, optional): Print pool identifiers + force (str, optional): Forcibly stop interactive rebuild Raises: - CommandFailure: if the dmg system exclude command fails. + CommandFailure: if the dmg system rebuild start command fails. Returns: dict: the dmg json command output converted to a python dictionary """ return self._get_json_result( - ("system", "exclude"), ranks=ranks, rank_hosts=rank_hosts) + ("system", "rebuild", "stop"), verbose=verbose, force=force) + + def system_self_heal_eval(self): + """Call dmg system self-heal eval. + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("system", "self-heal", "eval")) + + def system_set_prop(self, properties=None): + """Call dmg system set-prop. + + Args: + properties (str): properties in the form of key:val[,key:val...] + + Raises: + CommandFailure: if the command fails. + + Returns: + dict: the dmg json command output converted to a python dictionary + + """ + return self._get_json_result(("system", "set-prop"), properties=properties) def system_start(self, ranks=None, ignore_admin_excluded=False): """Start the system. @@ -1221,22 +1370,6 @@ def system_stop(self, force=False, ranks=None): data[rank] = info[1].strip() return data - def pool_evict(self, pool): - """Evict a pool. - - Args: - pool (str): UUID of DAOS pool to evict connection to - - Returns: - CmdResult: Object that contains exit status, stdout, and other - information. - - Raises: - CommandFailure: if the dmg pool evict command fails. - - """ - return self._get_result(("pool", "evict"), pool=pool) - def config_generate(self, mgmt_svc_replicas, num_engines=None, scm_only=False, net_class=None, net_provider=None, use_tmpfs_scm=False, control_metadata_path=None): diff --git a/src/tests/ftest/util/dmg_utils_base.py b/src/tests/ftest/util/dmg_utils_base.py index bbacbd19088..96ddc889625 100644 --- a/src/tests/ftest/util/dmg_utils_base.py +++ b/src/tests/ftest/util/dmg_utils_base.py @@ -1,6 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -413,6 +413,8 @@ def get_sub_command_class(self): self.sub_command_class = self.QuerySubCommand() elif self.sub_command.value == "query-targets": self.sub_command_class = self.QueryTargetsSubCommand() + elif self.sub_command.value == "rebuild": + self.sub_command_class = self.RebuildSubCommand() elif self.sub_command.value == "set-prop": self.sub_command_class = self.SetPropSubCommand() elif self.sub_command.value == "update-acl": @@ -560,6 +562,40 @@ def __init__(self): self.rank = FormattedParameter("--rank={}", None) self.target_idx = FormattedParameter("--target-idx={}", None) + class RebuildSubCommand(CommandWithSubCommand): + """Defines an object for the dmg pool rebuild command.""" + + def __init__(self): + """Create a dmg pool rebuild command object.""" + super().__init__("/run/dmg/pool/rebuild/*", "rebuild") + + def get_sub_command_class(self): + # pylint: disable=redefined-variable-type + """Get the dmg pool sub command object.""" + if self.sub_command.value == "start": + self.sub_command_class = self.StartSubCommand() + elif self.sub_command.value == "stop": + self.sub_command_class = self.StopSubCommand() + else: + self.sub_command_class = None + + class StartSubCommand(CommandWithParameters): + """Defines an object for dmg pool rebuild start command.""" + + def __init__(self): + """Create a dmg pool rebuild start command object.""" + super().__init__("/run/dmg/pool/rebuild/start/*", "start") + self.pool = BasicParameter(None, position=1) + + class StopSubCommand(CommandWithParameters): + """Defines an object for dmg pool rebuild stop command.""" + + def __init__(self): + """Create a dmg pool rebuild stop command object.""" + super().__init__("/run/dmg/pool/rebuild/stop/*", "stop") + self.pool = BasicParameter(None, position=1) + self.force = FormattedParameter("--force", False) + class ReintegrateSubCommand(CommandWithParameters): """Defines an object for dmg pool reintegrate command.""" @@ -843,16 +879,28 @@ def get_sub_command_class(self): self.sub_command_class = self.CleanupSubCommand() elif self.sub_command.value == "clear-exclude": self.sub_command_class = self.ClearExcludeSubCommand() + elif self.sub_command.value == "drain": + self.sub_command_class = self.DrainSubCommand() elif self.sub_command.value == "erase": self.sub_command_class = self.EraseSubCommand() elif self.sub_command.value == "exclude": self.sub_command_class = self.ExcludeSubCommand() + elif self.sub_command.value == "get-prop": + self.sub_command_class = self.GetPropSubCommand() elif self.sub_command.value == "leader-query": self.sub_command_class = self.LeaderQuerySubCommand() elif self.sub_command.value == "list-pools": self.sub_command_class = self.ListPoolsSubCommand() elif self.sub_command.value == "query": self.sub_command_class = self.QuerySubCommand() + elif self.sub_command.value == "rebuild": + self.sub_command_class = self.RebuildSubCommand() + elif self.sub_command.value == "reintegrate": + self.sub_command_class = self.ReintegrateSubCommand() + elif self.sub_command.value == "self-heal": + self.sub_command_class = self.SelfHealSubCommand() + elif self.sub_command.value == "set-prop": + self.sub_command_class = self.SetPropSubCommand() elif self.sub_command.value == "start": self.sub_command_class = self.StartSubCommand() elif self.sub_command.value == "stop": @@ -878,6 +926,15 @@ def __init__(self): self.ranks = FormattedParameter("--ranks={}") self.rank_hosts = FormattedParameter("--rank-hosts={}") + class DrainSubCommand(CommandWithParameters): + """Defines an object for the dmg system drain command.""" + + def __init__(self): + """Create a dmg system drain command object.""" + super().__init__("/run/dmg/system/drain/*", "drain") + self.ranks = FormattedParameter("--ranks={}") + self.rank_hosts = FormattedParameter("--rank-hosts={}") + class EraseSubCommand(CommandWithParameters): """Defines an object for the dmg system erase command.""" @@ -894,6 +951,14 @@ def __init__(self): self.ranks = FormattedParameter("--ranks={}") self.rank_hosts = FormattedParameter("--rank-hosts={}") + class GetPropSubCommand(CommandWithParameters): + """Defines an object for the dmg system get-prop command.""" + + def __init__(self): + """Create a dmg system get-prop command object.""" + super().__init__("/run/dmg/system/get-prop/*", "get-prop") + self.properties = BasicParameter(None, position=1) + class LeaderQuerySubCommand(CommandWithParameters): """Defines an object for the dmg system leader-query command.""" @@ -917,6 +982,79 @@ def __init__(self): self.ranks = FormattedParameter("--ranks={}") self.verbose = FormattedParameter("--verbose", False) + class ReintegrateSubCommand(CommandWithParameters): + """Defines an object for the dmg system reintegrate command.""" + + def __init__(self): + """Create a dmg system reintegrate command object.""" + super().__init__("/run/dmg/system/reintegrate/*", "reintegrate") + self.ranks = FormattedParameter("--ranks={}") + self.rank_hosts = FormattedParameter("--rank-hosts={}") + + class RebuildSubCommand(CommandWithSubCommand): + """Defines an object for the dmg system rebuild command.""" + + def __init__(self): + """Create a dmg system rebuild command object.""" + super().__init__("/run/dmg/system/rebuild/*", "rebuild") + + def get_sub_command_class(self): + # pylint: disable=redefined-variable-type + """Get the dmg system sub command object.""" + if self.sub_command.value == "start": + self.sub_command_class = self.StartSubCommand() + elif self.sub_command.value == "stop": + self.sub_command_class = self.StopSubCommand() + else: + self.sub_command_class = None + + class StartSubCommand(CommandWithParameters): + """Defines an object for the dmg system rebuild start command.""" + + def __init__(self): + """Create a dmg system rebuild start command object.""" + super().__init__("/run/dmg/system/rebuild/start/*", "start") + self.verbose = FormattedParameter("--verbose", False) + + class StopSubCommand(CommandWithParameters): + """Defines an object for the dmg system rebuild stop command.""" + + def __init__(self): + """Create a dmg system rebuild stop command object.""" + super().__init__("/run/dmg/system/rebuild/stop/*", "stop") + self.verbose = FormattedParameter("--verbose", False) + self.force = FormattedParameter("--force", False) + + class SelfHealSubCommand(CommandWithSubCommand): + """Defines an object for the dmg system self-heal command.""" + + def __init__(self): + """Create a dmg system self-heal command object.""" + super().__init__("/run/dmg/system/self-heal/*", "self-heal") + + def get_sub_command_class(self): + # pylint: disable=redefined-variable-type + """Get the dmg system sub command object.""" + if self.sub_command.value == "eval": + self.sub_command_class = self.EvalSubCommand() + else: + self.sub_command_class = None + + class EvalSubCommand(CommandWithParameters): + """Defines an object for the dmg system self-heal eval command.""" + + def __init__(self): + """Create a dmg system self-heal eval command object.""" + super().__init__("/run/dmg/system/self-heal/eval/*", "eval") + + class SetPropSubCommand(CommandWithParameters): + """Defines an object for the dmg system set-prop command.""" + + def __init__(self): + """Create a dmg system set-prop command object.""" + super().__init__("/run/dmg/system/set-prop/*", "set-prop") + self.properties = BasicParameter(None, position=1) + class StartSubCommand(CommandWithParameters): """Defines an object for the dmg system start command.""" diff --git a/src/tests/ftest/util/ec_utils.py b/src/tests/ftest/util/ec_utils.py index 10b4784f278..9243cbb9176 100644 --- a/src/tests/ftest/util/ec_utils.py +++ b/src/tests/ftest/util/ec_utils.py @@ -418,7 +418,7 @@ def setUp(self): """Set up each test case.""" super().setUp() # Create Pool - self.add_pool() + self.add_pool(connect=False) self.container = None self.out_queue = queue.Queue() diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py index bed802347f3..30c00204b2a 100644 --- a/src/tests/ftest/util/environment_utils.py +++ b/src/tests/ftest/util/environment_utils.py @@ -1,6 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -10,9 +10,8 @@ from ClusterShell.NodeSet import NodeSet # pylint: disable=import-error,no-name-in-module -from util.host_utils import get_local_host from util.network_utils import (PROVIDER_ALIAS, SUPPORTED_PROVIDERS, NetworkException, - get_common_provider, get_fastest_interface) + get_common_provider, get_fastest_interfaces) from util.run_utils import run_remote @@ -104,6 +103,7 @@ class TestEnvironment(): 'shared_dir': 'DAOS_TEST_SHARED_DIR', 'user_dir': 'DAOS_TEST_USER_DIR', 'interface': 'DAOS_TEST_FABRIC_IFACE', + 'control_interface': 'DAOS_TEST_CONTROL_IFACE', 'provider': 'D_PROVIDER', 'insecure_mode': 'DAOS_TEST_INSECURE_MODE', 'bullseye_src': 'DAOS_TEST_BULLSEYE_SRC', @@ -172,7 +172,9 @@ def set_defaults(self, logger, servers=None, clients=None, provider=None, insecu if self.user_dir is None: self.user_dir = os.path.join(self.log_dir, "user") if self.interface is None: - self.interface = self._default_interface(logger, all_hosts) + self.interface = self._default_interface(logger, servers) + if self.control_interface is None: + self.control_interface = self._default_control_interface(logger, servers) if self.provider is None: self.provider = self._default_provider(logger, servers) if self.insecure_mode is None: @@ -327,15 +329,79 @@ def _default_interface(self, logger, hosts): Returns: str: the default interface; can be None """ - interface = os.environ.get("D_INTERFACE") - if interface is None and hosts: - # Find all the /sys/class/net interfaces on the launch node (excluding lo) - logger.debug("Detecting network devices - D_INTERFACE not set") - try: - interface = get_fastest_interface(logger, hosts | get_local_host()) - except NetworkException as error: - raise TestEnvironmentException("Error obtaining a default interface!") from error - return interface + if not hosts: + return None + + logger.debug( + "Detecting network devices on %s - %s not set", hosts, self.__ENV_VAR_MAP['interface']) + try: + interfaces = get_fastest_interfaces(logger, hosts) + except NetworkException as error: + raise TestEnvironmentException("Error obtaining a default interface!") from error + + logger.debug(" Found interface(s): %s", ",".join(interfaces)) + return ",".join(interfaces) + + @property + def control_interface(self): + """Get the control plane interface device. + + Returns: + str: the control plane interface device + """ + return os.environ.get(self.__ENV_VAR_MAP['control_interface']) + + @control_interface.setter + def control_interface(self, value): + """Set the control plane interface device. + + Args: + value (str): the control plane interface device + """ + self.__set_value('control_interface', value) + + def _default_control_interface(self, logger, hosts): + """Get the default control plane interface. + + Finds the network interface whose IP address matches the hostname resolution. + This is the interface that should be used for control plane traffic. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to find the control interface + + Returns: + str: the default control interface; can be None + """ + if not hosts or not logger: + return None + + # Get the first host to query + first_host = NodeSet(str(list(hosts)[0])) + + logger.debug( + "Detecting control interface on %s - %s not set", + first_host, self.__ENV_VAR_MAP['control_interface']) + + # Find the interface whose IP matches the hostname resolution + command = ( + "python3 -c \"" + "import socket, subprocess; " + "ip = socket.gethostbyname(socket.gethostname()); " + "out = subprocess.check_output(['ip', '-o', 'addr', 'show']).decode(); " + "print(next((l.split()[1] for l in out.split(chr(10)) if f'inet {ip}/' in l), ''))\"" + ) + result = run_remote(logger, first_host, command) + if result.passed and result.output: + for data in result.output: + if data.stdout: + interface = data.stdout[0].strip() + if interface: + logger.debug(" Found control interface: %s", interface) + return interface + + logger.debug(" Could not detect control interface") + return None @property def provider(self): @@ -373,12 +439,13 @@ def _default_provider(self, logger, hosts): Returns: str: the default provider; can be None """ - if not hosts: + if not hosts or self.interface is None: return None + first_interface = self.interface.split(",", maxsplit=1)[0] logger.debug( - "Detecting provider for %s - %s not set", - self.interface, self.__ENV_VAR_MAP['provider']) + "Detecting provider for %s on %s - %s not set", + first_interface, hosts, self.__ENV_VAR_MAP['provider']) provider = None supported = list(SUPPORTED_PROVIDERS) @@ -392,7 +459,7 @@ def _default_provider(self, logger, hosts): supported = list(filter(lambda x: 'verbs' not in x, supported)) # Detect all supported providers for this interface that are common to all of the hosts - common_providers = get_common_provider(logger, hosts, self.interface, supported) + common_providers = get_common_provider(logger, hosts, first_interface, supported) if common_providers: # Select the preferred found provider based upon SUPPORTED_PROVIDERS order logger.debug("Supported providers detected: %s", common_providers) @@ -404,9 +471,9 @@ def _default_provider(self, logger, hosts): # Report an error if a provider cannot be found if not provider: raise TestEnvironmentException( - f"Error obtaining a supported provider for {self.interface} from: {supported}") + f"Error obtaining a supported provider for {first_interface} from: {supported}") - logger.debug(" Found %s provider for %s", provider, self.interface) + logger.debug(" Found %s provider for %s", provider, first_interface) return provider @property diff --git a/src/tests/ftest/util/ior_intercept_test_base.py b/src/tests/ftest/util/ior_intercept_test_base.py index fb28c7356ca..34fa3a550c5 100644 --- a/src/tests/ftest/util/ior_intercept_test_base.py +++ b/src/tests/ftest/util/ior_intercept_test_base.py @@ -1,5 +1,6 @@ """ (C) Copyright 2019-2023 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -31,6 +32,7 @@ def run_il_perf_check(self, libname): # Write and read performance thresholds write_x = self.params.get("write_x", self.ior_cmd.namespace, None) read_x = self.params.get("read_x", self.ior_cmd.namespace, None) + enforce_performance = self.params.get("enforce_performance", self.ior_cmd.namespace, True) if write_x is None or read_x is None: self.fail("Failed to get write_x and read_x from config") @@ -81,5 +83,8 @@ def run_il_perf_check(self, libname): self.log.info("DFUSE IL Max Read: %.2f", dfuse_max_read) self.log.info("Percent Diff: %.2f%%", actual_read_x * 100) - self.assertLessEqual(abs(actual_write_x), write_x, "Max Write Diff too large") - self.assertLessEqual(abs(actual_read_x), read_x, "Max Read Diff too large") + if enforce_performance: + self.assertLessEqual(abs(actual_write_x), write_x, "Max Write Diff too large") + self.assertLessEqual(abs(actual_read_x), read_x, "Max Read Diff too large") + else: + self.log.info("Skipping performance enforcement checks") diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index e317558bdd0..cd8bf8eeed2 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -869,13 +869,12 @@ def _generate_certs(self, logger): logger.debug("Generating certificates") test_env = TestEnvironment() certs_dir = os.path.join(test_env.log_dir, "daosCA") - certgen_dir = os.path.abspath( - os.path.join("..", "..", "..", "..", "lib64", "daos", "certgen")) - command = os.path.join(certgen_dir, "gen_certificates.sh") if not run_local(logger, f"/usr/bin/rm -rf {certs_dir}").passed: message = "Error removing old certificates" self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) return False + command = os.path.abspath( + os.path.join(test_env.daos_prefix, "lib64", "daos", "certgen", "gen_certificates.sh")) if not run_local(logger, f"{command} {test_env.log_dir}").passed: message = "Error generating certificates" self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) diff --git a/src/tests/ftest/util/mdtest_utils.py b/src/tests/ftest/util/mdtest_utils.py index 97e5d75d088..0b5654ece43 100644 --- a/src/tests/ftest/util/mdtest_utils.py +++ b/src/tests/ftest/util/mdtest_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2019-2024 Intel Corporation. + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -8,20 +9,115 @@ import re from command_utils import ExecutableCommand -from command_utils_base import FormattedParameter, LogParameter +from command_utils_base import BasicParameter, FormattedParameter, LogParameter +from exception_utils import CommandFailure from general_utils import get_log_file +from job_manager_utils import get_job_manager + +MDTEST_NAMESPACE = "/run/mdtest/*" + + +def get_mdtest(test, hosts, manager=None, path=None, slots=None, namespace=MDTEST_NAMESPACE, + mdtest_params=None): + """Get a Mdtest object. + + Args: + test (Test): avocado Test object + hosts (NodeSet): hosts on which to run the mdtest command + manager (JobManager, optional): command to manage the multi-host execution of mdtest. + Defaults to None, which will get a default job manager. + path (str, optional): hostfile path. Defaults to None. + slots (int, optional): hostfile number of slots per host. Defaults to None. + namespace (str, optional): path to yaml parameters. Defaults to MDTEST_NAMESPACE. + mdtest_params (dict, optional): parameters to update the mdtest command. Defaults to None. + + Returns: + Mdtest: the Mdtest object requested + """ + mdtest = Mdtest(test, hosts, manager, path, slots, namespace) + if mdtest_params: + for name, value in mdtest_params.items(): + mdtest.update(name, value) + return mdtest + + +def run_mdtest(test, hosts, path, slots, container, processes, ppn=None, manager=None, + log_file=None, intercept=None, display_space=True, namespace=MDTEST_NAMESPACE, + mdtest_params=None): + # pylint: disable=too-many-arguments + """Run Mdtest on multiple hosts. + + Args: + test (Test): avocado Test object + hosts (NodeSet): hosts on which to run the mdtest command + path (str): hostfile path. + slots (int): hostfile number of slots per host. + container (TestContainer): DAOS test container object. + processes (int): number of processes to run + ppn (int, optional): number of processes per node to run. If specified it will override + the processes input. Defaults to None. + manager (JobManager, optional): command to manage the multi-host execution of mdtest. + Defaults to None, which will get a default job manager. + log_file (str, optional): log file name. Defaults to None, which will result in a log file + name containing the test, pool, and container IDs. + intercept (str, optional): path to interception library. Defaults to None. + display_space (bool, optional): Whether to display the pool space. Defaults to True. + namespace (str, optional): path to yaml parameters. Defaults to MDTEST_NAMESPACE. + mdtest_params (dict, optional): dictionary of MdtestCommand attributes to override from + get_params(). Defaults to None. + + Raises: + CommandFailure: if there is an error running the mdtest command + + Returns: + CmdResult: result of the ior command + + """ + mdtest = get_mdtest(test, hosts, manager, path, slots, namespace, mdtest_params) + if log_file is None: + log_file = mdtest.get_unique_log(container) + mdtest.update_log_file(log_file) + return mdtest.run(container, processes, ppn, intercept, display_space) + + +def write_mdtest_data(test, container, namespace=MDTEST_NAMESPACE, **mdtest_run_params): + """Write data to the container using mdtest. + + Simple method for test classes to use to write data with mdtest. While not required, this is + setup by default to pull in mdtest parameters from the test yaml. + + Args: + test (Test): avocado Test object + container (TestContainer): the container to populate + namespace (str, optional): path to mdtest yaml parameters. Defaults to MDTEST_NAMESPACE. + mdtest_run_params (dict): optional params for the Mdtest.run() command. + + Returns: + Mdtest: the Mdtest object used to populate the container + """ + mdtest = get_mdtest(test, test.hostlist_clients, None, test.workdir, None, namespace) + mdtest.update_log_file(mdtest.get_unique_log(container)) + + if 'processes' not in mdtest_run_params: + mdtest_run_params['processes'] = test.params.get('processes', namespace, None) + elif 'ppn' not in mdtest_run_params: + mdtest_run_params['ppn'] = test.params.get('ppn', namespace, None) + + mdtest.run(container, **mdtest_run_params) + return mdtest class MdtestCommand(ExecutableCommand): """Defines a object representing a mdtest command.""" - def __init__(self, log_dir): + def __init__(self, log_dir, namespace="/run/mdtest/*"): """Create an MdtestCommand object. Args: log_dir (str): directory in which to put log files + namespace (str, optional): path to yaml parameters. Defaults to "/run/mdtest/*". """ - super().__init__("/run/mdtest/*", "mdtest") + super().__init__(namespace, "mdtest") self._log_dir = log_dir @@ -137,6 +233,145 @@ def get_default_env(self, manager_cmd, log_file=None): return env +class Mdtest: + """Defines a class that runs the mdtest command through a job manager, e.g. mpirun.""" + + def __init__(self, test, hosts, manager=None, path=None, slots=None, + namespace=MDTEST_NAMESPACE): + """Initialize an Mdtest object. + + Args: + test (Test): avocado Test object + hosts (NodeSet): hosts on which to run the mdtest command + manager (JobManager, optional): command to manage the multi-host execution of mdtest. + Defaults to None, which will get a default job manager. + path (str, optional): hostfile path. Defaults to None. + slots (int, optional): hostfile number of slots per host. Defaults to None. + namespace (str, optional): path to yaml parameters. Defaults to MDTEST_NAMESPACE. + """ + if manager is None: + manager = get_job_manager(test, subprocess=False, timeout=60) + self.manager = manager + self.manager.assign_hosts(hosts, path, slots) + self.manager.job = MdtestCommand(test.test_env.log_dir, namespace) + self.manager.job.get_params(test) + self.manager.output_check = "both" + self.timeout = test.params.get("timeout", namespace, None) + self.label_generator = test.label_generator + self.test_id = test.test_id + self.env = self.command.get_default_env(str(self.manager)) + + @property + def command(self): + """Get the MdtestCommand object. + + Returns: + MdtestCommand: the MdtestCommand object managed by the JobManager + + """ + return self.manager.job + + def update(self, name, value): + """Update a MdtestCommand BasicParameter with a new value. + + Args: + name (str): name of the MdtestCommand BasicParameter to update + value (str): value to assign to the MdtestCommand BasicParameter + """ + param = getattr(self.command, name, None) + if param: + if isinstance(param, BasicParameter): + param.update(value, ".".join([self.command.command, name])) + + def update_log_file(self, log_file): + """Update the log file for the mdtest command. + + Args: + log_file (str): new mdtest log file + """ + self.command.env["D_LOG_FILE"] = get_log_file( + log_file or f"{self.command.command}_daos.log") + + def get_unique_log(self, container): + """Get a unique mdtest log file name. + + Args: + container (TestContainer): container involved with the command + + Returns: + str: a log file name + """ + label = self.label_generator.get_label("mdtest") + parts = [self.test_id, container.pool.identifier, container.identifier, label] + return '.'.join(['_'.join(parts), 'log']) + + def update_daos_params(self, pool, container): + """Set the mdtest parameters for the pool and container. + + Optionally also set the DAOS pool and container environment variables for mdtest. + + Args: + pool (TestPool): the pool to use with the mdtest command + container (TestContainer): the container to use with the mdtest command + """ + self.command.update_params(dfs_pool=pool.identifier, dfs_cont=container.identifier) + + if "mpirun" in str(self.manager) or "srun" in str(self.manager): + self.env["DAOS_POOL"] = self.command.dfs_pool.value + self.env["DAOS_CONT"] = self.command.dfs_cont.value + self.env["IOR_HINT__MPI__romio_daos_obj_class"] = self.command.dfs_oclass.value + + def run(self, container, processes, ppn=None, intercept=None, display_space=True): + """Run mdtest. + + Args: + container (TestContainer): DAOS test container object. + processes (int): number of processes to run + ppn (int, optional): number of processes per node to run. If specified it will override + the processes input. Defaults to None. + intercept (str, optional): path to interception library. Defaults to None. + display_space (bool, optional): Whether to display the pool space. Defaults to True. + + Raises: + CommandFailure: if there is an error running the mdtest command + + Returns: + CmdResult: result of the mdtest command + """ + result = None + error_message = None + + self.update_daos_params(container.pool, container) + + if intercept: + self.env["LD_PRELOAD"] = intercept + + # Pass only processes or ppn to be compatible with previous behavior + if ppn is not None: + self.manager.assign_processes(ppn=ppn) + else: + self.manager.assign_processes(processes=processes) + + self.manager.assign_environment(self.env) + + try: + if display_space: + container.pool.display_space() + result = self.manager.run() + + except CommandFailure as error: + error_message = "Mdtest Failed:\n {}".format("\n ".join(str(error).split("\n"))) + + finally: + if not self.manager.run_as_subprocess and display_space: + container.pool.display_space() + + if error_message: + raise CommandFailure(error_message) + + return result + + class MdtestMetrics(): # pylint: disable=too-few-public-methods """Represents metrics from mdtest output. diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index e3802364d8f..3f8757bdd3d 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -384,8 +385,8 @@ def get_interface_providers(interface, provider_data): return providers -def get_fastest_interface(logger, hosts, verbose=True): - """Get the fastest active interface common to all hosts. +def get_fastest_interfaces(logger, hosts, verbose=True): + """Get the fastest active interfaces common to all hosts. Args: logger (Logger): logger for the messages produced by this method @@ -396,12 +397,12 @@ def get_fastest_interface(logger, hosts, verbose=True): NetworkException: if there is an error detecting the fastest active interface Returns: - str: the fastest active interface common to all hosts specified + list: the fastest active interfaces common to all hosts specified """ common_interfaces = get_common_interfaces(logger, hosts, verbose) # Find the speed of each common active interface in order to be able to choose the fastest - interface_speeds = {} + interfaces_at_speed = {} for interface in common_interfaces: detected_speeds = get_interface_speeds(logger, hosts, interface, verbose) speed_list = [] @@ -411,26 +412,22 @@ def get_fastest_interface(logger, hosts, verbose=True): speed_hosts.add(node_set) if speed_list and speed_hosts == hosts: # Only include interface speeds if a speed is detected on all the hosts - interface_speeds[interface] = min(speed_list) + min_speed = min(speed_list) + if min_speed not in interfaces_at_speed: + interfaces_at_speed[min_speed] = [] + interfaces_at_speed[min_speed].append(interface) + fastest_interfaces = None logger.info("Active network interface speeds on %s:", hosts) - available_interfaces = {} - for interface in sorted(interface_speeds): - logger.info(" - %-8s (speed: %6s)", interface, interface_speeds[interface]) - - # Only include the first active interface (as determined by alphabetic sort) for each speed - if interface_speeds[interface] not in available_interfaces: - available_interfaces[interface_speeds[interface]] = interface - - logger.info("Available interfaces on %s: %s", hosts, available_interfaces) - try: - # Select the fastest active interface available by sorting the speed - interface = available_interfaces[sorted(available_interfaces)[-1]] - except IndexError as error: - raise NetworkException("Error obtaining a default interface!") from error - - logger.info("Fastest interface detected on %s: %s", hosts, interface) - return interface + for speed in sorted(interfaces_at_speed): + fastest_interfaces = sorted(interfaces_at_speed[speed]) + logger.info(" - speed: %7s => %s", speed, fastest_interfaces) + + if fastest_interfaces is None: + raise NetworkException(f"Error obtaining default interfaces w/ equal speed on {hosts}!") + + logger.info("Fastest interfaces detected on %s: %s", hosts, fastest_interfaces) + return fastest_interfaces def get_common_provider(logger, hosts, interface, supported=None, verbose=True): diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 410b6ce46a2..90a78fa898a 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -290,6 +291,24 @@ def set_cont_class_properties(self, oclass="S1"): self.ior_cmd.dfs_dir_oclass.update(None, "ior.dfs_dir_oclass") self.container.oclass.update(None) + def get_random_test_ranks(self, total_ranks=2, join_ranks=True): + """Get random list of ranks for OSA tests. + + Args: + total_ranks (list): Random rank list for testing. Defaults to 2. + join_ranks (bool): Stop ranks individual ranks. Defaults to True. + + Returns: + list: a list of random ranks either as individual strings, + or one comma-separated string. + + """ + # Get a random rank(s) based on num_ranks input. + ranklist = list(self.server_managers[0].ranks.keys()) + if join_ranks is True: + return list(map(str, self.random.sample(ranklist, k=total_ranks))) + return [",".join(map(str, self.random.sample(ranklist, k=total_ranks)))] + def assert_on_exception(self, out_queue=None): """Assert on exception while executing an application. diff --git a/src/tests/ftest/util/package_utils.py b/src/tests/ftest/util/package_utils.py index af18623e332..634311855e6 100644 --- a/src/tests/ftest/util/package_utils.py +++ b/src/tests/ftest/util/package_utils.py @@ -32,7 +32,7 @@ def find_packages(log, hosts, pattern, user=None): return installed -def install_packages(log, hosts, packages, user=None, timeout=600): +def install_packages(log, hosts, packages, user=None, timeout=600, allowerasing=False): """Install the packages on the hosts. Args: @@ -41,12 +41,17 @@ def install_packages(log, hosts, packages, user=None, timeout=600): packages (list): a list of packages to install user (str, optional): user to use when installing the packages. Defaults to None. timeout (int, optional): timeout for the dnf install command. Defaults to 600. + allowerasing (bool, optional): whether to use dnf --allowerasing. Defaults to False. Returns: CommandResult: the 'dnf install' command results """ log.info('Installing packages on %s: %s', hosts, ', '.join(packages)) - command = command_as_user(' '.join(['dnf', 'install', '-y'] + packages), user) + command = ['dnf', 'install', '-y'] + if allowerasing: + command.append('--allowerasing') + command.extend(packages) + command = command_as_user(' '.join(command), user) return run_remote(log, hosts, command, timeout=timeout) diff --git a/src/tests/ftest/util/pool_create_all_base.py b/src/tests/ftest/util/pool_create_all_base.py index ff72e2f077f..660626c2c8e 100644 --- a/src/tests/ftest/util/pool_create_all_base.py +++ b/src/tests/ftest/util/pool_create_all_base.py @@ -1,5 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. +(C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -109,6 +110,7 @@ def check_pool_full_storage(self, scm_delta_bytes, nvme_delta_bytes=None, ranks= self.add_pool_qty(pool_count, create=False) pool_idx = len(self.pool) - pool_count + # pylint: disable-next=logging-format-truncated self.log.info("Creating a pool with all the available storage: size=100%") self.pool[pool_idx].size.update("100%", "pool[{}].size".format(pool_idx)) if ranks is not None: @@ -362,6 +364,7 @@ def check_pool_half_storage(self, scm_delta_bytes, nvme_delta_bytes=None): usable_bytes = self.get_usable_bytes() self.log.info("Usable bytes: scm_size=%d, nvme_size=%d", *usable_bytes) + # pylint: disable-next=logging-format-truncated self.log.info("Creating pool with half of the available storage: size=50%") self.pool[0].size.update("50%") self.pool[0].create() diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index f6d3223257a..d36900dc54f 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -1,6 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -1164,16 +1164,20 @@ def get_daos_metrics(self, verbose=False, timeout=60): engines.append(result) return engines - def get_vos_path(self, pool): - """Get the VOS file path. + def get_vos_paths(self, pool): + """Get the VOS file paths. Args: pool (TestPool): the pool containing the vos file Returns: - str: the full path to the vos file + list: the full path list to the vos file """ - return os.path.join(self.get_config_value("scm_mount"), pool.uuid.lower()) + vos_paths = [] + for engine_params in self.manager.job.yaml.engine_params: + scm_mount = engine_params.get_value("scm_mount") + vos_paths.append(os.path.join(scm_mount, pool.uuid.lower())) + return vos_paths def get_vos_files(self, pool, pattern="vos"): """Get all the VOS file paths containing the pattern. @@ -1187,7 +1191,7 @@ def get_vos_files(self, pool, pattern="vos"): /mnt/daos0//vos-0. If no matches are found the list will be empty. """ vos_files = [] - vos_path = self.get_vos_path(pool) + vos_path = self.get_vos_paths(pool)[0] command = command_as_user(f"ls {vos_path}", "root") result = run_remote(self.log, self.hosts[0:1], command) if result.passed: @@ -1198,18 +1202,40 @@ def get_vos_files(self, pool, pattern="vos"): return vos_files def search_engine_logs(self, pattern): - """Search the server logs for a specific pattern. + """Search the server log files for a specific pattern. Args: - pattern (str): The pattern to search for in the logs. + pattern (str): The pattern to search for in the log files. Returns: - CommandResult: Result of the grep command run against each server log. + CommandResult: Result of the grep command run against each server log file. """ - # Get the path of one of the server log files log_dir = os.path.dirname(self.get_config_value("log_file")) - command = (f"find {log_dir} -type f -regextype egrep " - r"-regex '.*/daos_server[[:digit:]]?\.log\.[[:digit:]]+' -print0 " - f"| xargs -0 -r grep -E -e '{pattern}'") + find_args = (f"{log_dir} -type f -regextype egrep -regex " + r"'.*/daos_server[[:digit:]]?\.log\.[[:digit:]]+'") + return self._search_logs(find_args, pattern) + + def search_control_logs(self, pattern): + """Search the control log files for a specific pattern. + + Args: + pattern (str): The pattern to search for in the log files + + Returns: + CommandResult: Result of the grep command run against each control log file. + """ + return self._search_logs(f"{self.get_config_value('control_log_file')}", pattern) + + def _search_logs(self, find_args, pattern): + """Search the log files for a specific pattern. + + Args: + find_args (str): arguments used with the find command to locate the log files + pattern (str): The pattern to search for in the log files + + Returns: + CommandResult: Result of the grep command run against each log file. + """ + command = f"find {find_args} -print0 | xargs -0 -r grep -E -e '{pattern}'" result = run_remote(self.log, self.hosts, command_as_user(command, "root")) return result diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index c6fba0bbf5d..36c9f5eb946 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -1,6 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -141,6 +141,7 @@ def __init__(self, filename, common_yaml, version=None): self.control_log_file = LogParameter(log_dir, None, "daos_control.log") self.helper_log_file = LogParameter(log_dir, None, "daos_server_helper.log") self.telemetry_port = BasicParameter(None, 9191) + self.control_iface = BasicParameter(None) self.client_env_vars = BasicParameter(None) # access_points was changed to mgmt_svc_replicas in 2.7 @@ -486,9 +487,19 @@ def __init__(self, base_namespace, index, provider=None, max_storage_tiers=MAX_S self._max_storage_tiers = max_storage_tiers super().__init__(os.path.join(*namespace)) - # Use environment variables to get default parameters - default_interface = os.environ.get("DAOS_TEST_FABRIC_IFACE", "eth0") - default_port = int(os.environ.get("D_PORT", 31416)) + # Use environment variables to get default parameters. Supports lists to define values for + # multiple engines through comma-separated strings. If the index exceeds the list length + # then values are reused round-robin style. + try: + _defaults = os.environ.get("DAOS_TEST_FABRIC_IFACE").split(",") + default_interface = list(filter(None, _defaults))[index % len(_defaults)] + except (AttributeError, IndexError): + default_interface = f"eth{index}" + try: + _defaults = [int(port) for port in os.environ.get("D_PORT").split(",")] + default_port = list(filter(None, _defaults))[index % len(_defaults)] + except (AttributeError, ValueError, IndexError): + default_port = 31317 + (100 * index) # All log files should be placed in the same directory on each host # to enable easy log file archiving by launch.py diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index 8937db87788..5230fba5a46 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -159,6 +159,12 @@ class TelemetryUtils(): "engine_pool_vos_wal_replay_size", "engine_pool_vos_wal_replay_time", "engine_pool_vos_wal_replay_transactions"] + ENGINE_POOL_VOS_CACHE_METRICS = [ + "engine_pool_vos_cache_page_evict", + "engine_pool_vos_cache_page_flush", + "engine_pool_vos_cache_page_hit", + "engine_pool_vos_cache_page_miss", + "engine_pool_vos_cache_page_ne"] ENGINE_POOL_SVC_METRICS = [ "engine_pool_svc_degraded_ranks", "engine_pool_svc_disabled_targets", @@ -179,6 +185,7 @@ class TelemetryUtils(): ENGINE_POOL_VOS_SPACE_METRICS + \ ENGINE_POOL_VOS_WAL_METRICS + \ ENGINE_POOL_VOS_WAL_REPLAY_METRICS +\ + ENGINE_POOL_VOS_CACHE_METRICS +\ ENGINE_POOL_SVC_METRICS ENGINE_EVENT_METRICS = [ "engine_events_dead_ranks", diff --git a/src/tests/ftest/util/test_utils_container.py b/src/tests/ftest/util/test_utils_container.py index 5fff6c88bd1..a8be58785a3 100644 --- a/src/tests/ftest/util/test_utils_container.py +++ b/src/tests/ftest/util/test_utils_container.py @@ -1,6 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -367,7 +367,7 @@ def __init__(self, pool, daos_command, label_generator=None, namespace=CONT_NAME self.dir_oclass = BasicParameter(None) self.file_oclass = BasicParameter(None) self.chunk_size = BasicParameter(None) - self.properties = BasicParameter(None) + self.properties = BasicParameter(None, "cksum:off,srv_cksum:off") self.acl_file = BasicParameter(None) self.daos_timeout = BasicParameter(None) self.label = BasicParameter(None, "TestContainer") diff --git a/src/tests/ftest/util/test_utils_pool.py b/src/tests/ftest/util/test_utils_pool.py index 4895f858873..36539943abc 100644 --- a/src/tests/ftest/util/test_utils_pool.py +++ b/src/tests/ftest/util/test_utils_pool.py @@ -1,6 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -12,6 +12,7 @@ from avocado import TestFail, fail_on from command_utils import BasicParameter +from data_utils import assert_dict_subset from dmg_utils import DmgCommand, DmgJsonCommandFailure from exception_utils import CommandFailure from general_utils import DaosTestError, check_file_exists @@ -95,8 +96,9 @@ def add_pool(test, namespace=POOL_NAMESPACE, create=True, connect=True, dmg=None # Add a step to remove this pool when the test completes and ensure their is enough time for the # pool destroy to be attempted - accounting for a possible dmg command timeout - test.increment_timeout(POOL_TIMEOUT_INCREMENT) - test.register_cleanup(remove_pool, test=test, pool=pool) + if pool.register_cleanup.value is True: + test.increment_timeout(POOL_TIMEOUT_INCREMENT) + test.register_cleanup(remove_pool, test=test, pool=pool) return pool @@ -271,7 +273,7 @@ def __init__(self, context, dmg_command, label_generator=None, namespace=POOL_NA self.gid = os.getegid() self.mode = BasicParameter(None) - self.name = BasicParameter(None) # server group name + self.name = BasicParameter(None) # server group name self.svcn = BasicParameter(None) self.target_list = BasicParameter(None) self.nranks = BasicParameter(None) @@ -280,9 +282,9 @@ def __init__(self, context, dmg_command, label_generator=None, namespace=POOL_NA self.mem_ratio = BasicParameter(None) self.scm_size = BasicParameter(None) self.nvme_size = BasicParameter(None) - self.prop_name = BasicParameter(None) # name of property to be set - self.prop_value = BasicParameter(None) # value of property - self.properties = BasicParameter(None) # string of cs name:value + self.prop_name = BasicParameter(None) # name of property to be set + self.prop_value = BasicParameter(None) # value of property + self.properties = BasicParameter(None, "rd_fac:0,space_rb:0") # string of cs name:value self.rebuild_timeout = BasicParameter(None) self.pool_query_timeout = BasicParameter(None) self.pool_query_delay = BasicParameter(None) @@ -305,6 +307,8 @@ def __init__(self, context, dmg_command, label_generator=None, namespace=POOL_NA # Parameter to control running 'dmg storage query usage --show_usable' if pool create fails self.query_on_create_error = BasicParameter(None, False) + self.register_cleanup = BasicParameter(True, True) # call register_cleanup by default + self.pool = None self.info = None self.svc_ranks = None @@ -723,7 +727,7 @@ def set_prop(self, *args, **kwargs): dict: json output of dmg pool set-prop command """ - return self.dmg.pool_set_prop(pool=self.identifier, *args, **kwargs) + return self.dmg.pool_set_prop(self.identifier, *args, **kwargs) @fail_on(CommandFailure) def get_prop(self, *args, **kwargs): @@ -847,6 +851,24 @@ def reintegrate(self, ranks, tgt_idx=None): """ return self.dmg.pool_reintegrate(self.identifier, ranks, tgt_idx) + def rebuild_start(self, *args, **kwargs): + """Use dmg to start rebuild on this pool. + + Returns: + CmdResult: Object that contains exit status, stdout, and other information. + + """ + return self.dmg.pool_rebuild_start(self.identifier, *args, **kwargs) + + def rebuild_stop(self, *args, **kwargs): + """Use dmg to stop rebuild on this pool. + + Returns: + CmdResult: Object that contains exit status, stdout, and other information. + + """ + return self.dmg.pool_rebuild_stop(self.identifier, *args, **kwargs) + @fail_on(CommandFailure) def set_property(self, prop_name, prop_value): """Set Property. @@ -1030,7 +1052,7 @@ def check_free_space(self, expected_scm=None, expected_nvme=None, timeout=30, in return False def check_rebuild_status(self, rs_version=None, rs_seconds=None, - rs_errno=None, rs_state=None, rs_padding16=None, + rs_errno=None, rs_state=None, rs_flags=None, rs_fail_rank=None, rs_toberb_obj_nr=None, rs_obj_nr=None, rs_rec_nr=None, rs_size=None): # pylint: disable=unused-argument @@ -1047,7 +1069,7 @@ def check_rebuild_status(self, rs_version=None, rs_seconds=None, rs_seconds (int, optional): rebuild seconds. Defaults to None. rs_errno (int, optional): rebuild error number. Defaults to None. rs_state (int, optional): rebuild state flag. Defaults to None. - rs_padding16 (int, optional): padding. Defaults to None. + rs_flags (int, optional): rebuild status flags. Defaults to None. rs_fail_rank (int, optional): rebuild fail target. Defaults to None. rs_toberb_obj_nr (int, optional): number of objects to be rebuilt. Defaults to None. @@ -1387,6 +1409,9 @@ def _update_rebuild_data(self, verbose=True): # If the current state is busy or idle w/o a version increase after previously being # busy then rebuild is running self._rebuild_data["check"] = "running" + elif self._rebuild_data["state"] == "idle" and self._rebuild_data["status"] == -2027: + # Rebuild was explicitly stopped + self._rebuild_data["check"] = "stopped" elif self._rebuild_data["check"] is None: # Otherwise rebuild has yet to start self._rebuild_data["check"] = "not yet started" @@ -1398,8 +1423,8 @@ def _wait_for_rebuild(self, expected, interval=1): """Wait for the rebuild to start or end. Args: - expected (str): which rebuild data check to wait for: 'running' or 'completed' - interval (int): number of seconds to wait in between rebuild completion checks + expected (str): which rebuild data check to wait for: 'running', 'completed', 'stopped' + interval (int, optional): number of seconds to wait between checks. Defaults to 1. Raises: DaosTestError: if waiting for rebuild times out. @@ -1461,7 +1486,7 @@ def wait_for_rebuild_to_start(self, interval=1): """Wait for the rebuild to start. Args: - interval (int): number of seconds to wait in between rebuild completion checks + interval (int, optional): number of seconds to wait between checks. Defaults to 1. Raises: DaosTestError: if waiting for rebuild times out. @@ -1473,7 +1498,7 @@ def wait_for_rebuild_to_end(self, interval=1): """Wait for the rebuild to end. Args: - interval (int): number of seconds to wait in between rebuild completion checks + interval (int, optional): number of seconds to wait between checks. Defaults to 1. Raises: DaosTestError: if waiting for rebuild times out. @@ -1481,6 +1506,18 @@ def wait_for_rebuild_to_end(self, interval=1): """ self._wait_for_rebuild("completed", interval) + def wait_for_rebuild_to_stop(self, interval=1): + """Wait for the rebuild to stop without completing. + + Args: + interval (int, optional): number of seconds to wait between checks. Defaults to 1. + + Raises: + DaosTestError: if waiting for rebuild times out. + + """ + self._wait_for_rebuild("stopped", interval) + def measure_rebuild_time(self, operation, interval=1): """Measure rebuild time. @@ -1568,3 +1605,46 @@ def verify_uuid_directory(self, host, scm_mount): else: self.log.info("%s does not exist on %s", pool_dir, host) return result[0] + + def verify_query(self, expected_response, use_cached_query=False): + """Verify dmg pool query returns expected values. + + Args: + expected_response (dict): Expected key/value pairs from dmg pool query. + Can be a subset of the full response, where only expected keys are verified. + Expected value can be type callable(actual_value) -> bool for custom verification. + use_cached_query (bool, optional): Whether to use the last cached query. + Defaults to False, which issues a new query. + + Raises: + AssertionError: if the pool query response does not match expected values + + """ + # Only refresh the cache if requested or not yet cached + if not use_cached_query or 'response' not in self.query_data: + self.set_query_data() + response = self.query_data['response'] + + assert_dict_subset(expected_response, response) + + def verify_query_targets_state(self, ranks, expected_target_state): + """Verify all targets are in the expected state with dmg pool query-targets. + + Args: + ranks (list): The list of ranks to verify. + expected_target_state (str): The expected target state. + + Raises: + AssertionError: if the targets are not in the expected state + + """ + for rank in ranks: + self.log.info( + 'Verifying targets on rank %s are in state %s', rank, expected_target_state) + response = self.query_targets(rank=rank)['response'] + infos = response['Infos'] + for target, info in enumerate(infos): + if info['target_state'] != expected_target_state: + raise AssertionError( + f'Expected target {target} to be in state {expected_target_state}, ' + f'but current state is {info["target_state"]}') diff --git a/src/tests/ftest/vmd/fault_reintegration.yaml b/src/tests/ftest/vmd/fault_reintegration.yaml index 735e059937b..ed5bbd1a0c3 100644 --- a/src/tests/ftest/vmd/fault_reintegration.yaml +++ b/src/tests/ftest/vmd/fault_reintegration.yaml @@ -1,9 +1,12 @@ hosts: test_servers: 3 test_clients: 1 + timeout: 360 + setup: start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -11,28 +14,27 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: auto 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: size: 90% svcn: 3 rebuild_timeout: 120 pool_query_timeout: 30 + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:2 file_oclass: RP_3G1 dir_oclass: RP_3G1 + ior: client_processes: np: 48 @@ -49,7 +51,9 @@ ior: # - [transfersize, blocksize] # The values are set to be in the multiples of 10. - [4000, 5000000] # [4K, 5M] + aggregation: test_with_aggregation: true + rebuild: test_with_rebuild: true diff --git a/src/tests/ftest/vmd/led.yaml b/src/tests/ftest/vmd/led.yaml index 1dec111e091..e77437e6a35 100644 --- a/src/tests/ftest/vmd/led.yaml +++ b/src/tests/ftest/vmd/led.yaml @@ -1,7 +1,9 @@ hosts: test_servers: 2 test_clients: 1 + timeout: 300 + server_config: name: daos_server engines_per_host: 2 @@ -9,8 +11,6 @@ server_config: 0: pinned_numa_node: 0 nr_xs_helpers: 1 - fabric_iface: ib0 - fabric_iface_port: 31317 log_file: daos_server0.log storage: 0: @@ -24,8 +24,6 @@ server_config: 1: pinned_numa_node: 1 nr_xs_helpers: 1 - fabric_iface: ib1 - fabric_iface_port: 31417 log_file: daos_server1.log storage: 0: @@ -36,14 +34,17 @@ server_config: class: nvme bdev_list: ["bbbb:bb:bb.b"] bdev_class: nvme + pool: mode: 146 name: daos_server size: 50% properties: ec_cell_sz:128KiB + container: type: POSIX properties: cksum:crc16,cksum_size:16384,srv_cksum:on control_method: daos + dfuse: disable_caching: True diff --git a/src/tests/suite/SConscript b/src/tests/suite/SConscript index ab3a3fee0e6..e780a0ae7cd 100644 --- a/src/tests/suite/SConscript +++ b/src/tests/suite/SConscript @@ -33,16 +33,15 @@ def scons(): newenv = denv.Clone() - c_files = Split("""daos_array.c daos_base_tx.c daos_capa.c daos_checksum.c - daos_container.c daos_dedup.c daos_degraded.c - daos_dist_tx.c daos_drain_simple.c daos_epoch.c - daos_epoch_io.c daos_epoch_recovery.c daos_kv.c - daos_md_replication.c daos_mgmt.c daos_nvme_recovery.c - daos_obj_array.c daos_obj.c daos_oid_alloc.c daos_pool.c - daos_rebuild.c daos_rebuild_common.c daos_rebuild_ec.c - daos_rebuild_simple.c daos_test.c daos_verify_consistency.c - daos_aggregate_ec.c daos_degrade_ec.c daos_cr.c daos_inc_reint.c - daos_extend_simple.c daos_obj_ec.c daos_upgrade.c daos_pipeline.c""") + c_files = Split("""daos_aggregate_ec.c daos_array.c daos_base_tx.c daos_capa.c daos_checksum.c + daos_container.c daos_cr.c daos_dedup.c daos_degraded.c daos_degrade_ec.c + daos_dist_tx.c daos_drain_common.c daos_drain_simple.c daos_epoch.c + daos_epoch_io.c daos_epoch_recovery.c daos_extend_common.c + daos_extend_simple.c daos_inc_reint.c daos_kv.c daos_md_replication.c + daos_mgmt.c daos_nvme_recovery.c daos_obj.c daos_obj_array.c daos_obj_ec.c + daos_oid_alloc.c daos_pipeline.c daos_pool.c daos_rebuild.c + daos_rebuild_common.c daos_rebuild_ec.c daos_rebuild_interactive.c + daos_rebuild_simple.c daos_test.c daos_upgrade.c daos_verify_consistency.c""") daostest = newenv.d_program('daos_test', c_files + daos_test_tgt, LIBS=['daos_common'] + libraries) diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index cdec5e4233c..1c31158cb17 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2023-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1350,7 +1350,7 @@ cr_engine_interaction(void **state) rc = cr_system_start(); assert_rc_equal(rc, 0); - /* Former connection for the pool has been evicted by checkre. Let's re-connect the pool. */ + /* Former connection for the pool has been evicted by checker. Let's re-connect the pool. */ rc = cr_cont_get_label(state, &pool, &cont, true, &label); assert_rc_equal(rc, 0); @@ -1732,7 +1732,7 @@ cr_stop_engine_interaction(void **state) rc = cr_system_start(); assert_rc_equal(rc, 0); - /* Former connection for the pool has been evicted by checkre. Let's re-connect the pool. */ + /* Former connection for the pool has been evicted by checker. Let's re-connect the pool. */ rc = cr_cont_get_label(state, &pool, &cont, true, &label); assert_rc_equal(rc, 0); @@ -2344,14 +2344,16 @@ cr_engine_resume(void **state) static void cr_reset_specified(void **state) { - test_arg_t *arg = *state; - struct test_pool pools[2] = { 0 }; - struct test_cont conts[2] = { 0 }; - struct daos_check_info dcis[2] = { 0 }; - uint32_t classes[3]; - uint32_t actions[3]; - int rc; - int i; + test_arg_t *arg = *state; + struct test_pool pools[2] = {0}; + struct test_cont conts[2] = {0}; + struct daos_check_info dcis[2] = {0}; + const int NR_REPORTS = 3; + uint32_t classes[NR_REPORTS]; + uint32_t actions[NR_REPORTS]; + uint32_t stale_actions[NR_REPORTS]; + int rc; + int i; FAULT_INJECTION_REQUIRED(); @@ -2368,6 +2370,12 @@ cr_reset_specified(void **state) actions[1] = TCA_INTERACT; actions[2] = TCA_INTERACT; + for (i = 0; i < NR_REPORTS; i++) { + stale_actions[i] = actions[i]; + if (stale_actions[i] == TCA_INTERACT) + stale_actions[i] = TCA_STALE; + } + for (i = 0; i < 2; i++) { rc = cr_pool_create(state, &pools[i], true, classes[0]); assert_rc_equal(rc, 0); @@ -2418,7 +2426,8 @@ cr_reset_specified(void **state) assert_rc_equal(rc, 0); /* Pool2's (old) report should be still there. */ - rc = cr_pool_verify(&dcis[1], pools[1].pool_uuid, TCPS_STOPPED, 2, classes, actions, NULL); + rc = cr_pool_verify(&dcis[1], pools[1].pool_uuid, TCPS_STOPPED, 2, classes, stale_actions, + NULL); assert_rc_equal(rc, 0); rc = cr_check_stop(0, NULL); @@ -2433,8 +2442,8 @@ cr_reset_specified(void **state) rc = cr_ins_verify(&dcis[1], TCIS_RUNNING); assert_rc_equal(rc, 0); - /* There are 3 reports for pool2: two are old (since not reset), another one is new. */ - rc = cr_pool_verify(&dcis[1], pools[1].pool_uuid, TCPS_PENDING, 3, classes, actions, NULL); + /* Pool2's stale report is re-generated */ + rc = cr_pool_verify(&dcis[1], pools[1].pool_uuid, TCPS_PENDING, 2, classes, actions, NULL); assert_rc_equal(rc, 0); rc = cr_check_stop(0, NULL); @@ -3427,12 +3436,6 @@ cr_fail_sync_orphan(void **state) rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); assert_rc_equal(rc, 0); - /* Check leader may be completed earlier than check engines in this case, double check. */ - cr_ins_wait(0, NULL, &dci); - - rc = cr_ins_verify(&dci, TCIS_COMPLETED); - assert_rc_equal(rc, 0); - cr_debug_set_params(arg, 0); rc = cr_mode_switch(false); @@ -3845,6 +3848,202 @@ cr_maintenance_mode(void **state) cr_cleanup(arg, &pool, 1); } +/* + * 1. Exclude rank 0. + * 2. Create pool without inconsistency. + * 3. Start checker without options. + * 4. Query checker, it should be completed instead of being blocked. + * 5. Switch to normal mode and cleanup. + */ +static void +cr_lost_rank0(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = {0}; + struct daos_check_info dci = {0}; + int rc; + + print_message("CR29: CR with rank 0 excluded at the beginning\n"); + + print_message("CR: excluding the rank 0 ...\n"); + rc = dmg_system_exclude_rank(dmg_config_file, 0); + assert_rc_equal(rc, 0); + + rc = cr_pool_create(state, &pool, false, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_RESET, 0, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + /* Reint the rank for subsequent test. */ + rc = cr_rank_reint(0, true); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + +/* + * 1. Create pool. + * 2. Fault injection to generate inconsistent pool label. + * 3. Set fail_loc to fail interaction report. + * 4. Start checker with option "--failout=on" and "POOL_BAD_LABEL:CIA_INTERACT". Should not crash. + * 5. Query checker, instance should failed, pool should be "failed". + * 6. Reset fail_loc. + * 7. Switch to normal mode to verify the pool label. + * 8. Cleanup. + */ +static void +cr_leader_report_fail(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = {0}; + struct daos_check_info dci = {0}; + char *label = NULL; + int rc; + + FAULT_INJECTION_REQUIRED(); + + print_message("CR30: Leader handle report failure\n"); + + rc = cr_pool_create(state, &pool, false, TCC_POOL_BAD_LABEL); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + /* Inject fail_loc to fail interaction report. */ + rc = cr_debug_set_params(arg, DAOS_CHK_REPORT_FAILURE | DAOS_FAIL_ALWAYS); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_FAILOUT | TCSF_RESET, 0, NULL, "POOL_BAD_LABEL:CIA_INTERACT"); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_FAILED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_debug_set_params(arg, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + print_message("CR: getting label for pool " DF_UUID " after check\n", + DP_UUID(pool.pool_uuid)); + rc = dmg_pool_get_prop(dmg_config_file, pool.label, pool.pool_uuid, "label", &label); + assert_rc_equal(rc, 0); + + D_ASSERTF(strcmp(label, pool.label) != 0, + "Pool (" DF_UUID ") label should not be repaired: %s\n", DP_UUID(pool.pool_uuid), + label); + + D_FREE(label); + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + +/* + * 1. Create pool and container. + * 2. Fault injection to make container label inconsistent. + * 3. Set fail_loc to fail interaction report. + * 4. Start checker with option "--failout=on" and "CONT_BAD_LABEL:CIA_INTERACT". Should not crash. + * 5. Query checker, instance should failed, pool should be "failed". + * 6. Reset fail_loc. + * 7. Switch to normal mode to verify the container label. + * 8. Cleanup. + */ +static void +cr_engine_report_fail(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = {0}; + struct test_cont cont = {0}; + struct daos_check_info dci = {0}; + char *label = NULL; + int rc; + + FAULT_INJECTION_REQUIRED(); + + print_message("CR31: Engine handle report failure\n"); + + rc = cr_pool_create(state, &pool, true, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_cont_create(state, &pool, &cont, 1); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + /* Inject fail_loc to fail interaction report. */ + rc = cr_debug_set_params(arg, DAOS_CHK_REPORT_FAILURE | DAOS_FAIL_ALWAYS); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_FAILOUT | TCSF_RESET, 0, NULL, "CONT_BAD_LABEL:CIA_INTERACT"); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_FAILED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_debug_set_params(arg, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + /* Former connection for the pool has been evicted by checker. Let's re-connect the pool. */ + rc = cr_cont_get_label(state, &pool, &cont, true, &label); + assert_rc_equal(rc, 0); + + D_ASSERTF(strcmp(label, cont.label) != 0, + "Cont (" DF_UUID ") label should not be repaired: %s\n", DP_UUID(cont.uuid), + label); + + D_FREE(label); + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + /* clang-format off */ static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", @@ -3903,6 +4102,12 @@ static const struct CMUnitTest cr_tests[] = { cr_handle_fail_pool2, async_disable, test_case_teardown}, { "CR28: maintenance mode after dry-run check", cr_maintenance_mode, async_disable, test_case_teardown}, + { "CR29: CR with rank 0 excluded at the beginning", + cr_lost_rank0, async_disable, test_case_teardown}, + { "CR30: Leader handle report failure", + cr_leader_report_fail, async_disable, test_case_teardown}, + { "CR31: Engine handle report failure", + cr_engine_report_fail, async_disable, test_case_teardown}, }; /* clang-format on */ diff --git a/src/tests/suite/daos_degrade_ec.c b/src/tests/suite/daos_degrade_ec.c index c22204a62d5..09a68609f64 100644 --- a/src/tests/suite/daos_degrade_ec.c +++ b/src/tests/suite/daos_degrade_ec.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -38,6 +39,11 @@ degrade_small_sub_setup(void **state) arg = *state; arg->no_rebuild = 1; + + /* Disable manual rebuilds */ + test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_REBUILD_DISABLE | DAOS_FAIL_ALWAYS); + + /* Disable automatic rebuilds */ rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal", "exclude"); return rc; @@ -56,6 +62,11 @@ degrade_sub_setup(void **state) arg = *state; arg->no_rebuild = 1; + + /* Disable manual rebuilds */ + test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_REBUILD_DISABLE | DAOS_FAIL_ALWAYS); + + /* Disable automatic rebuilds */ rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal", "exclude"); return rc; @@ -74,6 +85,11 @@ degrade_sub_rf1_setup(void **state) arg = *state; arg->no_rebuild = 1; + + /* Disable manual rebuilds */ + test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_REBUILD_DISABLE | DAOS_FAIL_ALWAYS); + + /* Disable automatic rebuilds */ rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal", "exclude"); return rc; diff --git a/src/tests/suite/daos_degraded.c b/src/tests/suite/daos_degraded.c index c1328553abb..c26ae442b1f 100644 --- a/src/tests/suite/daos_degraded.c +++ b/src/tests/suite/daos_degraded.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -258,6 +259,11 @@ degraded_setup(void **state) arg = *state; arg->no_rebuild = 1; + + /* Disable manual rebuilds */ + test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_REBUILD_DISABLE | DAOS_FAIL_ALWAYS); + + /* Disable automatic rebuilds */ rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal", "exclude"); return rc; diff --git a/src/tests/suite/daos_drain_common.c b/src/tests/suite/daos_drain_common.c new file mode 100644 index 00000000000..b1d317a1a0e --- /dev/null +++ b/src/tests/suite/daos_drain_common.c @@ -0,0 +1,227 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * This file is for common functions used between daos_drain_simple.c and + * daos_rebuild_interactive.c tests. + * + * tests/suite/daos_drain_common.c + * + */ +#define D_LOGFAC DD_FAC(tests) + +#include "daos_test.h" +#include "daos_iotest.h" +#include "dfs_test.h" +#include +#include +#include + +/* clang-format off */ +const char *extend_drain_opstrs[] = { + "EXTEND_DRAIN_PUNCH", + "EXTEND_DRAIN_STAT", + "EXTEND_DRAIN_ENUMERATE", + "EXTEND_DRAIN_FETCH", + "EXTEND_DRAIN_UPDATE", + "EXTEND_DRAIN_OVERWRITE", + "EXTEND_DRAIN_WRITELOOP" +}; +/* clang-format on */ + +void +extend_drain_read_check(dfs_t *dfs_mt, dfs_obj_t *dir, uint32_t objclass, uint32_t objcnt, + daos_size_t total_size, char start_char) +{ + char *buf = NULL; + char *verify_buf = NULL; + daos_size_t buf_size = 512 * 1024; + d_sg_list_t sgl; + d_iov_t iov; + d_iov_t verify_iov; + int i; + + buf = malloc(buf_size); + verify_buf = malloc(buf_size); + assert_non_null(buf); + assert_non_null(verify_buf); + d_iov_set(&iov, buf, buf_size); + d_iov_set(&verify_iov, buf, buf_size); + sgl.sg_nr = 1; + sgl.sg_iovs = &iov; + + for (i = 0; i < objcnt; i++) { + char filename[32]; + daos_size_t read_size = buf_size; + dfs_obj_t *obj; + daos_off_t offset = 0; + daos_size_t total = total_size; + int rc; + + sprintf(filename, "file%d", i); + rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, O_RDWR, objclass, + 1048576, NULL, &obj); + assert_int_equal(rc, 0); + + memset(verify_buf, start_char + i, buf_size); + + while (total > 0) { + memset(buf, 0, buf_size); + rc = dfs_read(dfs_mt, obj, &sgl, offset, &read_size, NULL); + assert_int_equal(rc, 0); + assert_memory_equal(buf, verify_buf, read_size); + offset += read_size; + total -= read_size; + } + + rc = dfs_release(obj); + assert_int_equal(rc, 0); + } + free(buf); + free(verify_buf); +} + +void +extend_drain_write(dfs_t *dfs_mt, dfs_obj_t *dir, uint32_t objclass, uint32_t objcnt, + daos_size_t total_size, char write_char, daos_obj_id_t *oids) +{ + char *buf = NULL; + daos_size_t buf_size = 512 * 1024; + d_sg_list_t sgl; + d_iov_t iov; + int i; + + buf = malloc(buf_size); + assert_non_null(buf); + d_iov_set(&iov, buf, buf_size); + sgl.sg_nr = 1; + sgl.sg_iovs = &iov; + + for (i = 0; i < objcnt; i++) { + char filename[32]; + dfs_obj_t *obj; + daos_size_t total = total_size; + daos_off_t offset = 0; + int rc; + + sprintf(filename, "file%d", i); + rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT, + OC_EC_2P1GX, 1048576, NULL, &obj); + assert_int_equal(rc, 0); + if (oids != NULL) + dfs_obj2id(obj, &oids[i]); + + memset(buf, write_char + i, buf_size); + while (total > 0) { + rc = dfs_write(dfs_mt, obj, &sgl, offset, NULL); + assert_int_equal(rc, 0); + offset += buf_size; + total -= buf_size; + } + rc = dfs_release(obj); + assert_int_equal(rc, 0); + } + free(buf); +} + +void +extend_drain_check(dfs_t *dfs_mt, dfs_obj_t *dir, int objclass, int opc) +{ + switch (opc) { + case EXTEND_DRAIN_PUNCH: + break; + case EXTEND_DRAIN_OVERWRITE: + extend_drain_read_check(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, WRITE_SIZE, + 'b'); + break; + case EXTEND_DRAIN_WRITELOOP: + extend_drain_read_check(dfs_mt, dir, objclass, 1, 512 * 1048576, 'a'); + break; + default: + extend_drain_read_check(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, WRITE_SIZE, + 'a'); + break; + } +} + +void +dfs_extend_drain_common(void **state, int opc, uint32_t objclass, + test_rebuild_cb_t extend_drain_cb_fn) +{ + test_arg_t *arg = *state; + dfs_t *dfs_mt; + daos_handle_t co_hdl; + dfs_obj_t *dir; + uuid_t co_uuid; + char str[37]; + daos_obj_id_t oids[EXTEND_DRAIN_OBJ_NR]; + struct extend_drain_cb_arg cb_arg; + dfs_attr_t attr = {}; + int rc; + + FAULT_INJECTION_REQUIRED(); + + if (!test_runable(arg, 4)) + return; + + attr.da_props = daos_prop_alloc(2); + assert_non_null(attr.da_props); + attr.da_props->dpp_entries[0].dpe_type = DAOS_PROP_CO_REDUN_LVL; + attr.da_props->dpp_entries[0].dpe_val = DAOS_PROP_CO_REDUN_RANK; + attr.da_props->dpp_entries[1].dpe_type = DAOS_PROP_CO_REDUN_FAC; + attr.da_props->dpp_entries[1].dpe_val = DAOS_PROP_CO_REDUN_RF1; + rc = dfs_cont_create(arg->pool.poh, &co_uuid, &attr, &co_hdl, &dfs_mt); + daos_prop_free(attr.da_props); + assert_int_equal(rc, 0); + print_message("Created DFS Container " DF_UUIDF "\n", DP_UUID(co_uuid)); + + rc = dfs_open(dfs_mt, NULL, "dir", S_IFDIR | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT, objclass, + 0, NULL, &dir); + assert_int_equal(rc, 0); + + /* Create 10 files */ + if (opc != EXTEND_DRAIN_UPDATE) + extend_drain_write(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, WRITE_SIZE, 'a', + oids); + + cb_arg.oids = oids; + cb_arg.dfs_mt = dfs_mt; + cb_arg.dir = dir; + cb_arg.opc = opc; + cb_arg.objclass = objclass; + arg->rebuild_cb = extend_drain_cb_fn; + arg->rebuild_cb_arg = &cb_arg; + + /* HOLD rebuild ULT */ + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, + DAOS_REBUILD_TGT_SCAN_HANG | DAOS_FAIL_ALWAYS, 0, NULL); + drain_single_pool_rank(arg, ranks_to_kill[0], false); + + extend_drain_check(dfs_mt, dir, objclass, opc); + + /* Unclear if kill engine is necessary for a drain / reintegrate test. + * Consider instead test_rebuild_wait() and reintegrate_single_pool_rank(restart=false). + */ + daos_kill_server(arg, arg->pool.pool_uuid, arg->group, arg->pool.alive_svc, + ranks_to_kill[0]); + + arg->rebuild_cb = NULL; + arg->rebuild_cb_arg = NULL; + reintegrate_single_pool_rank(arg, ranks_to_kill[0], true); + + extend_drain_check(dfs_mt, dir, objclass, opc); + + rc = dfs_release(dir); + assert_int_equal(rc, 0); + rc = dfs_umount(dfs_mt); + assert_int_equal(rc, 0); + + rc = daos_cont_close(co_hdl, NULL); + assert_rc_equal(rc, 0); + + uuid_unparse(co_uuid, str); + rc = daos_cont_destroy(arg->pool.poh, str, 1, NULL); + assert_rc_equal(rc, 0); +} diff --git a/src/tests/suite/daos_drain_simple.c b/src/tests/suite/daos_drain_simple.c index d5ce764789c..2227a3bcbc4 100644 --- a/src/tests/suite/daos_drain_simple.c +++ b/src/tests/suite/daos_drain_simple.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -43,6 +44,7 @@ drain_dkeys(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -80,6 +82,7 @@ drain_dkeys(void **state) reintegrate_inflight_io_verify(arg); ioreq_fini(&req); + T_END(); } static int @@ -110,6 +113,7 @@ cont_open_in_drain(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -129,7 +133,7 @@ cont_open_in_drain(void **state) ioreq_fini(&req); test_teardown_cont_hdl(arg); - arg->rebuild_cb = cont_open_and_inflight_io; + arg->rebuild_cb = cont_open_and_inflight_io; arg->rebuild_cb_arg = &oid; drain_single_pool_target(arg, ranks_to_kill[0], tgt, false); @@ -150,6 +154,7 @@ cont_open_in_drain(void **state) reintegrate_inflight_io_verify(arg); ioreq_fini(&req); + T_END(); } static void @@ -166,6 +171,7 @@ drain_akeys(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -202,6 +208,7 @@ drain_akeys(void **state) reintegrate_inflight_io_verify(arg); ioreq_fini(&req); + T_END(); } static void @@ -219,6 +226,7 @@ drain_indexes(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -258,6 +266,7 @@ drain_indexes(void **state) reintegrate_inflight_io_verify(arg); ioreq_fini(&req); + T_END(); } static void @@ -280,6 +289,7 @@ drain_snap_update_keys(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -298,7 +308,7 @@ drain_snap_update_keys(void **state) insert_single("dkey", akey, 0, "data", 1, DAOS_TX_NONE, &req); } - arg->rebuild_cb = reintegrate_inflight_io; + arg->rebuild_cb = reintegrate_inflight_io; arg->rebuild_cb_arg = &oid; drain_single_pool_target(arg, ranks_to_kill[0], tgt, false); @@ -335,6 +345,7 @@ drain_snap_update_keys(void **state) reintegrate_inflight_io_verify(arg); ioreq_fini(&req); + T_END(); } static void @@ -357,6 +368,7 @@ drain_snap_punch_keys(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R3S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -390,7 +402,7 @@ drain_snap_punch_keys(void **state) punch_akey("dkey", akey, DAOS_TX_NONE, &req); } - arg->rebuild_cb = reintegrate_inflight_io; + arg->rebuild_cb = reintegrate_inflight_io; arg->rebuild_cb_arg = &oid; drain_single_pool_target(arg, ranks_to_kill[0], tgt, false); @@ -427,6 +439,7 @@ drain_snap_punch_keys(void **state) reintegrate_inflight_io_verify(arg); ioreq_fini(&req); + T_END(); } static void @@ -445,6 +458,7 @@ drain_multiple(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -495,6 +509,7 @@ drain_multiple(void **state) reintegrate_inflight_io_verify(arg); ioreq_fini(&req); + T_END(); } static void @@ -513,6 +528,7 @@ drain_large_rec(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -548,6 +564,7 @@ drain_large_rec(void **state) reintegrate_inflight_io_verify(arg); ioreq_fini(&req); + T_END(); } static void @@ -563,6 +580,7 @@ drain_objects(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); for (i = 0; i < OBJ_NR; i++) { oids[i] = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); @@ -577,20 +595,22 @@ drain_objects(void **state) rebuild_io_validate(arg, oids, OBJ_NR); reintegrate_inflight_io_verify(arg); + T_END(); } static void drain_fail_and_retry_objects(void **state) { - test_arg_t *arg = *state; - daos_obj_id_t oids[OBJ_NR]; - int i; + test_arg_t *arg = *state; + daos_obj_id_t oids[OBJ_NR]; + int i; FAULT_INJECTION_REQUIRED(); if (!test_runable(arg, 4)) return; + T_BEGIN(); for (i = 0; i < OBJ_NR; i++) { oids[i] = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); @@ -611,8 +631,12 @@ drain_fail_and_retry_objects(void **state) daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); rebuild_io_validate(arg, oids, OBJ_NR); + arg->rebuild_cb = reintegrate_inflight_io; + arg->rebuild_cb_arg = &oids[OBJ_NR - 1]; drain_single_pool_rank(arg, ranks_to_kill[0], false); rebuild_io_validate(arg, oids, OBJ_NR); + reintegrate_inflight_io_verify(arg); + T_END(); } static void @@ -626,6 +650,7 @@ drain_then_exclude(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, OC_EC_2P1GX, 0, 0, arg->myrank); rebuild_io(arg, &oid, 1); @@ -639,123 +664,10 @@ drain_then_exclude(void **state) reintegrate_single_pool_rank(arg, ranks_to_kill[0], true); rebuild_io_validate(arg, &oid, 1); + T_END(); } -#define EXTEND_DRAIN_OBJ_NR 5 -#define WRITE_SIZE (1048576 * 5) -struct extend_drain_cb_arg{ - daos_obj_id_t *oids; - dfs_t *dfs_mt; - dfs_obj_t *dir; - d_rank_t rank; - uint32_t objclass; - int opc; -}; - -enum extend_drain_opc { - EXTEND_DRAIN_PUNCH, - EXTEND_DRAIN_STAT, - EXTEND_DRAIN_ENUMERATE, - EXTEND_DRAIN_FETCH, - EXTEND_DRAIN_UPDATE, - EXTEND_DRAIN_OVERWRITE, - EXTEND_DRAIN_WRITELOOP, -}; - -static void -extend_drain_read_check(dfs_t *dfs_mt, dfs_obj_t *dir, uint32_t objclass, uint32_t objcnt, - daos_size_t total_size, char start_char) -{ - char *buf = NULL; - char *verify_buf = NULL; - daos_size_t buf_size = 512 * 1024; - d_sg_list_t sgl; - d_iov_t iov; - d_iov_t verify_iov; - int i; - - buf = malloc(buf_size); - verify_buf = malloc(buf_size); - assert_non_null(buf); - assert_non_null(verify_buf); - d_iov_set(&iov, buf, buf_size); - d_iov_set(&verify_iov, buf, buf_size); - sgl.sg_nr = 1; - sgl.sg_iovs = &iov; - - for (i = 0; i < objcnt; i++) { - char filename[32]; - daos_size_t read_size = buf_size; - dfs_obj_t *obj; - daos_off_t offset = 0; - daos_size_t total = total_size; - int rc; - - sprintf(filename, "file%d", i); - rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, - O_RDWR, objclass, 1048576, NULL, &obj); - assert_int_equal(rc, 0); - - memset(verify_buf, start_char + i, buf_size); - - while (total > 0) { - memset(buf, 0, buf_size); - rc = dfs_read(dfs_mt, obj, &sgl, offset, &read_size, NULL); - assert_int_equal(rc, 0); - assert_memory_equal(buf, verify_buf, read_size); - offset += read_size; - total -= read_size; - } - - rc = dfs_release(obj); - assert_int_equal(rc, 0); - } - free(buf); - free(verify_buf); -} - -static void -extend_drain_write(dfs_t *dfs_mt, dfs_obj_t *dir, uint32_t objclass, uint32_t objcnt, - daos_size_t total_size, char write_char, daos_obj_id_t *oids) -{ - char *buf = NULL; - daos_size_t buf_size = 512 * 1024; - d_sg_list_t sgl; - d_iov_t iov; - int i; - - buf = malloc(buf_size); - assert_non_null(buf); - d_iov_set(&iov, buf, buf_size); - sgl.sg_nr = 1; - sgl.sg_iovs = &iov; - - for (i = 0; i < objcnt; i++) { - char filename[32]; - dfs_obj_t *obj; - daos_size_t total = total_size; - daos_off_t offset = 0; - int rc; - - sprintf(filename, "file%d", i); - rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, - O_RDWR | O_CREAT, OC_EC_2P1GX, 1048576, NULL, &obj); - assert_int_equal(rc, 0); - if (oids != NULL) - dfs_obj2id(obj, &oids[i]); - - memset(buf, write_char + i, buf_size); - while (total > 0) { - rc = dfs_write(dfs_mt, obj, &sgl, offset, NULL); - assert_int_equal(rc, 0); - offset += buf_size; - total -= buf_size; - } - rc = dfs_release(obj); - assert_int_equal(rc, 0); - } - free(buf); -} +/* FIXME: rename a few things - most of this code is performing drain + kill/exclude, NOT extend */ static int extend_drain_cb_internal(void *arg) @@ -775,10 +687,12 @@ extend_drain_cb_internal(void *arg) int i; if (opc != EXTEND_DRAIN_WRITELOOP) { - print_message("sleep 5 seconds then start op %d\n", opc); + print_message("sleep 5 seconds first\n"); sleep(5); } + print_message("start op %d (%s)\n", opc, extend_drain_opstrs[opc]); + /* Kill another rank during extend */ switch(opc) { case EXTEND_DRAIN_PUNCH: @@ -837,151 +751,85 @@ extend_drain_cb_internal(void *arg) daos_debug_set_params(test_arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); - return 0; -} - -static void -extend_drain_check(dfs_t *dfs_mt, dfs_obj_t *dir, int objclass, int opc) -{ - switch (opc) { - case EXTEND_DRAIN_PUNCH: - break; - case EXTEND_DRAIN_OVERWRITE: - extend_drain_read_check(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, - WRITE_SIZE, 'b'); - break; - case EXTEND_DRAIN_WRITELOOP: - extend_drain_read_check(dfs_mt, dir, objclass, 1, 512 * 1048576, 'a'); - break; - default: - extend_drain_read_check(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, - WRITE_SIZE, 'a'); - break; - } -} - -void -dfs_extend_drain_common(void **state, int opc, uint32_t objclass) -{ - test_arg_t *arg = *state; - dfs_t *dfs_mt; - daos_handle_t co_hdl; - dfs_obj_t *dir; - uuid_t co_uuid; - char str[37]; - daos_obj_id_t oids[EXTEND_DRAIN_OBJ_NR]; - struct extend_drain_cb_arg cb_arg; - dfs_attr_t attr = {}; - int rc; - - FAULT_INJECTION_REQUIRED(); + print_message("done op %d (%s)\n", opc, extend_drain_opstrs[opc]); - if (!test_runable(arg, 4)) - return; - - attr.da_props = daos_prop_alloc(2); - assert_non_null(attr.da_props); - attr.da_props->dpp_entries[0].dpe_type = DAOS_PROP_CO_REDUN_LVL; - attr.da_props->dpp_entries[0].dpe_val = DAOS_PROP_CO_REDUN_RANK; - attr.da_props->dpp_entries[1].dpe_type = DAOS_PROP_CO_REDUN_FAC; - attr.da_props->dpp_entries[1].dpe_val = DAOS_PROP_CO_REDUN_RF1; - rc = dfs_cont_create(arg->pool.poh, &co_uuid, &attr, &co_hdl, &dfs_mt); - daos_prop_free(attr.da_props); - assert_int_equal(rc, 0); - print_message("Created DFS Container "DF_UUIDF"\n", DP_UUID(co_uuid)); - - rc = dfs_open(dfs_mt, NULL, "dir", S_IFDIR | S_IWUSR | S_IRUSR, - O_RDWR | O_CREAT, objclass, 0, NULL, &dir); - assert_int_equal(rc, 0); - - /* Create 10 files */ - if (opc != EXTEND_DRAIN_UPDATE) - extend_drain_write(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, WRITE_SIZE, - 'a', oids); - - cb_arg.oids = oids; - cb_arg.dfs_mt = dfs_mt; - cb_arg.dir = dir; - cb_arg.opc = opc; - cb_arg.objclass = objclass; - arg->rebuild_cb = extend_drain_cb_internal; - arg->rebuild_cb_arg = &cb_arg; - - /* HOLD rebuild ULT */ - daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, - DAOS_REBUILD_TGT_SCAN_HANG | DAOS_FAIL_ALWAYS, 0, NULL); - drain_single_pool_rank(arg, ranks_to_kill[0], false); - - extend_drain_check(dfs_mt, dir, objclass, opc); - - daos_kill_server(arg, arg->pool.pool_uuid, arg->group, arg->pool.alive_svc, - ranks_to_kill[0]); - arg->rebuild_cb = NULL; - arg->rebuild_cb_arg = NULL; - reintegrate_single_pool_rank(arg, ranks_to_kill[0], true); - - extend_drain_check(dfs_mt, dir, objclass, opc); - - rc = dfs_release(dir); - assert_int_equal(rc, 0); - rc = dfs_umount(dfs_mt); - assert_int_equal(rc, 0); - - rc = daos_cont_close(co_hdl, NULL); - assert_rc_equal(rc, 0); - - uuid_unparse(co_uuid, str); - rc = daos_cont_destroy(arg->pool.poh, str, 1, NULL); - assert_rc_equal(rc, 0); + return 0; } void dfs_drain_punch(void **state) { - dfs_extend_drain_common(state, EXTEND_DRAIN_PUNCH, OC_EC_2P1GX); - dfs_extend_drain_common(state, EXTEND_DRAIN_PUNCH, OC_EC_4P2GX); + print_message("=== Begin EXTEND_DRAIN_PUNCH, oclass OC_EC_2P1GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_PUNCH, OC_EC_2P1GX, extend_drain_cb_internal); + print_message("=== Begin EXTEND_DRAIN_PUNCH, oclass OC_EC_4P2GX, rebuild stop|start\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_PUNCH, OC_EC_4P2GX, extend_drain_cb_internal); + T_END(); } void dfs_drain_stat(void **state) { - dfs_extend_drain_common(state, EXTEND_DRAIN_STAT, OC_EC_2P1GX); - dfs_extend_drain_common(state, EXTEND_DRAIN_STAT, OC_EC_4P2GX); + print_message("=== Begin EXTEND_DRAIN_STAT, oclass OC_EC_2P1GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_STAT, OC_EC_2P1GX, extend_drain_cb_internal); + print_message("=== Begin EXTEND_DRAIN_STAT, oclass OC_EC_4P2GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_STAT, OC_EC_4P2GX, extend_drain_cb_internal); + T_END(); } void dfs_drain_enumerate(void **state) { - dfs_extend_drain_common(state, EXTEND_DRAIN_ENUMERATE, OC_EC_2P1GX); - dfs_extend_drain_common(state, EXTEND_DRAIN_ENUMERATE, OC_EC_4P2GX); + print_message("=== Begin EXTEND_DRAIN_ENUMERATE, oclass OC_EC_2P1GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_ENUMERATE, OC_EC_2P1GX, + extend_drain_cb_internal); + print_message("=== Begin EXTEND_DRAIN_ENUMERATE, oclass OC_EC_4P2GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_ENUMERATE, OC_EC_4P2GX, + extend_drain_cb_internal); + T_END(); } void dfs_drain_fetch(void **state) { - dfs_extend_drain_common(state, EXTEND_DRAIN_FETCH, OC_EC_2P1GX); - dfs_extend_drain_common(state, EXTEND_DRAIN_FETCH, OC_EC_4P2GX); + print_message("=== Begin EXTEND_DRAIN_FETCH, oclass OC_EC_2P1GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_FETCH, OC_EC_2P1GX, extend_drain_cb_internal); + print_message("=== Begin EXTEND_DRAIN_FETCH, oclass OC_EC_4P2GX, rebuild stop|start\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_FETCH, OC_EC_4P2GX, extend_drain_cb_internal); + T_END(); } void dfs_drain_update(void **state) { - dfs_extend_drain_common(state, EXTEND_DRAIN_UPDATE, OC_EC_2P1GX); - dfs_extend_drain_common(state, EXTEND_DRAIN_UPDATE, OC_EC_4P2GX); + print_message("=== Begin EXTEND_DRAIN_UPDATE, oclass OC_EC_2P1GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_UPDATE, OC_EC_2P1GX, extend_drain_cb_internal); + print_message("=== Begin EXTEND_DRAIN_UPDATE, oclass OC_EC_4P2GX, rebuild stop|start\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_UPDATE, OC_EC_4P2GX, extend_drain_cb_internal); + T_END(); } void dfs_drain_overwrite(void **state) { - dfs_extend_drain_common(state, EXTEND_DRAIN_OVERWRITE, OC_EC_2P1GX); - dfs_extend_drain_common(state, EXTEND_DRAIN_OVERWRITE, OC_EC_4P2GX); + print_message("=== Begin EXTEND_DRAIN_OVERWRITE, oclass OC_EC_2P1GX, rebuild stop|start\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_OVERWRITE, OC_EC_2P1GX, + extend_drain_cb_internal); + print_message("=== Begin EXTEND_DRAIN_OVERWRITE, oclass OC_EC_4P2GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_OVERWRITE, OC_EC_4P2GX, + extend_drain_cb_internal); + T_END(); } void dfs_drain_writeloop(void **state) { - dfs_extend_drain_common(state, EXTEND_DRAIN_WRITELOOP, OC_EC_2P1GX); - dfs_extend_drain_common(state, EXTEND_DRAIN_WRITELOOP, OC_EC_4P2GX); + print_message("=== Begin EXTEND_DRAIN_WRITELOOP, oclass OC_EC_2P1GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_WRITELOOP, OC_EC_2P1GX, + extend_drain_cb_internal); + print_message("=== Begin EXTEND_DRAIN_WRITELOOP, oclass OC_EC_4P2GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_WRITELOOP, OC_EC_4P2GX, + extend_drain_cb_internal); + T_END(); } void @@ -1002,6 +850,7 @@ dfs_drain_extend(void **state) if (!test_runable(arg, 3)) return; + T_BEGIN(); attr.da_props = daos_prop_alloc(2); assert_non_null(attr.da_props); attr.da_props->dpp_entries[0].dpe_type = DAOS_PROP_CO_REDUN_LVL; @@ -1049,6 +898,7 @@ dfs_drain_extend(void **state) uuid_unparse(co_uuid, str); rc = daos_cont_destroy(arg->pool.poh, str, 1, NULL); assert_rc_equal(rc, 0); + T_END(); } /** create a new pool/container for each test */ diff --git a/src/tests/suite/daos_extend_common.c b/src/tests/suite/daos_extend_common.c new file mode 100644 index 00000000000..86a8a2e2009 --- /dev/null +++ b/src/tests/suite/daos_extend_common.c @@ -0,0 +1,223 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * This file is for common functions used between daos_extend_simple.c and + * daos_rebuild_interactive.c tests. + * + * tests/suite/daos_extend_common.c + * + */ +#define D_LOGFAC DD_FAC(tests) + +#include "daos_test.h" +#include "daos_iotest.h" +#include "dfs_test.h" +#include +#include +#include + +/* clang-format off */ +const char *extend_opstrs[] = { + "EXTEND_PUNCH", + "EXTEND_STAT", + "EXTEND_ENUMERATE", + "EXTEND_FETCH", + "EXTEND_UPDATE" +}; +/* clang-format on */ + +void +extend_read_check(dfs_t *dfs_mt, dfs_obj_t *dir) +{ + char *buf = NULL; + char *verify_buf = NULL; + daos_size_t buf_size = 512 * 1024; + d_sg_list_t sgl; + d_iov_t iov; + d_iov_t verify_iov; + int i; + + buf = malloc(buf_size); + verify_buf = malloc(buf_size); + print_message("%s(): allocations buf_size=" DF_U64 ", buf=%p, verify_buf=%p\n", + __FUNCTION__, buf_size, buf, verify_buf); + assert_non_null(buf); + assert_non_null(verify_buf); + d_iov_set(&iov, buf, buf_size); + d_iov_set(&verify_iov, buf, buf_size); + sgl.sg_nr = 1; + sgl.sg_iovs = &iov; + + for (i = 0; i < 20; i++) { + char filename[32]; + daos_size_t read_size = buf_size; + dfs_obj_t *obj; + int rc; + + sprintf(filename, "file%d", i); + rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, O_RDWR, + OC_EC_2P1GX, 1048576, NULL, &obj); + print_message("%s(): dfs_open(filename=%s) rc=%d\n", __FUNCTION__, filename, rc); + assert_int_equal(rc, 0); + + memset(verify_buf, 'a' + i, buf_size); + rc = dfs_read(dfs_mt, obj, &sgl, 0, &read_size, NULL); + print_message("%s(): dfs_read() read_size=" DF_U64 ", rc=%d\n", __FUNCTION__, + read_size, rc); + assert_int_equal(rc, 0); + assert_int_equal((int)read_size, buf_size); + assert_memory_equal(buf, verify_buf, read_size); + rc = dfs_release(obj); + print_message("%s(): dfs_release() rc=%d\n", __FUNCTION__, rc); + assert_int_equal(rc, 0); + } + free(buf); + free(verify_buf); + print_message("%s(): done, freed buf and verify_buf\n", __FUNCTION__); +} + +void +extend_write(dfs_t *dfs_mt, dfs_obj_t *dir) +{ + char *buf = NULL; + daos_size_t buf_size = 512 * 1024; + d_sg_list_t sgl; + d_iov_t iov; + int i; + + buf = malloc(buf_size); + assert_non_null(buf); + d_iov_set(&iov, buf, buf_size); + sgl.sg_nr = 1; + sgl.sg_iovs = &iov; + + for (i = 0; i < 20; i++) { + char filename[32]; + dfs_obj_t *obj; + int rc; + + sprintf(filename, "file%d", i); + rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT, + OC_EC_2P1GX, 1048576, NULL, &obj); + assert_int_equal(rc, 0); + + memset(buf, 'a' + i, buf_size); + rc = dfs_write(dfs_mt, obj, &sgl, 0, NULL); + assert_int_equal(rc, 0); + rc = dfs_release(obj); + assert_int_equal(rc, 0); + } + free(buf); +} + +void +dfs_extend_internal(void **state, int opc, test_rebuild_cb_t extend_cb, bool kill) +{ + test_arg_t *arg = *state; + dfs_t *dfs_mt; + daos_handle_t co_hdl; + dfs_obj_t *obj; + dfs_obj_t *dir; + uuid_t co_uuid; + int i; + d_rank_t extend_rank = 3; + char str[37]; + daos_obj_id_t oids[EXTEND_OBJ_NR]; + struct extend_cb_arg cb_arg; + dfs_attr_t attr = {}; + int rc; + + attr.da_props = daos_prop_alloc(2); + assert_non_null(attr.da_props); + attr.da_props->dpp_entries[0].dpe_type = DAOS_PROP_CO_REDUN_LVL; + attr.da_props->dpp_entries[0].dpe_val = DAOS_PROP_CO_REDUN_RANK; + attr.da_props->dpp_entries[1].dpe_type = DAOS_PROP_CO_REDUN_FAC; + attr.da_props->dpp_entries[1].dpe_val = DAOS_PROP_CO_REDUN_RF1; + rc = dfs_cont_create(arg->pool.poh, &co_uuid, &attr, &co_hdl, &dfs_mt); + daos_prop_free(attr.da_props); + assert_int_equal(rc, 0); + print_message("Created DFS Container " DF_UUIDF "\n", DP_UUID(co_uuid)); + + rc = dfs_open(dfs_mt, NULL, "dir", S_IFDIR | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT, + OC_EC_2P1GX, 0, NULL, &dir); + assert_int_equal(rc, 0); + + /* Create 1000 files */ + if (opc == EXTEND_FETCH) { + extend_write(dfs_mt, dir); + } else { + for (i = 0; i < EXTEND_OBJ_NR; i++) { + char filename[32]; + + sprintf(filename, "file%d", i); + rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, + O_RDWR | O_CREAT, OC_EC_2P1GX, 1048576, NULL, &obj); + assert_int_equal(rc, 0); + dfs_obj2id(obj, &oids[i]); + rc = dfs_release(obj); + assert_int_equal(rc, 0); + } + } + + cb_arg.oids = oids; + cb_arg.dfs_mt = dfs_mt; + cb_arg.dir = dir; + cb_arg.opc = opc; + cb_arg.kill = kill; + if (kill) + cb_arg.rank = 2; + else + cb_arg.rank = 4; + + arg->rebuild_cb = extend_cb; + arg->rebuild_cb_arg = &cb_arg; + + /* HOLD rebuild ULT. FIXME: maybe change to use test_set_engine_fail_loc()? */ + print_message("inject DAOS_REBUILD_TGT_SCAN_HANG fault on engines\n"); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, + DAOS_REBUILD_TGT_SCAN_HANG | DAOS_FAIL_ALWAYS, 0, NULL); + + arg->no_rebuild = + 1; /* This has no effect for RB_OP_TYPE_ADD - so can this be removed here? */ + extend_single_pool_rank(arg, extend_rank); + arg->no_rebuild = 0; + + print_message("sleep 30 secs for rank %u %s\n", cb_arg.rank, + cb_arg.kill ? "kill/exclude" : "extend"); + sleep(30); + print_message("wait for rebuild due to rank %u extend and rank %u %s\n", extend_rank, + cb_arg.rank, cb_arg.kill ? "kill/exclude" : "extend"); + test_rebuild_wait(&arg, 1); + + if (opc == EXTEND_UPDATE) { + print_message("First extend update read check\n"); + extend_read_check(dfs_mt, dir); + } + + arg->rebuild_cb = NULL; + arg->rebuild_cb_arg = NULL; + if (kill) { + print_message("reintegrate rank %u\n", cb_arg.rank); + reintegrate_single_pool_rank(arg, cb_arg.rank, true); + } + + if (opc == EXTEND_UPDATE) { + print_message("Second extend update read check\n"); + extend_read_check(dfs_mt, dir); + } + + rc = dfs_release(dir); + assert_int_equal(rc, 0); + rc = dfs_umount(dfs_mt); + assert_int_equal(rc, 0); + + rc = daos_cont_close(co_hdl, NULL); + assert_rc_equal(rc, 0); + + uuid_unparse(co_uuid, str); + rc = daos_cont_destroy(arg->pool.poh, str, 1, NULL); + assert_rc_equal(rc, 0); +} diff --git a/src/tests/suite/daos_extend_simple.c b/src/tests/suite/daos_extend_simple.c index 98be6290844..f2f5af8bbca 100644 --- a/src/tests/suite/daos_extend_simple.c +++ b/src/tests/suite/daos_extend_simple.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -12,47 +13,46 @@ * * */ -#define D_LOGFAC DD_FAC(tests) +#define D_LOGFAC DD_FAC(tests) #include "daos_iotest.h" +#include "daos_test.h" #include "dfs_test.h" #include #include #include -#define KEY_NR 10 -#define OBJ_NR 10 +#define KEY_NR 10 +#define OBJ_NR 10 static void extend_dkeys(void **state) { - test_arg_t *arg = *state; - daos_obj_id_t oids[OBJ_NR]; - struct ioreq req; - int i; - int j; - int rc; + test_arg_t *arg = *state; + daos_obj_id_t oids[OBJ_NR]; + struct ioreq req; + int i; + int j; + int rc; - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); if (!test_runable(arg, 3)) return; for (i = 0; i < OBJ_NR; i++) { - oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, - arg->myrank); + oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oids[i], DAOS_IOD_ARRAY, arg); /** Insert 10 records */ - print_message("Insert %d kv record in object "DF_OID"\n", - KEY_NR, DP_OID(oids[i])); + print_message("Insert %d kv record in object " DF_OID "\n", KEY_NR, + DP_OID(oids[i])); for (j = 0; j < KEY_NR; j++) { - char key[32] = {0}; + char key[32] = {0}; sprintf(key, "dkey_0_%d", j); - insert_single(key, "a_key", 0, "data", - strlen("data") + 1, - DAOS_TX_NONE, &req); + insert_single(key, "a_key", 0, "data", strlen("data") + 1, DAOS_TX_NONE, + &req); } ioreq_fini(&req); } @@ -63,38 +63,38 @@ extend_dkeys(void **state) rc = daos_obj_verify(arg->coh, oids[i], DAOS_EPOCH_MAX); assert_rc_equal(rc, 0); } + + T_END(); } static void extend_akeys(void **state) { - test_arg_t *arg = *state; - daos_obj_id_t oids[OBJ_NR]; - struct ioreq req; - int i; - int j; - int rc; + test_arg_t *arg = *state; + daos_obj_id_t oids[OBJ_NR]; + struct ioreq req; + int i; + int j; + int rc; - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); if (!test_runable(arg, 3)) return; for (i = 0; i < OBJ_NR; i++) { - oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, - arg->myrank); + oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oids[i], DAOS_IOD_ARRAY, arg); /** Insert 10 records */ - print_message("Insert %d kv record in object "DF_OID"\n", - KEY_NR, DP_OID(oids[i])); + print_message("Insert %d kv record in object " DF_OID "\n", KEY_NR, + DP_OID(oids[i])); for (j = 0; j < KEY_NR; j++) { - char akey[16]; + char akey[16]; sprintf(akey, "%d", j); - insert_single("dkey_1_0", akey, 0, "data", - strlen("data") + 1, - DAOS_TX_NONE, &req); + insert_single("dkey_1_0", akey, 0, "data", strlen("data") + 1, DAOS_TX_NONE, + &req); } ioreq_fini(&req); } @@ -104,41 +104,41 @@ extend_akeys(void **state) rc = daos_obj_verify(arg->coh, oids[i], DAOS_EPOCH_MAX); assert_rc_equal(rc, 0); } + + T_END(); } static void extend_indexes(void **state) { - test_arg_t *arg = *state; - daos_obj_id_t oids[OBJ_NR]; - struct ioreq req; - int i; - int j; - int k; - int rc; + test_arg_t *arg = *state; + daos_obj_id_t oids[OBJ_NR]; + struct ioreq req; + int i; + int j; + int k; + int rc; - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); if (!test_runable(arg, 3)) return; for (i = 0; i < OBJ_NR; i++) { - oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, - arg->myrank); + oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oids[i], DAOS_IOD_ARRAY, arg); /** Insert 10 records */ - print_message("Insert %d kv record in object "DF_OID"\n", - KEY_NR, DP_OID(oids[i])); + print_message("Insert %d kv record in object " DF_OID "\n", KEY_NR, + DP_OID(oids[i])); for (j = 0; j < KEY_NR; j++) { - char key[32] = {0}; + char key[32] = {0}; sprintf(key, "dkey_2_%d", j); for (k = 0; k < 20; k++) - insert_single(key, "a_key", k, "data", - strlen("data") + 1, DAOS_TX_NONE, - &req); + insert_single(key, "a_key", k, "data", strlen("data") + 1, + DAOS_TX_NONE, &req); } ioreq_fini(&req); } @@ -148,39 +148,39 @@ extend_indexes(void **state) rc = daos_obj_verify(arg->coh, oids[i], DAOS_EPOCH_MAX); assert_rc_equal(rc, 0); } + + T_END(); } static void extend_large_rec(void **state) { - test_arg_t *arg = *state; - daos_obj_id_t oids[OBJ_NR]; - struct ioreq req; - char buffer[5000]; - int i; - int j; - int rc; + test_arg_t *arg = *state; + daos_obj_id_t oids[OBJ_NR]; + struct ioreq req; + char buffer[5000]; + int i; + int j; + int rc; - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); if (!test_runable(arg, 3)) return; memset(buffer, 'a', 5000); for (i = 0; i < OBJ_NR; i++) { - oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, - arg->myrank); + oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oids[i], DAOS_IOD_ARRAY, arg); /** Insert 10 records */ - print_message("Insert %d kv record in object "DF_OID"\n", - KEY_NR, DP_OID(oids[i])); + print_message("Insert %d kv record in object " DF_OID "\n", KEY_NR, + DP_OID(oids[i])); for (j = 0; j < KEY_NR; j++) { - char key[32] = {0}; + char key[32] = {0}; sprintf(key, "dkey_3_%d", j); - insert_single(key, "a_key", 0, buffer, 5000, - DAOS_TX_NONE, &req); + insert_single(key, "a_key", 0, buffer, 5000, DAOS_TX_NONE, &req); } ioreq_fini(&req); } @@ -190,28 +190,28 @@ extend_large_rec(void **state) rc = daos_obj_verify(arg->coh, oids[i], DAOS_EPOCH_MAX); assert_rc_equal(rc, 0); } + + T_END(); } static void extend_objects(void **state) { - test_arg_t *arg = *state; - struct ioreq req; - daos_obj_id_t oids[OBJ_NR]; - int i; + test_arg_t *arg = *state; + struct ioreq req; + daos_obj_id_t oids[OBJ_NR]; + int i; - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); if (!test_runable(arg, 3)) return; for (i = 0; i < OBJ_NR; i++) { - oids[i] = daos_test_oid_gen(arg->coh, OC_S1, 0, - 0, arg->myrank); + oids[i] = daos_test_oid_gen(arg->coh, OC_S1, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oids[i], DAOS_IOD_ARRAY, arg); - insert_single("dkey", "akey", 0, "data", strlen("data") + 1, - DAOS_TX_NONE, &req); + insert_single("dkey", "akey", 0, "data", strlen("data") + 1, DAOS_TX_NONE, &req); ioreq_fini(&req); } @@ -222,151 +222,53 @@ extend_objects(void **state) ioreq_init(&req, arg->coh, oids[i], DAOS_IOD_ARRAY, arg); memset(buffer, 0, 16); - lookup_single("dkey", "akey", 0, buffer, 16, - DAOS_TX_NONE, &req); + lookup_single("dkey", "akey", 0, buffer, 16, DAOS_TX_NONE, &req); assert_string_equal(buffer, "data"); ioreq_fini(&req); } -} - -#define EXTEND_OBJ_NR 1000 -struct extend_cb_arg{ - daos_obj_id_t *oids; - dfs_t *dfs_mt; - dfs_obj_t *dir; - d_rank_t rank; - int opc; - bool kill; -}; - -enum extend_opc { - EXTEND_PUNCH, - EXTEND_STAT, - EXTEND_ENUMERATE, - EXTEND_FETCH, - EXTEND_UPDATE, -}; -static void -extend_read_check(dfs_t *dfs_mt, dfs_obj_t *dir) -{ - char *buf = NULL; - char *verify_buf = NULL; - daos_size_t buf_size = 512 * 1024; - d_sg_list_t sgl; - d_iov_t iov; - d_iov_t verify_iov; - int i; - - buf = malloc(buf_size); - verify_buf = malloc(buf_size); - print_message("%s(): allocations buf_size=" DF_U64 ", buf=%p, verify_buf=%p\n", - __FUNCTION__, buf_size, buf, verify_buf); - assert_non_null(buf); - assert_non_null(verify_buf); - d_iov_set(&iov, buf, buf_size); - d_iov_set(&verify_iov, buf, buf_size); - sgl.sg_nr = 1; - sgl.sg_iovs = &iov; - - for (i = 0; i < 20; i++) { - char filename[32]; - daos_size_t read_size = buf_size; - dfs_obj_t *obj; - int rc; - - sprintf(filename, "file%d", i); - rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, - O_RDWR, OC_EC_2P1GX, 1048576, NULL, &obj); - print_message("%s(): dfs_open(filename=%s) rc=%d\n", __FUNCTION__, filename, rc); - assert_int_equal(rc, 0); - - memset(verify_buf, 'a' + i, buf_size); - rc = dfs_read(dfs_mt, obj, &sgl, 0, &read_size, NULL); - print_message("%s(): dfs_read() read_size=" DF_U64 ", rc=%d\n", __FUNCTION__, - read_size, rc); - assert_int_equal(rc, 0); - assert_int_equal((int)read_size, buf_size); - assert_memory_equal(buf, verify_buf, read_size); - rc = dfs_release(obj); - print_message("%s(): dfs_release() rc=%d\n", __FUNCTION__, rc); - assert_int_equal(rc, 0); - } - free(buf); - free(verify_buf); - print_message("%s(): done, freed buf and verify_buf\n", __FUNCTION__); -} - -static void -extend_write(dfs_t *dfs_mt, dfs_obj_t *dir) -{ - char *buf = NULL; - daos_size_t buf_size = 512 * 1024; - d_sg_list_t sgl; - d_iov_t iov; - int i; - - buf = malloc(buf_size); - assert_non_null(buf); - d_iov_set(&iov, buf, buf_size); - sgl.sg_nr = 1; - sgl.sg_iovs = &iov; - - for (i = 0; i < 20; i++) { - char filename[32]; - dfs_obj_t *obj; - int rc; - - sprintf(filename, "file%d", i); - rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, - O_RDWR | O_CREAT, OC_EC_2P1GX, 1048576, NULL, &obj); - assert_int_equal(rc, 0); - - memset(buf, 'a' + i, buf_size); - rc = dfs_write(dfs_mt, obj, &sgl, 0, NULL); - assert_int_equal(rc, 0); - rc = dfs_release(obj); - assert_int_equal(rc, 0); - } - free(buf); + T_END(); } static int extend_cb_internal(void *arg) { - test_arg_t *test_arg = arg; - struct extend_cb_arg *cb_arg = test_arg->rebuild_cb_arg; - dfs_t *dfs_mt = cb_arg->dfs_mt; - daos_obj_id_t *oids = cb_arg->oids; - dfs_obj_t *dir = cb_arg->dir; - struct dirent ents[10]; - int opc = cb_arg->opc; - int total_entries = 0; - uint32_t num_ents = 10; - daos_anchor_t anchor = { 0 }; - const char *pre_op = (cb_arg->kill ? "kill" : "extend"); - int rc; - int i; - - print_message("sleep 10 seconds then %s %u and start op %d\n", pre_op, - cb_arg->rank, opc); + test_arg_t *test_arg = arg; + struct extend_cb_arg *cb_arg = test_arg->rebuild_cb_arg; + dfs_t *dfs_mt = cb_arg->dfs_mt; + daos_obj_id_t *oids = cb_arg->oids; + dfs_obj_t *dir = cb_arg->dir; + struct dirent ents[10]; + int opc = cb_arg->opc; + int total_entries = 0; + uint32_t num_ents = 10; + daos_anchor_t anchor = {0}; + const char *pre_op = (cb_arg->kill ? "kill" : "extend"); + int rc; + int i; + + print_message("Extending, sleep 10, %s another rank %u, and start op %d (%s)\n", pre_op, + cb_arg->rank, opc, extend_opstrs[opc]); + sleep(10); if (cb_arg->kill) { + /* Kill another rank during extend */ daos_kill_server(test_arg, test_arg->pool.pool_uuid, test_arg->group, test_arg->pool.alive_svc, cb_arg->rank); } else { - /* it should fail with -DER_BUSY */ + /* Extend another rank during extend */ print_message("extend pool " DF_UUID " rank %u\n", DP_UUID(test_arg->pool.pool_uuid), cb_arg->rank); rc = dmg_pool_extend(test_arg->dmg_config, test_arg->pool.pool_uuid, test_arg->group, &cb_arg->rank, 1); assert_int_equal(rc, 0); } - /* Kill another rank during extend */ - switch(opc) { + + switch (opc) { case EXTEND_PUNCH: - print_message("punch objects during %s\n", pre_op); + print_message("punch objects during extend one rank, %s rank %u\n", pre_op, + cb_arg->rank); for (i = 0; i < EXTEND_OBJ_NR; i++) { char filename[32]; @@ -376,10 +278,11 @@ extend_cb_internal(void *arg) } break; case EXTEND_STAT: - print_message("stat objects during %s\n", pre_op); + print_message("stat objects during extend one rank, %s rank %u\n", pre_op, + cb_arg->rank); for (i = 0; i < EXTEND_OBJ_NR; i++) { - char filename[32]; - struct stat stbuf; + char filename[32]; + struct stat stbuf; sprintf(filename, "file%d", i); rc = dfs_stat(dfs_mt, dir, filename, &stbuf); @@ -387,21 +290,24 @@ extend_cb_internal(void *arg) } break; case EXTEND_ENUMERATE: - print_message("enumerate objects during %s\n", pre_op); + print_message("enumerate objects during extend one rank, %s rank %u\n", pre_op, + cb_arg->rank); while (!daos_anchor_is_eof(&anchor)) { num_ents = 10; - rc = dfs_readdir(dfs_mt, dir, &anchor, &num_ents, ents); + rc = dfs_readdir(dfs_mt, dir, &anchor, &num_ents, ents); assert_int_equal(rc, 0); total_entries += num_ents; } assert_int_equal(total_entries, 1000); break; case EXTEND_FETCH: - print_message("fetch objects during %s\n", pre_op); + print_message("fetch objects during extend one rank, %s rank %u\n", pre_op, + cb_arg->rank); extend_read_check(dfs_mt, dir); break; case EXTEND_UPDATE: - print_message("update objects during %s\n", pre_op); + print_message("update objects during extend one rank, %s rank %u\n", pre_op, + cb_arg->rank); extend_write(dfs_mt, dir); break; default: @@ -413,121 +319,14 @@ extend_cb_internal(void *arg) return 0; } -void -dfs_extend_internal(void **state, int opc, test_rebuild_cb_t extend_cb, bool kill) -{ - test_arg_t *arg = *state; - dfs_t *dfs_mt; - daos_handle_t co_hdl; - dfs_obj_t *obj; - dfs_obj_t *dir; - uuid_t co_uuid; - int i; - d_rank_t extend_rank = 3; - char str[37]; - daos_obj_id_t oids[EXTEND_OBJ_NR]; - struct extend_cb_arg cb_arg; - dfs_attr_t attr = {}; - int rc; - - attr.da_props = daos_prop_alloc(2); - assert_non_null(attr.da_props); - attr.da_props->dpp_entries[0].dpe_type = DAOS_PROP_CO_REDUN_LVL; - attr.da_props->dpp_entries[0].dpe_val = DAOS_PROP_CO_REDUN_RANK; - attr.da_props->dpp_entries[1].dpe_type = DAOS_PROP_CO_REDUN_FAC; - attr.da_props->dpp_entries[1].dpe_val = DAOS_PROP_CO_REDUN_RF1; - rc = dfs_cont_create(arg->pool.poh, &co_uuid, &attr, &co_hdl, &dfs_mt); - daos_prop_free(attr.da_props); - assert_int_equal(rc, 0); - print_message("Created DFS Container "DF_UUIDF"\n", DP_UUID(co_uuid)); - - rc = dfs_open(dfs_mt, NULL, "dir", S_IFDIR | S_IWUSR | S_IRUSR, - O_RDWR | O_CREAT, OC_EC_2P1GX, 0, NULL, &dir); - assert_int_equal(rc, 0); - - /* Create 1000 files */ - if (opc == EXTEND_FETCH) { - extend_write(dfs_mt, dir); - } else { - for (i = 0; i < EXTEND_OBJ_NR; i++) { - char filename[32]; - - sprintf(filename, "file%d", i); - rc = dfs_open(dfs_mt, dir, filename, S_IFREG | S_IWUSR | S_IRUSR, - O_RDWR | O_CREAT, OC_EC_2P1GX, 1048576, NULL, &obj); - assert_int_equal(rc, 0); - dfs_obj2id(obj, &oids[i]); - rc = dfs_release(obj); - assert_int_equal(rc, 0); - } - } - - cb_arg.oids = oids; - cb_arg.dfs_mt = dfs_mt; - cb_arg.dir = dir; - cb_arg.opc = opc; - cb_arg.kill = kill; - if (kill) - cb_arg.rank = 2; - else - cb_arg.rank = 4; - - arg->rebuild_cb = extend_cb; - arg->rebuild_cb_arg = &cb_arg; - - /* HOLD rebuild ULT. FIXME: maybe change to use test_set_engine_fail_loc()? */ - print_message("inject DAOS_REBUILD_TGT_SCAN_HANG fault on engines\n"); - daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, - DAOS_REBUILD_TGT_SCAN_HANG | DAOS_FAIL_ALWAYS, 0, NULL); - - arg->no_rebuild=1; - extend_single_pool_rank(arg, extend_rank); - arg->no_rebuild=0; - - print_message("sleep 30 secs for rank %u %s\n", cb_arg.rank, - cb_arg.kill ? "exclude" : "extend"); - sleep(30); - print_message("wait for rebuild due to rank %u extend and rank %u %s\n", extend_rank, - cb_arg.rank, cb_arg.kill ? "exclude" : "extend"); - test_rebuild_wait(&arg, 1); - - if (opc == EXTEND_UPDATE) { - print_message("First extend update read check\n"); - extend_read_check(dfs_mt, dir); - } - - arg->rebuild_cb = NULL; - arg->rebuild_cb_arg = NULL; - if (kill) { - print_message("reintegrate rank %u\n", cb_arg.rank); - reintegrate_single_pool_rank(arg, cb_arg.rank, true); - } - - if (opc == EXTEND_UPDATE) { - print_message("Second extend update read check\n"); - extend_read_check(dfs_mt, dir); - } - - rc = dfs_release(dir); - assert_int_equal(rc, 0); - rc = dfs_umount(dfs_mt); - assert_int_equal(rc, 0); - - rc = daos_cont_close(co_hdl, NULL); - assert_rc_equal(rc, 0); - - uuid_unparse(co_uuid, str); - rc = daos_cont_destroy(arg->pool.poh, str, 1, NULL); - assert_rc_equal(rc, 0); -} - void dfs_extend_punch_kill(void **state) { FAULT_INJECTION_REQUIRED(); - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); dfs_extend_internal(state, EXTEND_PUNCH, extend_cb_internal, true); + T_END(); } void @@ -535,8 +334,9 @@ dfs_extend_punch_extend(void **state) { FAULT_INJECTION_REQUIRED(); - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); dfs_extend_internal(state, EXTEND_PUNCH, extend_cb_internal, false); + T_END(); } void @@ -544,8 +344,9 @@ dfs_extend_stat_kill(void **state) { FAULT_INJECTION_REQUIRED(); - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); dfs_extend_internal(state, EXTEND_STAT, extend_cb_internal, true); + T_END(); } void @@ -553,8 +354,9 @@ dfs_extend_stat_extend(void **state) { FAULT_INJECTION_REQUIRED(); - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); dfs_extend_internal(state, EXTEND_STAT, extend_cb_internal, false); + T_END(); } void @@ -562,8 +364,9 @@ dfs_extend_enumerate_kill(void **state) { FAULT_INJECTION_REQUIRED(); - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); dfs_extend_internal(state, EXTEND_ENUMERATE, extend_cb_internal, true); + T_END(); } void @@ -571,8 +374,9 @@ dfs_extend_enumerate_extend(void **state) { FAULT_INJECTION_REQUIRED(); - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); dfs_extend_internal(state, EXTEND_ENUMERATE, extend_cb_internal, false); + T_END(); } void @@ -580,8 +384,9 @@ dfs_extend_fetch_kill(void **state) { FAULT_INJECTION_REQUIRED(); - print_message("BEGIN %s\n", __FUNCTION__); + T_BEGIN(); dfs_extend_internal(state, EXTEND_FETCH, extend_cb_internal, true); + T_END(); } void @@ -614,14 +419,14 @@ dfs_extend_write_extend(void **state) void dfs_extend_fail_retry(void **state) { - test_arg_t *arg = *state; - dfs_t *dfs_mt; - daos_handle_t co_hdl; - dfs_obj_t *dir; - uuid_t co_uuid; - char str[37]; - dfs_attr_t attr = {}; - int rc; + test_arg_t *arg = *state; + dfs_t *dfs_mt; + daos_handle_t co_hdl; + dfs_obj_t *dir; + uuid_t co_uuid; + char str[37]; + dfs_attr_t attr = {}; + int rc; FAULT_INJECTION_REQUIRED(); @@ -630,14 +435,14 @@ dfs_extend_fail_retry(void **state) attr.da_props = daos_prop_alloc(1); assert_non_null(attr.da_props); attr.da_props->dpp_entries[0].dpe_type = DAOS_PROP_CO_REDUN_LVL; - attr.da_props->dpp_entries[0].dpe_val = DAOS_PROP_CO_REDUN_RANK; + attr.da_props->dpp_entries[0].dpe_val = DAOS_PROP_CO_REDUN_RANK; rc = dfs_cont_create(arg->pool.poh, &co_uuid, &attr, &co_hdl, &dfs_mt); daos_prop_free(attr.da_props); assert_int_equal(rc, 0); - print_message("Created DFS Container "DF_UUIDF"\n", DP_UUID(co_uuid)); + print_message("Created DFS Container " DF_UUIDF "\n", DP_UUID(co_uuid)); - rc = dfs_open(dfs_mt, NULL, "dir", S_IFDIR | S_IWUSR | S_IRUSR, - O_RDWR | O_CREAT, OC_EC_2P1GX, 0, NULL, &dir); + rc = dfs_open(dfs_mt, NULL, "dir", S_IFDIR | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT, + OC_EC_2P1GX, 0, NULL, &dir); assert_int_equal(rc, 0); extend_write(dfs_mt, dir); @@ -673,55 +478,53 @@ dfs_extend_fail_retry(void **state) /** create a new pool/container for each test */ static const struct CMUnitTest extend_tests[] = { - {"EXTEND1: extend small rec multiple dkeys", - extend_dkeys, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND2: extend small rec multiple akeys", - extend_akeys, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND3: extend small rec multiple indexes", - extend_indexes, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND4: extend large rec single index", - extend_large_rec, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND5: extend multiple objects", - extend_objects, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND6: punch object during extend and kill", - dfs_extend_punch_kill, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND7: punch object during extend and extend", - dfs_extend_punch_extend, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND8: stat object during extend and kill", - dfs_extend_stat_kill, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND9: stat object during extend and extend", - dfs_extend_stat_extend, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND10: enumerate object during extend and kill", - dfs_extend_enumerate_kill, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND11: enumerate object during extend and extend", - dfs_extend_enumerate_extend, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND12: read object during extend and kill", - dfs_extend_fetch_kill, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND13: read object during extend and extend", - dfs_extend_fetch_extend, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND14: write object during extend and kill", - dfs_extend_write_kill, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND15: write object during extend and extend", - dfs_extend_write_extend, rebuild_sub_3nodes_rf0_setup, test_teardown}, - {"EXTEND16: extend fail then retry", - dfs_extend_fail_retry, rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND1: extend small rec multiple dkeys", extend_dkeys, rebuild_sub_3nodes_rf0_setup, + test_teardown}, + {"EXTEND2: extend small rec multiple akeys", extend_akeys, rebuild_sub_3nodes_rf0_setup, + test_teardown}, + {"EXTEND3: extend small rec multiple indexes", extend_indexes, rebuild_sub_3nodes_rf0_setup, + test_teardown}, + {"EXTEND4: extend large rec single index", extend_large_rec, rebuild_sub_3nodes_rf0_setup, + test_teardown}, + {"EXTEND5: extend multiple objects", extend_objects, rebuild_sub_3nodes_rf0_setup, + test_teardown}, + {"EXTEND6: punch object during extend and kill", dfs_extend_punch_kill, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND7: punch object during extend and extend", dfs_extend_punch_extend, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND8: stat object during extend and kill", dfs_extend_stat_kill, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND9: stat object during extend and extend", dfs_extend_stat_extend, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND10: enumerate object during extend and kill", dfs_extend_enumerate_kill, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND11: enumerate object during extend and extend", dfs_extend_enumerate_extend, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND12: read object during extend and kill", dfs_extend_fetch_kill, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND13: read object during extend and extend", dfs_extend_fetch_extend, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND14: write object during extend and kill", dfs_extend_write_kill, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND15: write object during extend and extend", dfs_extend_write_extend, + rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"EXTEND16: extend fail then retry", dfs_extend_fail_retry, rebuild_sub_3nodes_rf0_setup, + test_teardown}, }; int -run_daos_extend_simple_test(int rank, int size, int *sub_tests, - int sub_tests_size) +run_daos_extend_simple_test(int rank, int size, int *sub_tests, int sub_tests_size) { int rc = 0; par_barrier(PAR_COMM_WORLD); if (sub_tests_size == 0) { sub_tests_size = ARRAY_SIZE(extend_tests); - sub_tests = NULL; + sub_tests = NULL; } - run_daos_sub_tests_only("DAOS_Extend_Simple", extend_tests, - ARRAY_SIZE(extend_tests), sub_tests, - sub_tests_size); + run_daos_sub_tests_only("DAOS_Extend_Simple", extend_tests, ARRAY_SIZE(extend_tests), + sub_tests, sub_tests_size); par_barrier(PAR_COMM_WORLD); diff --git a/src/tests/suite/daos_md_replication.c b/src/tests/suite/daos_md_replication.c index 1c10bae3d4b..a1d1ceb6692 100644 --- a/src/tests/suite/daos_md_replication.c +++ b/src/tests/suite/daos_md_replication.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -11,6 +12,12 @@ #include #include "daos_test.h" +/* + * Given the 128MB default blobstore cluster size, the minimal pool scm_size for + * an 8 targets engine would be 128MB * 8 = 1GB. + */ +#define MIN_SCM_SIZE (1ULL << 30) + static void mdr_stop_pool_svc(void **argv) { @@ -24,10 +31,8 @@ mdr_stop_pool_svc(void **argv) /* Create the pool. */ if (arg->myrank == 0) { print_message("creating pool\n"); - rc = dmg_pool_create(dmg_config_file, - geteuid(), getegid(), arg->group, - NULL, 256 * 1024 * 1024, 0, - NULL, arg->pool.svc, uuid); + rc = dmg_pool_create(dmg_config_file, geteuid(), getegid(), arg->group, NULL, + MIN_SCM_SIZE, 0, NULL, arg->pool.svc, uuid); } par_bcast(PAR_COMM_WORLD, &rc, 1, PAR_INT, 0); assert_rc_equal(rc, 0); @@ -134,10 +139,8 @@ mdr_stop_cont_svc(void **argv) int rc; print_message("creating pool\n"); - rc = dmg_pool_create(dmg_config_file, - geteuid(), getegid(), arg->group, - NULL, 256 * 1024 * 1024, 0, - NULL, arg->pool.svc, pool_uuid); + rc = dmg_pool_create(dmg_config_file, geteuid(), getegid(), arg->group, NULL, MIN_SCM_SIZE, + 0, NULL, arg->pool.svc, pool_uuid); assert_rc_equal(rc, 0); if (arg->pool.svc->rl_nr < 3) { diff --git a/src/tests/suite/daos_mgmt.c b/src/tests/suite/daos_mgmt.c index 5f0cf5d8079..6866a8d5376 100644 --- a/src/tests/suite/daos_mgmt.c +++ b/src/tests/suite/daos_mgmt.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -19,6 +20,12 @@ #include #include +/* + * Given the 128MB default blobstore cluster size, the minimal pool scm_size for + * an 8 targets engine would be 128MB * 8 = 1GB. + */ +#define MIN_SCM_SIZE (1ULL << 30) + /** create/destroy pool on all tgts */ static void pool_create_all(void **state) @@ -35,11 +42,8 @@ pool_create_all(void **state) /** create container */ print_message("creating pool synchronously ... "); - rc = dmg_pool_create(dmg_config_file, - geteuid(), getegid(), - arg->group, NULL /* tgts */, - 256 * 1024 * 1024 /* minimal size */, - 0 /* nvme size */, NULL /* prop */, + rc = dmg_pool_create(dmg_config_file, geteuid(), getegid(), arg->group, NULL /* tgts */, + MIN_SCM_SIZE, 0 /* nvme size */, NULL /* prop */, arg->pool.svc /* svc */, uuid); assert_rc_equal(rc, 0); @@ -340,11 +344,8 @@ pool_create_and_destroy_retry(void **state) test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_POOL_CREATE_FAIL_CORPC | DAOS_FAIL_ONCE); print_message("creating pool synchronously ... "); - rc = dmg_pool_create(dmg_config_file, - geteuid(), getegid(), - arg->group, NULL /* tgts */, - 256 * 1024 * 1024 /* minimal size */, - 0 /* nvme size */, NULL /* prop */, + rc = dmg_pool_create(dmg_config_file, geteuid(), getegid(), arg->group, NULL /* tgts */, + MIN_SCM_SIZE, 0 /* nvme size */, NULL /* prop */, arg->pool.svc /* svc */, uuid); assert_rc_equal(rc, 0); print_message("success uuid = "DF_UUIDF"\n", DP_UUID(uuid)); @@ -434,8 +435,7 @@ pool_create_steps_down_from_up_empty(void **state) svc.rl_ranks = &rank; svc.rl_nr = 1; rc = dmg_pool_create(dmg_config_file, geteuid(), getegid(), arg->group, NULL /* tgts */, - 256 * 1024 * 1024 /* minimal size */, 0 /* nvme size */, - NULL /* prop */, &svc, uuid); + MIN_SCM_SIZE, 0 /* nvme size */, NULL /* prop */, &svc, uuid); assert_rc_equal(rc, 0); print_message("success uuid = "DF_UUIDF"\n", DP_UUID(uuid)); @@ -465,8 +465,7 @@ pool_destroy_disconnect_all(void **state) print_message("creating pool synchronously ... "); rc = dmg_pool_create(dmg_config_file, geteuid(), getegid(), arg->group, NULL /* tgts */, - 256 * 1024 * 1024 /* minimal size */, 0 /* nvme size */, - NULL /* prop */, arg->pool.svc, uuid); + MIN_SCM_SIZE, 0 /* nvme size */, NULL /* prop */, arg->pool.svc, uuid); assert_rc_equal(rc, 0); print_message("success uuid = "DF_UUIDF"\n", DP_UUID(uuid)); @@ -514,8 +513,7 @@ pool_destroy_cancel_rfcheck(void **state) print_message("creating pool synchronously ... "); rc = dmg_pool_create(dmg_config_file, geteuid(), getegid(), arg->group, NULL /* tgts */, - 256 * 1024 * 1024 /* minimal size */, 0 /* nvme size */, - NULL /* prop */, arg->pool.svc, uuid); + MIN_SCM_SIZE, 0 /* nvme size */, NULL /* prop */, arg->pool.svc, uuid); assert_rc_equal(rc, 0); print_message("success uuid = "DF_UUIDF"\n", DP_UUID(uuid)); @@ -527,26 +525,51 @@ pool_destroy_cancel_rfcheck(void **state) test_set_engine_fail_loc(arg, CRT_NO_RANK, 0); } +static void +pool_create_query_fail(void **state) +{ + test_arg_t *arg = *state; + uuid_t uuid; + int rc; + + FAULT_INJECTION_REQUIRED(); + + if (arg->myrank != 0) + return; + + test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MGMT_FAIL_CREATE_QUERY | DAOS_FAIL_ONCE); + + print_message("creating pool synchronously ... "); + rc = dmg_pool_create(dmg_config_file, geteuid(), getegid(), arg->group, NULL /* tgts */, + MIN_SCM_SIZE, 0 /* nvme size */, NULL /* prop */, arg->pool.svc, uuid); + assert_rc_equal(rc, 0); + print_message("success uuid = " DF_UUIDF "\n", DP_UUID(uuid)); + + test_set_engine_fail_loc(arg, CRT_NO_RANK, 0); + + print_message("destroying pool synchronously ... "); + rc = dmg_pool_destroy(dmg_config_file, uuid, arg->group, 1); + assert_rc_equal(rc, 0); + print_message("success\n"); +} + static const struct CMUnitTest tests[] = { - { "MGMT1: create/destroy pool on all tgts", - pool_create_all, async_disable, test_case_teardown}, - { "MGMT2: create/destroy pool on all tgts (async)", - pool_create_all, async_enable, test_case_teardown}, - { "MGMT3: list-pools with no pools in sys", - list_pools_test, setup_zeropools, teardown_pools}, - { "MGMT4: list-pools with multiple pools in sys", - list_pools_test, setup_manypools, teardown_pools}, - { "MGMT5: retry MGMT_POOL_{CREATE,DESETROY} upon errors", - pool_create_and_destroy_retry, async_disable, test_case_teardown}, - { "MGMT6: daos_mgmt_get_sys_info", - get_sys_info_test, async_disable, test_case_teardown}, - { "MGMT7: create: PS steps down from UP_EMPTY", - pool_create_steps_down_from_up_empty, async_disable, test_case_teardown}, - { "MGMT8: pool destroy disconnect all", - pool_destroy_disconnect_all, async_disable, test_case_teardown}, - { "MGMT9: pool destroy cancels rfcheck", - pool_destroy_cancel_rfcheck, NULL, test_case_teardown} -}; + {"MGMT1: create/destroy pool on all tgts", pool_create_all, async_disable, test_case_teardown}, + {"MGMT2: create/destroy pool on all tgts (async)", pool_create_all, async_enable, + test_case_teardown}, + {"MGMT3: list-pools with no pools in sys", list_pools_test, setup_zeropools, teardown_pools}, + {"MGMT4: list-pools with multiple pools in sys", list_pools_test, setup_manypools, + teardown_pools}, + {"MGMT5: retry MGMT_POOL_{CREATE,DESETROY} upon errors", pool_create_and_destroy_retry, + async_disable, test_case_teardown}, + {"MGMT6: daos_mgmt_get_sys_info", get_sys_info_test, async_disable, test_case_teardown}, + {"MGMT7: create: PS steps down from UP_EMPTY", pool_create_steps_down_from_up_empty, + async_disable, test_case_teardown}, + {"MGMT8: pool destroy disconnect all", pool_destroy_disconnect_all, async_disable, + test_case_teardown}, + {"MGMT9: pool destroy cancels rfcheck", pool_destroy_cancel_rfcheck, NULL, test_case_teardown}, + {"MGMT10: query in pool create fails", pool_create_query_fail, async_disable, + test_case_teardown}}; static int setup(void **state) diff --git a/src/tests/suite/daos_pool.c b/src/tests/suite/daos_pool.c index a15b40bdda8..31582d78921 100644 --- a/src/tests/suite/daos_pool.c +++ b/src/tests/suite/daos_pool.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -734,11 +734,20 @@ pool_op_retry(void **state) print_message("success\n"); /* pool set prop success committed, "lost" reply - duplicate RPC retry */ + char *orig_self_heal = NULL; + rc = daos_pool_get_prop(arg->pool.pool_uuid, "self_heal", &orig_self_heal); + assert_rc_equal(rc, 0); test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); - print_message("set pool prop (retry / dup rpc detection)... "); + print_message("set pool prop self_heal from %s to none (retry / dup rpc detection)... ", + orig_self_heal); rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal", "none"); assert_rc_equal(rc, 0); - print_message("success\n"); + char *orig_self_heal_escaped = test_escape_self_heal(orig_self_heal); + rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal", orig_self_heal_escaped); + free(orig_self_heal_escaped); + assert_rc_equal(rc, 0); + print_message("success (restored self_heal to %s)\n", orig_self_heal); + free(orig_self_heal); /* pool evict success committed, "lost" reply - duplicate RPC retry */ test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); diff --git a/src/tests/suite/daos_rebuild_common.c b/src/tests/suite/daos_rebuild_common.c index 529894af7a7..b0817efdb15 100644 --- a/src/tests/suite/daos_rebuild_common.c +++ b/src/tests/suite/daos_rebuild_common.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -975,8 +975,10 @@ reintegrate_inflight_io(void *data) daos_obj_id_t oid = *(daos_obj_id_t *)arg->rebuild_cb_arg; char single_data[LARGE_SINGLE_VALUE_SIZE]; struct ioreq req; + bool interactive_rebuild = arg->interactive_rebuild && !arg->no_rebuild; int i; + print_message("%s(): begin\n", __FUNCTION__); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); for (i = 0; i < 5; i++) { char key[64]; @@ -996,6 +998,14 @@ reintegrate_inflight_io(void *data) insert_recxs(key, "a_key_1M", 1, DAOS_TX_NONE, &recx, 1, buf, DATA_SIZE, &req); + /* Stop the rebuild */ + if (i == 3 && interactive_rebuild) { + print_message("%s(): stop rebuild in middle of inflight IO\n", + __FUNCTION__); + rebuild_stop_with_dmg(arg); + test_rebuild_wait(&arg, 1); /* rebuild is stopped here */ + } + req.iod_type = DAOS_IOD_SINGLE; memset(single_data, 'a' + i, LARGE_SINGLE_VALUE_SIZE); sprintf(key, "d_inflight_single_small_%d", i); @@ -1007,7 +1017,16 @@ reintegrate_inflight_io(void *data) &req); } ioreq_fini(&req); - print_message("sleep 12 seconds to wait for the stable epoch update.\n"); + + /* Resume the rebuild */ + if (interactive_rebuild) { + print_message("%s(): restart rebuild after remaining inflight IO done\n", + __FUNCTION__); + rebuild_resume_wait_to_start(arg); + } + + print_message("%s() sleep 12 seconds to wait for the stable epoch update and return.\n", + __FUNCTION__); sleep(12); if (arg->myrank == 0) daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, @@ -1227,8 +1246,7 @@ rebuild_stop_with_dmg_internal(const char *cfg, const uuid_t uuid, const char *g rc = dmg_pool_rebuild_stop(cfg, uuid, grp, force); print_message("dmg pool rebuild stop " DF_UUID ", force=%d, rc=%d\n", DP_UUID(uuid), force, rc); - assert_rc_equal(rc, 0); - return 0; + return rc; } /* stop an in-progress rebuild with dmg pool rebuild stop command */ @@ -1236,14 +1254,18 @@ int rebuild_stop_with_dmg(void *data) { test_arg_t *arg = data; + int rc; - print_message("wait for rebuild to start for pool " DF_UUID "\n", - DP_UUID(arg->pool.pool_uuid)); - test_rebuild_wait_to_start(&arg, 1); - sleep(5); - - return rebuild_stop_with_dmg_internal(arg->dmg_config, arg->pool.pool_uuid, arg->group, - false); + /* Rebuild might be only queued (not yet launched) */ + while (true) { + rc = rebuild_stop_with_dmg_internal(arg->dmg_config, arg->pool.pool_uuid, + arg->group, false); + if (rc != -DER_NONEXIST) + break; + print_message("waiting for stop command to run during active rebuild ...\n"); + sleep(1); + } + return rc; } /* stop an in-progress rebuild with dmg pool rebuild stop command (force stop option) */ @@ -1251,14 +1273,18 @@ int rebuild_force_stop_with_dmg(void *data) { test_arg_t *arg = data; + int rc; - print_message("wait for rebuild to start for pool " DF_UUID "\n", - DP_UUID(arg->pool.pool_uuid)); - test_rebuild_wait_to_start(&arg, 1); - sleep(5); - - return rebuild_stop_with_dmg_internal(arg->dmg_config, arg->pool.pool_uuid, arg->group, - true); + /* Rebuild might be only queued (not yet launched) */ + while (true) { + rc = rebuild_stop_with_dmg_internal(arg->dmg_config, arg->pool.pool_uuid, + arg->group, true); + if (rc != -DER_NONEXIST) + break; + print_message("waiting for force-stop command to run during active rebuild ...\n"); + sleep(1); + } + return rc; } /* start/reesume a stopped rebuild with dmg pool rebuild start command */ @@ -1275,12 +1301,51 @@ rebuild_start_with_dmg(void *data) return 0; } +/* wait for previously-issued dmg pool rebuild stop to finish; + * invoke rebuild start, and make sure it got started before returning. + */ +int +rebuild_resume_wait_to_start(void *data) +{ + test_arg_t *arg = data; + struct daos_rebuild_status *rst = &arg->pool.pool_info.pi_rebuild_st; + bool state_match; + int rc; + + /* Verify that the stop resulted in the correct rebuild status. + * NB: you have to be sure the rebuild stop was issued while rebuild was running + * (e.g., when a fault was injected to hang the rebuild, or with carefully-timed sleeps). + */ + print_message( + "(before starting) wait for stopped rebuild and check: rs_errno=%d (expect %d), " + "rs_state=%d (expect %d)\n", + rst->rs_errno, -DER_OP_CANCELED, rst->rs_state, DRS_NOT_STARTED); + test_rebuild_wait(&arg, 1); + state_match = (rst->rs_errno == -DER_OP_CANCELED && rst->rs_state == DRS_NOT_STARTED); + print_message("%sMATCHED check: rs_errno=%d, rs_state=%d\n", state_match ? "" : "NOT-", + rst->rs_errno, rst->rs_state); + assert_int_equal(rst->rs_errno, -DER_OP_CANCELED); + assert_int_equal(rst->rs_state, DRS_NOT_STARTED); + + rc = rebuild_start_with_dmg(data); + assert_rc_equal(rc, 0); + + /* Verify that the current rebuild is no longer stopped (has been restarted). */ + test_rebuild_wait_to_start(&arg, 1); + + return 0; +} + +/* Check rebuild state from previously-stopped rebuild; + * invoke rebuild start and wait for it to completely finish before returning. + */ int rebuild_resume_wait(void *data) { test_arg_t *arg = data; struct daos_rebuild_status *rst = &arg->pool.pool_info.pi_rebuild_st; bool skip_restart = false; + bool state_match; int rc; if (arg->rebuild_cb == rebuild_resume_wait && arg->rebuild_cb_arg) @@ -1288,12 +1353,19 @@ rebuild_resume_wait(void *data) if (arg->rebuild_post_cb == rebuild_resume_wait && arg->rebuild_post_cb_arg) skip_restart = *((bool *)arg->rebuild_post_cb_arg); - /* Verify that the stop resulted in the correct rebuild status */ - print_message("check: stopped rebuild rs_errno=%d (expect %d), rs_state=%d (expect %d)\n", + /* Check whether the stop resulted in the expected rebuild status. + * NB: the stop is already done; the "wait" is just for the pool query rebuild state. + * NB: if the rebuild stop occurred after rebuild completed, we will not see the + * -DER_OP_CANCELED rebuild state. Warn in these instances, since it's all up + * to some variable test timing conditions. + */ + print_message("(before starting) check: stopped rebuild rs_errno=%d (want %d), rs_state=%d " + "(want %d)\n", rst->rs_errno, -DER_OP_CANCELED, rst->rs_state, DRS_NOT_STARTED); - assert_int_equal(rst->rs_errno, -DER_OP_CANCELED); - assert_int_equal(rst->rs_state, DRS_NOT_STARTED); - print_message("check passed\n"); + test_rebuild_wait(&arg, 1); + state_match = (rst->rs_errno == -DER_OP_CANCELED && rst->rs_state == DRS_NOT_STARTED); + print_message("%sMATCHED check: rs_errno=%d, rs_state=%d\n", + state_match ? "" : "WARN: NOT-", rst->rs_errno, rst->rs_state); if (skip_restart) return 0; @@ -1308,15 +1380,15 @@ rebuild_resume_wait(void *data) sleep(2); test_rebuild_wait(&arg, 1); print_message( - "current rebuild state: rs_errno=%d (expect %d), rs_state=%d (expect %d)\n", + "waiting rebuild state: rs_errno=%d (wait for %d), rs_state=%d (wait for %d)\n", rst->rs_errno, 0, rst->rs_state, DRS_COMPLETED); } while (rst->rs_errno == -DER_OP_CANCELED); + state_match = (rst->rs_errno == 0 && rst->rs_state == DRS_COMPLETED); print_message( - "check: resumed rebuild done: rs_errno=%d (expect %d), rs_state=%d (expect %d)\n", - rst->rs_errno, 0, rst->rs_state, DRS_COMPLETED); + "check %s: resumed rebuild rs_errno=%d (expect %d), rs_state=%d (expect %d)\n", + state_match ? "passed" : "FAILED", rst->rs_errno, 0, rst->rs_state, DRS_COMPLETED); assert_int_equal(rst->rs_errno, 0); assert_int_equal(rst->rs_state, DRS_COMPLETED); - print_message("check passed\n"); return 0; } diff --git a/src/tests/suite/daos_rebuild_interactive.c b/src/tests/suite/daos_rebuild_interactive.c new file mode 100644 index 00000000000..dd2607eb583 --- /dev/null +++ b/src/tests/suite/daos_rebuild_interactive.c @@ -0,0 +1,771 @@ +/** + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * This file is for interactive rebuild stop|start testing based on pool exclude, drain, extend, + * and reintegrate. + * + * tests/suite/daos_rebuild_interactive.c + * + */ +#define D_LOGFAC DD_FAC(tests) + +#include "daos_test.h" +#include "daos_iotest.h" +#include "dfs_test.h" +#include +#include +#include +#include + +#define DEFAULT_FAIL_TGT 0 +#define DRAIN_KEY_NR 50 +#define KEY_NR 10 +#define OBJ_NR 10 +#define DATA_SIZE (1048576 * 2 + 512) + +static void +reintegrate_with_inflight_io(test_arg_t *arg, daos_obj_id_t *oid, d_rank_t rank, int tgt) +{ + daos_obj_id_t inflight_oid; + + if (oid != NULL) { + inflight_oid = *oid; + } else { + inflight_oid = + daos_test_oid_gen(arg->coh, DAOS_OC_R3S_SPEC_RANK, 0, 0, arg->myrank); + inflight_oid = dts_oid_set_rank(inflight_oid, rank); + } + + arg->rebuild_cb = reintegrate_inflight_io; + arg->rebuild_cb_arg = &inflight_oid; + + /* To make sure the IO will be done before reintegration is done */ + if (arg->myrank == 0) + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, + DAOS_REBUILD_TGT_REBUILD_HANG, 0, NULL); + reintegrate_single_pool_target(arg, rank, tgt); + arg->rebuild_cb = NULL; + arg->rebuild_cb_arg = NULL; + + if (oid == NULL) { + int rc; + + rc = daos_obj_verify(arg->coh, inflight_oid, DAOS_EPOCH_MAX); + assert_rc_equal(rc, 0); + } +} + +#define SNAP_CNT 5 +static void +int_rebuild_snap_update_recs(void **state) +{ + test_arg_t *arg = *state; + daos_obj_id_t oid; + struct ioreq req; + daos_recx_t recx; + int tgt = DEFAULT_FAIL_TGT; + char string[100 * SNAP_CNT] = {0}; + daos_epoch_t snap_epoch[SNAP_CNT]; + int i; + int rc; + + if (!test_runable(arg, 4)) + return; + + T_BEGIN(); + oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); + oid = dts_oid_set_rank(oid, ranks_to_kill[0]); + oid = dts_oid_set_tgt(oid, tgt); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + for (i = 0; i < SNAP_CNT; i++) + sprintf(string + strlen(string), "old-snap%d", i); + + recx.rx_idx = 0; + recx.rx_nr = strlen(string); + insert_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, string, strlen(string) + 1, &req); + + for (i = 0; i < SNAP_CNT; i++) { + char data[100] = {0}; + + /* Update string for each snapshot */ + daos_cont_create_snap(arg->coh, &snap_epoch[i], NULL, NULL); + sprintf(data, "new-snap%d", i); + recx.rx_idx = i * strlen(data); + recx.rx_nr = strlen(data); + insert_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, data, strlen(data) + 1, + &req); + } + ioreq_fini(&req); + + /* insert rebuild stop|start into the exclude rebuild execution */ + arg->rebuild_cb = rebuild_stop_with_dmg; + arg->rebuild_post_cb = rebuild_resume_wait; + rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); + arg->rebuild_cb = NULL; + arg->rebuild_post_cb = NULL; + + for (i = 0; i < SNAP_CNT; i++) { + rc = daos_obj_verify(arg->coh, oid, snap_epoch[i]); + assert_rc_equal(rc, 0); + } + rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); + assert_rc_equal(rc, 0); + + arg->interactive_rebuild = 0; + reintegrate_with_inflight_io(arg, &oid, ranks_to_kill[0], tgt); + for (i = 0; i < SNAP_CNT; i++) { + rc = daos_obj_verify(arg->coh, oid, snap_epoch[i]); + assert_rc_equal(rc, 0); + } + rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); + assert_rc_equal(rc, 0); + T_END(); +} + +static void +int_rebuild_snap_punch_recs(void **state) +{ + test_arg_t *arg = *state; + daos_obj_id_t oid; + struct ioreq req; + daos_recx_t recx; + int tgt = DEFAULT_FAIL_TGT; + char string[200]; + daos_epoch_t snap_epoch[SNAP_CNT]; + int i; + int rc; + + if (!test_runable(arg, 4)) + return; + + T_BEGIN(); + oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); + oid = dts_oid_set_rank(oid, ranks_to_kill[0]); + oid = dts_oid_set_tgt(oid, tgt); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + for (i = 0; i < SNAP_CNT; i++) + sprintf(string + strlen(string), "old-snap%d", i); + + recx.rx_idx = 0; + recx.rx_nr = strlen(string); + insert_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, string, strlen(string) + 1, &req); + + for (i = 0; i < SNAP_CNT; i++) { + /* punch string */ + daos_cont_create_snap(arg->coh, &snap_epoch[i], NULL, NULL); + recx.rx_idx = i * 9; /* strlen("old-snap%d") */ + recx.rx_nr = 9; + punch_recxs("d_key", "a_key", &recx, 1, DAOS_TX_NONE, &req); + } + ioreq_fini(&req); + + rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); + + for (i = 0; i < SNAP_CNT; i++) { + rc = daos_obj_verify(arg->coh, oid, snap_epoch[i]); + assert_rc_equal(rc, 0); + } + rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); + assert_rc_equal(rc, 0); + + /* insert rebuild stop|start into the reintegrate rebuild execution */ + arg->interactive_rebuild = 1; + reintegrate_with_inflight_io(arg, &oid, ranks_to_kill[0], tgt); + for (i = 0; i < SNAP_CNT; i++) { + rc = daos_obj_verify(arg->coh, oid, snap_epoch[i]); + assert_rc_equal(rc, 0); + } + rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); + assert_rc_equal(rc, 0); + T_END(); +} + +static int +rebuild_wait_error_reset_fail_cb(void *data) +{ + test_arg_t *arg = data; + int rc; + + print_message("wait until rebuild starts erroring\n"); + test_rebuild_wait_to_error(&arg, 1); + print_message("rebuild version %u erroring, check rs_errno=%d (expecting -DER_IO=%d)\n", + arg->pool.pool_info.pi_rebuild_st.rs_version, + arg->pool.pool_info.pi_rebuild_st.rs_errno, -DER_IO); + assert_int_equal(arg->pool.pool_info.pi_rebuild_st.rs_errno, -DER_IO); + + print_message("clearing fault injection on all engines\n"); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_VALUE, 0, 0, NULL); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_NUM, 0, 0, NULL); + + print_message("wait until Fail_reclaim starts\n"); + test_rebuild_wait_to_start_lower(&arg, 1); + + print_message( + "send rebuild stop --force request during first/only Fail_reclaim operation\n"); + rc = rebuild_force_stop_with_dmg(data); + assert_rc_equal(rc, 0); + + /* Wait for stop, verify rs_state/rs_errno happens in rebuild_post_cb rebuild_resume_wait() + */ + + return rc; +} + +static void +int_rebuild_many_objects_with_failure(void **state) +{ + test_arg_t *arg = *state; + daos_obj_id_t *oids; + const int NUM_OBJS = 500; + int rc; + int i; + + if (!test_runable(arg, 6)) + return; + + T_BEGIN(); + D_ALLOC_ARRAY(oids, NUM_OBJS); + for (i = 0; i < NUM_OBJS; i++) { + char buffer[256]; + daos_recx_t recx; + struct ioreq req; + + oids[i] = daos_test_oid_gen(arg->coh, OC_RP_3G1, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oids[i], DAOS_IOD_ARRAY, arg); + memset(buffer, 'a', 256); + recx.rx_idx = 0; + recx.rx_nr = 256; + insert_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, buffer, 256, &req); + + ioreq_fini(&req); + } + + /* Inject faults on engines. Special handling for interactive_rebuild case */ + if (arg->myrank == 0) { + print_message("inject fault DAOS_REBUILD_OBJ_FAIL on all engines\n"); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, + DAOS_REBUILD_OBJ_FAIL | DAOS_FAIL_ALWAYS, 0, NULL); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_VALUE, 3, 0, NULL); + } + + /* For interactive rebuild, we need: + * 1. trigger rebuild (which will fail), wait until op:Fail_reclaim begins. + * 2. During op:Fail_reclaim, issue dmg system stop (test that stop does not interrupt + * reclaim, but takes effect by not retrying the rebuild. + */ + arg->rebuild_cb = rebuild_wait_error_reset_fail_cb; + arg->rebuild_post_cb = rebuild_resume_wait; + rebuild_single_pool_target(arg, 3, -1, false); + + for (i = 0; i < NUM_OBJS; i++) { + rc = daos_obj_verify(arg->coh, oids[i], DAOS_EPOCH_MAX); + assert_rc_equal(rc, 0); + } + D_FREE(oids); + T_END(); +} + +static int +cont_open_and_inflight_io(void *data) +{ + test_arg_t *arg = data; + int rc; + + assert_int_equal(arg->setup_state, SETUP_CONT_CREATE); + rc = test_setup_next_step((void **)&arg, NULL, NULL, NULL); + assert_success(rc); + assert_int_equal(arg->setup_state, SETUP_CONT_CONNECT); + + return reintegrate_inflight_io(data); +} + +static void +int_cont_open_in_drain(void **state) +{ + test_arg_t *arg = *state; + daos_obj_id_t oid; + struct ioreq req; + int tgt = DEFAULT_FAIL_TGT; + int i; + + FAULT_INJECTION_REQUIRED(); + + if (!test_runable(arg, 4)) + return; + + T_BEGIN(); + oid = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); + oid = dts_oid_set_rank(oid, ranks_to_kill[0]); + oid = dts_oid_set_tgt(oid, tgt); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + /** Insert records */ + print_message("Insert %d kv record in object " DF_OID "\n", DRAIN_KEY_NR, DP_OID(oid)); + for (i = 0; i < DRAIN_KEY_NR; i++) { + char key[32] = {0}; + + sprintf(key, "dkey_0_%d", i); + insert_single(key, "a_key", 0, "data", strlen("data") + 1, DAOS_TX_NONE, &req); + } + ioreq_fini(&req); + + test_teardown_cont_hdl(arg); + arg->interactive_rebuild = 1; + arg->rebuild_cb = cont_open_and_inflight_io; + arg->rebuild_cb_arg = &oid; + drain_single_pool_target(arg, ranks_to_kill[0], tgt, false); + + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + for (i = 0; i < DRAIN_KEY_NR; i++) { + char key[32] = {0}; + char buf[16] = {0}; + + sprintf(key, "dkey_0_%d", i); + /** Lookup */ + memset(buf, 0, 10); + lookup_single(key, "a_key", 0, buf, 10, DAOS_TX_NONE, &req); + assert_int_equal(req.iod[0].iod_size, strlen("data") + 1); + + /** Verify data consistency */ + assert_string_equal(buf, "data"); + } + + reintegrate_inflight_io_verify(arg); + ioreq_fini(&req); + T_END(); +} + +static void +int_drain_fail_and_retry_objects(void **state) +{ + test_arg_t *arg = *state; + daos_obj_id_t oids[OBJ_NR]; + int i; + + FAULT_INJECTION_REQUIRED(); + + if (!test_runable(arg, 4)) + return; + + T_BEGIN(); + for (i = 0; i < OBJ_NR; i++) { + oids[i] = daos_test_oid_gen(arg->coh, DAOS_OC_R1S_SPEC_RANK, 0, 0, arg->myrank); + oids[i] = dts_oid_set_rank(oids[i], ranks_to_kill[0]); + oids[i] = dts_oid_set_tgt(oids[i], DEFAULT_FAIL_TGT); + } + + rebuild_io(arg, oids, OBJ_NR); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, + DAOS_REBUILD_OBJ_FAIL | DAOS_FAIL_ALWAYS, 0, NULL); + + arg->no_rebuild = 1; + drain_single_pool_rank(arg, ranks_to_kill[0], false); + arg->no_rebuild = 0; + print_message("wait drain to fail and exit\n"); + /* NB: could be better to wait (in drain_single_pool_rank or test_rebuild_wait), but that + * requires new logic in rebuild_task_complete_schedule() to update state after + * Fail_reclaim + */ + print_message("wait for drain reubild to get -DER_IO\n"); + test_rebuild_wait_to_error(&arg, 1); + print_message("wait for op:Fail_reclaim to start\n"); + test_rebuild_wait_to_start_lower(&arg, 1); + + print_message("clear fault injection on all engines and wait for retry rebuild\n"); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); + test_rebuild_wait_to_start_next(&arg, 1); + print_message("drain rebuild retry started, version=%u\n", + arg->pool.pool_info.pi_rebuild_st.rs_version); + rebuild_io_validate(arg, oids, OBJ_NR); + + arg->interactive_rebuild = 1; + arg->rebuild_cb = reintegrate_inflight_io; + arg->rebuild_cb_arg = &oids[OBJ_NR - 1]; + print_message("inflight IO during drain (that will be stopped/restarted)\n"); + drain_single_pool_rank(arg, ranks_to_kill[0], false); + print_message("final data verification\n"); + rebuild_io_validate(arg, oids, OBJ_NR); + reintegrate_inflight_io_verify(arg); + T_END(); +} + +/* FIXME: rename a few things - most of this code is performing drain + kill/exclude, NOT extend */ + +static int +int_extend_drain_cb_internal(void *arg) +{ + test_arg_t *test_arg = arg; + struct extend_drain_cb_arg *cb_arg = test_arg->rebuild_cb_arg; + dfs_t *dfs_mt = cb_arg->dfs_mt; + daos_obj_id_t *oids = cb_arg->oids; + dfs_obj_t *dir = cb_arg->dir; + uint32_t objclass = cb_arg->objclass; + struct dirent ents[10]; + int opc = cb_arg->opc; + int total_entries = 0; + uint32_t num_ents = 10; + daos_anchor_t anchor = {0}; + int rc; + int i; + + if (opc != EXTEND_DRAIN_WRITELOOP) { + print_message("sleep 5 seconds first\n"); + sleep(5); + } + + print_message("%sstart op %d (%s)\n", + test_arg->interactive_rebuild ? "stop rebuild before " : "", opc, + extend_drain_opstrs[opc]); + + if (test_arg->interactive_rebuild) { + rc = rebuild_stop_with_dmg(arg); + assert_rc_equal(rc, 0); + } + + /* Kill another rank during extend */ + switch (opc) { + case EXTEND_DRAIN_PUNCH: + print_message("punch objects during extend & drain%s\n", + test_arg->interactive_rebuild ? " during stopped rebuild" : ""); + for (i = 0; i < EXTEND_DRAIN_OBJ_NR; i++) { + char filename[32]; + + sprintf(filename, "file%d", i); + rc = dfs_remove(dfs_mt, dir, filename, true, &oids[i]); + assert_int_equal(rc, 0); + } + break; + case EXTEND_DRAIN_STAT: + print_message("stat objects during extend & drain%s\n", + test_arg->interactive_rebuild ? " during stopped rebuild" : ""); + for (i = 0; i < EXTEND_DRAIN_OBJ_NR; i++) { + char filename[32]; + struct stat stbuf; + + sprintf(filename, "file%d", i); + rc = dfs_stat(dfs_mt, dir, filename, &stbuf); + assert_int_equal(rc, 0); + } + break; + case EXTEND_DRAIN_ENUMERATE: + print_message("enumerate objects during extend & drain%s]n", + test_arg->interactive_rebuild ? " during stopped rebuild" : ""); + while (!daos_anchor_is_eof(&anchor)) { + num_ents = 10; + rc = dfs_readdir(dfs_mt, dir, &anchor, &num_ents, ents); + assert_int_equal(rc, 0); + total_entries += num_ents; + } + assert_int_equal(total_entries, EXTEND_DRAIN_OBJ_NR); + break; + case EXTEND_DRAIN_FETCH: + print_message("fetch objects during extend & drain%s\n", + test_arg->interactive_rebuild ? " during stopped rebuild" : ""); + extend_drain_read_check(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, WRITE_SIZE, + 'a'); + break; + case EXTEND_DRAIN_UPDATE: + print_message("update objects during extend & drain%s\n", + test_arg->interactive_rebuild ? " during stopped rebuild" : ""); + extend_drain_write(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, WRITE_SIZE, 'a', + NULL); + break; + case EXTEND_DRAIN_OVERWRITE: + print_message("overwrite objects during extend & drain%s\n", + test_arg->interactive_rebuild ? " during stopped rebuild" : ""); + extend_drain_write(dfs_mt, dir, objclass, EXTEND_DRAIN_OBJ_NR, WRITE_SIZE, 'b', + NULL); + break; + case EXTEND_DRAIN_WRITELOOP: + print_message("keepwrite objects during extend & drain%s\n", + test_arg->interactive_rebuild ? " during stopped rebuild" : ""); + extend_drain_write(dfs_mt, dir, objclass, 1, 512 * 1048576, 'a', NULL); + break; + default: + break; + } + + daos_debug_set_params(test_arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); + + print_message("%sdone op %d (%s)\n", + test_arg->interactive_rebuild ? "resume rebuild after " : "", opc, + extend_drain_opstrs[opc]); + + if (test_arg->interactive_rebuild) + rebuild_resume_wait_to_start(arg); + + return 0; +} + +static void +int_dfs_drain_overwrite(void **state) +{ + test_arg_t *arg = *state; + + arg->interactive_rebuild = 1; + print_message("=== Begin EXTEND_DRAIN_OVERWRITE, oclass OC_EC_4P2GX\n"); + dfs_extend_drain_common(state, EXTEND_DRAIN_OVERWRITE, OC_EC_4P2GX, + int_extend_drain_cb_internal); + T_END(); +} + +static int +int_extend_cb_internal(void *arg) +{ + test_arg_t *test_arg = arg; + struct extend_cb_arg *cb_arg = test_arg->rebuild_cb_arg; + dfs_t *dfs_mt = cb_arg->dfs_mt; + daos_obj_id_t *oids = cb_arg->oids; + dfs_obj_t *dir = cb_arg->dir; + struct dirent ents[10]; + int opc = cb_arg->opc; + int total_entries = 0; + uint32_t num_ents = 10; + daos_anchor_t anchor = {0}; + bool do_stop = (!cb_arg->kill && test_arg->interactive_rebuild); + const char *pre_op = (cb_arg->kill ? "kill" : "extend"); + int rc; + int i; + + /* wait for first extend, and (as post-effect) get rebuild version so we can wait for + * the second rebuild to start (by waiting for a rebuild with version > first rs_version) + */ + print_message("before waiting for rebuild to start, pmap_ver=%u, rs_version=%u\n", + test_arg->pool.pool_info.pi_map_ver, + test_arg->pool.pool_info.pi_rebuild_st.rs_version); + test_rebuild_wait_to_start_next(&test_arg, 1); + print_message("Extending (rs_version=%u), sleep 10, %s rank %u, %sand start op %d (%s)\n", + test_arg->pool.pool_info.pi_rebuild_st.rs_version, pre_op, cb_arg->rank, + do_stop ? "stop rebuild, " : "", opc, extend_opstrs[opc]); + + sleep(10); + + if (cb_arg->kill) { + /* Kill another rank during extend */ + daos_kill_server(test_arg, test_arg->pool.pool_uuid, test_arg->group, + test_arg->pool.alive_svc, cb_arg->rank); + } else { + /* Extend another rank during extend */ + print_message("extend pool " DF_UUID " rank %u\n", + DP_UUID(test_arg->pool.pool_uuid), cb_arg->rank); + rc = dmg_pool_extend(test_arg->dmg_config, test_arg->pool.pool_uuid, + test_arg->group, &cb_arg->rank, 1); + assert_int_equal(rc, 0); + } + + if (do_stop) { + daos_debug_set_params(test_arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); + print_message("before waiting for rebuild to start, pmap_ver=%u, rs_version=%u\n", + test_arg->pool.pool_info.pi_map_ver, + test_arg->pool.pool_info.pi_rebuild_st.rs_version); + test_rebuild_wait_to_start_next(&test_arg, 1); + print_message("rebuild version=%u running\n", + test_arg->pool.pool_info.pi_rebuild_st.rs_version); + rc = rebuild_stop_with_dmg(arg); + assert_rc_equal(rc, 0); + test_rebuild_wait_to_error(&test_arg, 1); + } + + switch (opc) { + case EXTEND_PUNCH: + print_message("punch objects during extend one rank%s, %s rank %u\n", + do_stop ? ", stop rebuild" : "", pre_op, cb_arg->rank); + for (i = 0; i < EXTEND_OBJ_NR; i++) { + char filename[32]; + + sprintf(filename, "file%d", i); + rc = dfs_remove(dfs_mt, dir, filename, true, &oids[i]); + assert_int_equal(rc, 0); + } + break; + case EXTEND_STAT: + print_message("stat objects during extend one rank%s, %s rank %u\n", + do_stop ? ", stop rebuild" : "", pre_op, cb_arg->rank); + for (i = 0; i < EXTEND_OBJ_NR; i++) { + char filename[32]; + struct stat stbuf; + + sprintf(filename, "file%d", i); + rc = dfs_stat(dfs_mt, dir, filename, &stbuf); + assert_int_equal(rc, 0); + } + break; + case EXTEND_ENUMERATE: + print_message("enumerate objects during extend one rank%s, %s rank %u\n", + do_stop ? ", stop rebuild" : "", pre_op, cb_arg->rank); + while (!daos_anchor_is_eof(&anchor)) { + num_ents = 10; + rc = dfs_readdir(dfs_mt, dir, &anchor, &num_ents, ents); + assert_int_equal(rc, 0); + total_entries += num_ents; + } + assert_int_equal(total_entries, 1000); + break; + case EXTEND_FETCH: + print_message("fetch objects during extend one rank%s, %s rank %u\n", + do_stop ? ", stop rebuild" : "", pre_op, cb_arg->rank); + extend_read_check(dfs_mt, dir); + break; + case EXTEND_UPDATE: + print_message("update objects during extend one rank%s, %s rank %u\n", + do_stop ? ", stop rebuild" : "", pre_op, cb_arg->rank); + extend_write(dfs_mt, dir); + break; + default: + break; + } + + daos_debug_set_params(test_arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); + + if (do_stop) + rebuild_resume_wait_to_start(arg); + + return 0; +} + +static void +int_dfs_extend_enumerate_extend(void **state) +{ + test_arg_t *arg = *state; + + FAULT_INJECTION_REQUIRED(); + + T_BEGIN(); + arg->interactive_rebuild = 1; + dfs_extend_internal(state, EXTEND_ENUMERATE, int_extend_cb_internal, false); + T_END(); +} + +static void +int_rebuild_dkeys_stop_failing(void **state) +{ + test_arg_t *arg = *state; + d_rank_t kill_rank = 0; + int kill_rank_nr; + daos_obj_id_t oid; + struct ioreq req; + int i; + int rc; + + FAULT_INJECTION_REQUIRED(); + + if (!test_runable(arg, 4)) + return; + + T_BEGIN(); + + oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + /** Insert records */ + print_message("Insert %d kv record in object " DF_OID "\n", KEY_NR, DP_OID(oid)); + for (i = 0; i < KEY_NR; i++) { + char key[32] = {0}; + daos_recx_t recx; + char data[DATA_SIZE]; + + sprintf(key, "dkey_0_%d", i); + insert_single(key, "a_key", 0, "data", strlen("data") + 1, DAOS_TX_NONE, &req); + + sprintf(key, "dkey_0_1M_%d", i); + recx.rx_idx = 0; + recx.rx_nr = DATA_SIZE; + + memset(data, 'a', DATA_SIZE); + insert_recxs(key, "a_key_1M", 1, DAOS_TX_NONE, &recx, 1, data, DATA_SIZE, &req); + } + + /* Quick check that rebuild stop will return -DER_NONEXIST if nothing is rebuilding */ + rc = dmg_pool_rebuild_stop(arg->dmg_config, arg->pool.pool_uuid, arg->group, + false /* force */); + assert_int_equal(rc, -DER_NONEXIST); + + get_killing_rank_by_oid(arg, oid, 1, 0, &kill_rank, &kill_rank_nr); + ioreq_fini(&req); + + /* Cause first (and subsequent) rebuild attempts to fail with -DER_IO */ + if (arg->myrank == 0) { + print_message("inject fault DAOS_REBUILD_OBJ_FAIL on all engines\n"); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, + DAOS_REBUILD_OBJ_FAIL | DAOS_FAIL_ALWAYS, 0, NULL); + } + + /* Trigger exclude and rebuild, fail twice, force-stop command during second Fail_reclaim + * NB: stop will be deferred until after Fail_reclaim (since it did not fail). + */ + arg->no_rebuild = 1; + rebuild_single_pool_target(arg, kill_rank, -1, false); + arg->no_rebuild = 0; + print_message("before waiting for rebuild to start, pmap_ver=%u, rs_version=%u\n", + arg->pool.pool_info.pi_map_ver, arg->pool.pool_info.pi_rebuild_st.rs_version); + test_rebuild_wait_to_start(&arg, 1); + + print_message("Wait for exclude rebuild ver %u to fail (and start Fail_reclaim)\n", + arg->pool.pool_info.pi_rebuild_st.rs_version); + test_rebuild_wait_to_start_lower(&arg, 1); + print_message("Wait for Fail_reclaim to finish (and start retry of exclude rebuild)\n"); + test_rebuild_wait_to_start_next(&arg, 1); + print_message("Wait for second exclude rebuild to fail (and start Fail_reclaim)\n"); + test_rebuild_wait_to_start_lower(&arg, 1); + + print_message("Force-stop runaway failing exclude rebuild retries\n"); + rc = rebuild_force_stop_with_dmg(arg); + assert_rc_equal(rc, 0); + print_message("Waiting for exclude rebuild to stop\n"); + test_rebuild_wait(&arg, 1); + assert_int_equal(arg->pool.pool_info.pi_rebuild_st.rs_state, DRS_NOT_STARTED); + assert_int_equal(arg->pool.pool_info.pi_rebuild_st.rs_errno, -DER_OP_CANCELED); + print_message("Exclude rebuild stopped\n"); + + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); + + /* Do not restart the rebuild ; instead, go directly to reintegrate the rank */ + reintegrate_with_inflight_io(arg, &oid, kill_rank, -1); + rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); + assert_rc_equal(rc, 0); + T_END(); +} + +/** create a new pool/container for each test */ +static const struct CMUnitTest rebuild_interactive_tests[] = { + {"IREBUILD1: interactive exclude: records with multiple snapshots", + int_rebuild_snap_update_recs, rebuild_small_sub_setup, test_teardown}, + {"IREBUILD2: interactive exclude: punch/records with multiple snapshots", + int_rebuild_snap_punch_recs, rebuild_small_sub_setup, test_teardown}, + {"IREBUILD3: interactive exclude: lot of objects with failure", + int_rebuild_many_objects_with_failure, rebuild_sub_setup, test_teardown}, + {"IREBUILD4: interactive drain: cont open and update during rebuild", int_cont_open_in_drain, + rebuild_small_sub_rf0_setup, test_teardown}, + {"IREBUILD5: drain fail and retry", int_drain_fail_and_retry_objects, rebuild_sub_rf0_setup, + test_teardown}, + {"IREBUILD6: interactive drain: overwrite during rebuild", int_dfs_drain_overwrite, + rebuild_sub_rf0_setup, test_teardown}, + {"IREBUILD7: interactive extend: enumerate object during two rebuilds", + int_dfs_extend_enumerate_extend, rebuild_sub_3nodes_rf0_setup, test_teardown}, + {"IREBUILD8: interactive exclude: stop repeatedly-failing rebuild", + int_rebuild_dkeys_stop_failing, rebuild_small_sub_setup, test_teardown}, +}; + +int +run_daos_int_rebuild_test(int rank, int size, int *sub_tests, int sub_tests_size) +{ + int rc = 0; + + par_barrier(PAR_COMM_WORLD); + if (sub_tests_size == 0) { + sub_tests_size = ARRAY_SIZE(rebuild_interactive_tests); + sub_tests = NULL; + } + + rc = run_daos_sub_tests_only("DAOS_Rebuild_Interactive", rebuild_interactive_tests, + ARRAY_SIZE(rebuild_interactive_tests), sub_tests, + sub_tests_size); + + par_barrier(PAR_COMM_WORLD); + + return rc; +} diff --git a/src/tests/suite/daos_rebuild_simple.c b/src/tests/suite/daos_rebuild_simple.c index 185deda338a..adc1677a5ce 100644 --- a/src/tests/suite/daos_rebuild_simple.c +++ b/src/tests/suite/daos_rebuild_simple.c @@ -80,6 +80,7 @@ rebuild_dkeys(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); @@ -107,13 +108,7 @@ rebuild_dkeys(void **state) get_killing_rank_by_oid(arg, oid, 1, 0, &kill_rank, &kill_rank_nr); ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, kill_rank, -1, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) @@ -123,7 +118,7 @@ rebuild_dkeys(void **state) rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } static void @@ -141,6 +136,7 @@ rebuild_akeys(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); @@ -169,13 +165,7 @@ rebuild_akeys(void **state) get_killing_rank_by_oid(arg, oid, 1, 0, &kill_rank, &kill_rank_nr); ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, kill_rank, tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) @@ -185,7 +175,7 @@ rebuild_akeys(void **state) rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } static void @@ -202,6 +192,7 @@ rebuild_indexes(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); oid = dts_oid_set_tgt(oid, tgt); @@ -221,13 +212,7 @@ rebuild_indexes(void **state) ioreq_fini(&req); /* Rebuild rank 1 */ - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) @@ -237,7 +222,7 @@ rebuild_indexes(void **state) rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } #define SNAP_CNT 20 @@ -257,6 +242,7 @@ rebuild_snap_update_recs(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); oid = dts_oid_set_tgt(oid, tgt); @@ -282,13 +268,7 @@ rebuild_snap_update_recs(void **state) } ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; for (i = 0; i < SNAP_CNT; i++) { rc = daos_obj_verify(arg->coh, oid, snap_epoch[i]); @@ -308,7 +288,7 @@ rebuild_snap_update_recs(void **state) rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } static void @@ -327,6 +307,7 @@ rebuild_snap_punch_recs(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); oid = dts_oid_set_tgt(oid, tgt); @@ -348,13 +329,7 @@ rebuild_snap_punch_recs(void **state) } ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; for (i = 0; i < SNAP_CNT; i++) { rc = daos_obj_verify(arg->coh, oid, snap_epoch[i]); @@ -374,7 +349,7 @@ rebuild_snap_punch_recs(void **state) rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } static void @@ -391,6 +366,7 @@ rebuild_snap_update_keys(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); oid = dts_oid_set_tgt(oid, tgt); @@ -408,13 +384,7 @@ rebuild_snap_update_keys(void **state) insert_single("dkey", akey, 0, "data", 1, DAOS_TX_NONE, &req); } - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; daos_fail_loc_set(DAOS_OBJ_SPECIAL_SHARD); for (i = 0; i < OBJ_REPLICAS; i++) { @@ -464,7 +434,7 @@ rebuild_snap_update_keys(void **state) if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); ioreq_fini(&req); - print_message("success\n"); + T_END(); } static void @@ -481,6 +451,7 @@ rebuild_snap_punch_keys(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); oid = dts_oid_set_tgt(oid, tgt); @@ -516,13 +487,7 @@ rebuild_snap_punch_keys(void **state) punch_akey("dkey", akey, DAOS_TX_NONE, &req); } - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; daos_fail_loc_set(DAOS_OBJ_SPECIAL_SHARD); for (i = 0; i < OBJ_REPLICAS; i++) { @@ -573,7 +538,7 @@ rebuild_snap_punch_keys(void **state) if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); ioreq_fini(&req); - print_message("success\n"); + T_END(); } static void @@ -590,6 +555,7 @@ rebuild_snap_punch_empty(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, DAOS_OC_R3S_SPEC_RANK, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); @@ -603,17 +569,7 @@ rebuild_snap_punch_empty(void **state) punch_obj(DAOS_TX_NONE, &req); - /* stop exclude rebuild, but skip the start, then directly reintegrate */ - if (arg->interactive_rebuild) { - bool skip_restart = true; - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - arg->rebuild_post_cb_arg = &skip_restart; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; - arg->rebuild_post_cb_arg = NULL; daos_fail_loc_set(DAOS_OBJ_SPECIAL_SHARD); for (i = 0; i < OBJ_REPLICAS; i++) { @@ -653,17 +609,12 @@ rebuild_snap_punch_empty(void **state) assert_int_equal(number, 0); } - /* from a stopped (not restarted) exclude, directly reintegrate (stop+start this rebuild) */ - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } reintegrate_single_pool_target(arg, ranks_to_kill[0], tgt); rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); ioreq_fini(&req); - print_message("success\n"); + T_END(); } static void @@ -681,6 +632,7 @@ rebuild_multiple(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); oid = dts_oid_set_tgt(oid, tgt); @@ -704,19 +656,13 @@ rebuild_multiple(void **state) } } - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); ioreq_fini(&req); - print_message("success\n"); + T_END(); } #define LARGE_BUFFER_SIZE (32 * 1024 * 4) @@ -734,6 +680,7 @@ rebuild_large_rec(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); oid = dts_oid_set_tgt(oid, tgt); @@ -762,13 +709,7 @@ rebuild_large_rec(void **state) ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) @@ -778,7 +719,7 @@ rebuild_large_rec(void **state) rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } static void @@ -793,6 +734,7 @@ rebuild_objects(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); for (i = 0; i < OBJ_NR; i++) { oids[i] = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); @@ -802,13 +744,7 @@ rebuild_objects(void **state) rebuild_io(arg, oids, OBJ_NR); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; for (i = 0; i < OBJ_NR; i++) { rc = daos_obj_verify(arg->coh, oids[i], DAOS_EPOCH_MAX); @@ -822,7 +758,7 @@ rebuild_objects(void **state) if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); } - print_message("success\n"); + T_END(); } static void @@ -844,12 +780,6 @@ rebuild_sx_object_internal(void **state, daos_oclass_id_t oclass, if (!test_runable(arg, 4)) return; - if (arg->interactive_rebuild && !wait_rebuild) { - print_message("SKIP due to interactive_rebuild enabled, but not tested here\n"); - skip(); - return; - } - oid = daos_test_oid_gen(arg->coh, oclass, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); print_message("insert 100 dkeys\n"); @@ -885,15 +815,9 @@ rebuild_sx_object_internal(void **state, daos_oclass_id_t oclass, arg->group, rank, -1); assert_success(rc); - if (arg->interactive_rebuild) - rebuild_stop_with_dmg(arg); - /* wait until exclude rebuild done */ - if (wait_rebuild) { + if (wait_rebuild) test_rebuild_wait(&arg, 1); - if (arg->interactive_rebuild) - rebuild_resume_wait(arg); - } print_message("dmg pool reintegrate rank %u " DF_UUID "\n", rank, DP_UUID(arg->pool.pool_uuid)); @@ -901,15 +825,9 @@ rebuild_sx_object_internal(void **state, daos_oclass_id_t oclass, rank, -1); assert_success(rc); - if (arg->interactive_rebuild) - rebuild_stop_with_dmg(arg); - /* wait until reintegration rebuild is done */ - if (wait_rebuild) { + if (wait_rebuild) test_rebuild_wait(&arg, 1); - if (arg->interactive_rebuild) - rebuild_resume_wait(arg); - } print_message("lookup 100 dkeys\n"); for (i = 0; i < 100 && verify; i++) { @@ -928,22 +846,25 @@ rebuild_sx_object_internal(void **state, daos_oclass_id_t oclass, static void rebuild_sx_object(void **state) { + T_BEGIN(); rebuild_sx_object_internal(state, OC_SX, false, true); - print_message("success\n"); + T_END(); } static void rebuild_xsf_object(void **state) { + T_BEGIN(); rebuild_sx_object_internal(state, OC_RP_XSF, true, true); - print_message("success\n"); + T_END(); } static void rebuild_sx_object_no_data_sync(void **state) { + T_BEGIN(); rebuild_sx_object_internal(state, OC_SX, false, false); - print_message("success\n"); + T_END(); } static int @@ -978,6 +899,7 @@ rebuild_large_object(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); for (i = 0; i < 5; i++) { oid = daos_test_oid_gen(arg->coh, OC_RP_2G8, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); @@ -994,13 +916,8 @@ rebuild_large_object(void **state) rank, -1); assert_success(rc); - if (arg->interactive_rebuild) - rebuild_stop_with_dmg(arg); - /* wait until exclude rebuild done */ test_rebuild_wait(&arg, 1); - if (arg->interactive_rebuild) - rebuild_resume_wait(arg); print_message("dmg pool reintegrate rank %u " DF_UUID "\n", rank, DP_UUID(arg->pool.pool_uuid)); @@ -1008,16 +925,10 @@ rebuild_large_object(void **state) rank, -1); assert_success(rc); - if (arg->interactive_rebuild) { - rebuild_stop_with_dmg(arg); - } - /* wait until reintegration rebuild is done */ test_rebuild_wait(&arg, 1); - if (arg->interactive_rebuild) - rebuild_resume_wait(arg); - print_message("success\n"); + T_END(); } int @@ -1092,6 +1003,7 @@ rebuild_large_snap(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); oid = dts_oid_set_tgt(oid, tgt); @@ -1109,19 +1021,11 @@ rebuild_large_snap(void **state) insert_single("dkey", akey, 0, "data", 1, DAOS_TX_NONE, &req); } - /* stop and resume the exclude rebuild */ - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], tgt, false); ioreq_fini(&req); - /* stop and resume the reintegration rebuild (cb functions are still set in arg) */ reintegrate_single_pool_target(arg, ranks_to_kill[0], tgt); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; - print_message("success\n"); + T_END(); } static void @@ -1135,6 +1039,7 @@ rebuild_full_shards(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); /* require 4 nodes and 8 targets per node */ if (arg->myrank == 0 && arg->srv_ntgts / arg->srv_nnodes != 8) { print_message("skip - require 4 nodes and 8 targets/node\n"); @@ -1158,26 +1063,12 @@ rebuild_full_shards(void **state) ioreq_fini(&req); /* rebuild and reintegration to use full shards */ - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } - /* stop and resume the first exclude rebuild (but not the second one) */ rebuild_single_pool_target(arg, 0, -1, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rebuild_single_pool_target(arg, 3, -1, false); - /* stop and resume the first reintegrate rebuild (but not the second one) */ - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } reintegrate_single_pool_target(arg, 0, -1); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; reintegrate_single_pool_target(arg, 3, -1); - print_message("success\n"); + T_END(); } static void @@ -1194,6 +1085,7 @@ rebuild_punch_recs(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); oid = dts_oid_set_rank(oid, ranks_to_kill[0]); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); @@ -1212,18 +1104,12 @@ rebuild_punch_recs(void **state) } ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, ranks_to_kill[0], -1, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } static void @@ -1240,6 +1126,7 @@ rebuild_multiple_group(void **state) if (!test_runable(arg, 7)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, OC_RP_2G4, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); @@ -1269,13 +1156,7 @@ rebuild_multiple_group(void **state) get_killing_rank_by_oid(arg, oid, 1, 0, &kill_rank, &kill_rank_nr); ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, kill_rank, -1, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) @@ -1286,7 +1167,7 @@ rebuild_multiple_group(void **state) rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } /** i/o to variable idx offset */ @@ -1307,6 +1188,7 @@ rebuild_with_large_offset(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); memset(data, 'a', 128); @@ -1323,13 +1205,7 @@ rebuild_with_large_offset(void **state) get_killing_rank_by_oid(arg, oid, 1, 0, &kill_rank, &kill_rank_nr); ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, kill_rank, -1, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) @@ -1340,7 +1216,7 @@ rebuild_with_large_offset(void **state) rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) assert_rc_equal(rc, -DER_NOSYS); - print_message("success\n"); + T_END(); } #define LARGE_KEY_SIZE 1048576 @@ -1361,6 +1237,7 @@ rebuild_with_large_key(void **state) if (!test_runable(arg, 4)) return; + T_BEGIN(); oid = daos_test_oid_gen(arg->coh, arg->obj_class, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); dkey = calloc(LARGE_KEY_SIZE, 1); @@ -1375,13 +1252,7 @@ rebuild_with_large_key(void **state) get_killing_rank_by_oid(arg, oid, 1, 0, &kill_rank, &kill_rank_nr); ioreq_fini(&req); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rebuild_single_pool_target(arg, kill_rank, -1, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; rc = daos_obj_verify(arg->coh, oid, DAOS_EPOCH_MAX); if (rc != 0) @@ -1393,7 +1264,7 @@ rebuild_with_large_key(void **state) assert_rc_equal(rc, -DER_NOSYS); free(dkey); free(akey); - print_message("success\n"); + T_END(); } void @@ -1416,6 +1287,7 @@ rebuild_with_dfs_open_create_punch(void **state) if (!test_runable(arg, 6)) return; + T_BEGIN(); dfs_attr_t attr = {}; attr.da_props = daos_prop_alloc(1); @@ -1444,16 +1316,10 @@ rebuild_with_dfs_open_create_punch(void **state) dfs_obj2id(dir, &oid); - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_stop_with_dmg; - arg->rebuild_post_cb = rebuild_resume_wait; - } rank = get_rank_by_oid_shard(arg, oid, 0); rebuild_single_pool_rank(arg, rank, false); reintegrate_single_pool_rank(arg, rank, false); - arg->rebuild_cb = NULL; - arg->rebuild_post_cb = NULL; daos_cont_status_clear(co_hdl, NULL); for (i = 0; i < 20; i++) { @@ -1480,7 +1346,7 @@ rebuild_with_dfs_open_create_punch(void **state) uuid_unparse(co_uuid, str); rc = daos_cont_destroy(arg->pool.poh, str, 1, NULL); assert_rc_equal(rc, 0); - print_message("success\n"); + T_END(); } static int @@ -1488,8 +1354,7 @@ rebuild_wait_reset_fail_cb(void *data) { test_arg_t *arg = data; - print_message("wait 60 seconds for rebuild/reclaim%s\n", - arg->interactive_rebuild ? "" : "/retry"); + print_message("wait 60 seconds for rebuild/reclaim\n"); sleep(60); print_message("clearing fault injection on all engines\n"); @@ -1500,40 +1365,6 @@ rebuild_wait_reset_fail_cb(void *data) return 0; } -static int -rebuild_wait_error_reset_fail_cb(void *data) -{ - test_arg_t *arg = data; - int rc; - - print_message("wait until rebuild errors (and starts Fail_reclaim)\n"); - test_rebuild_wait_to_error(&arg, 1); - print_message("check rebuild errored, rs_errno=%d (expecting -DER_IO=%d)\n", - arg->pool.pool_info.pi_rebuild_st.rs_errno, -DER_IO); - assert_int_equal(arg->pool.pool_info.pi_rebuild_st.rs_errno, -DER_IO); - print_message("rebuild error code check passed\n"); - - print_message("clearing fault injection on all engines\n"); - daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); - daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_VALUE, 0, 0, NULL); - daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_NUM, 0, 0, NULL); - - /* Give time for transition from op:Rebuild into op:Fail_reclaim */ - sleep(2); - - print_message( - "send rebuild stop --force request during first/only Fail_reclaim operation\n"); - rc = rebuild_force_stop_with_dmg(data); - if (rc != 0) - print_message("rebuild_force_stop_with_dmg failed, rc=%d\n", rc); - - print_message("wait for rebuild to be stopped\n"); - test_rebuild_wait(&arg, 1); - /* Verifying rs_state/rs_errno will happen in post_cb rebuild_resume_wait() */ - - return rc; -} - static void rebuild_many_objects_with_failure(void **state) { @@ -1545,6 +1376,7 @@ rebuild_many_objects_with_failure(void **state) if (!test_runable(arg, 6)) return; + T_BEGIN(); D_ALLOC_ARRAY(oids, 8000); for (i = 0; i < 8000; i++) { char buffer[256]; @@ -1561,7 +1393,7 @@ rebuild_many_objects_with_failure(void **state) ioreq_fini(&req); } - /* Inject faults on engines. Special handling for interactive_rebuild case */ + /* Inject faults on engines */ if (arg->myrank == 0) { print_message("inject fault DAOS_REBUILD_OBJ_FAIL on all engines\n"); daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, @@ -1569,19 +1401,7 @@ rebuild_many_objects_with_failure(void **state) daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_VALUE, 50, 0, NULL); } - /* For interactive rebuild, we need: - * 1. trigger rebuild (which will fail), query pool reubild state until op:Rebuild fails - * and op:Fail_reclaim begins. See test_rebuild_wait_to_error(). - * 2. Then, while rebuild is in op:Fail_reclaim, issue dmg system stop to test that you - * can't stop during Fail_reclaim (though the command will take effect by not retrying - * rebuild). - */ - if (arg->interactive_rebuild) { - arg->rebuild_cb = rebuild_wait_error_reset_fail_cb; - arg->rebuild_post_cb = rebuild_resume_wait; - } else { - arg->rebuild_cb = rebuild_wait_reset_fail_cb; - } + arg->rebuild_cb = rebuild_wait_reset_fail_cb; rebuild_single_pool_target(arg, 3, -1, false); for (i = 0; i < 8000; i++) { @@ -1590,7 +1410,7 @@ rebuild_many_objects_with_failure(void **state) assert_rc_equal(rc, -DER_NOSYS); } D_FREE(oids); - print_message("success\n"); + T_END(); } #define KB 1024 @@ -1649,10 +1469,7 @@ rebuild_object_with_csum_error(void **state) skip(); } - if (arg->interactive_rebuild) { - print_message("SKIP due to interactive_rebuild enabled, but not tested here\n"); - skip(); - } + T_BEGIN(); /* setup pool to have scrubbing turned on */ assert_success(dmg_pool_set_prop(dmg_config_file, "scrub", "timed", pool_uuid)); @@ -1729,7 +1546,7 @@ rebuild_object_with_csum_error(void **state) assert_success(daos_cont_close(coh, NULL)); assert_success(daos_cont_destroy(poh, uuid_cont_str, false, NULL)); assert_success(dmg_pool_set_prop(dmg_config_file, "scrub", "off", arg->pool.pool_uuid)); - print_message("success\n"); + T_END(); } struct rebuild_cb_arg { @@ -1889,12 +1706,14 @@ rebuild_dfs_append_cb(void *data) rebuild_dfs_write(cb_arg->dfs_mt, cb_arg->dir, cb_arg->offset, cb_arg->size, O_RDWR | O_EXCL); + print_message("%s(): completed rebuild_dfs_write()\n", __FUNCTION__); if (arg->myrank == 0) { daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_VALUE, 0, 0, NULL); } + return 0; } @@ -1905,12 +1724,14 @@ rebuild_dfs_punch_cb(void *data) struct rebuild_cb_arg *cb_arg = arg->rebuild_cb_arg; rebuild_dfs_remove(cb_arg->dfs_mt, cb_arg->dir); + print_message("%s(): completed rebuild_dfs_remove()\n", __FUNCTION__); if (arg->myrank == 0) { daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_VALUE, 0, 0, NULL); } + return 0; } @@ -1929,11 +1750,7 @@ rebuild_with_dfs_inflight_append(void **state) if (!test_runable(arg, 6)) return; - if (arg->interactive_rebuild) { - print_message("SKIP due to interactive_rebuild enabled, but not tested here\n"); - skip(); - } - + T_BEGIN(); daos_pool_set_prop(arg->pool.pool_uuid, "reclaim", "disabled"); rebuild_dfs_prep(arg, &dfs_mt, &dir, &co_hdl, &co_uuid); @@ -1972,7 +1789,7 @@ rebuild_with_dfs_inflight_append(void **state) rebuild_dfs_read_check(dfs_mt, dir, 0, 1048576 * 3); rebuild_dfs_fini(arg, dfs_mt, dir, co_hdl, co_uuid); - print_message("success\n"); + T_END(); } void @@ -1994,11 +1811,7 @@ rebuild_with_dfs_inflight_punch(void **state) if (!test_runable(arg, 6)) return; - if (arg->interactive_rebuild) { - print_message("SKIP due to interactive_rebuild enabled, but not tested here\n"); - skip(); - } - + T_BEGIN(); daos_pool_set_prop(arg->pool.pool_uuid, "reclaim", "disabled"); rebuild_dfs_prep(arg, &dfs_mt, &dir, &co_hdl, &co_uuid); @@ -2041,7 +1854,7 @@ rebuild_with_dfs_inflight_punch(void **state) } rebuild_dfs_fini(arg, dfs_mt, dir, co_hdl, co_uuid); - print_message("success\n"); + T_END(); } static int @@ -2052,13 +1865,14 @@ rebuild_dfs_create_append_cb(void *data) rebuild_dfs_write(cb_arg->dfs_mt, cb_arg->dir, cb_arg->offset, cb_arg->size, O_RDWR | O_CREAT | O_EXCL); - + print_message("%s(): completed rebuild_dfs_write()\n", __FUNCTION__); if (arg->myrank == 0) { daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_VALUE, 0, 0, NULL); } + return 0; } @@ -2081,11 +1895,7 @@ rebuild_with_dfs_inflight_append_punch(void **state) if (!test_runable(arg, 6)) return; - if (arg->interactive_rebuild) { - print_message("SKIP due to interactive_rebuild enabled, but not tested here\n"); - skip(); - } - + T_BEGIN(); daos_pool_set_prop(arg->pool.pool_uuid, "reclaim", "disabled"); rebuild_dfs_prep(arg, &dfs_mt, &dir, &co_hdl, &co_uuid); @@ -2124,7 +1934,7 @@ rebuild_with_dfs_inflight_append_punch(void **state) rebuild_dfs_read_check(dfs_mt, dir, 0, 1048576 + 10); rebuild_dfs_fini(arg, dfs_mt, dir, co_hdl, co_uuid); - print_message("success\n"); + T_END(); } static int @@ -2134,13 +1944,14 @@ rebuild_dfs_punch_create_cb(void *data) struct rebuild_cb_arg *cb_arg = arg->rebuild_cb_arg; int i; - print_message("start remove/update loop\n"); + print_message("%s(): start remove/update loop\n", __FUNCTION__); for (i = 0; i < 100; i++) { rebuild_dfs_remove(cb_arg->dfs_mt, cb_arg->dir); rebuild_dfs_write(cb_arg->dfs_mt, cb_arg->dir, cb_arg->offset, cb_arg->size, O_RDWR | O_CREAT | O_EXCL); } - print_message("end remove/update loop\n"); + print_message("%s() end remove/update loop\n", __FUNCTION__); + return 0; } @@ -2163,11 +1974,7 @@ rebuild_with_dfs_inflight_punch_create(void **state) if (!test_runable(arg, 6)) return; - if (arg->interactive_rebuild) { - print_message("SKIP due to interactive_rebuild enabled, but not tested here\n"); - skip(); - } - + T_BEGIN(); daos_pool_set_prop(arg->pool.pool_uuid, "reclaim", "disabled"); rebuild_dfs_prep(arg, &dfs_mt, &dir, &co_hdl, &co_uuid); @@ -2191,7 +1998,7 @@ rebuild_with_dfs_inflight_punch_create(void **state) rebuild_dfs_read_check(dfs_mt, dir, 0, 1048576 + 10); rebuild_dfs_fini(arg, dfs_mt, dir, co_hdl, co_uuid); - print_message("success\n"); + T_END(); } /** create a new pool/container for each test */ diff --git a/src/tests/suite/daos_test.c b/src/tests/suite/daos_test.c index d4f789c05cb..18c6fcbab7d 100644 --- a/src/tests/suite/daos_test.c +++ b/src/tests/suite/daos_test.c @@ -24,7 +24,7 @@ * These tests will only be run if explicitly specified. They don't get * run if no test is specified. */ -#define EXPLICIT_TESTS "x" +#define EXPLICIT_TESTS "xj" static const char *all_tests = TESTS; static const char *all_tests_defined = TESTS EXPLICIT_TESTS; @@ -32,7 +32,6 @@ enum { CHECKSUM_ARG_VAL_TYPE = 0x2713, CHECKSUM_ARG_VAL_CHUNKSIZE = 0x2714, CHECKSUM_ARG_VAL_SERVERVERIFY = 0x2715, - REBUILD_INTERACTIVE = 0x2716, }; static void @@ -89,7 +88,6 @@ print_usage(int rank) print_message("daos_test --csum_type CSUM_TYPE\n"); print_message("daos_test --csum_cs CHUNKSIZE\n"); print_message("daos_test --csum_sv\n"); - print_message("daos_test --rebuild_interactive\n"); print_message("\n=============================\n"); } @@ -318,6 +316,13 @@ run_specified_tests(const char *tests, int rank, int size, daos_test_print(rank, "================="); nr_failed += run_daos_inc_reint_test(rank, size, sub_tests, sub_tests_size); break; + case 'j': + daos_test_print(rank, "\n\n================="); + daos_test_print(rank, "DAOS interactive rebuild tests.."); + daos_test_print(rank, "================="); + nr_failed += + run_daos_int_rebuild_test(rank, size, sub_tests, sub_tests_size); + break; default: D_ASSERT(0); } @@ -402,7 +407,6 @@ main(int argc, char **argv) {"work_dir", required_argument, NULL, 'W'}, {"workload_file", required_argument, NULL, 'w'}, {"obj_class", required_argument, NULL, 'l'}, - {"rebuild_interactive", no_argument, NULL, REBUILD_INTERACTIVE}, {"help", no_argument, NULL, 'h'}, {NULL, 0, NULL, 0}}; @@ -415,7 +419,7 @@ main(int argc, char **argv) memset(tests, 0, sizeof(tests)); while ( - (opt = getopt_long(argc, argv, "amFpcCdtTViIzUZxADKeoROg:n:s:u:E:f:w:W:hrNvbBSXl:GPY", + (opt = getopt_long(argc, argv, "amFpcCdtTViIzUZxADKeoROg:n:s:u:E:f:w:W:hrNvbBSXl:GPYj", long_options, &index)) != -1) { if (strchr(all_tests_defined, opt) != NULL) { tests[ntests] = opt; @@ -477,9 +481,6 @@ main(int argc, char **argv) case CHECKSUM_ARG_VAL_SERVERVERIFY: dt_csum_server_verify = true; break; - case REBUILD_INTERACTIVE: - dt_rb_interactive = true; - break; default: daos_test_print(rank, "Unknown Option\n"); print_usage(rank); diff --git a/src/tests/suite/daos_test.h b/src/tests/suite/daos_test.h index 23de8a21501..501a943c942 100644 --- a/src/tests/suite/daos_test.h +++ b/src/tests/suite/daos_test.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -42,11 +42,22 @@ #include #include #include +#include #if D_HAS_WARNING(4, "-Wframe-larger-than=") #pragma GCC diagnostic ignored "-Wframe-larger-than=" #endif +#define T_BEGIN() \ + do { \ + printf("BEGIN %s()\n", __FUNCTION__); \ + } while (0) + +#define T_END() \ + do { \ + printf("END %s() success\n", __FUNCTION__); \ + } while (0) + /** Server crt group ID */ extern const char *server_group; @@ -54,9 +65,6 @@ extern const char *server_group; extern int dt_incr_reint; extern bool dt_no_punch; -/** pool interactive rebuild */ -extern bool dt_rb_interactive; - /** Pool service replicas */ extern unsigned int svc_nreplicas; extern const char *dmg_config_file; @@ -379,6 +387,7 @@ int run_daos_nvme_recov_test(int rank, int size, int *sub_tests, int run_daos_rebuild_simple_test(int rank, int size, int *tests, int test_size); int run_daos_drain_simple_test(int rank, int size, int *tests, int test_size); int run_daos_extend_simple_test(int rank, int size, int *tests, int test_size); +int run_daos_int_rebuild_test(int rank, int size, int *tests, int test_size); int run_daos_inc_reint_test(int rank, int size, int *tests, int test_size); int run_daos_rebuild_simple_ec_test(int rank, int size, int *tests, int test_size); @@ -411,10 +420,16 @@ bool test_rebuild_query(test_arg_t **args, int args_cnt); void test_rebuild_wait(test_arg_t **args, int args_cnt); void test_rebuild_wait_to_start(test_arg_t **args, int args_cnt); +void +test_rebuild_wait_to_start_next(test_arg_t **args, int args_cnt); +void +test_rebuild_wait_to_start_lower(test_arg_t **args, int args_cnt); void test_rebuild_wait_to_error(test_arg_t **args, int args_cnt); int daos_pool_set_prop(const uuid_t pool_uuid, const char *name, const char *value); +int + daos_pool_get_prop(const uuid_t pool_uuid, const char *name, char **value_out); int daos_pool_upgrade(const uuid_t pool_uuid); int ec_data_nr_get(daos_obj_id_t oid); @@ -500,6 +515,8 @@ int rebuild_start_with_dmg(void *data); int rebuild_resume_wait(void *data); +int + rebuild_resume_wait_to_start(void *data); int get_server_config(char *host, char *server_config_file); int get_log_file(char *host, char *server_config_file, @@ -539,8 +556,10 @@ void make_buffer(char *buffer, char start, int total); bool oid_is_ec(daos_obj_id_t oid, struct daos_oclass_attr **attr); uint32_t test_ec_get_parity_off(daos_key_t *dkey, struct daos_oclass_attr *oca); + int reintegrate_inflight_io(void *data); -int reintegrate_inflight_io_verify(void *data); +int +reintegrate_inflight_io_verify(void *data); static inline void daos_test_print(int rank, char *message) @@ -747,7 +766,78 @@ void void test_set_engine_fail_value(test_arg_t *arg, d_rank_t engine_rank, uint64_t fail_value); void test_set_engine_fail_num(test_arg_t *arg, d_rank_t engine_rank, uint64_t fail_num); +char * +test_escape_self_heal(const char *value); + void test_verify_cont(test_arg_t *arg, struct test_pool *pool, struct test_cont *conts, int cont_nr); +/* Common types and functions for drain rebuild tests */ + +#define EXTEND_DRAIN_OBJ_NR 5 +#define WRITE_SIZE (1048576 * 5) + +struct extend_drain_cb_arg { + daos_obj_id_t *oids; + dfs_t *dfs_mt; + dfs_obj_t *dir; + d_rank_t rank; + uint32_t objclass; + int opc; +}; + +enum extend_drain_opc { + EXTEND_DRAIN_PUNCH, + EXTEND_DRAIN_STAT, + EXTEND_DRAIN_ENUMERATE, + EXTEND_DRAIN_FETCH, + EXTEND_DRAIN_UPDATE, + EXTEND_DRAIN_OVERWRITE, + EXTEND_DRAIN_WRITELOOP, +}; + +extern const char *extend_drain_opstrs[]; + +void +extend_drain_read_check(dfs_t *dfs_mt, dfs_obj_t *dir, uint32_t objclass, uint32_t objcnt, + daos_size_t total_size, char start_char); +void +extend_drain_write(dfs_t *dfs_mt, dfs_obj_t *dir, uint32_t objclass, uint32_t objcnt, + daos_size_t total_size, char write_char, daos_obj_id_t *oids); +void +extend_drain_check(dfs_t *dfs_mt, dfs_obj_t *dir, int objclass, int opc); +void +dfs_extend_drain_common(void **state, int opc, uint32_t objclass, + test_rebuild_cb_t extend_drain_cb_fn); + +/* Common types and functions for extend rebuild tests */ + +#define EXTEND_OBJ_NR 1000 + +struct extend_cb_arg { + daos_obj_id_t *oids; + dfs_t *dfs_mt; + dfs_obj_t *dir; + d_rank_t rank; + int opc; + bool kill; +}; + +enum extend_opc { + EXTEND_PUNCH, + EXTEND_STAT, + EXTEND_ENUMERATE, + EXTEND_FETCH, + EXTEND_UPDATE, +}; + +extern const char *extend_opstrs[]; + +void +dfs_extend_internal(void **state, int opc, test_rebuild_cb_t extend_cb, bool kill); +void +extend_read_check(dfs_t *dfs_mt, dfs_obj_t *dir); +void +extend_write(dfs_t *dfs_mt, dfs_obj_t *dir); + #endif diff --git a/src/tests/suite/daos_test_common.c b/src/tests/suite/daos_test_common.c index 31d6ce922a8..4716fb646d8 100644 --- a/src/tests/suite/daos_test_common.c +++ b/src/tests/suite/daos_test_common.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -38,9 +38,6 @@ int dt_redun_fac; int dt_incr_reint; bool dt_no_punch; /* will remove later */ -/** rebuild test variants */ -bool dt_rb_interactive = false; - /* Create or import a single pool with option to store info in arg->pool * or an alternate caller-specified test_pool structure. * ipool (optional): import pool: store info for an existing pool to arg->pool. @@ -247,6 +244,70 @@ test_setup_cont_create(void **state, daos_prop_t *co_prop) } } + /* Temporarily use old container property defaults due to DAOS-17946 */ + /* Set DAOS_PROP_CO_CSUM to off if not already defined */ + if (daos_prop_entry_get(co_prop, DAOS_PROP_CO_CSUM) == NULL) { + daos_prop_t *csum_prop = daos_prop_alloc(1); + if (csum_prop == NULL) { + D_ERROR("failed to allocate csum prop\n"); + daos_prop_free(redun_lvl_prop); + daos_prop_free(merged_props); + return -DER_NOMEM; + } + csum_prop->dpp_entries[0].dpe_type = DAOS_PROP_CO_CSUM; + csum_prop->dpp_entries[0].dpe_val = DAOS_PROP_CO_CSUM_OFF; + + daos_prop_t *new_merged_props = daos_prop_merge(co_prop, csum_prop); + if (new_merged_props == NULL) { + D_ERROR("failed to merge co_prop and csum_prop\n"); + daos_prop_free(redun_lvl_prop); + daos_prop_free(merged_props); + daos_prop_free(csum_prop); + return -DER_NOMEM; + } + + /* Update co_prop to point to the newly merged properties */ + if (merged_props) { + daos_prop_free(merged_props); + merged_props = new_merged_props; + } else { + merged_props = new_merged_props; + } + co_prop = merged_props; + daos_prop_free(csum_prop); + } + /* Set DAOS_PROP_CO_CSUM_SERVER_VERIFY to off if not already defined */ + if (daos_prop_entry_get(co_prop, DAOS_PROP_CO_CSUM_SERVER_VERIFY) == NULL) { + daos_prop_t *csum_sv_prop = daos_prop_alloc(1); + if (csum_sv_prop == NULL) { + D_ERROR("failed to allocate csum_sv_prop\n"); + daos_prop_free(redun_lvl_prop); + daos_prop_free(merged_props); + return -DER_NOMEM; + } + csum_sv_prop->dpp_entries[0].dpe_type = DAOS_PROP_CO_CSUM_SERVER_VERIFY; + csum_sv_prop->dpp_entries[0].dpe_val = DAOS_PROP_CO_CSUM_SV_OFF; + + daos_prop_t *new_merged_props = daos_prop_merge(co_prop, csum_sv_prop); + if (new_merged_props == NULL) { + D_ERROR("failed to merge co_prop and csum_sv_prop\n"); + daos_prop_free(redun_lvl_prop); + daos_prop_free(merged_props); + daos_prop_free(csum_sv_prop); + return -DER_NOMEM; + } + + /* Update co_prop to point to the newly merged properties */ + if (merged_props) { + daos_prop_free(merged_props); + merged_props = new_merged_props; + } else { + merged_props = new_merged_props; + } + co_prop = merged_props; + daos_prop_free(csum_sv_prop); + } + D_ASSERT(co_prop != NULL); if (daos_prop_entry_get(co_prop, DAOS_PROP_CO_LABEL) == NULL) { char cont_label[32]; @@ -407,12 +468,6 @@ test_setup(void **state, unsigned int step, bool multi_rank, } /** Look at variables set by test arguments and configure testing */ - if (dt_rb_interactive) { - print_message("\n-------\n" - "Interactive rebuild (stop|start) is enabled in some tests!" - "\n-------\n"); - arg->interactive_rebuild = 1; - } /** Look at variables set by test arguments and setup pool props */ if (dt_incr_reint) { @@ -790,8 +845,41 @@ test_pool_get_info(test_arg_t *arg, daos_pool_info_t *pinfo, d_rank_list_t **eng return rc; } +/* Determine if pool rebuild is busy, and the rebuild version is > rs_version */ +static bool +rebuild_pool_started_after_ver(test_arg_t *arg, uint32_t rs_version) +{ + daos_pool_info_t pinfo = {0}; + struct daos_rebuild_status *rst; + int rc; + + pinfo.pi_bits = DPI_REBUILD_STATUS; + rc = test_pool_get_info(arg, &pinfo, NULL /* engine_ranks */); + rst = &pinfo.pi_rebuild_st; + + if (rc != 0) { + print_message("pool query for rebuild status failed, rc=%d, pool " DF_UUIDF "\n", + rc, DP_UUID(arg->pool.pool_uuid)); + return false; + } else { + bool in_progress = (rst->rs_state == DRS_IN_PROGRESS); + print_message("rebuild for pool " DF_UUIDF "has %sstarted, rs_version=%u " + "(waiting for > %d)\n", + DP_UUID(arg->pool.pool_uuid), in_progress ? "" : "not yet ", + rst->rs_version, rs_version); + if (in_progress && (rst->rs_version > rs_version)) { + /* save final pool query info to be able to inspect rebuild status */ + memcpy(&arg->pool.pool_info, &pinfo, sizeof(pinfo)); + + return true; + } + return false; + } +} + +/* Determine if pool rebuild is busy, and the rebuild version is < rs_version */ static bool -rebuild_pool_started(test_arg_t *arg) +rebuild_pool_started_before_ver(test_arg_t *arg, uint32_t rs_version) { daos_pool_info_t pinfo = {0}; struct daos_rebuild_status *rst; @@ -806,10 +894,18 @@ rebuild_pool_started(test_arg_t *arg) rc, DP_UUID(arg->pool.pool_uuid)); return false; } else { - bool started = (rst->rs_state == DRS_IN_PROGRESS); - print_message("rebuild for pool " DF_UUIDF "has %sstarted\n", - DP_UUID(arg->pool.pool_uuid), started ? "" : "not yet "); - return started; + bool in_progress = (rst->rs_state == DRS_IN_PROGRESS); + + print_message("rebuild for pool " DF_UUIDF "has %sstarted, rs_version=%u " + "(waiting for < %d)\n", + DP_UUID(arg->pool.pool_uuid), in_progress ? "" : "not yet ", + rst->rs_version, rs_version); + if (in_progress && (rst->rs_version < rs_version)) { + /* save final pool query info to be able to inspect rebuild status */ + memcpy(&arg->pool.pool_info, &pinfo, sizeof(pinfo)); + return true; + } + return false; } } @@ -830,11 +926,10 @@ rebuild_pool_erroring(test_arg_t *arg) return false; } else { bool started = (rst->rs_state == DRS_IN_PROGRESS); - bool erroring = started && (rst->rs_errno != 0); + bool erroring = (rst->rs_errno != 0); - print_message("rebuild for pool " DF_UUIDF "has %sstarted, rs_errno=%d\n", - DP_UUID(arg->pool.pool_uuid), started ? "" : "not yet ", - rst->rs_errno); + print_message("rebuild for pool " DF_UUIDF " is %scurrently running, rs_errno=%d\n", + DP_UUID(arg->pool.pool_uuid), started ? "" : "not ", rst->rs_errno); /* save final pool query info to be able to inspect rebuild status */ if (erroring) @@ -916,8 +1011,26 @@ test_get_last_svr_rank(test_arg_t *arg) return arg->srv_nnodes - disable_nodes - 1; } -bool -test_rebuild_started(test_arg_t **args, int args_cnt) +static bool +test_rebuild_started_before(test_arg_t **args, int args_cnt, uint32_t *cur_versions) +{ + bool all_started = true; + int i; + + for (i = 0; i < args_cnt; i++) { + bool started = true; + + if (!args[i]->pool.destroyed) + started = rebuild_pool_started_before_ver(args[i], cur_versions[i]); + + if (!started) + all_started = false; + } + return all_started; +} + +static bool +test_rebuild_started_after(test_arg_t **args, int args_cnt, uint32_t *cur_versions) { bool all_started = true; int i; @@ -926,7 +1039,7 @@ test_rebuild_started(test_arg_t **args, int args_cnt) bool started = true; if (!args[i]->pool.destroyed) - started = rebuild_pool_started(args[i]); + started = rebuild_pool_started_after_ver(args[i], cur_versions[i]); if (!started) all_started = false; @@ -934,11 +1047,67 @@ test_rebuild_started(test_arg_t **args, int args_cnt) return all_started; } +/* wait until pools start rebuilds with rs_version < current (e.g.,. expecting op:Fail_reclaim) */ +void +test_rebuild_wait_to_start_lower(test_arg_t **args, int args_cnt) +{ + uint32_t *cur_versions; + int i; + + D_ALLOC_ARRAY(cur_versions, args_cnt); + assert_true(cur_versions != NULL); + for (i = 0; i < args_cnt; i++) + cur_versions[i] = args[i]->pool.pool_info.pi_rebuild_st.rs_version; + + while (!test_rebuild_started_before(args, args_cnt, cur_versions)) + sleep(2); + + /* NB: when control reaches here, each pool's current rs_version has been updated + * (for subsequent calls that will rely on it as a baseline) + */ + D_FREE(cur_versions); +} + +/* wait until pools start rebuilds with rs_version > current (e.g.,. expecting op:Rebuild) */ +void +test_rebuild_wait_to_start_next(test_arg_t **args, int args_cnt) +{ + uint32_t *cur_versions; + int i; + + D_ALLOC_ARRAY(cur_versions, args_cnt); + assert_true(cur_versions != NULL); + for (i = 0; i < args_cnt; i++) + cur_versions[i] = args[i]->pool.pool_info.pi_rebuild_st.rs_version; + + while (!test_rebuild_started_after(args, args_cnt, cur_versions)) + sleep(2); + + /* NB: when control reaches here, each pool's current rs_version has been updated + * (for subsequent calls that will rely on it as a baseline) + */ + D_FREE(cur_versions); +} + +/* wait until pools start rebuilds with any rs_version > 0 (whatever is current) */ void test_rebuild_wait_to_start(test_arg_t **args, int args_cnt) { - while (!test_rebuild_started(args, args_cnt)) + uint32_t *cur_versions; + int i; + + D_ALLOC_ARRAY(cur_versions, args_cnt); + assert_true(cur_versions != NULL); + for (i = 0; i < args_cnt; i++) + cur_versions[i] = 0; + + while (!test_rebuild_started_after(args, args_cnt, cur_versions)) sleep(2); + + /* NB: when control reaches here, each pool's current rs_version has been updated + * (for subsequent calls that will rely on it as a baseline) + */ + D_FREE(cur_versions); } bool @@ -1070,6 +1239,12 @@ daos_pool_set_prop(const uuid_t pool_uuid, const char *name, return dmg_pool_set_prop(dmg_config_file, name, value, pool_uuid); } +int +daos_pool_get_prop(const uuid_t pool_uuid, const char *name, char **value_out) +{ + return dmg_pool_get_prop(dmg_config_file, NULL, pool_uuid, name, value_out); +} + void daos_start_server(test_arg_t *arg, const uuid_t pool_uuid, const char *grp, d_rank_list_t *svc, d_rank_t rank) @@ -1652,3 +1827,41 @@ test_set_engine_fail_num(test_arg_t *arg, d_rank_t engine_rank, uint64_t fail_nu rc = daos_debug_set_params(arg->group, engine_rank, DMG_KEY_FAIL_NUM, fail_num, 0, NULL); assert_rc_equal(rc, 0); } + +/** + * Duplicate unescaped \a value, escaping every ';' with '\\'. The caller is + * responsible for freeing the returned string. + * + * \param[in] value self_heal value to escape + */ +char * +test_escape_self_heal(const char *value) +{ + size_t len = 0; + char *new_value; + const char *src; + char *dst; + + for (src = value; *src != '\0'; src++) { + D_ASSERT(*src != '\\'); + len++; + if (*src == ';') + len++; /* for '\\' */ + } + + D_ALLOC(new_value, len + 1 /* '\0' */); + D_ASSERT(new_value != NULL); + + dst = new_value; + for (src = value; *src != '\0'; src++) { + if (*src == ';') { + *dst++ = '\\'; + *dst++ = ';'; + } else { + *dst++ = *src; + } + } + *dst = '\0'; + + return new_value; +} diff --git a/src/tests/suite/dfs_unit_test.c b/src/tests/suite/dfs_unit_test.c index cb2ad322f89..6a537ffa048 100644 --- a/src/tests/suite/dfs_unit_test.c +++ b/src/tests/suite/dfs_unit_test.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2256,11 +2256,12 @@ dfs_test_oclass_hints(void **state) daos_oclass_id_t cid; daos_handle_t coh; dfs_t *dfs_l; - dfs_obj_t *obj; + dfs_obj_t *obj, *dir; daos_obj_id_t oid; daos_oclass_id_t ecidx; daos_prop_t *prop = NULL; dfs_attr_t dattr = {0}; + dfs_obj_info_t oinfo = {0}; struct pl_map_attr attr = {0}; int rc; @@ -2416,6 +2417,21 @@ dfs_test_oclass_hints(void **state) rc = compare_oclass(coh, cid, OC_RP_2GX); assert_rc_equal(rc, 0); + /** create a directory and set EC to be used on the directory */ + rc = dfs_open(dfs_l, NULL, "d1", S_IFDIR | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT, 0, 0, NULL, + &dir); + assert_int_equal(rc, 0); + rc = dfs_obj_set_oclass(dfs_l, dir, 0, ecidx); + assert_int_equal(rc, 0); + /** get the dir info to query what oclass will be used */ + rc = dfs_obj_get_info(dfs_l, dir, &oinfo); + assert_int_equal(rc, 0); + rc = compare_oclass(coh, oinfo.doi_dir_oclass_id, OC_RP_2G1); + assert_int_equal(rc, 0); + rc = compare_oclass(coh, oinfo.doi_file_oclass_id, ecidx); + assert_int_equal(rc, 0); + dfs_release(dir); + rc = dfs_umount(dfs_l); assert_int_equal(rc, 0); rc = daos_cont_close(coh, NULL); @@ -2468,6 +2484,21 @@ dfs_test_oclass_hints(void **state) rc = compare_oclass(coh, cid, OC_RP_3GX); assert_rc_equal(rc, 0); + /** create a directory and set EC to be used on the directory */ + rc = dfs_open(dfs_l, NULL, "d1", S_IFDIR | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT, 0, 0, NULL, + &dir); + assert_int_equal(rc, 0); + rc = dfs_obj_set_oclass(dfs_l, dir, 0, ecidx); + assert_int_equal(rc, 0); + /** get the dir info to query what oclass will be used */ + rc = dfs_obj_get_info(dfs_l, dir, &oinfo); + assert_int_equal(rc, 0); + rc = compare_oclass(coh, oinfo.doi_dir_oclass_id, OC_RP_3G1); + assert_int_equal(rc, 0); + rc = compare_oclass(coh, oinfo.doi_file_oclass_id, ecidx); + assert_int_equal(rc, 0); + dfs_release(dir); + rc = dfs_umount(dfs_l); assert_int_equal(rc, 0); rc = daos_cont_close(coh, NULL); @@ -2520,6 +2551,22 @@ dfs_test_oclass_hints(void **state) rc = compare_oclass(coh, cid, OC_RP_4GX); assert_rc_equal(rc, 0); + /** create a directory and set EC to be used on the directory */ + rc = dfs_open(dfs_l, NULL, "d1", S_IFDIR | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT, 0, 0, NULL, + &dir); + assert_int_equal(rc, 0); + rc = dfs_obj_set_oclass(dfs_l, dir, 0, ecidx); + assert_int_equal(rc, 0); + /** get the dir info to query what oclass will be used */ + rc = dfs_obj_get_info(dfs_l, dir, &oinfo); + assert_int_equal(rc, 0); + rc = compare_oclass(coh, oinfo.doi_dir_oclass_id, OC_RP_4G1); + assert_int_equal(rc, 0); + rc = compare_oclass(coh, oinfo.doi_file_oclass_id, ecidx); + assert_int_equal(rc, 0); + dfs_release(dir); + + assert_int_equal(rc, 0); rc = dfs_umount(dfs_l); assert_int_equal(rc, 0); rc = daos_cont_close(coh, NULL); diff --git a/src/tests/suite/dfuse_test.c b/src/tests/suite/dfuse_test.c index 24c9d38aa1c..bf078145746 100644 --- a/src/tests/suite/dfuse_test.c +++ b/src/tests/suite/dfuse_test.c @@ -586,6 +586,15 @@ do_mtime(void **state) rc = close(fd); assert_return_code(rc, errno); + usleep(10000); + prev_ts.tv_sec = stbuf.st_mtim.tv_sec; + prev_ts.tv_nsec = stbuf.st_mtim.tv_nsec; + rc = utimensat(root, "mtime_file", NULL, 0); + assert_return_code(rc, errno); + rc = fstatat(root, "mtime_file", &stbuf, 0); + assert_return_code(rc, errno); + assert_true(timespec_gt(stbuf.st_mtim, prev_ts)); + rc = unlinkat(root, "mtime_file", 0); assert_return_code(rc, errno); @@ -858,8 +867,10 @@ do_fdcallscheck(void **state) char path_old[512]; char path_new[512]; char *env_ldpreload; - bool use_dfuse = true; - bool with_pil4dfs = false; + char *env_compatible; + bool use_dfuse = true; + bool with_pil4dfs = false; + bool compatible_mode = false; /* "/tmp/dfuse-test" is assigned in src/tests/ftest/daos_test/dfuse.py */ char native_mount_dir[] = "/tmp/dfuse-test"; @@ -871,6 +882,11 @@ do_fdcallscheck(void **state) /* libioil cannot pass this test since low fds are only temporarily blocked */ with_pil4dfs = true; + env_compatible = getenv("D_IL_COMPATIBLE"); + if ((env_compatible != NULL) && (strcmp(env_compatible, "1") == 0)) + /* libioil cannot pass this test since low fds are only temporarily blocked */ + compatible_mode = true; + root = open(test_dir, O_PATH | O_DIRECTORY); assert_return_code(root, errno); @@ -984,7 +1000,7 @@ do_fdcallscheck(void **state) fd = openat(root, "test_file", O_RDWR | O_CREAT, S_IWUSR | S_IRUSR); assert_return_code(fd, errno); - if (with_pil4dfs && use_dfuse) + if (with_pil4dfs && use_dfuse && !compatible_mode) assert_true(is_fd_large(fd)); fd_new = 10000; @@ -1004,12 +1020,62 @@ do_fdcallscheck(void **state) rc = close(fd); assert_return_code(rc, errno); + rc = close(root); + assert_return_code(rc, errno); + /* end testing dup3() */ + + /* start testing dup3() - closing old fd first */ + root = open(test_dir, O_PATH | O_DIRECTORY); + assert_return_code(root, errno); + + fd = openat(root, "test_file", O_RDWR | O_CREAT, S_IWUSR | S_IRUSR); + assert_return_code(fd, errno); + + fd_new = 10000; + flag = O_CLOEXEC; + rc = dup3(fd, fd_new, flag); + assert_true(rc == fd_new); + + rc = close(fd); + assert_return_code(rc, errno); + + rc = close(fd_new); + assert_return_code(rc, errno); + /* end testing dup3() - closing old fd first */ + + /* start testing dup() */ + fd = openat(root, "test_file", O_RDWR | O_CREAT, S_IWUSR | S_IRUSR); + assert_return_code(fd, errno); + + fd_new = dup(fd); + assert_true(fd_new > 0); + + /* close the new fd first */ + rc = close(fd_new); + assert_return_code(rc, errno); + + rc = close(fd); + assert_return_code(rc, errno); + + fd = openat(root, "test_file", O_RDWR | O_CREAT, S_IWUSR | S_IRUSR); + assert_return_code(fd, errno); + + fd_new = dup(fd); + assert_true(fd_new > 0); + + /* close the old fd first */ + rc = close(fd); + assert_return_code(rc, errno); + + rc = close(fd_new); + assert_return_code(rc, errno); + /* end testing dup3() - closing old fd first */ + rc = unlinkat(root, "test_file", 0); assert_return_code(rc, errno); rc = close(root); assert_return_code(rc, errno); - /* end testing dup3() */ } /* diff --git a/src/utils/ctl/cart_ctl.c b/src/utils/ctl/cart_ctl.c index 3bdf65b2bbb..e135fe365f1 100644 --- a/src/utils/ctl/cart_ctl.c +++ b/src/utils/ctl/cart_ctl.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2018-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -54,6 +55,7 @@ enum cmd_t { CMD_SET_FI_ATTR, CMD_LOG_SET, CMD_LOG_ADD_MSG, + CMD_DUMP_COUNTERS, }; struct cmd_info { @@ -78,6 +80,7 @@ struct cmd_info cmds[] = { DEF_CMD(CMD_SET_FI_ATTR, CRT_OPC_CTL_FI_SET_ATTR), DEF_CMD(CMD_LOG_SET, CRT_OPC_CTL_LOG_SET), DEF_CMD(CMD_LOG_ADD_MSG, CRT_OPC_CTL_LOG_ADD_MSG), + DEF_CMD(CMD_DUMP_COUNTERS, CRT_OPC_CTL_DUMP_COUNTERS), }; static char * @@ -270,7 +273,7 @@ print_usage_msg(const char *msg) msg("Usage: cart_ctl --group-name name --rank " "start-end,start-end,rank,rank\n"); msg("\ncmds: get_uri_cache, list_ctx, get_hostname, get_pid, "); - msg("set_log, set_fi_attr, add_log_msg\n"); + msg("set_log, set_fi_attr, add_log_msg, dump_counters\n"); msg("\nset_log:\n"); msg("\tSet log to mask passed via -l argument\n"); msg("\nget_uri_cache:\n"); @@ -281,6 +284,8 @@ print_usage_msg(const char *msg) msg("\tPrint hostnames of specified ranks\n"); msg("\nget_pid:\n"); msg("\tReturn pids of the specified ranks\n"); + msg("\ndump_counters:\n"); + msg("\tDump mercury counters into the server log\n"); msg("\nset_fi_attr\n"); msg("\tset fault injection attributes for a fault ID. This command\n" "\tmust be accompanied by the option\n" @@ -337,6 +342,8 @@ parse_args(int argc, char **argv) ctl_gdata.cg_cmd_code = CMD_LOG_ADD_MSG; else if (strcmp(argv[1], "use_daos_agent_env") == 0) ctl_gdata.cg_use_daos_agent_env = true; + else if (strcmp(argv[1], "dump_counters") == 0) + ctl_gdata.cg_cmd_code = CMD_DUMP_COUNTERS; else { print_usage_msg("Invalid command\n"); D_GOTO(out, rc = -DER_INVAL); @@ -523,6 +530,10 @@ ctl_cli_cb(const struct crt_cb_info *cb_info) msg("pid: %d\n", out->cgp_pid); } break; + case CMD_DUMP_COUNTERS: { + msg("counters dumped into a server log\n"); + } break; + default: break; } diff --git a/src/utils/ddb/README.md b/src/utils/ddb/README.md index 58ceabf9098..8fbabf37cfa 100644 --- a/src/utils/ddb/README.md +++ b/src/utils/ddb/README.md @@ -24,11 +24,11 @@ The primary layers for the application are: The golang interface which handles parsing most of the user input. The github.com/jessevdk/go-flags module handles the user input from the command -line. This includes determining if the -R and -f options are passed and if a +line. This includes determining if the -f option is passed and if a path to a vos file was supplied. The github.com/desertbit/grumble module handles the execution of the commands, -whether from interactive mode or from the values of -R or -f. It also supplies +whether from interactive mode or from the -f value. It also supplies the interactive mode, managing history, input keys, etc. The golang code also calls the c code functions to initialize daos and vos. @@ -45,63 +45,141 @@ VOS api. This layer will adapt the needs of the ddb commands to the current VOS API implementation, making the VOS interaction a bit nicer for ddb. -# Help and Usage +## Help and Usage ``` $ ddb -h Usage: - ddb [OPTIONS] [] + ddb [OPTIONS] [vos_file_path] [ddb_command] [ddb_command_args...] The DAOS Debug Tool (ddb) allows a user to navigate through and modify a file in the VOS format. It offers both a command line and interactive -shell mode. If the '-R' or '-f' options are not provided, then it will -run in interactive mode. In order to modify the file, the '-w' option -must be included. The optional will be opened before running -commands supplied by '-R' or '-f' or entering interactive mode. +shell mode. If neither a single command or '-f' option is provided, then +the tool will run in interactive mode. In order to modify the VOS file, +the '-w' option must be included. If supplied, the VOS file supplied in +the first positional parameter will be opened before commands are executed. + +Many of the commands take a vos tree path. The format for this path +is [cont]/[obj]/[dkey]/[akey]/[extent]. +- cont - the full container uuid. +- obj - the object id. +- keys (akey, dkey) - there are multiple types of keys +-- string keys are simply the string value. If the size of the +key is greater than strlen(key), then the size is included at +the end of the string value. Example: 'akey{5}' is the key: akey +with a null terminator at the end. +-- number keys are formatted as '{[type]: NNN}' where type is +'uint8, uint16, uint32, or uint64'. NNN can be a decimal or +hex number. Example: '{uint32: 123456}' +-- binary keys are formatted as '{bin: 0xHHH}' where HHH is the hex +representation of the binary key. Example: '{bin: 0x1a2b}' +- extent for array values - in the format {lo-hi}. + +To make it easier to navigate the tree, indexes can be +used instead of the path part. The index is in the format [i]. Indexes +and actual path values can be used together + +Example Paths: +/3550f5df-e6b1-4415-947e-82e15cf769af/939000573846355970.0.13.1/dkey/akey/[0-1023] +[0]/[1]/[2]/[1]/[9] +/[0]/939000573846355970.0.13.1/[2]/akey{5}/[0-1023] + Application Options: - -R, --run_cmd= Execute the single command , then exit - -f, --file_cmd= Path to a file container a list of ddb commands, one - command per line, then exit. - -w, --write_mode Open the vos file in write mode. + --debug enable debug output + -w, --write_mode Open the vos file in write mode. + -f, --cmd_file= Path to a file containing a sequence of ddb commands to execute. + -p, --db_path= Path to the sys db. + -v, --version Show version Help Options: - -h, --help Show this help message + -h, --help Show this help message ``` -Interactive mode help +### Interactive mode help + ``` $ help -The DAOS Debug Tool (ddb) allows a user to navigate through and modify -a file in the VOS format. In order to modify the file, the '-w' option must -be included when opening the vos file. +Commands: + clear clear the screen + exit exit the shell + help use 'help [command]' for command help + quit, q exit the shell + version Print ddb version + +smd + smd_sync Restore the SMD file with backup from blob + +vos + close Close the currently opened vos pool shard + dev_list List all devices + dev_replace Replace an old device with a new unused device + dtx_act_abort Mark the active dtx entry as aborted + dtx_act_commit Mark the active dtx entry as committed + dtx_act_discard_invalid Discard the active DTX entry's records if invalid. + dtx_aggr Aggregate DTX entries + dtx_cmt_clear Clear the dtx committed table + dtx_dump Dump the dtx tables + dtx_stat Stat on DTX entries + feature Manage vos pool features + ilog_clear Remove all the ilog entries + ilog_commit Process the ilog + ilog_dump Dump the ilog + ls List containers, objects, dkeys, akeys, and values + open Opens the vos file at + prov_mem Prepare the memory environment for md-on-ssd mode + rm Remove a branch of the VOS tree. + rm_pool Remove a vos pool. + superblock_dump Dump the pool superblock information + value_dump Dump a value + value_load Load a value to a vos path. + vea_dump Dump information from the vea about free regions + vea_update Alter the VEA tree to mark a region as free. +``` -Many of the commands take a vos tree path. The format for this path -is 'cont_uuid/obj_id/dkey/akey/recx'. The keys currently only support string -keys. The recx for array values is the format {lo-hi}. To make it easier to -navigate the tree, indexes can be used instead of the path part. The index -is in the format '[i]', for example '[0]/[0]/[0]' +## `prov_mem` command -Commands: - clear clear the screen - clear_cmt_dtx Clear the dtx committed table - close Close the currently opened vos pool shard - commit_ilog Process the ilog - dtx_abort Mark the active dtx entry as aborted - dtx_commit Mark the active dtx entry as committed - dump_dtx Dump the dtx tables - dump_ilog Dump the ilog - dump_superblock Dump the pool superblock information - dump_value Dump a value to a file - dump_vea Dump information from the vea about free regions - exit exit the shell - help use 'help [command]' for command help - load Load a value to a vos path. - ls List containers, objects, dkeys, akeys, and values - open Opens the vos file at - rm Remove a branch of the VOS tree. - rm_ilog Remove all the ilog entries - smd_sync Restore the SMD file with backup from blob - update_vea Alter the VEA tree to mark a region as free. -``` \ No newline at end of file +``` +Prepare the memory environment for md-on-ssd mode + +Usage: + prov_mem [flags] db_path tmpfs_mount + +Args: + db_path string Path to the sys db. + tmpfs_mount string Path to the tmpfs mountpoint. + +Flags: + -h, --help display help + -s, --tmpfs_size uint Specify tmpfs size(GiB) for mount. By default, the total size of all VOS files will be used. +``` + +### Description + +This command is used when working with DAOS in md-on-ssd (metadata-on-SSD) mode. It: + +1. Verifies the system is running in MD-on-SSD mode. +2. Creates a tmpfs mount at the specified path (if not already mounted). +3. Sets up the necessary directory structure. +4. Recreates VOS pool target files on the tmpfs mount. + +### Examples + +**Note**: Please do not omit the first empty argument. + +**Note**: The user you use have to have access to specified resources and be able to mount(2). + +```bash +# Prepare memory environment with auto-calculated tmpfs size +ddb "" prov_mem /path/to/sys/db /mnt/tmpfs + +# Prepare memory environment with specific tmpfs size of 16 GiB +ddb "" prov_mem -s 16 /path/to/sys/db /mnt/tmpfs +``` + +### Notes + +- The `tmpfs_mount` path must not already be a mount point; otherwise, the command will fail with a "busy" error. +- If `tmpfs_size` is not specified, the size will be automatically calculated based on the total size of all VOS files. +- This command requires the system to be configured for MD-on-SSD mode. diff --git a/src/utils/ddb/SConscript b/src/utils/ddb/SConscript index 52861eca39c..df4ce966d14 100644 --- a/src/utils/ddb/SConscript +++ b/src/utils/ddb/SConscript @@ -8,7 +8,7 @@ def scons(): denv = env.Clone() denv.Append(OBJPREFIX="ddb_") - libs = ['vos', 'daos_common_pmem', 'abt', 'gurt', 'uuid', 'bio', 'cart'] + libs = ['vos', 'daos_common_pmem', 'abt', 'gurt', 'uuid', 'bio', 'ssl', 'cart'] # spdk libraries libs += ['spdk_event', 'spdk_log'] libs += ['spdk_bdev', 'spdk_blob', 'spdk_blob_bdev', 'spdk_json'] diff --git a/src/utils/ddb/ddb.c b/src/utils/ddb/ddb.c index a0e975ad97d..fdedeab4132 100644 --- a/src/utils/ddb/ddb.c +++ b/src/utils/ddb/ddb.c @@ -1,20 +1,24 @@ /** * (C) Copyright 2022-2024 Intel Corporation. * (C) Copyright 2025 Vdura Inc. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) + #include -#include -#include -#include #include +#include +#include #include +#include + +#include +#include +#include #include "ddb.h" -#include "daos/common.h" -#include "daos_errno.h" #include "ddb_common.h" #include "ddb_parse.h" diff --git a/src/utils/ddb/ddb.h b/src/utils/ddb/ddb.h index 3bc63c8f40f..818881cd94a 100644 --- a/src/utils/ddb/ddb.h +++ b/src/utils/ddb/ddb.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * (C) Copyright 2025 Vdura Inc. * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -204,6 +204,7 @@ struct feature_options { struct rm_pool_options { const char *path; + const char *db_path; }; struct dev_list_options { diff --git a/src/utils/ddb/ddb_commands.c b/src/utils/ddb/ddb_commands.c index 0e26686a24c..77553fe2bd2 100644 --- a/src/utils/ddb/ddb_commands.c +++ b/src/utils/ddb/ddb_commands.c @@ -1,32 +1,50 @@ /** * (C) Copyright 2022-2024 Intel Corporation. * (C) Copyright 2025 Vdura Inc. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) -#include -#include #include #include -#include "daos_errno.h" -#include "daos_srv/vos_types.h" -#include "daos_types.h" +#include +#include +#include +#include +#include +#include + #include "ddb_common.h" #include "ddb_parse.h" #include "ddb.h" #include "ddb_vos.h" #include "ddb_printer.h" -#include "daos.h" #include "ddb_tree_path.h" -#include "gurt/common.h" -#include "gurt/debug.h" #define ilog_path_required_error_message "Path to object, dkey, or akey required\n" #define error_msg_write_mode_only "Can only modify the VOS tree in 'write mode'\n" +/* clang-format off */ +#define DDB_POOL_SHOULD_OPEN(ctx) \ + do { \ + if (daos_handle_is_inval((ctx)->dc_poh)) { \ + ddb_error(ctx, "Cannot operate on a closed pool. Open it firstly.\n"); \ + return -DER_NO_HDL; \ + } \ + } while (0) + +#define DDB_POOL_SHOULD_CLOSE(ctx) \ + do { \ + if (daos_handle_is_valid((ctx)->dc_poh)) { \ + ddb_error(ctx, "Cannot operate on an opened pool. Close it firstly.\n"); \ + return -DER_BUSY; \ + } \ + } while (0) +/* clang-format on */ + int ddb_run_version(struct ddb_ctx *ctx) { @@ -62,12 +80,10 @@ ddb_pool_is_open(struct ddb_ctx *ctx) int ddb_run_open(struct ddb_ctx *ctx, struct open_options *opt) { - if (ddb_pool_is_open(ctx)) { - ddb_error(ctx, "Must close pool before can open another\n"); - return -DER_EXIST; - } + DDB_POOL_SHOULD_CLOSE(ctx); + ctx->dc_write_mode = opt->write_mode; - return dv_pool_open(opt->path, opt->db_path, &ctx->dc_poh, 0); + return dv_pool_open(opt->path, opt->db_path, &ctx->dc_poh, 0, ctx->dc_write_mode); } int @@ -75,10 +91,8 @@ ddb_run_close(struct ddb_ctx *ctx) { int rc; - if (!ddb_pool_is_open(ctx)) { - ddb_error(ctx, "No pool open to close\n"); + if (!ddb_pool_is_open(ctx)) return 0; - } rc = dv_pool_close(ctx->dc_poh); ctx->dc_poh = DAOS_HDL_INVAL; @@ -217,12 +231,9 @@ ddb_run_ls(struct ddb_ctx *ctx, struct ls_options *opt) struct dv_tree_path vtp; struct ls_ctx lsctx = {0}; - if (daos_handle_is_inval(ctx->dc_poh)) { - ddb_error(ctx, "Not connected to a pool. Use 'open' to connect to a pool.\n"); - return -DER_NONEXIST; - } - rc = init_path(ctx, opt->path, &itp); + DDB_POOL_SHOULD_OPEN(ctx); + rc = init_path(ctx, opt->path, &itp); if (!SUCCESS(rc)) return rc; @@ -266,8 +277,9 @@ ddb_run_superblock_dump(struct ddb_ctx *ctx) { int rc; - rc = dv_superblock(ctx->dc_poh, print_superblock_cb, ctx); + DDB_POOL_SHOULD_OPEN(ctx); + rc = dv_superblock(ctx->dc_poh, print_superblock_cb, ctx); if (rc == -DER_DF_INVAL) ddb_error(ctx, "Error with pool superblock"); @@ -331,6 +343,8 @@ ddb_run_value_dump(struct ddb_ctx *ctx, struct value_dump_options *opt) dv_dump_value_cb cb = NULL; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!opt->path) { ddb_error(ctx, "A VOS path to dump is required.\n"); return -DER_INVAL; @@ -383,6 +397,8 @@ ddb_run_ilog_dump(struct ddb_ctx *ctx, struct ilog_dump_options *opt) daos_handle_t coh; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!opt->path) { ddb_error(ctx, ilog_path_required_error_message); return -DER_INVAL; @@ -460,6 +476,8 @@ ddb_run_dtx_dump(struct ddb_ctx *ctx, struct dtx_dump_options *opt) bool both = !(opt->committed ^ opt->active); struct dtx_cb_args args = {.ctx = ctx, .entry_count = 0}; + DDB_POOL_SHOULD_OPEN(ctx); + rc = init_path(ctx, opt->path, &itp); if (!SUCCESS(rc)) return rc; @@ -512,6 +530,8 @@ ddb_run_rm(struct ddb_ctx *ctx, struct rm_options *opt) struct dv_tree_path vtp; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); return -DER_INVAL; @@ -549,6 +569,8 @@ ddb_run_value_load(struct ddb_ctx *ctx, struct value_load_options *opt) size_t file_size; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); return -DER_INVAL; @@ -616,6 +638,8 @@ process_ilog_op(struct ddb_ctx *ctx, char *path, enum ddb_ilog_op op) daos_handle_t coh = {0}; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); return -DER_INVAL; @@ -686,6 +710,8 @@ ddb_run_dtx_cmt_clear(struct ddb_ctx *ctx, struct dtx_cmt_clear_options *opt) daos_handle_t coh = {0}; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); return -DER_INVAL; @@ -764,10 +790,7 @@ ddb_run_smd_sync(struct ddb_ctx *ctx, struct smd_sync_options *opt) char db_path[DDB_PATH_MAX] = DEFAULT_DB_PATH; int rc; - if (daos_handle_is_valid(ctx->dc_poh)) { - ddb_print(ctx, "Close pool connection before attempting to sync smd\n"); - return -DER_INVAL; - } + DDB_POOL_SHOULD_CLOSE(ctx); if (opt->nvme_conf != NULL) { if (strlen(opt->nvme_conf) == 0 || strlen(opt->nvme_conf) >= DDB_PATH_MAX) { @@ -816,6 +839,8 @@ ddb_run_vea_dump(struct ddb_ctx *ctx) struct dump_vea_cb_args args = {.dva_ctx = ctx, .dva_count = 0}; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + rc = dv_enumerate_vea(ctx->dc_poh, dump_vea_cb, &args); ddb_printf(ctx, "Total Free Regions: %d\n", args.dva_count); @@ -894,6 +919,8 @@ ddb_run_vea_update(struct ddb_ctx *ctx, struct vea_update_options *opt) uint32_t blk_cnt; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); return -DER_INVAL; @@ -983,6 +1010,8 @@ ddb_run_dtx_act_commit(struct ddb_ctx *ctx, struct dtx_act_options *opt) struct dtx_modify_args args = {0}; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); return -DER_INVAL; @@ -1013,6 +1042,8 @@ ddb_run_dtx_act_abort(struct ddb_ctx *ctx, struct dtx_act_options *opt) struct dtx_modify_args args = {0}; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); return -DER_INVAL; @@ -1070,7 +1101,8 @@ ddb_run_feature(struct ddb_ctx *ctx, struct feature_options *opt) if (!opt->db_path || strnlen(opt->db_path, PATH_MAX) == 0) opt->db_path = ctx->dc_db_path; - rc = dv_pool_open(opt->path, opt->db_path, &ctx->dc_poh, VOS_POF_FOR_FEATURE_FLAG); + rc = dv_pool_open(opt->path, opt->db_path, &ctx->dc_poh, VOS_POF_FOR_FEATURE_FLAG, + ctx->dc_write_mode); if (rc) return rc; close = true; @@ -1115,12 +1147,9 @@ ddb_run_feature(struct ddb_ctx *ctx, struct feature_options *opt) int ddb_run_rm_pool(struct ddb_ctx *ctx, struct rm_pool_options *opt) { - if (ddb_pool_is_open(ctx)) { - ddb_error(ctx, "Must close pool before can open another\n"); - return -DER_BUSY; - } + DDB_POOL_SHOULD_CLOSE(ctx); - return dv_pool_destroy(opt->path); + return dv_pool_destroy(opt->path, opt->db_path); } #define DTI_ALL "all" @@ -1161,6 +1190,8 @@ ddb_run_dtx_act_discard_invalid(struct ddb_ctx *ctx, struct dtx_act_options *opt struct dtx_active_entry_discard_invalid_cb_arg bundle = {.ctx = ctx, .args = &args}; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); return -DER_INVAL; @@ -1197,10 +1228,7 @@ ddb_run_dev_list(struct ddb_ctx *ctx, struct dev_list_options *opt) d_list_t dev_list; int rc, dev_cnt = 0; - if (daos_handle_is_valid(ctx->dc_poh)) { - ddb_print(ctx, "Close pool connection before attempting to list devices\n"); - return -DER_INVAL; - } + DDB_POOL_SHOULD_CLOSE(ctx); if (opt->db_path != NULL) { if (strlen(opt->db_path) == 0 || strlen(opt->db_path) >= DDB_PATH_MAX) { @@ -1240,10 +1268,7 @@ ddb_run_dev_replace(struct ddb_ctx *ctx, struct dev_replace_options *opt) uuid_t old_devid, new_devid; int rc; - if (daos_handle_is_valid(ctx->dc_poh)) { - ddb_print(ctx, "Close pool connection before attempting to replace device\n"); - return -DER_INVAL; - } + DDB_POOL_SHOULD_CLOSE(ctx); if (opt->db_path != NULL) { if (strlen(opt->db_path) == 0 || strlen(opt->db_path) >= DDB_PATH_MAX) { @@ -1506,35 +1531,48 @@ dtx_stat_cont_cb(daos_handle_t ih, vos_iter_entry_t *entry, vos_iter_type_t type if (args->opt->details) { if (args->aggr_epoch < args_tmp.aggr_epoch) args->aggr_epoch = args_tmp.aggr_epoch; - if (args->time_stat.dts_cmt_time[0] > args_tmp.time_stat.dts_cmt_time[0]) - args->time_stat.dts_cmt_time[0] = args_tmp.time_stat.dts_cmt_time[0]; - if (args->time_stat.dts_cmt_time[1] < args_tmp.time_stat.dts_cmt_time[1]) - args->time_stat.dts_cmt_time[1] = args_tmp.time_stat.dts_cmt_time[1]; - if (args->time_stat.dts_cmt_time[2] == 0) - args->time_stat.dts_cmt_time[2] = args_tmp.time_stat.dts_cmt_time[2]; + if (args->time_stat.dts_cmt_time[DTX_TIME_STAT_MIN] > + args_tmp.time_stat.dts_cmt_time[DTX_TIME_STAT_MIN]) + args->time_stat.dts_cmt_time[DTX_TIME_STAT_MIN] = + args_tmp.time_stat.dts_cmt_time[DTX_TIME_STAT_MIN]; + if (args->time_stat.dts_cmt_time[DTX_TIME_STAT_MAX] < + args_tmp.time_stat.dts_cmt_time[DTX_TIME_STAT_MAX]) + args->time_stat.dts_cmt_time[DTX_TIME_STAT_MAX] = + args_tmp.time_stat.dts_cmt_time[DTX_TIME_STAT_MAX]; + if (args->time_stat.dts_cmt_time[DTX_TIME_STAT_MEAN] == 0) + args->time_stat.dts_cmt_time[DTX_TIME_STAT_MEAN] = + args_tmp.time_stat.dts_cmt_time[DTX_TIME_STAT_MEAN]; else { long double tmp_mean; - tmp_mean = args->time_stat.dts_cmt_time[2] * (long double)args->cmt_cnt; - tmp_mean += (long double)args_tmp.time_stat.dts_cmt_time[2] * - (long double)args_tmp.cmt_cnt; + tmp_mean = args->time_stat.dts_cmt_time[DTX_TIME_STAT_MEAN] * + (long double)args->cmt_cnt; + tmp_mean += + (long double)args_tmp.time_stat.dts_cmt_time[DTX_TIME_STAT_MEAN] * + (long double)args_tmp.cmt_cnt; tmp_mean /= (long double)(args->cmt_cnt + args_tmp.cmt_cnt); - args->time_stat.dts_cmt_time[2] = tmp_mean; + args->time_stat.dts_cmt_time[DTX_TIME_STAT_MEAN] = tmp_mean; } - if (args->time_stat.dts_epoch[0] > args_tmp.time_stat.dts_epoch[0]) - args->time_stat.dts_epoch[0] = args_tmp.time_stat.dts_epoch[0]; - if (args->time_stat.dts_epoch[1] < args_tmp.time_stat.dts_epoch[1]) - args->time_stat.dts_epoch[1] = args_tmp.time_stat.dts_epoch[1]; - if (args->time_stat.dts_epoch[2] == 0) - args->time_stat.dts_epoch[2] = args_tmp.time_stat.dts_epoch[2]; + if (args->time_stat.dts_epoch[DTX_TIME_STAT_MIN] > + args_tmp.time_stat.dts_epoch[DTX_TIME_STAT_MIN]) + args->time_stat.dts_epoch[DTX_TIME_STAT_MIN] = + args_tmp.time_stat.dts_epoch[DTX_TIME_STAT_MIN]; + if (args->time_stat.dts_epoch[DTX_TIME_STAT_MAX] < + args_tmp.time_stat.dts_epoch[DTX_TIME_STAT_MAX]) + args->time_stat.dts_epoch[DTX_TIME_STAT_MAX] = + args_tmp.time_stat.dts_epoch[DTX_TIME_STAT_MAX]; + if (args->time_stat.dts_epoch[DTX_TIME_STAT_MEAN] == 0) + args->time_stat.dts_epoch[DTX_TIME_STAT_MEAN] = + args_tmp.time_stat.dts_epoch[DTX_TIME_STAT_MEAN]; else { long double tmp_mean; - tmp_mean = args->time_stat.dts_epoch[2] * (long double)args->cmt_cnt; - tmp_mean += (long double)args_tmp.time_stat.dts_epoch[2] * + tmp_mean = args->time_stat.dts_epoch[DTX_TIME_STAT_MEAN] * + (long double)args->cmt_cnt; + tmp_mean += (long double)args_tmp.time_stat.dts_epoch[DTX_TIME_STAT_MEAN] * (long double)args_tmp.cmt_cnt; tmp_mean /= (long double)(args->cmt_cnt + args_tmp.cmt_cnt); - args->time_stat.dts_epoch[2] = tmp_mean; + args->time_stat.dts_epoch[DTX_TIME_STAT_MEAN] = tmp_mean; } } @@ -1578,11 +1616,7 @@ ddb_run_dtx_stat(struct ddb_ctx *ctx, struct dtx_stat_options *opt) struct vos_iter_anchors anchors = {0}; int rc; - if (daos_handle_is_inval(ctx->dc_poh)) { - ddb_error(ctx, "Not connected to a pool. Use 'open' to connect to a pool.\n"); - rc = -DER_NONEXIST; - goto done; - } + DDB_POOL_SHOULD_OPEN(ctx); args.ctx = ctx; args.opt = opt; @@ -1592,8 +1626,8 @@ ddb_run_dtx_stat(struct ddb_ctx *ctx, struct dtx_stat_options *opt) } args.cmt_cnt = 0; - args.time_stat.dts_cmt_time[0] = UINT64_MAX; - args.time_stat.dts_epoch[0] = DAOS_EPOCH_MAX; + args.time_stat.dts_cmt_time[DTX_TIME_STAT_MIN] = UINT64_MAX; + args.time_stat.dts_epoch[DTX_TIME_STAT_MIN] = DAOS_EPOCH_MAX; param.ip_hdl = ctx->dc_poh; param.ip_epr.epr_hi = DAOS_EPOCH_MAX; do { @@ -1710,18 +1744,14 @@ ddb_run_dtx_aggr(struct ddb_ctx *ctx, struct dtx_aggr_options *opt) struct vos_iter_anchors anchors = {0}; int rc; + DDB_POOL_SHOULD_OPEN(ctx); + if (!ctx->dc_write_mode) { ddb_error(ctx, error_msg_write_mode_only); rc = -DER_INVAL; goto done; } - if (daos_handle_is_inval(ctx->dc_poh)) { - ddb_error(ctx, "Not connected to a pool. Use 'open' to connect to a pool.\n"); - rc = -DER_NONEXIST; - goto done; - } - switch (opt->format) { case DDB_DTX_AGGR_NOW: args.cmt_time = NULL; @@ -1761,6 +1791,8 @@ ddb_run_prov_mem(struct ddb_ctx *ctx, struct prov_mem_options *opt) { int rc = 0; + DDB_POOL_SHOULD_CLOSE(ctx); + if (opt->db_path == NULL || strlen(opt->db_path) == 0 || strlen(opt->db_path) >= DDB_PATH_MAX) { ddb_errorf(ctx, "db_path '%s' either too short (==0) or too long (>=%d).\n", diff --git a/src/utils/ddb/ddb_main.c b/src/utils/ddb/ddb_main.c index 1db110961e4..050ae9d9176 100644 --- a/src/utils/ddb/ddb_main.c +++ b/src/utils/ddb/ddb_main.c @@ -1,19 +1,23 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Vdura Inc. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) + +#include +#include #include #include + +#include "ddb.h" #include "ddb_main.h" #include "ddb_common.h" #include "ddb_parse.h" #include "ddb_vos.h" -#include "ddb.h" -#include -#include int ddb_init() @@ -248,7 +252,8 @@ ddb_main(struct ddb_io_ft *io_ft, int argc, char *argv[]) if (!SUCCESS(rc)) D_GOTO(done, rc); if (open) { - rc = dv_pool_open(pa.pa_pool_path, pa.pa_db_path, &ctx.dc_poh, 0); + rc = + dv_pool_open(pa.pa_pool_path, pa.pa_db_path, &ctx.dc_poh, 0, ctx.dc_write_mode); if (!SUCCESS(rc)) D_GOTO(done, rc); } diff --git a/src/utils/ddb/ddb_mgmt.c b/src/utils/ddb/ddb_mgmt.c index e9ec2fe7436..3941168eb48 100644 --- a/src/utils/ddb/ddb_mgmt.c +++ b/src/utils/ddb/ddb_mgmt.c @@ -1,9 +1,10 @@ /** * (C) Copyright 2025 Vdura Inc. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) #include #include @@ -13,7 +14,6 @@ #include #include -#include #include #include #include @@ -32,6 +32,7 @@ ddb_auto_calculate_tmpfs_mount_size(unsigned int *tmpfs_mount_size) int rc = 0; int pool_list_cnt; uint64_t pool_size; + uint64_t rdb_size; uint64_t total_size; const unsigned long GiB = (1ul << 30); @@ -47,17 +48,32 @@ ddb_auto_calculate_tmpfs_mount_size(unsigned int *tmpfs_mount_size) total_size = 0; d_list_for_each_entry(pool_info, &pool_list, spi_link) { if ((pool_info->spi_blob_sz[SMD_DEV_TYPE_META] == 0) || - (pool_info->spi_flags[SMD_DEV_TYPE_META] & SMD_POOL_IN_CREATION)) { + (pool_info->spi_flags[SMD_DEV_TYPE_META] & SMD_POOL_IN_CREATION)) continue; + + rdb_size = 0; + rc = smd_rdb_get_blob_sz(pool_info->spi_id, &rdb_size); + if (rc == 0) { + /** Align to 4K */ + rdb_size = D_ALIGNUP(rdb_size, 1ULL << 12); + total_size += rdb_size; + } else if (rc == -DER_NONEXIST) { + rc = 0; + } else { + D_ERROR("Failed to extract the size of rdb for " DF_UUID ": " DF_RC "\n", + DP_UUID(pool_info->spi_id), DP_RC(rc)); + break; } + D_ASSERT(pool_info->spi_scm_sz > 0); /** Align to 4K */ pool_size = (D_ALIGNUP(pool_info->spi_scm_sz, 1ULL << 12)) * pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META]; total_size += pool_size; - D_INFO("Pool " DF_UUID " required scm size: " DF_U64 "", DP_UUID(pool_info->spi_id), - pool_size); + + D_INFO("Pool " DF_UUID " required scm size " DF_U64 ", rdb size " DF_U64 "\n", + DP_UUID(pool_info->spi_id), pool_size, rdb_size); } d_list_for_each_entry_safe(pool_info, tmp, &pool_list, spi_link) { @@ -140,6 +156,7 @@ ddb_recreate_pooltgts(const char *storage_path) struct smd_pool_info *pool_info = NULL; struct smd_pool_info *tmp; d_list_t pool_list; + daos_size_t rdb_size; int rc = 0; int pool_list_cnt; @@ -152,20 +169,27 @@ ddb_recreate_pooltgts(const char *storage_path) } d_list_for_each_entry(pool_info, &pool_list, spi_link) { - if ((pool_info->spi_blob_sz[SMD_DEV_TYPE_META] == 0) || - (pool_info->spi_flags[SMD_DEV_TYPE_META] & SMD_POOL_IN_CREATION)) { + if (pool_info->spi_blob_sz[SMD_DEV_TYPE_META] == 0 || + pool_info->spi_flags[SMD_DEV_TYPE_META] & SMD_POOL_IN_CREATION) continue; + + rdb_size = 0; + rc = smd_rdb_get_blob_sz(pool_info->spi_id, &rdb_size); + if (rc != 0 && rc != -DER_NONEXIST) { + D_ERROR("Failed to extract the size of rdb for " DF_UUID ": " DF_RC "\n", + DP_UUID(pool_info->spi_id), DP_RC(rc)); + break; } - D_INFO("Recreating files for the pool " DF_UUID "", DP_UUID(pool_info->spi_id)); + D_INFO("Recreating files for the pool " DF_UUID "\n", DP_UUID(pool_info->spi_id)); D_ASSERT(pool_info->spi_scm_sz > 0); - /* specify rdb_blob_sz as zero to skip rdb file creation */ + rc = ds_mgmt_tgt_recreate(pool_info->spi_id, pool_info->spi_scm_sz, - pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META], 0, + pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META], + pool_info->spi_tgts[SMD_DEV_TYPE_META], rdb_size, storage_path, NULL); - if (rc) { + if (rc != 0) break; - } } d_list_for_each_entry_safe(pool_info, tmp, &pool_list, spi_link) { diff --git a/src/utils/ddb/ddb_parse.c b/src/utils/ddb/ddb_parse.c index e1bf64272a7..8586427e6ad 100644 --- a/src/utils/ddb/ddb_parse.c +++ b/src/utils/ddb/ddb_parse.c @@ -1,14 +1,17 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) #include #include -#include -#include "daos_errno.h" + +#include +#include + #include "ddb_common.h" #include "ddb_parse.h" @@ -52,6 +55,11 @@ vos_path_parse(const char *path, struct vos_file_parts *vos_file_parts) strncpy(vos_file_parts->vf_vos_file, tok, ARRAY_SIZE(vos_file_parts->vf_vos_file) - 1); + if (strcmp(vos_file_parts->vf_vos_file, "rdb-pool") == 0) { + vos_file_parts->vf_target_idx = BIO_SYS_TGT_ID; + goto done; + } + /* * file name should be vos-N ... split on "-" * If not, might be test, just assume target of 0 @@ -446,8 +454,12 @@ key_parse_str(const char *input, daos_key_t *key) key_len++; } } - if (size == 0) + if (size == 0) { + if (key_len == 0) { + return -DER_INVAL; + } size = key_len; + } if (size < key_len) return -DER_INVAL; diff --git a/src/utils/ddb/ddb_parse.h b/src/utils/ddb/ddb_parse.h index df5d43771db..439791823a2 100644 --- a/src/utils/ddb/ddb_parse.h +++ b/src/utils/ddb/ddb_parse.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -23,7 +23,7 @@ struct program_args { bool pa_write_mode; bool pa_get_help; }; -#define DB_PATH_LEN 64 +#define DB_PATH_LEN 256 struct vos_file_parts { char vf_db_path[DB_PATH_LEN]; uuid_t vf_pool_uuid; diff --git a/src/utils/ddb/ddb_printer.c b/src/utils/ddb/ddb_printer.c index aadd7cd4b48..dd78efbb481 100644 --- a/src/utils/ddb/ddb_printer.c +++ b/src/utils/ddb/ddb_printer.c @@ -1,8 +1,10 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) #include "ddb_printer.h" @@ -135,22 +137,20 @@ void ddb_print_sv(struct ddb_ctx *ctx, struct ddb_sv *sv, uint32_t indent) { print_indent(ctx, indent); - ddb_printf(ctx, DF_IDX" Single Value (Length: "DF_U64" bytes)\n", - sv->ddbs_idx, - sv->ddbs_record_size); + ddb_printf(ctx, DF_IDX " Single Value (Length: " DF_U64 " bytes, Epoch: " DF_U64 ")\n", + sv->ddbs_idx, sv->ddbs_record_size, sv->ddbs_epoch); } void ddb_print_array(struct ddb_ctx *ctx, struct ddb_array *array, uint32_t indent) { print_indent(ctx, indent); - ddb_printf(ctx, DF_IDX" Array Value (Length: "DF_U64" records, Record Indexes: " - "{"DF_U64"-"DF_U64"}, Record Size: "DF_U64")\n", - array->ddba_idx, - array->ddba_recx.rx_nr, - array->ddba_recx.rx_idx, - array->ddba_recx.rx_idx + array->ddba_recx.rx_nr - 1, - array->ddba_record_size); + ddb_printf(ctx, + DF_IDX " Array Value (Length: " DF_U64 " records, Record Indexes: " + "{" DF_U64 "-" DF_U64 "}, Record Size: " DF_U64 ", Epoch: " DF_U64 ")\n", + array->ddba_idx, array->ddba_recx.rx_nr, array->ddba_recx.rx_idx, + array->ddba_recx.rx_idx + array->ddba_recx.rx_nr - 1, array->ddba_record_size, + array->ddba_epoch); } void diff --git a/src/utils/ddb/ddb_spdk.c b/src/utils/ddb/ddb_spdk.c index fe1dff9a822..aa55ce8e782 100644 --- a/src/utils/ddb/ddb_spdk.c +++ b/src/utils/ddb/ddb_spdk.c @@ -1,12 +1,12 @@ /** * (C) Copyright 2022 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) -#include -#include +#include #include #include #include @@ -14,7 +14,9 @@ #include #include #include -#include + +#include +#include #include "ddb_common.h" #include "ddb_spdk.h" diff --git a/src/utils/ddb/ddb_tree_path.c b/src/utils/ddb/ddb_tree_path.c index e57bff9b245..3bfc104424e 100644 --- a/src/utils/ddb/ddb_tree_path.c +++ b/src/utils/ddb/ddb_tree_path.c @@ -1,8 +1,10 @@ /** * (C) Copyright 2023-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) #include "ddb_tree_path.h" #include "ddb_printer.h" diff --git a/src/utils/ddb/ddb_vos.c b/src/utils/ddb/ddb_vos.c index 5d53e8b3609..4ee22fbe099 100644 --- a/src/utils/ddb/ddb_vos.c +++ b/src/utils/ddb/ddb_vos.c @@ -1,32 +1,38 @@ /** * (C) Copyright 2022-2025 Intel Corporation. * (C) Copyright 2025 Vdura Inc. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(ddb) #include #include #include + +#include #include -#include -#include #include +#include #include + #include "ddb_common.h" #include "ddb_parse.h" #include "ddb_mgmt.h" #include "ddb_vos.h" #include "ddb_spdk.h" + #define ddb_vos_iterate(param, iter_type, recursive, anchors, cb, args) \ vos_iterate(param, iter_type, recursive, \ anchors, cb, NULL, args, NULL) int -dv_pool_open(const char *path, const char *db_path, daos_handle_t *poh, uint32_t flags) +dv_pool_open(const char *path, const char *db_path, daos_handle_t *poh, uint32_t flags, + bool write_mode) { struct vos_file_parts path_parts = {0}; + int cow_val; int rc; /* @@ -44,11 +50,34 @@ dv_pool_open(const char *path, const char *db_path, daos_handle_t *poh, uint32_t strncpy(path_parts.vf_db_path, db_path, sizeof(path_parts.vf_db_path) - 1); } + /** + * When the user requests read‑only mode (write_mode == false), DDB itself will not attempt + * to modify the pool. However, PMEMOBJ performs several operations that do modify the pool + * during open and/or close, for example: + * - Internal bookkeeping required to ensure resilience in case of an ADR failure (SDS). + * - ULOG replay, which restores the pool to a consistent state. + * These mechanisms cannot be disabled because they are essential for PMEMOBJ to maintain + * the consistency of the pool. + * + * However, since none of these changes need to be persisted when the pool is opened in + * read‑only mode (write_mode == false), we can work around this by mapping the pool using + * copy‑on‑write. Copy‑on‑write allows pages to be read normally, but when a page is + * modified, a new private copy is allocated. As a result, any changes made to + * the mapped memory do not propagate to the persistent medium. + */ + if (!write_mode) { + cow_val = 1; + rc = pmemobj_ctl_set(NULL, "copy_on_write.at_open", &cow_val); + if (rc != 0) { + return daos_errno2der(errno); + } + } + rc = vos_self_init(path_parts.vf_db_path, true, path_parts.vf_target_idx); if (!SUCCESS(rc)) { D_ERROR("Failed to initialize VOS with path '%s': "DF_RC"\n", path_parts.vf_db_path, DP_RC(rc)); - return rc; + goto exit; } rc = vos_pool_open(path, path_parts.vf_pool_uuid, flags, poh); @@ -57,11 +86,18 @@ dv_pool_open(const char *path, const char *db_path, daos_handle_t *poh, uint32_t vos_self_fini(); } +exit: + if (!write_mode) { + /** Restore the default value. */ + cow_val = 0; + pmemobj_ctl_set(NULL, "copy_on_write.at_open", &cow_val); + } + return rc; } int -dv_pool_destroy(const char *path) +dv_pool_destroy(const char *path, const char *db_path) { struct vos_file_parts path_parts = {0}; int rc, flags = 0; @@ -70,6 +106,11 @@ dv_pool_destroy(const char *path) if (!SUCCESS(rc)) return rc; + if (db_path != NULL && strnlen(db_path, PATH_MAX) != 0) { + memset(path_parts.vf_db_path, 0, sizeof(path_parts.vf_db_path)); + strncpy(path_parts.vf_db_path, db_path, sizeof(path_parts.vf_db_path) - 1); + } + rc = vos_self_init(path_parts.vf_db_path, true, path_parts.vf_target_idx); if (!SUCCESS(rc)) { D_ERROR("Failed to initialize VOS with path '%s': " DF_RC "\n", @@ -778,8 +819,9 @@ handle_sv(struct ddb_iter_ctx *ctx, vos_iter_entry_t *entry) D_ASSERT(ctx && ctx->handlers && ctx->handlers->ddb_sv_handler); value.ddbs_record_size = entry->ie_rsize; - value.ddbs_idx = ctx->value_seen++; - value.ddbs_path = &ctx->itp; + value.ddbs_epoch = entry->ie_epoch; + value.ddbs_idx = ctx->value_seen++; + value.ddbs_path = &ctx->itp; return ctx->handlers->ddb_sv_handler(&value, ctx->handler_args); } @@ -791,10 +833,11 @@ handle_array(struct ddb_iter_ctx *ctx, vos_iter_entry_t *entry) D_ASSERT(ctx && ctx->handlers && ctx->handlers->ddb_array_handler); itp_set_recx(&ctx->itp, &entry->ie_orig_recx, ctx->value_seen); - value.ddba_path = &ctx->itp; + value.ddba_path = &ctx->itp; value.ddba_record_size = entry->ie_rsize; - value.ddba_recx = entry->ie_orig_recx; - value.ddba_idx = ctx->value_seen++; + value.ddba_recx = entry->ie_orig_recx; + value.ddba_epoch = entry->ie_epoch; + value.ddba_idx = ctx->value_seen++; return ctx->handlers->ddb_array_handler(&value, ctx->handler_args); } diff --git a/src/utils/ddb/ddb_vos.h b/src/utils/ddb/ddb_vos.h index 465ad12ccfc..675c4ceb33b 100644 --- a/src/utils/ddb/ddb_vos.h +++ b/src/utils/ddb/ddb_vos.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * (C) Copyright 2025 Vdura Inc. * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -36,25 +36,27 @@ struct ddb_key { }; struct ddb_sv { - uint64_t ddbs_record_size; - uint32_t ddbs_idx; - struct dv_indexed_tree_path *ddbs_path; + uint64_t ddbs_record_size; + daos_epoch_t ddbs_epoch; + uint32_t ddbs_idx; + struct dv_indexed_tree_path *ddbs_path; }; struct ddb_array { - uint64_t ddba_record_size; - daos_recx_t ddba_recx; - uint32_t ddba_idx; - struct dv_indexed_tree_path *ddba_path; - + uint64_t ddba_record_size; + daos_recx_t ddba_recx; + daos_epoch_t ddba_epoch; + uint32_t ddba_idx; + struct dv_indexed_tree_path *ddba_path; }; /* Open and close a pool for a ddb_ctx */ int - dv_pool_open(const char *path, const char *db_path, daos_handle_t *poh, uint32_t flags); + dv_pool_open(const char *path, const char *db_path, daos_handle_t *poh, uint32_t flags, + bool write_mode); int dv_pool_close(daos_handle_t poh); int -dv_pool_destroy(const char *path); +dv_pool_destroy(const char *path, const char *db_path); /* Update vos pool flags */ int diff --git a/src/utils/ddb/tests/ddb_commands_print_tests.c b/src/utils/ddb/tests/ddb_commands_print_tests.c index 5b0d1673816..9b60b070dd5 100644 --- a/src/utils/ddb/tests/ddb_commands_print_tests.c +++ b/src/utils/ddb/tests/ddb_commands_print_tests.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -146,25 +147,26 @@ print_key_test(void **state) static void print_sv_test(void **state) { - struct ddb_sv sv = {.ddbs_record_size = 19089555}; + struct ddb_sv sv = {.ddbs_record_size = 19089555, .ddbs_epoch = 49126485506073}; ddb_print_sv(&g_ctx, &sv, 0); - assert_printed_exact("[0] Single Value (Length: 19089555 bytes)\n"); + assert_printed_exact("[0] Single Value (Length: 19089555 bytes, Epoch: 49126485506073)\n"); } static void print_array_test(void **state) { struct ddb_array array = { - .ddba_recx.rx_idx = 64, - .ddba_recx.rx_nr = 128, - .ddba_record_size = 3, - .ddba_idx = 8, + .ddba_recx.rx_idx = 64, + .ddba_recx.rx_nr = 128, + .ddba_record_size = 3, + .ddba_idx = 8, + .ddba_epoch = 49126485506073, }; ddb_print_array(&g_ctx, &array, 0); assert_printed_exact("[8] Array Value (Length: 128 records, " - "Record Indexes: {64-191}, Record Size: 3)\n"); + "Record Indexes: {64-191}, Record Size: 3, Epoch: 49126485506073)\n"); } #define assert_hr_bytes(expected_str, bytes) \ diff --git a/src/utils/ddb/tests/ddb_commands_tests.c b/src/utils/ddb/tests/ddb_commands_tests.c index acb920d9fce..81bc9673dfe 100644 --- a/src/utils/ddb/tests/ddb_commands_tests.c +++ b/src/utils/ddb/tests/ddb_commands_tests.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -181,12 +181,22 @@ dump_ilog_cmd_tests(void **state) assert_success(ddb_run_ilog_dump(&ctx, &opt)); assert_true(dvt_fake_print_called); + /* Dump dkey ilog - invalid */ + dvt_fake_print_called = 0; + opt.path = "[0]/[0]//"; + assert_rc_equal(ddb_run_ilog_dump(&ctx, &opt), -DER_INVAL); + assert_true(dvt_fake_print_called); + /* Dump dkey ilog */ dvt_fake_print_called = 0; opt.path = "[0]/[0]/[0]"; assert_success(ddb_run_ilog_dump(&ctx, &opt)); assert_true(dvt_fake_print_called); + /* Dump akey ilog - invalid */ + opt.path = "[0]/[0]/[0]//"; + assert_rc_equal(ddb_run_ilog_dump(&ctx, &opt), -DER_INVAL); + /* Dump akey ilog */ opt.path = "[0]/[0]/[0]/[0]"; assert_success(ddb_run_ilog_dump(&ctx, &opt)); @@ -573,7 +583,7 @@ dcv_suit_setup(void **state) /* test setup creates the pool, but doesn't open it ... leave it open for these tests */ tctx = *state; - assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &tctx->dvt_poh, 0)); + assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &tctx->dvt_poh, 0, true)); g_ctx.dc_poh = tctx->dvt_poh; diff --git a/src/utils/ddb/tests/ddb_main_tests.c b/src/utils/ddb/tests/ddb_main_tests.c index 533f64fb422..35a100cf063 100644 --- a/src/utils/ddb/tests/ddb_main_tests.c +++ b/src/utils/ddb/tests/ddb_main_tests.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Vdura Inc. * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -242,7 +243,7 @@ ddb_main_suit_setup(void **state) /* test setup creates the pool, but doesn't open it ... leave it open for these tests */ tctx = *state; - assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &tctx->dvt_poh, 0)); + assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &tctx->dvt_poh, 0, true)); return 0; } diff --git a/src/utils/ddb/tests/ddb_test_driver.c b/src/utils/ddb/tests/ddb_test_driver.c index 07e0b0c8694..89746bd43c5 100644 --- a/src/utils/ddb/tests/ddb_test_driver.c +++ b/src/utils/ddb/tests/ddb_test_driver.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -314,7 +314,7 @@ ddb_teardown_vos(void **state) } if (tctx->dvt_special_pool_destroy) { - rc = dv_pool_destroy(tctx->dvt_pmem_file); + rc = dv_pool_destroy(tctx->dvt_pmem_file, NULL); } else { vos_self_init("/mnt/daos", false, 0); assert_success(vos_pool_destroy(tctx->dvt_pmem_file, tctx->dvt_pool_uuid)); diff --git a/src/utils/ddb/tests/ddb_vos_tests.c b/src/utils/ddb/tests/ddb_vos_tests.c index 2812a775763..f5eb354642e 100644 --- a/src/utils/ddb/tests/ddb_vos_tests.c +++ b/src/utils/ddb/tests/ddb_vos_tests.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Vdura Inc. * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -13,6 +14,8 @@ #include "ddb_cmocka.h" #include "ddb_test_driver.h" +#include "../../placement/tests/place_obj_common.h" + /* * The tests in this file depend on a VOS instance with a bunch of data written. The tests will * verify that different parts of the VOS tree can be navigated/iterated. The way the @@ -182,13 +185,13 @@ open_pool_test(void **state) daos_handle_t poh; struct dt_vos_pool_ctx *tctx = *state; - assert_rc_equal(-DER_INVAL, dv_pool_open("/bad/path", NULL, &poh, 0)); + assert_rc_equal(-DER_INVAL, dv_pool_open("/bad/path", NULL, &poh, 0, false)); - assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &poh, 0)); + assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &poh, 0, false)); assert_success(dv_pool_close(poh)); /* should be able to open again after closing */ - assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &poh, 0)); + assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &poh, 0, false)); assert_success(dv_pool_close(poh)); } @@ -1087,7 +1090,7 @@ dv_test_setup(void **state) active_entry_handler_called = 0; committed_entry_handler_called = 0; - assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &tctx->dvt_poh, 0)); + assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &tctx->dvt_poh, 0, true)); return 0; } @@ -1108,7 +1111,8 @@ pool_flags_tests(void **state) uint64_t compat_flags; uint64_t incompat_flags; - assert_success(dv_pool_open(tctx->dvt_pmem_file, NULL, &poh, VOS_POF_FOR_FEATURE_FLAG)); + assert_success( + dv_pool_open(tctx->dvt_pmem_file, NULL, &poh, VOS_POF_FOR_FEATURE_FLAG, true)); assert_success(dv_pool_get_flags(poh, &compat_flags, &incompat_flags)); assert(compat_flags == 0); assert(incompat_flags == 0); @@ -1120,6 +1124,84 @@ pool_flags_tests(void **state) assert_success(dv_pool_close(poh)); } +#define SHA256_DIGEST_LEN 64 + +struct file_state { + struct stat stat; + char digest[SHA256_DIGEST_LEN]; +}; + +#define FILE_STATE_PRE 0 +#define FILE_STATE_POST 1 + +/** + * Use sha256sum utility to get the sha256 digest of the file. + * + * \note sha256sum was used to avoid introducing libcrypto dependency. + */ +static void +sha256sum(const char *file, char digest[SHA256_DIGEST_LEN]) +{ + char cmd[1024]; + snprintf(cmd, sizeof(cmd), "sha256sum \"%s\"", file); + + FILE *fp = popen(cmd, "r"); + assert_non_null(fp); + + /** sha256sum prints: <64 hex chars> */ + assert_int_equal(fscanf(fp, "%" STR(SHA256_DIGEST_LEN) "s", digest), 1); + + pclose(fp); +} + +/** + * Simple sequence of operations: + * - stat + sha256sum + * - open + * - update a single value + * - close + * - stat + sha256sum + * + * \param[in] tctx Test context to get the pool name and access to the pool handle. + * \param[out] fs [0] state of the pool file at the beginning and [1] at the end. + * \param[in] write_mode Whether to open the pool in the write mode. + */ +static void +helper_stat_open_modify_close_stat(struct dt_vos_pool_ctx *tctx, struct file_state fs[2], + bool write_mode) +{ + const char *path = tctx->dvt_pmem_file; + + assert_int_equal(stat(path, &fs[FILE_STATE_PRE].stat), 0); + sha256sum(path, fs[FILE_STATE_PRE].digest); + + assert_success(dv_pool_open(path, NULL, &tctx->dvt_poh, 0, write_mode)); + update_value_to_modify_tests((void **)&tctx); + assert_success(dv_pool_close(tctx->dvt_poh)); + + assert_int_equal(stat(path, &fs[FILE_STATE_POST].stat), 0); + sha256sum(path, fs[FILE_STATE_POST].digest); +} + +static void +read_only_vs_write_mode_test(void **state) +{ + struct dt_vos_pool_ctx *tctx = *state; + struct file_state fs[2]; + + /** In read‑only mode, the pool contents remain unchanged, and its mtime stays the same. */ + helper_stat_open_modify_close_stat(tctx, fs, false /** read-only */); + assert_int_equal(fs[FILE_STATE_PRE].stat.st_mtime, fs[FILE_STATE_POST].stat.st_mtime); + assert_memory_equal(fs[FILE_STATE_PRE].digest, fs[FILE_STATE_PRE].digest, + SHA256_DIGEST_LEN); + + /** In write mode, the pool contents will change and its mtime will increase. */ + helper_stat_open_modify_close_stat(tctx, fs, true /** read-write */); + assert_true(fs[FILE_STATE_PRE].stat.st_mtime < fs[FILE_STATE_POST].stat.st_mtime); + assert_memory_not_equal(fs[FILE_STATE_PRE].digest, fs[FILE_STATE_POST].digest, + SHA256_DIGEST_LEN); +} + /* * All these tests use the same VOS tree that is created at suit_setup. Therefore, tests * that modify the state of the tree (delete, add, etc) should be run after all others. @@ -1149,6 +1231,8 @@ const struct CMUnitTest dv_test_cases[] = { TEST(dtx_abort_active_table), TEST(path_verify), {"pool_flag_update", pool_flags_tests, NULL, NULL}, + {"read_only_vs_write_mode", read_only_vs_write_mode_test, NULL, + NULL}, /* don't want this test to run with setup */ }; int diff --git a/src/utils/dlck/SConscript b/src/utils/dlck/SConscript index 33a4194fce7..34ebafd52ce 100644 --- a/src/utils/dlck/SConscript +++ b/src/utils/dlck/SConscript @@ -11,15 +11,18 @@ def scons(): libs = ['vos_srv', 'daos_common_pmem', 'bio', 'uuid', 'gurt', 'pmemobj', 'abt', 'pthread'] src = [ + 'cmds/dlck_cmd_check.c', 'dlck_abt.c', 'dlck_args_common.c', 'dlck_args_engine.c', 'dlck_args_files.c', 'dlck_args_parse.c', 'dlck_args.c', + 'dlck_checker.c', 'dlck_engine.c', 'dlck_main.c', 'dlck_pool.c', + 'dlck_report.c', 'dlck_sched.c', 'dlck_tls.c', '../../engine/util.c', diff --git a/src/utils/dlck/cmds/dlck_cmd_check.c b/src/utils/dlck/cmds/dlck_cmd_check.c new file mode 100644 index 00000000000..b48116b6dd6 --- /dev/null +++ b/src/utils/dlck/cmds/dlck_cmd_check.c @@ -0,0 +1,228 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include +#include +#include + +#include "../dlck_args.h" +#include "../dlck_bitmap.h" +#include "../dlck_checker.h" +#include "../dlck_engine.h" +#include "../dlck_pool.h" +#include "../dlck_report.h" + +/** + * Target thread (worker). Check a single pool. + * + * \param[in] xa Target's arguments. + * \param[in] file File to process. + * \param[in] ck Checker. + * + * \retval DER_SUCCESS Success. + * \retval -DER_NOMEM Out of memory. + * \retval -DER_* Other errors. + */ +static int +pool_process(struct xstream_arg *xa, struct dlck_file *file, struct checker *ck) +{ + char *path; + daos_handle_t poh; + int rc; + + rc = dlck_pool_file_preallocate(xa->ctrl->engine.storage_path, file->po_uuid, + xa->xs->tgt_id); + CK_PRINTL_RC(ck, xa->rc, "VOS file allocation"); + if (rc != DER_SUCCESS) { + return rc; + } + + /** generate a VOS file path */ + rc = ds_mgmt_file(xa->ctrl->engine.storage_path, file->po_uuid, VOS_FILE, &xa->xs->tgt_id, + &path); + if (rc != DER_SUCCESS) { + CK_PRINTL_RC(ck, xa->rc, "VOS file path allocation failed"); + return rc; + } + + rc = vos_pool_open_metrics(path, file->po_uuid, DLCK_POOL_OPEN_FLAGS, NULL, ck, &poh); + if (rc == DER_SUCCESS) { + (void)vos_pool_close(poh); + } + D_FREE(path); + + /** check */ + if (rc != DER_SUCCESS) { + /** ignore a possible error from the unlock */ + return rc; + } + + return DER_SUCCESS; +} + +#define DLCK_POOL_CHECK_RESULT_PREFIX_FMT "[%d] pool " DF_UUIDF " check result" +#define DLCK_WARNINGS_NUM_FMT " (%u warning(s))" + +/** + * Target thread (worker). + */ +static void +exec_one(void *arg) +{ + struct xstream_arg *xa = arg; + struct dlck_file *file; + struct checker *main_ck = &xa->ctrl->checker; + struct checker ck; + int rc; + + /** initialize the daos_io_* thread */ + rc = dlck_engine_xstream_init(xa->xs); + if (rc != DER_SUCCESS) { + xa->rc = rc; + xa->progress = DLCK_XSTREAM_PROGRESS_END; + return; + } + + d_list_for_each_entry(file, &xa->ctrl->files.list, link) { + /** do not process the given file if the target is not requested */ + if (dlck_bitmap_isclr32(file->targets_bitmap, xa->xs->tgt_id)) { + /** report the progress to the main thread */ + ++xa->progress; + continue; + } + + /** initialize the logfile and its print utility */ + rc = dlck_checker_worker_init(&xa->ctrl->common.options, xa->ctrl->log_dir, + file->po_uuid, xa->xs->tgt_id, main_ck, &ck); + if (rc != DER_SUCCESS) { + /** There is no point continuing without a logfile. */ + dlck_xstream_set_rc(xa, rc); + xa->progress = DLCK_XSTREAM_PROGRESS_END; + break; + } + + /** check the pool */ + rc = pool_process(xa, file, &ck); + /** report the result */ + if (rc == DER_SUCCESS && ck.ck_warnings_num > 0) { + CK_PRINTF( + main_ck, + DLCK_POOL_CHECK_RESULT_PREFIX_FMT CHECKER_OK_INFIX DLCK_WARNINGS_NUM_FMT + ".\n", + xa->xs->tgt_id, DP_UUID(file->po_uuid), ck.ck_warnings_num); + } else { + CK_PRINTFL_RC(main_ck, rc, DLCK_POOL_CHECK_RESULT_PREFIX_FMT, + xa->xs->tgt_id, DP_UUID(file->po_uuid)); + } + dlck_xstream_set_rc(xa, rc); + dlck_uadd_no_overflow(xa->warnings_num, ck.ck_warnings_num, &xa->warnings_num); + /** Continue to the next pool regardless of the result. */ + + /** close the logfile */ + dlck_checker_worker_fini(&ck); + + /** report the progress to the main thread */ + ++xa->progress; + } + + if (xa->rc != DER_SUCCESS) { + (void)dlck_engine_xstream_fini(xa->xs); + return; + } + + rc = dlck_engine_xstream_fini(xa->xs); + dlck_xstream_set_rc(xa, rc); +} + +/** + * The main thread spawns and waits for other threads to complete their tasks. + */ +int +dlck_cmd_check(struct dlck_control *ctrl) +{ + D_ASSERT(ctrl != NULL); + + struct checker *ck = &ctrl->checker; + char log_dir_template[] = "/tmp/dlck_check_XXXXXX"; + struct dlck_engine *engine = NULL; + int *rcs; + int rc; + + /** create a log directory */ + if (DAOS_FAIL_CHECK(DLCK_FAULT_CREATE_LOG_DIR)) { /** fault injection */ + ctrl->log_dir = NULL; + errno = daos_fail_value_get(); + } else { + ctrl->log_dir = mkdtemp(log_dir_template); + } + if (ctrl->log_dir == NULL) { + rc = daos_errno2der(errno); + CK_PRINTL_RC(ck, rc, "Cannot create log directory"); + return rc; + } + CK_PRINTF(ck, "Log directory: %s\n", ctrl->log_dir); + + CK_PRINT(ck, "Start the engine... "); + rc = dlck_engine_start(&ctrl->engine, &engine); + CK_APPENDL_RC(ck, rc); + if (rc != DER_SUCCESS) { + return rc; + } + + if (d_list_empty(&ctrl->files.list)) { + /** no files specified means all files are requested */ + CK_PRINT(ck, "Read the list of pools... "); + rc = dlck_pool_list(&ctrl->files.list); + CK_APPENDL_RC(ck, rc); + if (rc != DER_SUCCESS) { + goto err_stop_engine; + } + /** no files exist */ + if (d_list_empty(&ctrl->files.list)) { + CK_PRINT(ck, "No pools exist. Exiting...\n"); + goto err_stop_engine; + } + } + + CK_PRINT(ck, "Create pools directories... "); + rc = dlck_pool_mkdir_all(ctrl->engine.storage_path, &ctrl->files.list, ck); + CK_APPENDL_RC(ck, rc); + if (rc != DER_SUCCESS) { + goto err_stop_engine; + } + + /** allocate an array of return codes for targets */ + D_ALLOC_ARRAY(rcs, ctrl->engine.targets); + if (rcs == NULL) { + rc = -DER_NOMEM; + CK_PRINTL_RC(ck, rc, ""); + goto err_stop_engine; + } + + rc = dlck_engine_exec_all(engine, exec_one, dlck_engine_xstream_arg_alloc, ctrl, + dlck_engine_xstream_arg_free, ck); + if (rc != DER_SUCCESS) { + goto err_free_rcs; + } + + CK_PRINT(ck, "Stop the engine... "); + rc = dlck_engine_stop(engine); + CK_APPENDL_RC(ck, rc); + + /** Ignore an error for now to print the collected results. */ + dlck_report_results(rcs, ctrl->engine.targets, ctrl->warnings_num, ck); + D_FREE(rcs); + + /** Return the first encountered error. */ + return rc; + +err_free_rcs: + D_FREE(rcs); +err_stop_engine: + (void)dlck_engine_stop(engine); + + return rc; +} diff --git a/src/utils/dlck/dlck_abt.c b/src/utils/dlck/dlck_abt.c index 5f1ba266d27..314f47e9d96 100644 --- a/src/utils/dlck/dlck_abt.c +++ b/src/utils/dlck/dlck_abt.c @@ -49,41 +49,6 @@ dlck_abt_attr_free(ABT_thread_attr *attr) return dss_abterr2der(ABT_thread_attr_free(attr)); } -int -dlck_abt_init(struct dlck_engine *engine) -{ - int rc; - - rc = ABT_init(0, NULL); - if (rc != ABT_SUCCESS) { - return dss_abterr2der(rc); - } - - rc = ABT_mutex_create(&engine->open_mtx); - if (rc != ABT_SUCCESS) { - (void)ABT_finalize(); - return dss_abterr2der(rc); - } - - return DER_SUCCESS; -} - -int -dlck_abt_fini(struct dlck_engine *engine) -{ - int rc; - - rc = ABT_mutex_free(&engine->open_mtx); - if (rc != ABT_SUCCESS) { - (void)ABT_finalize(); - return dss_abterr2der(rc); - } - - rc = ABT_finalize(); - - return dss_abterr2der(rc); -} - int dlck_xstream_create(struct dlck_xstream *xs) { diff --git a/src/utils/dlck/dlck_args.h b/src/utils/dlck/dlck_args.h index 2b719d87ecd..4cac4865659 100644 --- a/src/utils/dlck/dlck_args.h +++ b/src/utils/dlck/dlck_args.h @@ -11,10 +11,9 @@ #include #include #include +#include #include -#include "dlck_cmds.h" - #define _STRINGIFY(x) #x #define STRINGIFY(x) _STRINGIFY(x) @@ -25,9 +24,9 @@ #define GROUP_AUTOMAGIC (-1) /** yes, -1 is the last group */ /** all short options */ -#define KEY_COMMON_CMD 'c' -#define KEY_COMMON_CO_UUID 'q' +#define KEY_COMMON_OPTIONS 'o' #define KEY_COMMON_WRITE_MODE 'w' +#define KEY_COMMON_VERBOSE 'v' #define KEY_FILES 'f' /** the options below follow the daos_engine options */ #define KEY_ENGINE_NUMA_NODE 'p' @@ -44,10 +43,12 @@ #define DLCK_TARGET_MAX 31 +#define MISSING_ARG_FMT "Missing argument for the '%s' option" + struct dlck_args_common { - enum dlck_cmd cmd; - uuid_t co_uuid; /** Container UUID. */ + struct checker_options options; bool write_mode; /** false by default (dry run) */ + bool verbose; /** false by default */ }; /** @@ -59,7 +60,6 @@ struct dlck_file { d_list_t link; uuid_t po_uuid; /** Pool UUID. */ uint32_t targets_bitmap; /** Bitmap of targets involved. */ - const char *desc; /** Argument provided by the user. */ }; /** @@ -85,17 +85,41 @@ struct dlck_args_files { d_list_t list; }; -struct dlck_print { - int (*dp_printf)(const char *fmt, ...); -}; +/** + * Count the number of files in the list. + * + * \param[in] files The list of files to count. + * + * \return The number of files on the list \p files. + */ +static inline unsigned +dlck_args_files_num(struct dlck_args_files *files) +{ + struct dlck_file *file; + unsigned num = 0; + + d_list_for_each_entry(file, &files->list, link) { + ++num; + } + return num; +} + +/** + * @struct dlck_control + * + * Bundle of input, output, and control arguments. + */ struct dlck_control { /** in */ struct dlck_args_common common; struct dlck_args_files files; struct dlck_args_engine engine; - /** print */ - struct dlck_print print; + /** checker */ + struct checker checker; + /** out */ + char *log_dir; + unsigned warnings_num; }; /** helper definitions */ @@ -113,7 +137,7 @@ struct dlck_control { #define FAIL(STATE, RC, ERRNUM, ...) \ do { \ argp_failure(STATE, ERRNUM, ERRNUM, __VA_ARGS__); \ - RC = ERRNUM; \ + (RC) = ERRNUM; \ } while (0) #define RETURN_FAIL(STATE, ERRNUM, ...) \ @@ -122,10 +146,6 @@ struct dlck_control { return ERRNUM; \ } while (0) -#define DLCK_PRINT(ctrl, fmt) (void)ctrl->print.dp_printf(fmt) - -#define DLCK_PRINTF(ctrl, fmt, ...) (void)ctrl->print.dp_printf(fmt, __VA_ARGS__) - /** dlck_args_parse.c */ /** @@ -160,15 +180,18 @@ int parse_file(const char *arg, struct argp_state *state, struct dlck_file **file_ptr); /** - * Extract a command from \p arg. + * Extract an event from \p arg. * - * \param[in] arg String value. + * \param[in] option Name of the option. + * \param[in] value String value. + * \param[out] state State of the parser. + * \param[out] rc Return code. * - * \retval DLCK_CMD_UNKNOWN The provided command is unknown. - * \retval DLCK_CMD_* DLCK command. + * \retval CHECKER_EVENT_INVALID The provided event is invalid. + * \retval CHECKER_EVENT_* DLCK event. */ -enum dlck_cmd -parse_command(const char *arg); +enum checker_event +parse_event(const char *option, const char *value, struct argp_state *state, int *rc); /** dlck_args_files.c */ diff --git a/src/utils/dlck/dlck_args_common.c b/src/utils/dlck/dlck_args_common.c index 0ce3ce62bba..d11a56239c6 100644 --- a/src/utils/dlck/dlck_args_common.c +++ b/src/utils/dlck/dlck_args_common.c @@ -10,34 +10,63 @@ #include "dlck_args.h" +#define DLCK_OPT_NON_ZERO_PADDING_STR "non_zero_padding" + static struct argp_option args_common_options[] = { OPT_HEADER("Options:", GROUP_OPTIONS), /** entries below inherits the group number of the header entry */ {"write_mode", KEY_COMMON_WRITE_MODE, 0, 0, "Make changes persistent."}, - {"cmd", KEY_COMMON_CMD, "CMD", 0, "Command (Required). Please see available commands below."}, - {"co_uuid", KEY_COMMON_CO_UUID, "UUID", 0, - "UUID of a container to process. If not provided all containers are processed."}, - OPT_HEADER("Available commands:", GROUP_AVAILABLE_CMDS), + {"options", KEY_COMMON_OPTIONS, "OPTIONS", 0, + "Set options. Options are comma-separated and may include arguments using the equals sign " + "('='). Please see available options below."}, + OPT_HEADER("Available options:", GROUP_AVAILABLE_CMDS), /** entries below inherits the group number of the header entry */ - LIST_ENTRY("WIP", "No commands implemented yet."), + LIST_ENTRY(DLCK_OPT_NON_ZERO_PADDING_STR "=EVENT", + "Action to take when non-zero padding or reserved fields are detected. EVENT can be " + "either 'error' or 'warning'. It is 'error' by default."), + /** this is expected to be necessary only while solving issues with the tool itself so it seems + to fit better with a different group */ + {"verbose", KEY_COMMON_VERBOSE, 0, 0, + "Print DAOS log messages. All standard environment variables apply.", GROUP_AUTOMAGIC}, {0}}; +enum dlck_options_values { DLCK_OPT_NON_ZERO_PADDING }; + +static char *options_tokens[] = { + [DLCK_OPT_NON_ZERO_PADDING] = DLCK_OPT_NON_ZERO_PADDING_STR, +}; + static void args_common_init(struct dlck_args_common *args) { memset(args, 0, sizeof(*args)); /** set defaults */ args->write_mode = false; /** dry run */ - args->cmd = DLCK_CMD_NOT_SET; - uuid_clear(args->co_uuid); + args->verbose = false; + args->options.cko_non_zero_padding = CHECKER_EVENT_WARNING; } static int -args_common_check(struct argp_state *state, struct dlck_args_common *args) +args_common_options_parse(char *options_str, struct checker_options *opts, struct argp_state *state) { - if (args->cmd == DLCK_CMD_NOT_SET) { - RETURN_FAIL(state, EINVAL, "Command not set"); + char *value; + enum checker_event tmp_event; + int rc; + + while (*options_str != '\0') { + switch (getsubopt(&options_str, options_tokens, &value)) { + case DLCK_OPT_NON_ZERO_PADDING: + tmp_event = parse_event(DLCK_OPT_NON_ZERO_PADDING_STR, value, state, &rc); + if (tmp_event == CHECKER_EVENT_INVALID) { + return rc; + } + opts->cko_non_zero_padding = tmp_event; + break; + default: + RETURN_FAIL(state, EINVAL, "Unknown option: '%s'", value); + } } + return 0; } @@ -45,7 +74,6 @@ static error_t args_common_parser(int key, char *arg, struct argp_state *state) { struct dlck_args_common *args = state->input; - uuid_t tmp_uuid; int rc = 0; /** state changes */ @@ -54,7 +82,6 @@ args_common_parser(int key, char *arg, struct argp_state *state) args_common_init(args); return 0; case ARGP_KEY_END: - return args_common_check(state, args); case ARGP_KEY_SUCCESS: case ARGP_KEY_FINI: return 0; @@ -65,18 +92,11 @@ args_common_parser(int key, char *arg, struct argp_state *state) case KEY_COMMON_WRITE_MODE: args->write_mode = true; break; - case KEY_COMMON_CMD: - args->cmd = parse_command(arg); - if (args->cmd == DLCK_CMD_UNKNOWN) { - RETURN_FAIL(state, EINVAL, "Unknown command: %s", arg); - } + case KEY_COMMON_VERBOSE: + args->verbose = true; break; - case KEY_COMMON_CO_UUID: - rc = uuid_parse(arg, tmp_uuid); - if (rc != 0) { - RETURN_FAIL(state, EINVAL, "Malformed uuid: %s", arg); - } - uuid_copy(args->co_uuid, tmp_uuid); + case KEY_COMMON_OPTIONS: + rc = args_common_options_parse(arg, &args->options, state); break; default: return ARGP_ERR_UNKNOWN; diff --git a/src/utils/dlck/dlck_args_engine.c b/src/utils/dlck/dlck_args_engine.c index f80cc9926ca..ea6d4d263e3 100644 --- a/src/utils/dlck/dlck_args_engine.c +++ b/src/utils/dlck/dlck_args_engine.c @@ -37,6 +37,10 @@ args_engine_init(struct dlck_args_engine *args) static int args_engine_check(struct argp_state *state, struct dlck_args_engine *args) { + if (args->storage_path == NULL) { + RETURN_FAIL(state, EINVAL, "Storage path not provided"); + } + return 0; } diff --git a/src/utils/dlck/dlck_args_files.c b/src/utils/dlck/dlck_args_files.c index 3bfe9455b0e..5e98e7a954e 100644 --- a/src/utils/dlck/dlck_args_files.c +++ b/src/utils/dlck/dlck_args_files.c @@ -14,9 +14,9 @@ #include "dlck_args.h" static struct argp_option args_files_options[] = { - {"file", KEY_FILES, "UUID,TARGET", 0, + {"file", KEY_FILES, "UUID[,TARGET]", 0, "Pool UUID and a set of targets. If no TARGET is provided, all targets are used. This option " - "can be specified multiple times.", + "can be specified multiple times. By default all pools are used.", GROUP_OPTIONS}, {0}}; @@ -28,15 +28,6 @@ args_files_init(struct dlck_args_files *args) D_INIT_LIST_HEAD(&args->list); } -static int -args_files_check(struct argp_state *state, struct dlck_args_files *args) -{ - if (d_list_empty(&args->list)) { - RETURN_FAIL(state, EINVAL, "No file chosen"); - } - return 0; -} - static error_t args_files_parser(int key, char *arg, struct argp_state *state) { @@ -50,7 +41,6 @@ args_files_parser(int key, char *arg, struct argp_state *state) args_files_init(args); return 0; case ARGP_KEY_END: - return args_files_check(state, args); case ARGP_KEY_SUCCESS: case ARGP_KEY_FINI: return 0; diff --git a/src/utils/dlck/dlck_args_parse.c b/src/utils/dlck/dlck_args_parse.c index 581ff9af106..17aa2ab9e2d 100644 --- a/src/utils/dlck/dlck_args_parse.c +++ b/src/utils/dlck/dlck_args_parse.c @@ -9,6 +9,7 @@ #include #include "dlck_args.h" +#include "dlck_bitmap.h" int parse_unsigned(const char *arg, unsigned *value, struct argp_state *state) @@ -49,8 +50,6 @@ parse_file(const char *arg, struct argp_state *state, struct dlck_file **file_pt RETURN_FAIL(state, ENOMEM, "Out of memory"); } - file->desc = arg; - D_STRNDUP(arg_copy, arg, FILE_STR_MAX); if (arg_copy == NULL) { FAIL(state, rc, ENOMEM, "Out of memory"); @@ -77,7 +76,7 @@ parse_file(const char *arg, struct argp_state *state, struct dlck_file **file_pt FAIL(state, rc, EINVAL, "Chosen target is too big: %" PRIu32 ">%" PRIu32, target, DLCK_TARGET_MAX); } - file->targets_bitmap |= (1 << target); + dlck_bitmap_setbit32(&file->targets_bitmap, target); } /** No target means all targets. */ @@ -99,10 +98,23 @@ parse_file(const char *arg, struct argp_state *state, struct dlck_file **file_pt return rc; } -enum dlck_cmd -parse_command(const char *arg) +#define DLCK_EVENT_ERROR_STR "error" +#define DLCK_EVENT_WARNING_STR "warning" + +enum checker_event +parse_event(const char *option, const char *value, struct argp_state *state, int *rc) { - /** placeholder for future commands */ + if (value != NULL) { + if (strcmp(value, DLCK_EVENT_ERROR_STR) == 0) { + return CHECKER_EVENT_ERROR; + } else if (strcmp(value, DLCK_EVENT_WARNING_STR) == 0) { + return CHECKER_EVENT_WARNING; + } + + FAIL(state, *rc, EINVAL, "Invalid event '%s' for the '%s' option", value, option); + } else { + FAIL(state, *rc, EINVAL, MISSING_ARG_FMT, option); + } - return DLCK_CMD_UNKNOWN; + return CHECKER_EVENT_INVALID; } diff --git a/src/utils/dlck/dlck_bitmap.h b/src/utils/dlck/dlck_bitmap.h new file mode 100644 index 00000000000..9a204113c14 --- /dev/null +++ b/src/utils/dlck/dlck_bitmap.h @@ -0,0 +1,55 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#ifndef __DLCK_BITMAP__ +#define __DLCK_BITMAP__ + +#include +#include + +/** + * Typed variant of the setbit() macro. For 32-bit values. + * + * \param[in,out] bitmap Bitmap to modify. + * \param[in] bit Bit to set. + */ +static inline void +dlck_bitmap_setbit32(uint32_t *bitmap, int bit) +{ + setbit((uint8_t *)bitmap, bit); +} + +/** + * Typed variant of the isset() macro. For 32-bit values. + * + * \param[in] bitmap Bitmap to check. + * \param[in] bit Bit to check. + * + * \retval true if \p bit is set. + * \retval false otherwise. + */ +static inline bool +dlck_bitmap_isset32(uint32_t bitmap, int bit) +{ + return isset((uint8_t *)&bitmap, bit); +} + +/** + * Typed variant of the isclr() macro. For 32-bit values. + * + * \param[in] bitmap Bitmap to check. + * \param[in] bit Bit to check. + * + * \retval true if \p bit is NOT set. + * \retval false otherwise. + */ +static inline bool +dlck_bitmap_isclr32(uint32_t bitmap, int bit) +{ + return isclr((uint8_t *)&bitmap, bit); +} + +#endif /** __DLCK_BITMAP__ */ diff --git a/src/utils/dlck/dlck_checker.c b/src/utils/dlck/dlck_checker.c new file mode 100644 index 00000000000..986cd852755 --- /dev/null +++ b/src/utils/dlck/dlck_checker.c @@ -0,0 +1,242 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +#define D_LOGFAC DD_FAC(dlck) + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "dlck_checker.h" + +/** + * Flush output immediately in case DLCK crashes unexpectedly. + * Intended to ensure no useful diagnostic information is lost due to not flushed buffers. + */ +static int +dlck_vprintf_internal(FILE *stream, const char *fmt, va_list args) +{ + int rc; + + rc = vfprintf(stream, fmt, args); + if (rc < 0) { + rc = daos_errno2der(errno); + D_ERROR("vfprintf() failed: " DF_RC "\n", DP_RC(rc)); + return rc; + } + + rc = fflush(stream); + if (rc == EOF) { + rc = daos_errno2der(errno); + D_ERROR("fflush() failed: " DF_RC "\n", DP_RC(rc)); + return rc; + } + + return rc; +} + +/** + * Wrap printing in a lock/unlock block to guarantee thread-safe output. + */ +static int +dlck_checker_main_vprintf(struct checker *ck, const char *fmt, va_list args) +{ + struct dlck_checker_main *dcm = dlck_checker_main_get_custom(ck); + int rc_abt; + int rc; + + rc_abt = ABT_mutex_lock(dcm->stream_mutex); + if (rc_abt != ABT_SUCCESS) { + rc = dss_abterr2der(rc_abt); + D_ERROR(DLCK_PRINT_MAIN_LOCK_FAIL_FMT, DP_RC(rc)); + return rc; + } + + rc = dlck_vprintf_internal(dcm->core.stream, fmt, args); + if (rc != DER_SUCCESS) { + (void)ABT_mutex_unlock(dcm->stream_mutex); + return rc; + } + + rc_abt = ABT_mutex_unlock(dcm->stream_mutex); + if (rc_abt != ABT_SUCCESS) { + rc = dss_abterr2der(rc_abt); + D_ERROR(DLCK_PRINT_MAIN_UNLOCK_FAIL_FMT, DP_RC(rc)); + return rc; + } + + return rc; +} + +static int +dlck_checker_core_indent_set(struct dlck_checker_worker *dwc, int level) +{ + memset(dwc->prefix, DLCK_PRINT_INDENT, CHECKER_INDENT_MAX); + if (level > 0) { + dwc->prefix[level] = ' '; + dwc->prefix[level + 1] = '\0'; + } else { + dwc->prefix[0] = '\0'; + } + + return DER_SUCCESS; +} + +static int +dlck_checker_main_indent_set(struct checker *ck) +{ + struct dlck_checker_main *dcm = dlck_checker_main_get_custom(ck); + return dlck_checker_core_indent_set(&dcm->core, ck->ck_level); +} + +int +dlck_checker_main_init(struct checker *ck) +{ + struct dlck_checker_main *dcm; + int rc_abt; + int rc; + + D_ALLOC_PTR(dcm); + if (dcm == NULL) { + return -DER_NOMEM; + } + + dcm->core.magic = DLCK_CHECKER_MAIN_MAGIC; + dcm->core.stream = stdout; + + rc_abt = ABT_mutex_create(&dcm->stream_mutex); + if (rc_abt != ABT_SUCCESS) { + rc = dss_abterr2der(rc_abt); + D_ERROR("Cannot create a stream synchronization mutex: " DF_RC "\n", DP_RC(rc)); + D_FREE(dcm); + return rc; + } + + ck->ck_private = dcm; + ck->ck_vprintf = dlck_checker_main_vprintf; + ck->ck_indent_set = dlck_checker_main_indent_set; + ck->ck_prefix = dcm->core.prefix; + + return DER_SUCCESS; +} + +int +dlck_checker_main_fini(struct checker *ck) +{ + struct dlck_checker_main *dcm = dlck_checker_main_get_custom(ck); + int rc_abt; + int rc = DER_SUCCESS; + + rc_abt = ABT_mutex_free(&dcm->stream_mutex); + if (rc_abt != ABT_SUCCESS) { + rc = dss_abterr2der(rc_abt); + D_ERROR("Failed to free the stream synchronization mutex: " DF_RC "\n", DP_RC(rc)); + } + + D_FREE(dcm); + memset(ck, 0, sizeof(*ck)); + + return rc; +} + +/** + * Get the custom payload from the worker's checker. + * + * \param[in] ck Print utility (only the worker one will work). + * + * \return The custom payload. + */ +static inline struct dlck_checker_worker * +dlck_checker_worker_get_custom(struct checker *ck) +{ + struct dlck_checker_worker *dcw = ck->ck_private; + D_ASSERT(dcw->magic == DLCK_CHECKER_WORKER_MAGIC); + return dcw; +} + +static int +dlck_checker_worker_indent_set(struct checker *ck) +{ + struct dlck_checker_worker *dcw = dlck_checker_worker_get_custom(ck); + return dlck_checker_core_indent_set(dcw, ck->ck_level); +} + +/** + * Just print. + */ +static int +dlck_checker_worker_vprintf(struct checker *ck, const char *fmt, va_list args) +{ + struct dlck_checker_worker *dcw = dlck_checker_worker_get_custom(ck); + FILE *stream = dcw->stream; + + return dlck_vprintf_internal(stream, fmt, args); +} + +int +dlck_checker_worker_init(struct checker_options *options, const char *log_dir, uuid_t po_uuid, + int tgt_id, struct checker *main_ck, struct checker *ck) +{ + struct dlck_checker_worker *dcw; + char *log_file; + FILE *stream; + int rc; + + D_ALLOC_PTR(dcw); + if (dcw == NULL) { + return -DER_NOMEM; + } + + /** open the logfile */ + D_ASPRINTF(log_file, "%s/" DF_UUIDF "_%s%d", log_dir, DP_UUID(po_uuid), VOS_FILE, tgt_id); + if (log_file == NULL) { + rc = -DER_NOMEM; + CK_PRINTFL_RC(main_ck, rc, "[%d] Log file path allocation failed", tgt_id); + /** + * It is very unlikely we can continue work without an ability to allocate more + * memory. + */ + D_FREE(dcw); + return rc; + } + + stream = fopen(log_file, "w"); + if (stream == NULL) { + rc = daos_errno2der(errno); + CK_PRINTFL_RC(main_ck, rc, "[%d] Log file open failed: %s", tgt_id, log_file); + D_FREE(log_file); + D_FREE(dcw); + return rc; + } + D_FREE(log_file); + + dcw->magic = DLCK_CHECKER_WORKER_MAGIC; + dcw->stream = stream; + + memset(ck, 0, sizeof(*ck)); + memcpy(&ck->ck_options, options, sizeof(*options)); + ck->ck_vprintf = dlck_checker_worker_vprintf; + ck->ck_indent_set = dlck_checker_worker_indent_set; + ck->ck_private = dcw; + ck->ck_prefix = dcw->prefix; + + return DER_SUCCESS; +} + +void +dlck_checker_worker_fini(struct checker *ck) +{ + struct dlck_checker_worker *dcw = dlck_checker_worker_get_custom(ck); + + (void)fclose(dcw->stream); + D_FREE(dcw); + memset(ck, 0, sizeof(*ck)); +} diff --git a/src/utils/dlck/dlck_checker.h b/src/utils/dlck/dlck_checker.h new file mode 100644 index 00000000000..1fe663266e2 --- /dev/null +++ b/src/utils/dlck/dlck_checker.h @@ -0,0 +1,115 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#ifndef __DLCK_CHECKER__ +#define __DLCK_CHECKER__ + +#include + +#include + +#define DLCK_PRINT_INDENT '-' +#define DLCK_CHECKER_MAIN_MAGIC 0x17A28DC5626110A5 +#define DLCK_CHECKER_WORKER_MAGIC 0xEB4F7DD311060A6D + +/** + * \struct dlck_checker_worker + * + * Custom payload of the worker checker. + */ +struct dlck_checker_worker { + uint64_t magic; + FILE *stream; + char prefix[CHECKER_INDENT_MAX + 2]; /** ' ' and '\0' hence 2 characters */ +}; + +/** + * \struct dlck_checker_main + * + * Custom payload of the main checker. + */ +struct dlck_checker_main { + struct dlck_checker_worker core; + ABT_mutex stream_mutex; +}; + +#define DLCK_PRINT_MAIN_LOCK_FAIL_FMT \ + "Failed to lock the stream's synchronization mutex: " DF_RC "\n" +#define DLCK_PRINT_MAIN_UNLOCK_FAIL_FMT \ + "Failed to unlock the stream's synchronization mutex: " DF_RC "\n" + +/** + * \brief Init the main checker. + * + * Prints to stdout and it is guarded by a mutex. + * + * \param[out] dp Initialized checker. + * + * \retval DER_SUCCESS Success. + * \retval -DER_NOMEM Out of memory. + */ +int +dlck_checker_main_init(struct checker *ck); + +/** + * Finalize the main print utility. + * + * \param[in] ck Checker to finalize. + * + * \retval DER_SUCCESS Success. + * \retval -DER_* An error. + */ +int +dlck_checker_main_fini(struct checker *ck); + +/** + * Get the custom payload from the main print utility. + * + * \note Only for advance use-cases. Please see DLCK_PRINT*() macros first. + * + * \param[in] dp Print utility (only the main one will work). + * + * \return The custom payload. + */ +static inline struct dlck_checker_main * +dlck_checker_main_get_custom(struct checker *ck) +{ + struct dlck_checker_main *dcm = ck->ck_private; + D_ASSERT(dcm->core.magic == DLCK_CHECKER_MAIN_MAGIC); + return dcm; +} + +/** + * \brief Init a worker's checker. + * + * Creates and opens a logfile. The created checker will direct log into the created file. + * + * \param[in] options Control options. + * \param[in] log_dir Directory where a logfile will be created. + * \param[in] po_uuid Pool's UUID. + * \param[in] tgt_id Target's ID. + * \param[in] main_ck Main checker. To report errors when they occur. + * \param[out] ck Created checker. + * + * \retval DER_SUCCESS Success. + * \retval -DER_NOMEM Out of memory. + * \retval -DER_* Other error. + */ +int +dlck_checker_worker_init(struct checker_options *options, const char *log_dir, uuid_t po_uuid, + int tgt_id, struct checker *main_ck, struct checker *ck); + +/** + * \brief Finalize the worker's checker. + * + * \note The worker output stream will be closed. + * + * \param[in] ck Checker to finalize. + */ +void +dlck_checker_worker_fini(struct checker *ck); + +#endif /** __DLCK_CHECKER__ */ diff --git a/src/utils/dlck/dlck_cmds.h b/src/utils/dlck/dlck_cmds.h index 46f6521e05b..38a034baad6 100644 --- a/src/utils/dlck/dlck_cmds.h +++ b/src/utils/dlck/dlck_cmds.h @@ -7,17 +7,23 @@ #ifndef __DLCK_CMDS__ #define __DLCK_CMDS__ -enum dlck_cmd { - DLCK_CMD_NOT_SET = -2, - DLCK_CMD_UNKNOWN = -1, -}; +#include "dlck_args.h" -struct dlck_control; - -typedef int (*dlck_cmd_func)(struct dlck_control *ctrl); - -#define DLCK_CMDS_FUNCS \ - { \ - } +/** + * \brief Validate the integrity of the pool(s) metadata. + * + * The \p ctrl argument specifies which pool(s) to check and how the output will be printed. + * + * \param[in] ctrl Control bundle. + * + * \retval DER_SUCCESS All checked pools are ok. + * \retval -DER_DF_INVAL Durable format error. + * \retval -DER_DF_INCOMPT Incompatible durable format. + * \retval -DER_ID_MISMATCH Pool UUID mismatch. + * \retval -DER_NOTYPE Unexpected contents. + * \retval -DER_* Other errors. + */ +int +dlck_cmd_check(struct dlck_control *ctrl); #endif /** __DLCK_CMDS__ */ diff --git a/src/utils/dlck/dlck_engine.c b/src/utils/dlck/dlck_engine.c index 639b0598199..46670f3a443 100644 --- a/src/utils/dlck/dlck_engine.c +++ b/src/utils/dlck/dlck_engine.c @@ -419,6 +419,10 @@ dlck_engine_start(struct dlck_args_engine *args, struct dlck_engine **engine_ptr int tag = DAOS_SERVER_TAG - DAOS_TGT_TAG; int rc; + if (DAOS_FAIL_CHECK(DLCK_FAULT_ENGINE_START)) { /** fault injection */ + return daos_errno2der(daos_fail_value_get()); + } + rc = dlck_engine_alloc(args->targets, &engine); if (rc != DER_SUCCESS) { return rc; @@ -429,15 +433,10 @@ dlck_engine_start(struct dlck_args_engine *args, struct dlck_engine **engine_ptr goto fail_engine_free; } - rc = dlck_abt_init(engine); - if (rc != DER_SUCCESS) { - goto fail_engine_free; - } - rc = bio_nvme_init(args->nvme_conf, args->numa_node, args->max_dma_buf_size, args->nvme_hugepage_size, args->targets, bypass_health_chk); if (rc != DER_SUCCESS) { - goto fail_abt_fini; + goto fail_engine_free; } dss_register_key(&daos_srv_modkey); @@ -485,8 +484,6 @@ dlck_engine_start(struct dlck_args_engine *args, struct dlck_engine **engine_ptr dss_unregister_key(&vos_module_key); dss_unregister_key(&daos_srv_modkey); bio_nvme_fini(); -fail_abt_fini: - (void)dlck_abt_fini(engine); fail_engine_free: dlck_engine_free(engine); @@ -498,6 +495,15 @@ dlck_engine_stop(struct dlck_engine *engine) { int rc; + if (DAOS_FAIL_CHECK(DLCK_FAULT_ENGINE_STOP)) { /** fault injection */ + return daos_errno2der(daos_fail_value_get()); + } + + if (engine->join_fail) { + /** Cannot stop the engine in this case. It will probably crash. */ + return -DER_BUSY; + } + rc = xstream_stop_all(engine); if (rc != DER_SUCCESS) { /** not all execution streams were stopped - can't pull out other resources */ @@ -521,148 +527,228 @@ dlck_engine_stop(struct dlck_engine *engine) bio_nvme_fini(); - rc = dlck_abt_fini(engine); - dlck_engine_free(engine); return rc; } -int -dlck_engine_exec_all(struct dlck_engine *engine, dlck_ult_func exec_one, - arg_alloc_fn_t arg_alloc_fn, void *custom, arg_free_fn_t arg_free_fn) -{ +/** + * \struct dlck_exec + * + * Job batch. ULTs + their arguments + the free function to clean it all up. + */ +struct dlck_exec { struct dlck_ult *ults; void **ult_args; - int rc; - int rc2; + void *custom; + arg_free_fn_t arg_free_fn; +}; + +/** + * \brief Join all ULTs but ignore errors. No error returned neither. + * + * \note It is designed as a cleanup procedure in case of an error either while starting or stopping + * ULTs. + * + * \param[in] engine Engine to clean up. + * \param[in,out] de Execution to stop and cleanup after. + */ +static void +dlck_engine_join_all_no_error(struct dlck_engine *engine, struct dlck_exec *de) +{ + int rc; - D_ALLOC_ARRAY(ults, engine->targets); - if (ults == NULL) { + for (int i = 0; i < engine->targets; ++i) { + if (de->ults[i].thread != ABT_THREAD_NULL) { + rc = ABT_thread_join(de->ults[i].thread); + if (rc != ABT_SUCCESS) { + engine->join_fail = true; + /** + * the ULT did not join - can't free the thread nor free the + * arguments + */ + continue; + } + + (void)ABT_thread_free(&de->ults[i].thread); + } + (void)de->arg_free_fn(de->custom, &de->ult_args[i]); + } + + D_FREE(de->ult_args); + D_FREE(de->ults); +} + +/** + * Spawn an ULT on each of the targets execution stream. + * + * \param[in] engine Engine to run the created ULTs on. + * \param[in] exec_one Function to run in the ULTs. + * \param[in] arg_alloc_fn Function to allocate arguments for an ULT. + * \param[in,out] de Execution state to store the created resources. + * + * \retval DER_SUCCESS Success. + * \retval -DER_NOMEM Out of memory. + * \retval -DER_* Other error. + */ +static int +dlck_engine_targets_start(struct dlck_engine *engine, dlck_ult_func exec_one, + arg_alloc_fn_t arg_alloc_fn, struct dlck_exec *de) +{ + int rc = DER_SUCCESS; + + if (DAOS_FAIL_CHECK(DLCK_FAULT_ENGINE_EXEC)) { /** fault injection */ + return daos_errno2der(daos_fail_value_get()); + } + + D_ALLOC_ARRAY(de->ults, engine->targets); + if (de->ults == NULL) { return -DER_NOMEM; } - D_ALLOC_ARRAY(ult_args, engine->targets); - if (ult_args == NULL) { - D_FREE(ults); + D_ALLOC_ARRAY(de->ult_args, engine->targets); + if (de->ult_args == NULL) { + D_FREE(de->ults); return -DER_NOMEM; } for (int i = 0; i < engine->targets; ++i) { /** prepare arguments */ - rc = arg_alloc_fn(engine, i, custom, &ult_args[i]); + rc = arg_alloc_fn(engine, i, de->custom, &de->ult_args[i]); if (rc != DER_SUCCESS) { goto fail_join_and_free; } /** start an ULT */ - rc = dlck_ult_create(engine->xss[i].pool, exec_one, ult_args[i], &ults[i]); + rc = dlck_ult_create(engine->xss[i].pool, exec_one, de->ult_args[i], &de->ults[i]); if (rc != DER_SUCCESS) { goto fail_join_and_free; } } + return rc; + +fail_join_and_free: + dlck_engine_join_all_no_error(engine, de); + + return rc; +} + +/** + * Wait for all the target ULTs to conclude. + * + * \param[in] engine Engine where the ULTs run. + * \param[in] de Execution state to wait for and release. + * + * \retval DER_SUCCESS Success. + * \retval -DER_* Other error. + */ +static int +dlck_engine_targets_stop(struct dlck_engine *engine, struct dlck_exec *de) +{ + int rc = DER_SUCCESS; + + if (DAOS_FAIL_CHECK(DLCK_FAULT_ENGINE_JOIN)) { /** fault injection */ + engine->join_fail = true; + return daos_errno2der(daos_fail_value_get()); + } + for (int i = 0; i < engine->targets; ++i) { - rc = ABT_thread_join(ults[i].thread); + rc = ABT_thread_join(de->ults[i].thread); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); + engine->join_fail = true; goto fail_join_and_free; } - rc = ABT_thread_free(&ults[i].thread); + rc = ABT_thread_free(&de->ults[i].thread); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); goto fail_join_and_free; } - rc = arg_free_fn(custom, &ult_args[i]); + rc = de->arg_free_fn(de->custom, &de->ult_args[i]); if (rc != 0) { goto fail_join_and_free; } } - D_FREE(ult_args); - D_FREE(ults); + D_FREE(de->ult_args); + D_FREE(de->ults); - return DER_SUCCESS; + return rc; fail_join_and_free: - for (int i = 0; i < engine->targets; ++i) { - if (ults[i].thread != ABT_THREAD_NULL) { - rc2 = ABT_thread_join(ults[i].thread); - if (rc2 != ABT_SUCCESS) { - /** - * the ULT did not join - can't free the thread nor free the - * arguments - */ - continue; - } - } - (void)ABT_thread_free(&ults[i].thread); - (void)arg_free_fn(custom, &ult_args[i]); - } - - D_FREE(ult_args); - D_FREE(ults); + dlck_engine_join_all_no_error(engine, de); return rc; } +#define STOP_TGT_STR "Wait for targets to stop" + int -dlck_pool_open_safe(ABT_mutex mtx, const char *storage_path, uuid_t po_uuid, int tgt_id, - daos_handle_t *poh) +dlck_engine_exec_all(struct dlck_engine *engine, dlck_ult_func exec_one, + arg_alloc_fn_t arg_alloc_fn, void *custom, arg_free_fn_t arg_free_fn, + struct checker *ck) { - int rc; - int rc_abt; - - rc_abt = ABT_mutex_lock(mtx); - if (rc_abt != ABT_SUCCESS) { - return dss_abterr2der(rc_abt); - } - - rc = dlck_pool_open(storage_path, po_uuid, tgt_id, poh); + struct dlck_exec de = {0}; + int rc; - /** unlock ASAP */ - rc_abt = ABT_mutex_unlock(mtx); + /** initialize batch */ + de.arg_free_fn = arg_free_fn; + de.custom = custom; - /** code returned from the open operation takes precedence */ + CK_PRINT(ck, "Start targets... "); + rc = dlck_engine_targets_start(engine, exec_one, arg_alloc_fn, &de); + CK_APPENDL_OK(ck); if (rc != DER_SUCCESS) { return rc; } - /** unlock error is an error */ - if (rc_abt != ABT_SUCCESS) { - return dss_abterr2der(rc_abt); - } + CK_PRINT(ck, STOP_TGT_STR "...\n"); + rc = dlck_engine_targets_stop(engine, &de); + CK_PRINTL_RC(ck, rc, STOP_TGT_STR); - return DER_SUCCESS; + return rc; } int -dlck_pool_close_safe(ABT_mutex mtx, daos_handle_t poh) +dlck_engine_xstream_arg_alloc(struct dlck_engine *engine, int idx, void *ctrl_ptr, + void **output_arg) { - int rc; - int rc_abt; + struct xstream_arg *xa; - rc_abt = ABT_mutex_lock(mtx); - if (rc_abt != ABT_SUCCESS) { - return dss_abterr2der(rc_abt); + D_ALLOC_PTR(xa); + if (xa == NULL) { + return -DER_NOMEM; } - rc = vos_pool_close(poh); + xa->ctrl = ctrl_ptr; + xa->engine = engine; + xa->xs = &engine->xss[idx]; + xa->rc = DER_SUCCESS; - /** unlock ASAP */ - rc_abt = ABT_mutex_unlock(mtx); + *output_arg = xa; - /** code returned from the close operation takes precedence */ - if (rc != DER_SUCCESS) { - return rc; - } + return DER_SUCCESS; +} - /** unlock error is an error */ - if (rc_abt != ABT_SUCCESS) { - return dss_abterr2der(rc_abt); +int +dlck_engine_xstream_arg_free(void *ctrl_ptr, void **arg) +{ + struct dlck_control *ctrl = ctrl_ptr; + struct xstream_arg *xa = *arg; + int rc; + + if (xa == NULL) { + return DER_SUCCESS; } - return DER_SUCCESS; + rc = xa->rc; + dlck_uadd_no_overflow(ctrl->warnings_num, xa->warnings_num, &ctrl->warnings_num); + + D_FREE(*arg); + *arg = NULL; + + return rc; } diff --git a/src/utils/dlck/dlck_engine.h b/src/utils/dlck/dlck_engine.h index a77820930eb..d619e7b14bb 100644 --- a/src/utils/dlck/dlck_engine.h +++ b/src/utils/dlck/dlck_engine.h @@ -11,6 +11,14 @@ #include "dlck_args.h" +#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || defined(__riscv) +#define CACHELINE_SIZE 64ULL +#elif defined(__PPC64__) +#define CACHELINE_SIZE 128ULL +#else +#error unable to recognize architecture at compile time +#endif + struct dlck_ult { ABT_thread thread; }; @@ -29,7 +37,7 @@ struct dlck_xstream { struct dlck_engine { unsigned targets; struct dlck_xstream *xss; - ABT_mutex open_mtx; + bool join_fail; }; typedef void (*dlck_ult_func)(void *arg); @@ -53,6 +61,7 @@ dlck_engine_start(struct dlck_args_engine *args, struct dlck_engine **engine_ptr * \param[in] engine Engine to stop. * * \retval DER_SUCCESS Success. + * \retval -DER_BUSY Joining ULTs failed. Unrecoverable. * \retval -DER_* Errors. */ int @@ -83,28 +92,6 @@ dlck_engine_xstream_fini(struct dlck_xstream *xs); /** dlck_abt.c */ -/** - * Initialize ABT as it is about to be used by the \p engine. - * - * \param[out] engine Engine for which ABT is initialized for. - * - * \retval DER_SUCCESS Success. - * \retval -DER_* Error. - */ -int -dlck_abt_init(struct dlck_engine *engine); - -/** - * Finalize ABT for the \p engine. - * - * \param[in,out] engine Engine for which ABT is finalized for. - * - * \retval DER_SUCCESS Success. - * \retval -DER_* Error. - */ -int -dlck_abt_fini(struct dlck_engine *engine); - /** * Just create an ABT execution stream. * @@ -158,45 +145,95 @@ typedef int (*arg_free_fn_t)(void *custom, void **arg); * \param[in] arg_alloc_fn Function to allocate arguments for an ULT. * \param[in] custom Custom parameters for \p arg_alloc_fn and \p arg_free_fn function. * \param[in] arg_free_fn Function to free arguments. + * \param[in] checker Checker. * * \retval DER_SUCCESS Success. * \retval -DER_* Error. */ int dlck_engine_exec_all(struct dlck_engine *engine, dlck_ult_func exec_one, - arg_alloc_fn_t arg_alloc_fn, void *input_arg, arg_free_fn_t arg_free_fn); + arg_alloc_fn_t arg_alloc_fn, void *input_arg, arg_free_fn_t arg_free_fn, + struct checker *ck); + +#define DLCK_XSTREAM_PROGRESS_END UINT_MAX /** - * Open a pool but lock the \p mtx mutex first and unlock it after. Thread-safe. - * - * \param[in] mtx Mutex. - * \param[in] storage_path Storage path. - * \param[in] po_uuid Pool UUID. - * \param[in] tgt_id Target ID. - * \param[out] poh Pool handle. - * - * \retval DER_SUCCESS Success. - * \retval -DER_NOMEM Out of memory. - * \retval -DER_NO_PERM Permission problem. Please see open(3) and fallocate(2). - * \retval -DER_EXIST The file already exists. Please see open(3). - * \retval -DER_NONEXIST The file does not exist. Please see open(3). - * \retval -DER_NOSPACE There is not enough space left on the device. - * \retval -DER_* Possibly other errors. + * @struct xstream_arg + * + * Arguments passed to the main ULT on each of the execution streams. + */ +struct xstream_arg { + /** in */ + struct dlck_control *ctrl; /** Control state. */ + struct dlck_engine *engine; /** Engine itself. */ + struct dlck_xstream *xs; /** The execution stream the ULT is run in. */ + /** out */ + volatile unsigned progress __attribute__((__aligned__(CACHELINE_SIZE))); + int rc; /** return code */ + unsigned warnings_num; +}; + +static inline void +dlck_xstream_set_rc(struct xstream_arg *xa, int rc) +{ + if (rc == DER_SUCCESS) { + return; + } + + /** do not overwrite the first error found */ + if (xa->rc == DER_SUCCESS) { + xa->rc = rc; + } +} + +static inline void +dlck_uadd_no_overflow(unsigned a, unsigned b, unsigned *result) +{ + /** safeguard against integer overflow */ + if (__builtin_uadd_overflow(a, b, result)) { + *result = UINT_MAX; + } +} + +/** + * Allocate arguments for a ULT. + * + * \param[in] engine Engine the ULT is about to be run in. + * \param[in] idx ULT ID. + * \param[in] ctrl_ptr Control state to be passed to the ULT. + * \param[out] output_arg Allocated argument for the ULT. + * + * \retval DER_SUCCESS Success. + * \retval -DER_NOMEM Out of memory. */ int -dlck_pool_open_safe(ABT_mutex mtx, const char *storage_path, uuid_t po_uuid, int tgt_id, - daos_handle_t *poh); +dlck_engine_xstream_arg_alloc(struct dlck_engine *engine, int idx, void *ctrl_ptr, + void **output_arg); /** - * Close a pool but lock the \p mtx mutex first and unlock it after. Thread-safe. + * Free arguments of a ULT. * - * \param[in] mtx Mutex. - * \param[in] poh Pool handle. + * \param[out] ctrl_ptr Control state to collect stats in. + * \param[in,out] arg ULT arguments to process and free. * - * \retval DER_SUCCESS Success. - * \retval -DER_INVAL Issues with \p mtx. + * \return The return code for the ULT. */ int -dlck_pool_close_safe(ABT_mutex mtx, daos_handle_t poh); +dlck_engine_xstream_arg_free(void *ctrl_ptr, void **arg); + +/** + * Read the progress of the given execution stream \p xa. + * + * \param[in] xa Execution stream. + * \param[out] progress Progress read from \p xa. + * + * \retval DER_SUCCESS Success. + * \retval -DER_INVAL Invalid mutex. + */ +static inline void +dlck_xstream_progress_get(struct xstream_arg *xa, unsigned *progress) +{ + *progress = xa->progress; +} #endif /** __DLCK_ENGINE__ */ diff --git a/src/utils/dlck/dlck_main.c b/src/utils/dlck/dlck_main.c index 7d957021fd8..d380de67c60 100644 --- a/src/utils/dlck/dlck_main.c +++ b/src/utils/dlck/dlck_main.c @@ -1,35 +1,209 @@ /** - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ +#define D_LOGFAC DD_FAC(dlck) #include #include +#include +#include +#include +#include +#include +#include +#include #include -#include "dlck_cmds.h" #include "dlck_args.h" +#include "dlck_checker.h" +#include "dlck_cmds.h" + +#define EFFECTIVE_USER_STR "Effective user: " +#define USER_BELONGS_TO_GRP_FMT "User %sbelong%s to group: %s (gid=%" PRIuMAX ")\n" +#define UNEXPECTED_USER_WARNING_MSG \ + "\nWARNING: It is recommended to run this program as root or as a user who belongs to " \ + "the '" DAOS_DEFAULT_SYS_NAME "' group.\n" \ + "Running it under any other account may cause the program to stop due to insufficient " \ + "privileges.\n\n" + +static bool +user_is_root(struct checker *ck) +{ + uid_t euid = geteuid(); + + if (DAOS_FAIL_CHECK(DLCK_MOCK_ROOT)) { /** fault injection */ + /** it does not have ANY effect on the actual privileges of the user */ + euid = 0; + } + + if (euid == 0) { + /** The root user is not always named "root" but its uid is always 0. */ + CK_PRINT(ck, EFFECTIVE_USER_STR "root\n"); + return true; + } + + CK_PRINTF(ck, EFFECTIVE_USER_STR "uid=%" PRIuMAX "\n", (uintmax_t)euid); + return false; +} -static const dlck_cmd_func dlck_cmds[] = DLCK_CMDS_FUNCS; +#define MAX_GROUPS 128 + +static bool +user_belongs_to_group(const char *group_name, struct checker *ck) +{ + struct group *group = NULL; + gid_t group_id; + gid_t groups[MAX_GROUPS]; + int rc; + + /** get GID of the requested group */ + if (DAOS_FAIL_CHECK(DLCK_FAULT_GETGRNAM)) { /** fault injection */ + errno = daos_fail_value_get(); + } else if (DAOS_FAIL_CHECK(DLCK_MOCK_NO_DAOS_SERVER_GROUP)) { /** fault injection */ + errno = 0; + } else { + errno = 0; + group = getgrnam(group_name); + } + if (group == NULL) { + if (errno != 0) { + rc = daos_errno2der(errno); + CK_PRINTFL_RC(ck, rc, "getgrnam(%s) failed", group_name); + } else { + CK_PRINTF(ck, "The %s group does not exist.\n", group_name); + } + return false; + } + group_id = group->gr_gid; + + /** check primary group */ + if (getgid() == group_id) { + CK_PRINTF(ck, USER_BELONGS_TO_GRP_FMT, "", "s", group_name, (uintmax_t)group_id); + return true; + } + + /** get supplementary groups */ + if (DAOS_FAIL_CHECK(DLCK_FAULT_GETGROUPS)) { /** fault injection */ + rc = -1; + errno = daos_fail_value_get(); + } else { + rc = getgroups(MAX_GROUPS, groups); + } + if (rc < 0) { + rc = daos_errno2der(errno); + CK_PRINTFL_RC(ck, rc, "getgroups() failed", group_name); + return false; + } + + /** check supplementary groups */ + if (!DAOS_FAIL_CHECK(DLCK_MOCK_NOT_IN_DAOS_SERVER_GROUP)) { /** fault injection */ + for (int i = 0; i < rc; i++) { + if (groups[i] == group_id) { + CK_PRINTF(ck, USER_BELONGS_TO_GRP_FMT, "", "s", group_name, + (uintmax_t)group_id); + return true; + } + } + } + + CK_PRINTF(ck, USER_BELONGS_TO_GRP_FMT, "DOES NOT ", "", group_name, (uintmax_t)group_id); + + return false; +} + +static void +check_user_privileges(struct checker *ck) +{ + if (user_is_root(ck)) { + /** the root user is assumed to have all required privileges */ + return; + } + + if (user_belongs_to_group(DAOS_DEFAULT_SYS_NAME, ck)) { + return; + } + + CK_PRINT(ck, UNEXPECTED_USER_WARNING_MSG); +} int main(int argc, char *argv[]) { struct dlck_control ctrl = {0}; + int rc_abt; int rc; + rc = d_fault_inject_init(); + if (rc != DER_SUCCESS && rc != -DER_NOSYS) { + return rc; + } + + if (d_fault_inject_is_enabled()) { + /** an errno value the fault injection will trigger */ + daos_fail_value_set(EINVAL); + } + dlck_args_parse(argc, argv, &ctrl); - D_ASSERT(ctrl.common.cmd < ARRAY_SIZE(dlck_cmds)); - D_ASSERT(ctrl.common.cmd >= 0); + if (ctrl.common.verbose) { + rc = daos_debug_init_ex(DAOS_LOG_DEFAULT, DLOG_ERR); + if (rc != 0) { + goto err_args_free; + } + } + + rc_abt = ABT_init(0, NULL); + if (rc_abt != ABT_SUCCESS) { + rc = dss_abterr2der(rc_abt); + goto err_args_free; + } + + rc = dlck_checker_main_init(&ctrl.checker); + if (rc != DER_SUCCESS) { + goto err_abt_fini; + } - ctrl.print.dp_printf = printf; + check_user_privileges(&ctrl.checker); + + rc = dlck_cmd_check(&ctrl); + if (rc != DER_SUCCESS) { + goto err_print_main_fini; + } + + rc = dlck_checker_main_fini(&ctrl.checker); + if (rc != DER_SUCCESS) { + goto err_abt_fini; + } + + rc_abt = ABT_finalize(); + if (rc_abt != ABT_SUCCESS) { + rc = dss_abterr2der(rc_abt); + goto err_args_free; + } + + dlck_args_free(&ctrl); + + rc = d_fault_inject_fini(); + if (rc == -DER_NOSYS) { + rc = DER_SUCCESS; + } + + return rc; - rc = dlck_cmds[ctrl.common.cmd](&ctrl); +err_print_main_fini: + (void)dlck_checker_main_fini(&ctrl.checker); +err_abt_fini: + (void)ABT_finalize(); +err_args_free: + if (ctrl.common.verbose) { + daos_debug_fini(); + } dlck_args_free(&ctrl); + (void)d_fault_inject_fini(); return rc; } diff --git a/src/utils/dlck/dlck_pool.c b/src/utils/dlck/dlck_pool.c index 937d372ed12..f62cc5a7cbc 100644 --- a/src/utils/dlck/dlck_pool.c +++ b/src/utils/dlck/dlck_pool.c @@ -15,7 +15,7 @@ #include "dlck_pool.h" int -dlck_pool_mkdir(const char *storage_path, uuid_t po_uuid) +dlck_pool_mkdir(const char *storage_path, uuid_t po_uuid, struct checker *ck) { char po_uuid_str[UUID_STR_LEN]; char *path; @@ -28,21 +28,50 @@ dlck_pool_mkdir(const char *storage_path, uuid_t po_uuid) return -DER_NOMEM; } - rc = mkdir(path, 0777); - D_FREE(path); + if (DAOS_FAIL_CHECK(DLCK_FAULT_CREATE_POOL_DIR)) { + errno = daos_fail_value_get(); + rc = -1; + } else { + rc = mkdir(path, 0777); + } if (rc != 0 && errno != EEXIST) { - return daos_errno2der(errno); + rc = daos_errno2der(errno); + CK_PRINTFL_RC(ck, rc, "Cannot create a pool directory: %s", path); } else { - return DER_SUCCESS; + rc = DER_SUCCESS; } + + D_FREE(path); + return rc; } -static int -dlck_file_preallocate(const char *storage_path, uuid_t po_uuid, int tgt_id) +int +dlck_pool_mkdir_all(const char *storage_path, d_list_t *files, struct checker *ck) +{ + struct dlck_file *file; + int rc; + + d_list_for_each_entry(file, files, link) { + rc = dlck_pool_mkdir(storage_path, file->po_uuid, ck); + if (rc != DER_SUCCESS) { + return rc; + } + } + + return DER_SUCCESS; +} + +int +dlck_pool_file_preallocate(const char *storage_path, uuid_t po_uuid, int tgt_id) { struct smd_pool_info *pool_info = NULL; int rc; + /** no MD-on-SSD mode means no file preallocation is necessary */ + if (!bio_nvme_configured(SMD_DEV_TYPE_META)) { + return DER_SUCCESS; + } + rc = smd_pool_get_info(po_uuid, &pool_info); if (rc != 0) { return rc; @@ -59,26 +88,19 @@ int dlck_pool_open(const char *storage_path, uuid_t po_uuid, int tgt_id, daos_handle_t *poh) { char *path; - char po_uuid_str[UUID_STR_LEN]; - const unsigned int flags = VOS_POF_EXCL | VOS_POF_FOR_FEATURE_FLAG; int rc; - uuid_unparse(po_uuid, po_uuid_str); - - D_ASPRINTF(path, "%s/%s/" VOS_FILE "%d", storage_path, po_uuid_str, tgt_id); - if (path == NULL) { - return -DER_NOMEM; + rc = ds_mgmt_file(storage_path, po_uuid, VOS_FILE, &tgt_id, &path); + if (rc != DER_SUCCESS) { + return rc; } - /** no MD-on-SSD mode means no file preallocation is necessary */ - if (bio_nvme_configured(SMD_DEV_TYPE_META)) { - rc = dlck_file_preallocate(storage_path, po_uuid, tgt_id); - if (rc != 0) { - goto fail; - } + rc = dlck_pool_file_preallocate(storage_path, po_uuid, tgt_id); + if (rc != DER_SUCCESS) { + goto fail; } - rc = vos_pool_open(path, po_uuid, flags, poh); + rc = vos_pool_open(path, po_uuid, DLCK_POOL_OPEN_FLAGS, poh); fail: D_FREE(path); @@ -121,3 +143,60 @@ dlck_pool_cont_list(daos_handle_t poh, d_list_t *co_uuids) return vos_iterate(¶m, VOS_ITER_COUUID, false, &anchors, cont_list_append, NULL, co_uuids, NULL); } + +int +dlck_pool_list(d_list_t *file_list) +{ + D_LIST_HEAD(pool_list); + int pool_cnt = 0; + struct smd_pool_info *pool_info = NULL; + struct smd_pool_info *tmp; + struct dlck_file *file = NULL; + struct dlck_file *file_tmp; + int rc; + + D_ASSERT(d_list_empty(file_list)); + + /** get the list of pools */ + rc = smd_pool_list(&pool_list, &pool_cnt); + if (rc != DER_SUCCESS) { + return rc; + } + + d_list_for_each_entry_safe(pool_info, tmp, &pool_list, spi_link) { + /** allocate a new file */ + D_ALLOC_PTR(file); + if (file == NULL) { + rc = -DER_NOMEM; + goto err; + } + + /** populate and append the file */ + uuid_copy(file->po_uuid, pool_info->spi_id); + file->targets_bitmap = -1; /** all targets by default */ + d_list_add(&file->link, file_list); + + /** remove the pool from the list and free it */ + d_list_del(&pool_info->spi_link); + smd_pool_free_info(pool_info); + } + + return DER_SUCCESS; + +err: + /** free the list of files */ + d_list_for_each_entry_safe(file, file_tmp, file_list, link) { + d_list_del(&file->link); + D_FREE(file); + } + D_ASSERT(d_list_empty(file_list)); + + /** free the list of pools */ + d_list_for_each_entry_safe(pool_info, tmp, &pool_list, spi_link) { + d_list_del(&pool_info->spi_link); + smd_pool_free_info(pool_info); + pool_info = NULL; + } + + return rc; +} diff --git a/src/utils/dlck/dlck_pool.h b/src/utils/dlck/dlck_pool.h index 384f29e02ba..fbeea38f9a1 100644 --- a/src/utils/dlck/dlck_pool.h +++ b/src/utils/dlck/dlck_pool.h @@ -11,21 +11,57 @@ #include "dlck_args.h" +#define DLCK_POOL_OPEN_FLAGS (VOS_POF_EXCL | VOS_POF_FOR_FEATURE_FLAG) + /** * Create a directory for the pool. * * \param[in] storage_path Storage path. * \param[in] po_uuid Pool UUID. + * \param[in] ck Checker. * * \retval DER_SUCCESS Success. * \retval -DER_NOMEM Out of memory. * \retval -DER_NO_PERM Permission problem. Please see mkdir(2). - * \retval -DER_EXIST Directory already exists. * \retval -DER_NONEXIST A component of the \p storage_path does not exist. * \retval -DER_* Possibly other errors. */ int -dlck_pool_mkdir(const char *storage_path, uuid_t po_uuid); +dlck_pool_mkdir(const char *storage_path, uuid_t po_uuid, struct checker *ck); + +/** + * Create pool directories for all \p files provided. + * + * \param[in] storage_path Engine the ULT is about to be run in. + * \param[in] files List of files. + * \param[in] ck Checker. + * + * \retval DER_SUCCESS Success. + * \retval -DER_NOMEM Out of memory. + * \retval -DER_NO_PERM Permission problem. Please see mkdir(2). + * \retval -DER_NONEXIST A component of the \p storage_path does not exist. + * \retval -DER_* Possibly other errors but not -DER_EXIST. + */ +int +dlck_pool_mkdir_all(const char *storage_path, d_list_t *files, struct checker *ck); + +/** + * Allocate the pool file if necessary (MD-on-SSD). + * + * \param[in] storage_path Storage path. + * \param[in] po_uuid Pool UUID. + * \param[in] tgt_id Target ID. + * + * \retval DER_SUCCESS Success. + * \retval -DER_NOMEM Out of memory. + * \retval -DER_NO_PERM Permission problem. Please see open(3) and fallocate(2). + * \retval -DER_EXIST The file already exists. Please see open(3). + * \retval -DER_NONEXIST The file does not exist. Please see open(3). + * \retval -DER_NOSPACE There is not enough space left on the device. + * \retval -DER_* Possibly other errors. + */ +int +dlck_pool_file_preallocate(const char *storage_path, uuid_t po_uuid, int tgt_id); /** * Open a pool. @@ -66,4 +102,16 @@ struct co_uuid_list_elem { int dlck_pool_cont_list(daos_handle_t poh, d_list_t *co_uuids); +/** + * Add all files (pool UUIDs + all targets bitmap) to \p file_list. + * + * \param[out] file_list List of all files belonging to the given DAOS engine. + * + * \retval DER_SUCCESS Success. + * \retval -DER_NOMEM Out of memory. + * \retval -DER_* Possibly other errors. + */ +int +dlck_pool_list(d_list_t *file_list); + #endif /** __DLCK_POOL__ */ diff --git a/src/utils/dlck/dlck_report.c b/src/utils/dlck/dlck_report.c new file mode 100644 index 00000000000..99b82a9dc7c --- /dev/null +++ b/src/utils/dlck/dlck_report.c @@ -0,0 +1,61 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +#define D_LOGFAC DD_FAC(dlck) + +#include +#include + +#include +#include + +#include "dlck_checker.h" +#include "dlck_report.h" + +#define DLCK_PROGRESS_LINE_LEN 32 + +/** + * Produce and provide a simple separator: + * + * ======== + */ +static inline char * +get_separator() +{ + static char separator[DLCK_PROGRESS_LINE_LEN] = {0}; + static bool initialized = false; + + if (unlikely(!initialized)) { + memset(separator, '=', DLCK_PROGRESS_LINE_LEN); + initialized = true; + } + + return separator; +} + +#define DLCK_PRINT_SEPARATOR(ck) CK_PRINTF(ck, "%s\n", get_separator()) + +/** + * \note This function is called when no other threads are running in parallel. No locks are + * necessary. + */ +void +dlck_report_results(int *rcs, unsigned targets, unsigned warnings_num, struct checker *ck) +{ + /** print header */ + DLCK_PRINT_SEPARATOR(ck); + CK_PRINT(ck, "Targets:\n"); + DLCK_PRINT_SEPARATOR(ck); + + /** print records */ + for (int i = 0; i < targets; ++i) { + CK_PRINTFL_RC(ck, rcs[i], "[%d] result", i); + } + + /** print footer */ + DLCK_PRINT_SEPARATOR(ck); + CK_PRINTF(ck, "Total: %u warning(s).\n", warnings_num); + DLCK_PRINT_SEPARATOR(ck); +} diff --git a/src/utils/dlck/dlck_report.h b/src/utils/dlck/dlck_report.h new file mode 100644 index 00000000000..c1a1c3da665 --- /dev/null +++ b/src/utils/dlck/dlck_report.h @@ -0,0 +1,25 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#ifndef __DLCK_REPORT__ +#define __DLCK_REPORT__ + +/** + * Report targets' results. + * + * \param[in] rcs Array of return codes for all targets. + * \param[in] targets Number of targets. + * \param[in] warnings_num Number of warnings. + * \param[in] dp Main print utility. + * + * \retval DER_SUCCESS Success. + * \retval -DER_MISC Printing error. + * \retval -DER_* Other errors. + */ +void +dlck_report_results(int *rcs, unsigned targets, unsigned warnings_num, struct checker *ck); + +#endif /** __DLCK_REPORT__ */ diff --git a/src/utils/dlck/tests/SConscript b/src/utils/dlck/tests/SConscript index b447ca3e210..d83805c5390 100644 --- a/src/utils/dlck/tests/SConscript +++ b/src/utils/dlck/tests/SConscript @@ -31,11 +31,35 @@ def build_dlck_test_helper(henv): henv.d_test_program('dlck_test_helper', srcs, LIBS=libs) +def build_dlck_args_ut(henv): + """Build dlck_args_ut""" + henv.Append(OBJPREFIX="dlck_args_ut_") + henv.Append(CPPPATH=[Dir('../../../').srcnode()]) + henv.AppendUnique(LINKFLAGS=['-Wl,--wrap=argp_failure']) + henv.AppendUnique(RPATH_FULL=['$PREFIX/lib64/daos_srv']) + henv.require('cmocka') + + libs = [ + 'gurt', 'daos_common_pmem', 'cmocka', 'uuid' + ] + + srcs = [ + 'dlck_args_ut.c', + '../dlck_args_common.c', + '../dlck_args_engine.c', + '../dlck_args_files.c', + '../dlck_args_parse.c', + '../dlck_args.c', + ] + henv.d_test_program('dlck_args_ut', srcs, LIBS=libs) + + def scons(): """Execute build""" Import('env') build_dlck_test_helper(env.Clone()) + build_dlck_args_ut(env.Clone()) if __name__ == "SCons.Script": diff --git a/src/utils/dlck/tests/dlck_args_ut.c b/src/utils/dlck/tests/dlck_args_ut.c new file mode 100644 index 00000000000..e9bc4f79efc --- /dev/null +++ b/src/utils/dlck/tests/dlck_args_ut.c @@ -0,0 +1,148 @@ +/** + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +#define D_LOGFAC DD_FAC(tests) + +#include +#include +#include +#include + +#include + +#include "../dlck_args.h" + +/** globals */ + +#define APP_NAME_MOCK "app_name" +#define PARSER_FAILURE EINVAL + +extern struct argp argp_common; +extern struct argp argp_file; +extern struct argp argp_engine; + +struct dlck_control Ctrl; + +argp_parser_t Argp_engine_parser_real; + +/** wrappers and mocks */ + +void +__wrap_argp_failure(const struct argp_state *__restrict __state, int __status, int __errnum, + const char *__restrict __fmt, ...) +{ + check_expected(__state); + assert_int_equal(__status, PARSER_FAILURE); + assert_int_equal(__errnum, PARSER_FAILURE); +} + +static error_t +argp_common_parser_mock(int key, char *arg, struct argp_state *state) +{ + check_expected(key); + assert_non_null(state); + assert_ptr_equal(state->input, &Ctrl.common); + + return 0; +} + +static error_t +argp_file_parser_mock(int key, char *arg, struct argp_state *state) +{ + check_expected(key); + assert_non_null(state); + assert_ptr_equal(state->input, &Ctrl.files); + + return 0; +} + +static error_t +argp_engine_parser_mock(int key, char *arg, struct argp_state *state) +{ + check_expected_ptr(key); + assert_non_null(state); + assert_ptr_equal(state->input, &Ctrl.engine); + + return 0; +} + +/** setups & teardowns */ + +static int +setup_engine_args_default(void **state_ptr) +{ + static struct dlck_args_engine args = {0}; + static struct argp_state state = {0}; + error_t ret; + + /** bind the input */ + state.input = &args; + + /** set defaults */ + ret = Argp_engine_parser_real(ARGP_KEY_INIT, NULL, &state); + assert_int_equal(ret, 0); + + *state_ptr = &state; + + return 0; +} + +/** tests */ + +/** + * Test if all the children parsers are connected properly and if each of them receives all of + * the expected special key values. + */ +static void +test_parser_children_connection(void **unused) +{ + /** special keys as they are provided for each of the parsers in order */ + int keys[] = {ARGP_KEY_INIT, ARGP_KEY_NO_ARGS, ARGP_KEY_END, ARGP_KEY_SUCCESS, + ARGP_KEY_FINI}; + + /** empty argument list */ + int argc = 1; + char *argv[] = {APP_NAME_MOCK}; + + for (int i = 0; i < ARRAY_SIZE(keys); ++i) { + expect_value(argp_common_parser_mock, key, keys[i]); + expect_value(argp_file_parser_mock, key, keys[i]); + expect_value(argp_engine_parser_mock, key, keys[i]); + } + + dlck_args_parse(argc, argv, &Ctrl); +} + +static void +test_engine_parser_END_no_storage_path_fail(void **state_ptr) +{ + struct argp_state *state = *state_ptr; + error_t ret; + + expect_value(__wrap_argp_failure, __state, state); + + ret = Argp_engine_parser_real(ARGP_KEY_END, NULL, state); + assert_int_equal(ret, PARSER_FAILURE); +} + +static const struct CMUnitTest dlck_args_tests[] = { + {"DLCK_ARGS100: parser - children connection", test_parser_children_connection, NULL, NULL}, + {"DLCK_ARGS200: engine parser + ARGP_KEY_END + no storage path", + test_engine_parser_END_no_storage_path_fail, setup_engine_args_default, NULL}, +}; + +int +main(int argc, char **argv) +{ + /** collect function pointers to real parsers */ + Argp_engine_parser_real = argp_engine.parser; + + /** overwrite real parsers with mocks */ + argp_common.parser = argp_common_parser_mock; + argp_file.parser = argp_file_parser_mock; + argp_engine.parser = argp_engine_parser_mock; + + return cmocka_run_group_tests_name("dlck_args_ut", dlck_args_tests, NULL, NULL); +} diff --git a/src/utils/dlck/tests/dlck_test_helper.c b/src/utils/dlck/tests/dlck_test_helper.c index 74d83af0b45..cd2dd268b9a 100644 --- a/src/utils/dlck/tests/dlck_test_helper.c +++ b/src/utils/dlck/tests/dlck_test_helper.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -23,6 +22,7 @@ #include #include "../dlck_args.h" +#include "../dlck_bitmap.h" #include "../dlck_engine.h" #include "../dlck_pool.h" @@ -263,13 +263,13 @@ exec_one(void *arg) } d_list_for_each_entry(file, &xst->args_files->list, link) { - /** do not process the given file if the target is excluded */ - if ((file->targets_bitmap & (1 << xst->xs->tgt_id)) == 0) { + /** do not process the given file if the target is not requested */ + if (dlck_bitmap_isclr32(file->targets_bitmap, xst->xs->tgt_id)) { continue; } - rc = dlck_pool_open_safe(xst->engine->open_mtx, xst->args_engine->storage_path, - file->po_uuid, xst->xs->tgt_id, &xst->poh); + rc = dlck_pool_open(xst->args_engine->storage_path, file->po_uuid, xst->xs->tgt_id, + &xst->poh); if (rc != DER_SUCCESS) { xst->rc = rc; break; @@ -277,7 +277,7 @@ exec_one(void *arg) cont_process(xst, xst->co_uuid); - rc = dlck_pool_close_safe(xst->engine->open_mtx, xst->poh); + rc = vos_pool_close(xst->poh); if (rc != DER_SUCCESS) { xst->rc = rc; break; @@ -374,23 +374,38 @@ static struct argp argp = {NULL, parser, NULL /** usage */, NULL, children}; static int setup(struct dlck_helper_args *args, struct bundle *bundle) { - struct dlck_file *file; struct dlck_engine *engine; unsigned int seed = SRAND_SEED; int rc; + int rc_abt; - /** prepare pool storage directories */ - d_list_for_each_entry(file, &args->files.list, link) { - rc = dlck_pool_mkdir(args->engine.storage_path, file->po_uuid); - assert_int_equal(rc, DER_SUCCESS); + rc_abt = ABT_init(0, NULL); + if (rc_abt != ABT_SUCCESS) { + rc = dss_abterr2der(rc_abt); + return rc; } /** start an engine */ rc = dlck_engine_start(&args->engine, &engine); if (rc != DER_SUCCESS) { + (void)ABT_finalize(); return rc; } + if (d_list_empty(&args->files.list)) { + /** no files specified means all files are requested */ + rc = dlck_pool_list(&args->files.list); + if (rc != DER_SUCCESS) { + goto fail_engine_stop; + } + } + + /** prepare pool storage directories */ + rc = dlck_pool_mkdir_all(args->engine.storage_path, &args->files.list, NULL); + if (rc != DER_SUCCESS) { + goto fail_engine_stop; + } + D_ALLOC_ARRAY(bundle->co_uuids, args->engine.targets); if (bundle->co_uuids == NULL) { rc = -DER_NOMEM; @@ -413,12 +428,14 @@ setup(struct dlck_helper_args *args, struct bundle *bundle) fail_engine_stop: (void)dlck_engine_stop(engine); + (void)ABT_finalize(); return rc; } static int teardown(struct bundle *bundle) { + int rc_abt; int rc; dss_unregister_key(dtx_module.sm_key); @@ -426,6 +443,15 @@ teardown(struct bundle *bundle) D_FREE(bundle->co_uuids); rc = dlck_engine_stop(bundle->engine); + if (rc != DER_SUCCESS) { + (void)ABT_finalize(); + return rc; + } + + rc_abt = ABT_finalize(); + if (rc_abt != ABT_SUCCESS) { + rc = dss_abterr2der(rc_abt); + } return rc; } @@ -445,7 +471,7 @@ main(int argc, char **argv) goto fail_args_free; } - rc = dlck_engine_exec_all(bundle.engine, exec_one, arg_alloc, &bundle, arg_free); + rc = dlck_engine_exec_all(bundle.engine, exec_one, arg_alloc, &bundle, arg_free, NULL); if (rc != DER_SUCCESS) { goto fail_teardown; } diff --git a/src/utils/dlck/tests/fault_injection_dlck.yaml b/src/utils/dlck/tests/fault_injection_dlck.yaml new file mode 100644 index 00000000000..5d822eaaa02 --- /dev/null +++ b/src/utils/dlck/tests/fault_injection_dlck.yaml @@ -0,0 +1,38 @@ +# Uncomment a fault you would like to trigger +fault_config: + # - id: 131328 # DLCK_MOCK_ROOT + # - id: 131329 # DLCK_FAULT_GETGRNAM + # - id: 131330 # DLCK_MOCK_NO_DAOS_SERVER_GROUP + # - id: 131331 # DLCK_FAULT_GETGROUPS + # - id: 131332 # DLCK_MOCK_NOT_IN_DAOS_SERVER_GROUP + # - id: 131333 # DLCK_FAULT_CREATE_LOG_DIR + # - id: 131334 # DLCK_FAULT_CREATE_POOL_DIR + # - id: 131335 # DLCK_FAULT_ENGINE_START + # - id: 131336 # DLCK_FAULT_ENGINE_EXEC + # - id: 131337 # DLCK_FAULT_ENGINE_JOIN + # - id: 131338 # DLCK_FAULT_ENGINE_STOP + # - id: 131584 # DAOS_FAULT_POOL_NVME_HEALTH + # interval: 2 # skip sys_db + # - id: 131585 # DAOS_FAULT_POOL_OPEN_BIO + # - id: 131586 # DAOS_FAULT_POOL_OPEN_UMEM + # interval: 2 # skip sys_db + # - id: 131587 # DAOS_FAULT_POOL_OPEN_MAGIC + # interval: 2 # skip sys_db + # - id: 131588 # DAOS_FAULT_POOL_OPEN_VERSION + # interval: 2 # skip sys_db + # - id: 131589 # DAOS_FAULT_POOL_OPEN_UUID + # interval: 2 # skip sys_db + # - id: 131590 # DAOS_FAULT_BTREE_OPEN_INV_CLASS + # interval: 28 # containers tree fine-tuned; note: -t 1 + # interval: 29 # gc tree fine-tuned; note: -t 1 + # max_faults: 1 + # - id: 131591 # DAOS_FAULT_BTREE_OPEN_UNREG_CLASS + # interval: 28 # containers tree fine-tuned; note: -t 1 + # interval: 29 # gc tree fine-tuned; note: -t 1 + # max_faults: 1 + # - id: 131592 # DAOS_FAULT_BTREE_FEATURES + # interval: 28 # containers tree fine-tuned; note: -t 1 + # interval: 29 # gc tree fine-tuned; note: -t 1 + # max_faults: 1 + # - id: 131593 # DAOS_FAULT_POOL_EXT_PADDING + # - id: 131594 # DAOS_FAULT_POOL_EXT_RESERVED diff --git a/src/utils/self_test/self_test_lib.c b/src/utils/self_test/self_test_lib.c index 85eb435c6f3..0c39f7af5e1 100644 --- a/src/utils/self_test/self_test_lib.c +++ b/src/utils/self_test/self_test_lib.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -176,10 +176,13 @@ self_test_init(char *dest_name, crt_context_t *crt_ctx, crt_group_t **srv_grp, p d_rank_list_free(rank_list); - ret = crt_rank_self_set(max_rank + 1, 1 /* group_version_min */); - if (ret != 0) { - D_ERROR("crt_rank_self_set failed; ret = %d\n", ret); - return ret; + /* when running as a server set the rank to next highest one unused */ + if (listen) { + ret = crt_rank_self_set(max_rank + 1, 1 /* group_version_min */); + if (ret != 0) { + D_ERROR("crt_rank_self_set failed; ret = %d\n", ret); + return ret; + } } return 0; diff --git a/src/vea/vea_api.c b/src/vea/vea_api.c index e26bc353eb5..cbbf6e377ad 100644 --- a/src/vea/vea_api.c +++ b/src/vea/vea_api.c @@ -380,7 +380,7 @@ vea_load(struct umem_instance *umem, struct umem_tx_stage_data *txd, return rc; } -#define FLUSH_INTVL 2 /* seconds */ +#define FLUSH_INTVL 5 /* seconds */ static inline bool need_aging_flush(struct vea_space_info *vsi, bool force) diff --git a/src/vea/vea_free.c b/src/vea/vea_free.c index fafa656d76c..14681680989 100644 --- a/src/vea/vea_free.c +++ b/src/vea/vea_free.c @@ -767,7 +767,12 @@ aggregated_free(struct vea_space_info *vsi, struct vea_free_entry *vfe) return 0; } -#define EXPIRE_INTVL 3 /* seconds */ +/* + * Tune the EXPIRE_INTVL & FLUSH_INTVL with extreme care. While decreasing these + * values could be beneficial for accelerating space reclamation in overwrite mode, + * it is generally detrimental in punch mode. DAOS-18012. + */ +#define EXPIRE_INTVL 10 /* seconds */ #define UNMAP_SIZE_THRESH (1UL << 20) /* 1MB */ static int diff --git a/src/vos/README.md b/src/vos/README.md index db3c83c8229..5a65e7feed1 100644 --- a/src/vos/README.md +++ b/src/vos/README.md @@ -58,7 +58,7 @@ Please refer to the Blob I/O (BIO) module for mor Special care is taken when developing and modifying the VOS layer because any software bug could corrupt data structures in persistent memory. The VOS, therefore, checksums its persistent data structures despite the presence of hardware ECC. -The VOS provides a lightweight I/O stack fully in user space, leveraging the PMDK open-source libraries developed to support this programming model. +The VOS provides a lightweight I/O stack fully in user space, leveraging the PMDK open-source libraries developed to support this programming model. @@ -73,7 +73,7 @@ It is worth noting that such transactions are different from the DAOS transactio Persistent memory transactions must guarantee consistency of VOS internal data structures when processing incoming requests, regardless of their epoch number. Transactions over persistent memory can be implemented in many different ways, e.g., undo logs, redo logs, a combination of both, or copy-on-write. -PMDK is an open source collection of libraries for using persistent memory, optimized specifically for NVRAM. +PMDK is an open source collection of libraries for using persistent memory, optimized specifically for NVRAM. Among these is the libpmemobj library, which implements relocatable persistent heaps called persistent memory pools. This includes memory allocation, transactions, and general facilities for persistent memory programming. The transactions are local to one thread (not multi-threaded) and rely on undo logs. diff --git a/src/vos/ilog.c b/src/vos/ilog.c index 1e3fca5be32..016cf8fa86f 100644 --- a/src/vos/ilog.c +++ b/src/vos/ilog.c @@ -1648,6 +1648,15 @@ ilog_version_get(daos_handle_t loh) return ilog_mag2ver(lctx->ic_root->lr_magic); } +bool +ilog_root_is_valid(struct ilog_df *ilog_df) +{ + struct ilog_root *root = (struct ilog_root *)ilog_df; + D_ASSERT(root != NULL); + + return ILOG_MAGIC_VALID(root->lr_magic); +} + bool ilog_is_valid(struct umem_instance *umm, umem_off_t rec, uint32_t dtx_lid, daos_epoch_t epoch) { diff --git a/src/vos/ilog.h b/src/vos/ilog.h index f8f64cbd455..3fdff8524a5 100644 --- a/src/vos/ilog.h +++ b/src/vos/ilog.h @@ -340,6 +340,16 @@ ilog_is_punch(const struct ilog_entry *entry) entry->ie_id.id_update_minor_eph; } +/** + * Validate ilog's root. + * + * \param[in] ilog_df + * + * \return true if the root is valid. + */ +bool +ilog_root_is_valid(struct ilog_df *ilog_df); + /** Validate the provided ilog. * * Note: It is designed for catastrophic recovery. Not to perform at run-time. diff --git a/src/vos/tests/SConscript b/src/vos/tests/SConscript index e5b6f72ecc7..0f61277fafe 100644 --- a/src/vos/tests/SConscript +++ b/src/vos/tests/SConscript @@ -14,7 +14,7 @@ def scons(): vts_objs = senv.StaticObject(vts_src) Export('vts_objs') - libraries = ['vos', 'bio', 'abt', 'pthread', 'daos_common_pmem', + libraries = ['vos', 'bio', 'ssl', 'abt', 'pthread', 'daos_common_pmem', 'daos_tests', 'gurt', 'uuid', 'pthread', 'pmemobj', 'cmocka', 'gomp'] @@ -42,7 +42,7 @@ def scons(): unit_env.AppendUnique(RPATH_FULL=['$PREFIX/lib64/daos_srv']) libraries = ['daos_common_pmem', 'daos_tests', 'gurt', 'cart', 'cmocka', - 'vos', 'uuid', 'pmem', 'pmemobj', 'bio', 'pthread', 'abt'] + 'vos', 'uuid', 'pmem', 'pmemobj', 'bio', 'ssl', 'pthread', 'abt'] unit_env.d_test_program('pool_scrubbing_tests', ['pool_scrubbing_tests.c', '../vos_pool_scrub.c'], LIBS=libraries) @@ -51,7 +51,8 @@ def scons(): tenv.AppendUnique(RPATH_FULL=['$PREFIX/lib64/daos_srv']) tenv.Append(CPPDEFINES={'VOS_STANDALONE': '1'}) - libraries = ['uuid', 'bio', 'gurt', 'cmocka', 'daos_common_pmem', 'daos_tests', 'vos', 'abt'] + libraries = ['uuid', 'bio', 'gurt', 'cmocka', 'daos_common_pmem', 'daos_tests', 'vos', 'abt', + 'ssl'] tenv.require('spdk') bio_ut_src = ['bio_ut.c', 'wal_ut.c'] diff --git a/src/vos/tests/pool_scrubbing_tests.c b/src/vos/tests/pool_scrubbing_tests.c index a7111045b73..066742a76fa 100644 --- a/src/vos/tests/pool_scrubbing_tests.c +++ b/src/vos/tests/pool_scrubbing_tests.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2020-2022 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -497,7 +498,7 @@ sts_ctx_setup_scrub_ctx(struct sts_context *ctx) ctx->tsc_scrub_ctx.sc_drain_pool_tgt_fn = fake_target_drain; ctx->tsc_scrub_ctx.sc_pool = &ctx->tsc_pool; ctx->tsc_scrub_ctx.sc_dmi = &ctx->tsc_dmi; - ctx->tsc_scrub_ctx.sc_cont.scs_props_fetched = true; + ctx->tsc_scrub_ctx.sc_cont.scs_csummer_inited = true; } static void diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index 12cd6d72728..57d80412c96 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -51,40 +51,13 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, vts_init_dte(&dth->dth_dte); - dth->dth_coh = coh; - dth->dth_epoch = epoch; - dth->dth_leader_oid = *oid; - - dth->dth_pinned = 0; - dth->dth_sync = 0; - dth->dth_cos_done = 0; - dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_solo = 0; - dth->dth_drop_cmt = 0; - dth->dth_modify_shared = 0; - dth->dth_active = 0; - dth->dth_dist = 0; - dth->dth_for_migration = 0; - dth->dth_ignore_uncommitted = 0; - dth->dth_prepared = 0; - dth->dth_epoch_owner = 0; - dth->dth_aborted = 0; - dth->dth_already = 0; - dth->dth_need_validation = 0; - - dth->dth_dti_cos_count = 0; - dth->dth_dti_cos = NULL; - dth->dth_ent = NULL; - dth->dth_flags = DTE_LEADER; + dth->dth_coh = coh; + dth->dth_epoch = epoch; + dth->dth_leader_oid = *oid; + dth->dth_flags = DTE_LEADER; dth->dth_modification_cnt = 1; - - dth->dth_op_seq = 1; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - dth->dth_oid_array = NULL; - - dth->dth_dkey_hash = dkey_hash; + dth->dth_op_seq = 1; + dth->dth_dkey_hash = dkey_hash; D_INIT_LIST_HEAD(&dth->dth_share_cmt_list); D_INIT_LIST_HEAD(&dth->dth_share_abt_list); diff --git a/src/vos/tests/wal_ut.c b/src/vos/tests/wal_ut.c index 32b4b4c9957..0bdc85a38d7 100644 --- a/src/vos/tests/wal_ut.c +++ b/src/vos/tests/wal_ut.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2023-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -669,8 +670,8 @@ ut_fill_wal(struct bio_ut_args *args, int tx_nr, struct ut_tx_array **txa_ptr) tx = txa->ta_tx_ptrs[0]; /* - * Each tx is roughly 800k, 40 txs will consume 32000k, which is more than - * half of 50MB WAL size. + * Each tx is roughly 800k, 100 txs will consume 80MB, which is more than + * half of 128MB WAL size. */ for (i = 0; i < tx_nr; i++) { tx = txa->ta_tx_ptrs[i]; @@ -705,11 +706,11 @@ static void wal_ut_wrap(void **state) { struct bio_ut_args *args = *state; - uint64_t meta_sz = (50ULL << 20); /* 50 MB */ + uint64_t meta_sz = (128ULL << 20); /* 128 MB */ struct ut_tx_array *txa; struct umem_wal_tx *tx; struct ut_fake_tx *fake_tx; - int tx_nr = 40, rc; + int tx_nr = 100, rc; rc = ut_mc_init(args, meta_sz, meta_sz, meta_sz); assert_rc_equal(rc, 0); @@ -745,11 +746,11 @@ static void wal_ut_wrap_many(void **state) { struct bio_ut_args *args = *state; - uint64_t meta_sz = (50ULL << 20); /* 50 MB */ + uint64_t meta_sz = (128ULL << 20); /* 128 MB */ struct ut_tx_array *txa; struct umem_wal_tx *tx; struct ut_fake_tx *fake_tx; - int tx_nr = 40, rc; + int tx_nr = 100, rc; rc = ut_mc_init(args, meta_sz, meta_sz, meta_sz); assert_rc_equal(rc, 0); diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index a7397a94256..642621ebd08 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -215,12 +215,22 @@ vos_tx_publish(struct dtx_handle *dth, bool publish) } int -vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb) +vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, + struct vos_object *obj) { int rc; - if (dth == NULL) - return umem_tx_begin(umm, vos_txd_get(is_sysdb)); + if (dth == NULL) { + /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ + rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); + if (rc == 0 && obj != NULL && unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(1), need to restart TX.\n", + DP_UOID(obj->obj_id)); + rc = umem_tx_end(umm, -DER_TX_RESTART); + } + + return rc; + } D_ASSERT(!is_sysdb); /** Note: On successful return, dth tls gets set and will be cleared by the corresponding @@ -235,6 +245,14 @@ vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb) rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); if (rc == 0) { + /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ + if (obj != NULL && unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(2), need to restart TX.\n", + DP_UOID(obj->obj_id)); + + return umem_tx_end(umm, -DER_TX_RESTART); + } + dth->dth_local_tx_started = 1; vos_dth_set(dth, false); } @@ -250,12 +268,6 @@ vos_local_tx_abort(struct dtx_handle *dth) if (dth->dth_local_oid_cnt == 0) return; - /** - * Since a local transaction spawns always a single pool an eaither one of the containers - * can be used to access the pool. - */ - record = &dth->dth_local_oid_array[0]; - /** * Evict all objects touched by the aborted transaction from the object cache to make sure * no invalid pointer stays there. Not all of the touched objects have to be evicted but @@ -979,7 +991,7 @@ vos_self_nvme_init(const char *vos_path, bool init_spdk) goto out; /* Only use hugepages if NVME SSD configuration existed. */ - fd = open(nvme_conf, O_RDONLY, 0600); + fd = open(nvme_conf, O_RDONLY); if (fd < 0) { rc = bio_nvme_init_ext(NULL, VOS_NVME_NUMA_NODE, 0, 0, VOS_NVME_NR_TARGET, true, init_spdk); diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c index ad76bb8f6fe..9cb992b28be 100644 --- a/src/vos/vos_container.c +++ b/src/vos/vos_container.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -545,6 +545,8 @@ int vos_cont_query(daos_handle_t coh, vos_cont_info_t *cont_info) { struct vos_container *cont; + struct vos_cont_df *cont_df; + uint64_t feats; cont = vos_hdl2cont(coh); if (cont == NULL) { @@ -552,9 +554,14 @@ vos_cont_query(daos_handle_t coh, vos_cont_info_t *cont_info) return -DER_INVAL; } - cont_info->ci_nobjs = cont->vc_cont_df->cd_nobjs; - cont_info->ci_used = cont->vc_cont_df->cd_used; - cont_info->ci_hae = cont->vc_cont_df->cd_hae; + cont_df = cont->vc_cont_df; + memset(cont_info, 0, sizeof(*cont_info)); + cont_info->ci_nobjs = cont_df->cd_nobjs; + cont_info->ci_used = cont_df->cd_used; + cont_info->ci_hae = cont_df->cd_hae; + + feats = dbtree_feats_get(&cont_df->cd_obj_root); + vos_feats_agg_time_get(feats, &cont_info->ci_agg_write); return 0; } @@ -1029,3 +1036,81 @@ vos_cont_set_mod_bound(daos_handle_t coh, uint64_t epoch) return 0; } + +int +vos_cont_save_props(daos_handle_t coh, struct cont_props *props) +{ + struct umem_instance *umm; + struct vos_container *cont; + struct vos_cont_ext_df *ced; + int rc = 0; + + cont = vos_hdl2cont(coh); + D_ASSERT(cont != NULL); + + umm = vos_cont2umm(cont); + ced = umem_off2ptr(umm, cont->vc_cont_df->cd_ext); + + /* Do not allow to save property against old container without extension. */ + if (ced == NULL) + D_GOTO(out, rc = -DER_NOTSUPPORTED); + + /* Currently we only save chunksize and csum_type in vos_container. Maybe more in future. */ + + if (ced->ced_chunksize == props->dcp_chunksize && ced->ced_valid_bits & VCEB_CSUM && + ((props->dcp_csum_enabled == 1 && ced->ced_csum_type == props->dcp_csum_type) || + (props->dcp_csum_enabled == 0 && ced->ced_csum_type == DAOS_PROP_CO_CSUM_OFF))) + D_GOTO(out, rc = 0); + + rc = umem_tx_begin(umm, NULL); + if (rc != 0) + goto out; + + if (ced->ced_chunksize != props->dcp_chunksize) { + rc = umem_tx_add_ptr(umm, &ced->ced_chunksize, sizeof(ced->ced_chunksize)); + if (rc != 0) + goto abort; + + ced->ced_chunksize = props->dcp_chunksize; + } + + if (!(ced->ced_valid_bits & VCEB_CSUM)) { + rc = umem_tx_add_ptr(umm, &ced->ced_valid_bits, sizeof(ced->ced_valid_bits)); + if (rc != 0) + goto abort; + + ced->ced_valid_bits |= VCEB_CSUM; + } + + if (props->dcp_csum_enabled == 1) { + if (ced->ced_csum_type != props->dcp_csum_type) { + rc = umem_tx_add_ptr(umm, &ced->ced_csum_type, sizeof(ced->ced_csum_type)); + if (rc != 0) + goto abort; + + ced->ced_csum_type = props->dcp_csum_type; + } + } else { + if (ced->ced_csum_type != DAOS_PROP_CO_CSUM_OFF) { + rc = umem_tx_add_ptr(umm, &ced->ced_csum_type, sizeof(ced->ced_csum_type)); + if (rc != 0) + goto abort; + + ced->ced_csum_type = DAOS_PROP_CO_CSUM_OFF; + } + } + +abort: + if (rc != 0) + rc = umem_tx_abort(umm, rc); + else + rc = umem_tx_commit(umm); + +out: + DL_CDEBUG(rc != 0, DLOG_ERR, DB_MGMT, rc, + "Save property (csum %s, hash_type %d, chunksize %u) for container " DF_UUID, + props->dcp_csum_enabled == 1 ? "enabled" : "disabled", props->dcp_csum_type, + props->dcp_chunksize, DP_UUID(cont->vc_id)); + + return rc; +} diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index f3cd477239b..d50f2b87cc8 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -151,37 +151,21 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, } static void -dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, - struct dtx_handle *dth, bool evict, bool keep_df) +dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool evict, + bool keep_df) { - if (evict) { - daos_unit_oid_t *oids; - int count; - int i; + if (evict && dae->dae_oids != NULL) { + int i; - if (dth != NULL) { - if (dth->dth_oid_array != NULL) { - D_ASSERT(dth->dth_oid_cnt > 0); - - count = dth->dth_oid_cnt; - oids = dth->dth_oid_array; - } else { - count = 1; - oids = &dth->dth_leader_oid; - } - } else { - count = dae->dae_oid_cnt; - oids = dae->dae_oids; - } - - for (i = 0; i < count; i++) - vos_obj_evict_by_oid(cont, oids[i]); + for (i = 0; i < dae->dae_oid_cnt; i++) + vos_obj_evict_by_oid(cont, dae->dae_oids[i]); } if (dae->dae_oids != NULL && dae->dae_oids != &dae->dae_oid_inline && dae->dae_oids != &DAE_OID(dae)) { D_FREE(dae->dae_oids); dae->dae_oid_cnt = 0; + dae->dae_oid_cap = 0; } DAE_REC_OFF(dae) = UMOFF_NULL; @@ -254,7 +238,7 @@ dtx_act_ent_free(struct btr_instance *tins, struct btr_record *rec, D_ASSERT(dae != NULL); *(struct vos_dtx_act_ent **)args = dae; } else if (dae != NULL) { - dtx_act_ent_cleanup(tins->ti_priv, dae, NULL, true, false); + dtx_act_ent_cleanup(tins->ti_priv, dae, true, false); } return 0; @@ -391,8 +375,9 @@ static int dtx_cmt_ent_update(struct btr_instance *tins, struct btr_record *rec, d_iov_t *key, d_iov_t *val, d_iov_t *val_out) { - struct vos_dtx_cmt_ent *dce_new = val->iov_buf; - struct vos_dtx_cmt_ent *dce_old; + struct vos_dtx_cmt_ent *dce_new = val->iov_buf; + struct vos_dtx_cmt_ent *dce_old; + int rc = 0; dce_old = umem_off2ptr(&tins->ti_umm, rec->rec_off); @@ -418,20 +403,11 @@ dtx_cmt_ent_update(struct btr_instance *tins, struct btr_record *rec, if (dce_old->dce_invalid) { rec->rec_off = umem_ptr2off(&tins->ti_umm, dce_new); D_FREE(dce_old); - } else if (!dce_old->dce_reindex) { - /* If two client threads (such as non-initialized context after fork) use the same - * DTX ID (by chance), then it is possible to arrive here. But once comes here, we - * have no chance to require related client/application to restart the transaction - * since related RPC may has already completed. - * */ - if (unlikely(dce_new->dce_reindex == 0)) - D_WARN("Commit DTX " DF_DTI " for more than once, maybe reused\n", - DP_DTI(&DCE_XID(dce_new))); - else - dce_new->dce_exist = 1; + } else { + rc = -DER_EXIST; } - return 0; + return rc; } static btr_ops_t dtx_committed_btr_ops = { @@ -482,6 +458,9 @@ vos_dtx_table_destroy(struct umem_instance *umm, struct vos_cont_df *cont_df) while (!UMOFF_IS_NULL(cont_df->cd_dtx_committed_head)) { dbd_off = cont_df->cd_dtx_committed_head; dbd = umem_off2ptr(umm, dbd_off); + D_ASSERTF_MEM(dbd->dbd_magic == DTX_CMT_BLOB_MAGIC, dbd, DTX_CMT_BLOB_SIZE, + "dbd_magic = %#x != DTX_CMT_BLOB_MAGIC (%#x)\n", dbd->dbd_magic, + DTX_CMT_BLOB_MAGIC); cont_df->cd_dtx_committed_head = dbd->dbd_next; rc = umem_free(umm, dbd_off); if (rc != 0) @@ -501,6 +480,9 @@ vos_dtx_table_destroy(struct umem_instance *umm, struct vos_cont_df *cont_df) while (!UMOFF_IS_NULL(cont_df->cd_dtx_active_head)) { dbd_off = cont_df->cd_dtx_active_head; dbd = umem_off2ptr(umm, dbd_off); + D_ASSERTF_MEM(dbd->dbd_magic == DTX_ACT_BLOB_MAGIC, dbd, DTX_ACT_BLOB_SIZE, + "dbd_magic = %#x != DTX_ACT_BLOB_MAGIC (%#x)\n", dbd->dbd_magic, + DTX_ACT_BLOB_MAGIC); for (i = 0; i < dbd->dbd_index; i++) { dae_df = &dbd->dbd_active_data[i]; @@ -887,7 +869,7 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_BYPASS, &kiov, &dae); if (rc == 0) { - dtx_act_ent_cleanup(cont, dae, NULL, false, false); + dtx_act_ent_cleanup(cont, dae, false, false); dtx_evict_lid(cont, dae); } @@ -944,12 +926,15 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t *dae_p = dae; out: - if (rc != -DER_ALREADY && rc != -DER_NONEXIST) - DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, "Commit the DTX " DF_DTI, DP_DTI(dti)); - if (rc != 0) D_FREE(dce); + if (rc == -DER_EXIST) + rc = 0; + + if (rc != -DER_ALREADY && rc != -DER_NONEXIST) + DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, "Commit the DTX " DF_DTI, DP_DTI(dti)); + if (rm_cos != NULL && ((rc == 0 && !keep_act) || rc == -DER_NONEXIST || (rc == -DER_ALREADY && dae == NULL))) *rm_cos = true; @@ -1850,30 +1835,6 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) (dth->dth_modification_cnt > 0)) dth->dth_sync = 1; - if (dth->dth_oid_array != NULL) { - D_ASSERT(dth->dth_oid_cnt > 0); - - dae->dae_oid_cnt = dth->dth_oid_cnt; - if (dth->dth_oid_cnt == 1) { - dae->dae_oid_inline = dth->dth_oid_array[0]; - dae->dae_oids = &dae->dae_oid_inline; - } else { - size = sizeof(daos_unit_oid_t) * dth->dth_oid_cnt; - D_ALLOC_NZ(dae->dae_oids, size); - if (dae->dae_oids == NULL) { - /* Not fatal. */ - D_WARN("No DRAM to store ACT DTX OIDs " - DF_DTI"\n", DP_DTI(&DAE_XID(dae))); - dae->dae_oid_cnt = 0; - } else { - memcpy(dae->dae_oids, dth->dth_oid_array, size); - } - } - } else { - dae->dae_oids = &DAE_OID(dae); - dae->dae_oid_cnt = 1; - } - if (DAE_MBS_DSIZE(dae) <= sizeof(DAE_MBS_INLINE(dae))) { memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, DAE_MBS_DSIZE(dae)); @@ -2281,17 +2242,16 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], rc = vos_dtx_commit_one(cont, &dtis[i], epoch, cmt_time, keep_act, &dces[i], daes != NULL ? &daes[i] : NULL, rm_cos != NULL ? &rm_cos[i] : NULL); - if (rc == 0 && (daes == NULL || daes[i] != NULL)) - committed++; - if (rc == -DER_ALREADY || rc == -DER_NONEXIST) rc = 0; if (rc != 0) goto out; - if (dces[i] != NULL) + if (dces[i] != NULL) { + committed++; j++; + } } if (j > dbd->dbd_count) { @@ -2372,6 +2332,11 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto again; out: + if (committed > 0) { + cont->vc_dtx_committed_count += committed; + cont->vc_pool->vp_dtx_committed_count += committed; + } + return rc < 0 ? rc : committed; } @@ -2381,9 +2346,11 @@ vos_dtx_post_handle(struct vos_container *cont, struct vos_dtx_cmt_ent **dces, int count, bool abort, bool rollback, bool keep_act) { - d_iov_t kiov; - int rc; - int i; + struct vos_tls *tls = vos_tls_get(false); + d_iov_t kiov; + int rc; + int i; + int j; D_ASSERT(daes != NULL); @@ -2398,7 +2365,7 @@ vos_dtx_post_handle(struct vos_container *cont, if (dces == NULL) return; - for (i = 0; i < count; i++) { + for (i = 0, j = 0; i < count; i++) { if (dces[i] == NULL) continue; @@ -2406,32 +2373,39 @@ vos_dtx_post_handle(struct vos_container *cont, sizeof(DCE_XID(dces[i]))); rc = dbtree_delete(cont->vc_dtx_committed_hdl, BTR_PROBE_EQ, &kiov, NULL); - if (rc != 0 && rc != -DER_NONEXIST) { + if (rc != 0) { D_WARN("Failed to rollback cmt DTX entry " DF_DTI": "DF_RC"\n", DP_DTI(&DCE_XID(dces[i])), DP_RC(rc)); dces[i]->dce_invalid = 1; + } else { + j++; } } + if (j > 0) { + D_ASSERTF( + cont->vc_dtx_committed_count >= j, + "Unexpected committed DTX entries count when rollback for " DF_UUID + ": %u vs %u\n", + DP_UUID(cont->vc_id), cont->vc_dtx_committed_count, j); + + cont->vc_dtx_committed_count -= j; + cont->vc_pool->vp_dtx_committed_count -= j; + d_tm_dec_gauge(tls->vtl_committed, j); + } + return; } if (!abort && dces != NULL) { - struct vos_tls *tls = vos_tls_get(false); - int j = 0; - - D_ASSERT(cont->vc_pool->vp_sysdb == false); - for (i = 0; i < count; i++) { + for (i = 0, j = 0; i < count; i++) { if (dces[i] != NULL) j++; } - if (j > 0) { - cont->vc_dtx_committed_count += j; - cont->vc_pool->vp_dtx_committed_count += j; + if (j > 0) d_tm_inc_gauge(tls->vtl_committed, j); - } } for (i = 0; i < count; i++) { @@ -2446,7 +2420,7 @@ vos_dtx_post_handle(struct vos_container *cont, DAE_FLAGS(daes[i]) |= DTE_PARTIAL_COMMITTED; daes[i]->dae_committing = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, false, true); + dtx_act_ent_cleanup(cont, daes[i], false, true); continue; } @@ -2472,13 +2446,13 @@ vos_dtx_post_handle(struct vos_container *cont, daes[i]->dae_aborted = 1; daes[i]->dae_aborting = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, true, false); + dtx_act_ent_cleanup(cont, daes[i], true, false); } else { D_ASSERT(daes[i]->dae_aborting == 0); daes[i]->dae_committed = 1; daes[i]->dae_committing = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, false, false); + dtx_act_ent_cleanup(cont, daes[i], false, false); } DAE_FLAGS(daes[i]) &= ~(DTE_CORRUPTED | DTE_ORPHAN | DTE_PARTIAL_COMMITTED); } @@ -2586,8 +2560,7 @@ dtx_commit_pin(struct vos_container *cont, struct dtx_id dtis[], int count, int dae = riov.iov_buf; D_ASSERT(dae->dae_preparing == 0); - if (vos_dae_is_abort(dae) || dae->dae_committed || dae->dae_committing || - dae->dae_need_release == 0) + if (dae->dae_aborted || dae->dae_committed || dae->dae_need_release == 0) continue; rc = bkts_add_dae(vos_cont2pool(cont), &bkts, dae); @@ -3058,10 +3031,11 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co struct vos_cont_df *cont_df, umem_off_t dbd_off, const uint64_t *cmt_time) { struct vos_dtx_blob_df *dbd; - umem_off_t dbd_next_off = UMOFF_NULL; - uint64_t epoch; - int dtx_aggr_count; - bool is_dbd_freed = false; + umem_off_t dbd_next_off = UMOFF_NULL; + uint64_t epoch = cont_df->cd_newest_aggregated; + int dtx_aggr_count = 0; + int cached_count = 0; + bool is_dbd_freed = false; int i; int rc; @@ -3078,8 +3052,6 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co goto out; } - dtx_aggr_count = 0; - epoch = cont_df->cd_newest_aggregated; for (i = 0; i < dbd->dbd_count; i++) { struct vos_dtx_cmt_ent_df *dce_df; d_iov_t kiov; @@ -3091,6 +3063,8 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co d_iov_set(&kiov, &dce_df->dce_xid, sizeof(dce_df->dce_xid)); rc = dbtree_delete(cont->vc_dtx_committed_hdl, BTR_PROBE_EQ, &kiov, NULL); + if (rc == 0) + cached_count++; if (rc == -DER_NONEXIST) rc = 0; if (unlikely(rc != 0)) { @@ -3190,6 +3164,16 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co } out_tx_end: + if (cached_count > 0) { + D_ASSERTF(cont->vc_dtx_committed_count >= cached_count, + "Unexpected committed DTX entries count during aggregation for " DF_UUID + ": %u vs %u\n", + DP_UUID(cont->vc_id), cont->vc_dtx_committed_count, cached_count); + + cont->vc_dtx_committed_count -= cached_count; + cont->vc_pool->vp_dtx_committed_count -= cached_count; + } + rc = umem_tx_end(umm, rc); if (likely(rc != 0)) { DL_ERROR(rc, @@ -3199,26 +3183,22 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co goto out; } - if (dtx_aggr_count > 0) { - cont->vc_dtx_committed_count -= dtx_aggr_count; - cont->vc_pool->vp_dtx_committed_count -= dtx_aggr_count; - d_tm_dec_gauge(tls->vtl_committed, dtx_aggr_count); - - D_DEBUG(DB_IO, - "Release %i DTX committed entries of blob %p (" UMOFF_PF - ") of cont " DF_UUID, - dtx_aggr_count, dbd, UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); - } + D_DEBUG(DB_TRACE, + "Release %d/%d DTX committed entries of blob %p (" UMOFF_PF ") of cont " DF_UUID, + cached_count, dtx_aggr_count, dbd, UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); if (is_dbd_freed) { cont->vc_cmt_dtx_reindex_pos = dbd_next_off; - D_DEBUG(DB_IO, + D_DEBUG(DB_TRACE, "Removed blob of DTX committed entries %p (" UMOFF_PF ") of cont " DF_UUID, dbd, UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); rc = 1; } out: + if (cached_count > 0) + d_tm_dec_gauge(tls->vtl_committed, cached_count); + return rc; } @@ -3239,6 +3219,9 @@ vos_dtx_aggregate(daos_handle_t coh, const uint64_t *cmt_time) D_ASSERT(cont != NULL); D_ASSERT(cont->vc_pool->vp_sysdb == false); + if (unlikely(cont->vc_dtx_reset == 1)) + return 0; + umm = vos_cont2umm(cont); cont_df = cont->vc_cont_df; dbd_off = cont_df->cd_dtx_committed_head; @@ -3593,25 +3576,18 @@ vos_dtx_cmt_reindex(daos_handle_t coh) D_GOTO(out, rc = -DER_NOMEM); memcpy(&dce->dce_base, dce_df, sizeof(dce->dce_base)); - dce->dce_reindex = 1; d_iov_set(&kiov, &DCE_XID(dce), sizeof(DCE_XID(dce))); d_iov_set(&riov, dce, sizeof(*dce)); rc = dbtree_upsert(cont->vc_dtx_committed_hdl, BTR_PROBE_EQ, DAOS_INTENT_UPDATE, &kiov, &riov, NULL); if (rc != 0) { + if (rc == -DER_EXIST) + rc = 1; D_FREE(dce); goto out; } - /* The committed DTX entry is already in the index. - * Related re-index logic can stop. - */ - if (dce->dce_exist) { - D_FREE(dce); - D_GOTO(out, rc = 1); - } - cnt++; } @@ -3667,7 +3643,7 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) */ if (dae != NULL) { D_ASSERT(!vos_dae_is_prepare(dae)); - dtx_act_ent_cleanup(cont, dae, dth, true, false); + dtx_act_ent_cleanup(cont, dae, true, false); } } else { d_iov_set(&kiov, &dth->dth_xid, sizeof(dth->dth_xid)); @@ -3690,7 +3666,7 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) if (DAE_EPOCH(dae) != dth->dth_epoch) goto out; - dtx_act_ent_cleanup(cont, dae, dth, true, false); + dtx_act_ent_cleanup(cont, dae, true, false); rc = dbtree_delete(cont->vc_dtx_active_hdl, riov.iov_buf != NULL ? BTR_PROBE_BYPASS : BTR_PROBE_EQ, @@ -3969,25 +3945,31 @@ vos_dtx_cache_reset(daos_handle_t coh, bool force) cmt: if (daos_handle_is_valid(cont->vc_dtx_committed_hdl)) { - rc = dbtree_destroy(cont->vc_dtx_committed_hdl, NULL); - if (rc != 0) { - D_ERROR("Failed to destroy committed DTX tree for "DF_UUID": "DF_RC"\n", - DP_UUID(cont->vc_id), DP_RC(rc)); - return rc; - } + uint32_t count = cont->vc_dtx_committed_count; - D_ASSERTF(cont->vc_pool->vp_dtx_committed_count >= cont->vc_dtx_committed_count, - "Unexpected committed DTX entries count: %u vs %u\n", - cont->vc_pool->vp_dtx_committed_count, cont->vc_dtx_committed_count); + cont->vc_dtx_reset = 1; + rc = dbtree_destroy(cont->vc_dtx_committed_hdl, NULL); + /* + * If dbtree_destroy() failed, then the count of DTX entries in the committed index + * tree may not match cont->vc_dtx_committed_count any more and not easy to recover. + * Let's assert here. + */ + D_ASSERTF(rc == 0, + "Failed to destroy committed DTX tree for " DF_UUID ": " DF_RC "\n", + DP_UUID(cont->vc_id), DP_RC(rc)); - cont->vc_pool->vp_dtx_committed_count -= cont->vc_dtx_committed_count; D_ASSERT(cont->vc_pool->vp_sysdb == false); - d_tm_dec_gauge(vos_tls_get(false)->vtl_committed, cont->vc_dtx_committed_count); + D_ASSERTF(cont->vc_pool->vp_dtx_committed_count >= count, + "Unexpected committed DTX entries count for " DF_UUID ": %u vs %u\n", + DP_UUID(cont->vc_id), cont->vc_pool->vp_dtx_committed_count, count); - cont->vc_dtx_committed_hdl = DAOS_HDL_INVAL; + cont->vc_dtx_committed_hdl = DAOS_HDL_INVAL; cont->vc_dtx_committed_count = 0; - cont->vc_cmt_dtx_indexed = 0; + cont->vc_cmt_dtx_indexed = 0; cont->vc_cmt_dtx_reindex_pos = cont->vc_cont_df->cd_dtx_committed_head; + cont->vc_dtx_reset = 0; + cont->vc_pool->vp_dtx_committed_count -= count; + d_tm_dec_gauge(vos_tls_get(false)->vtl_committed, count); } rc = dbtree_create_inplace_ex(VOS_BTR_DTX_CMT_TABLE, 0, DTX_BTREE_ORDER, &uma, @@ -4048,7 +4030,7 @@ vos_dtx_local_begin(struct dtx_handle *dth, daos_handle_t poh) goto error; } - rc = vos_tx_begin(dth, umm, pool->vp_sysdb); + rc = vos_tx_begin(dth, umm, pool->vp_sysdb, NULL); if (rc != 0) { D_ERROR("Failed to start transaction: rc=" DF_RC "\n", DP_RC(rc)); goto error; @@ -4078,10 +4060,12 @@ vos_dtx_local_end(struct dtx_handle *dth, int result) return result; } +enum { DTS_EPOCH_ACC = 0, DTS_CMT_TIME_ACC, DTS_ACC_COUNT }; + struct dtx_time_stat_priv { struct dtx_time_stat dts_pub; /* DAOS-17322: Use of floating point to avoid integer overflow issue */ - long double dts_mean[2]; + long double dts_mean[DTS_ACC_COUNT]; }; int @@ -4107,8 +4091,8 @@ vos_dtx_get_cmt_stat(daos_handle_t coh, uint64_t *cmt_cnt, struct dtx_time_stat cmt_cnt_tmp = 0; umm = vos_cont2umm(cont); dbd = umem_off2ptr(umm, cont->vc_cont_df->cd_dtx_committed_head); - dts_tmp.dts_pub.dts_epoch[0] = DAOS_EPOCH_MAX; - dts_tmp.dts_pub.dts_cmt_time[0] = UINT64_MAX; + dts_tmp.dts_pub.dts_epoch[DTX_TIME_STAT_MIN] = DAOS_EPOCH_MAX; + dts_tmp.dts_pub.dts_cmt_time[DTX_TIME_STAT_MIN] = UINT64_MAX; while (dbd != NULL) { if (dbd->dbd_magic != DTX_CMT_BLOB_MAGIC) { D_ERROR("Committed DTX blob with bad magic: container=" DF_UUID @@ -4127,17 +4111,25 @@ vos_dtx_get_cmt_stat(daos_handle_t coh, uint64_t *cmt_cnt, struct dtx_time_stat dce_df = &dbd->dbd_committed_data[i]; - if (dts_tmp.dts_pub.dts_epoch[0] > dce_df->dce_epoch) - dts_tmp.dts_pub.dts_epoch[0] = dce_df->dce_epoch; - if (dts_tmp.dts_pub.dts_epoch[1] < dce_df->dce_epoch) - dts_tmp.dts_pub.dts_epoch[1] = dce_df->dce_epoch; - dts_tmp.dts_mean[0] += dce_df->dce_epoch; - - if (dts_tmp.dts_pub.dts_cmt_time[0] > dce_df->dce_cmt_time) - dts_tmp.dts_pub.dts_cmt_time[0] = dce_df->dce_cmt_time; - if (dts_tmp.dts_pub.dts_cmt_time[1] < dce_df->dce_cmt_time) - dts_tmp.dts_pub.dts_cmt_time[1] = dce_df->dce_cmt_time; - dts_tmp.dts_mean[1] += dce_df->dce_cmt_time; + if (dts_tmp.dts_pub.dts_epoch[DTX_TIME_STAT_MIN] > + dce_df->dce_epoch) + dts_tmp.dts_pub.dts_epoch[DTX_TIME_STAT_MIN] = + dce_df->dce_epoch; + if (dts_tmp.dts_pub.dts_epoch[DTX_TIME_STAT_MAX] < + dce_df->dce_epoch) + dts_tmp.dts_pub.dts_epoch[DTX_TIME_STAT_MAX] = + dce_df->dce_epoch; + dts_tmp.dts_mean[DTS_EPOCH_ACC] += dce_df->dce_epoch; + + if (dts_tmp.dts_pub.dts_cmt_time[DTX_TIME_STAT_MIN] > + dce_df->dce_cmt_time) + dts_tmp.dts_pub.dts_cmt_time[DTX_TIME_STAT_MIN] = + dce_df->dce_cmt_time; + if (dts_tmp.dts_pub.dts_cmt_time[DTX_TIME_STAT_MAX] < + dce_df->dce_cmt_time) + dts_tmp.dts_pub.dts_cmt_time[DTX_TIME_STAT_MAX] = + dce_df->dce_cmt_time; + dts_tmp.dts_mean[DTS_CMT_TIME_ACC] += dce_df->dce_cmt_time; } } @@ -4148,11 +4140,13 @@ vos_dtx_get_cmt_stat(daos_handle_t coh, uint64_t *cmt_cnt, struct dtx_time_stat if (dts != NULL) { if (cmt_cnt_tmp != 0) { - dts_tmp.dts_mean[0] /= (long double)cmt_cnt_tmp; - dts_tmp.dts_pub.dts_epoch[2] = (daos_epoch_t)dts_tmp.dts_mean[0]; + dts_tmp.dts_mean[DTS_EPOCH_ACC] /= (long double)cmt_cnt_tmp; + dts_tmp.dts_pub.dts_epoch[DTX_TIME_STAT_MEAN] = + (daos_epoch_t)dts_tmp.dts_mean[DTS_EPOCH_ACC]; - dts_tmp.dts_mean[1] /= (long double)cmt_cnt_tmp; - dts_tmp.dts_pub.dts_cmt_time[2] = (uint64_t)dts_tmp.dts_mean[1]; + dts_tmp.dts_mean[DTS_CMT_TIME_ACC] /= (long double)cmt_cnt_tmp; + dts_tmp.dts_pub.dts_cmt_time[DTX_TIME_STAT_MEAN] = + (uint64_t)dts_tmp.dts_mean[DTS_CMT_TIME_ACC]; } memcpy(dts, &dts_tmp, sizeof(struct dtx_time_stat)); @@ -4163,3 +4157,68 @@ vos_dtx_get_cmt_stat(daos_handle_t coh, uint64_t *cmt_cnt, struct dtx_time_stat out: return rc; } + +int +vos_dtx_record_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t oid) +{ + struct dtx_local_oid_record *oid_array; + struct dtx_local_oid_record *record; + struct vos_dtx_act_ent *dae; + daos_unit_oid_t *oids; + int rc = 0; + + if (dth == NULL) + D_GOTO(out, rc = 0); + + if (dth->dth_local) { + if (dth->dth_local_oid_cnt == dth->dth_local_oid_cap) { + D_REALLOC_ARRAY(oid_array, dth->dth_local_oid_array, dth->dth_local_oid_cap, + dth->dth_local_oid_cap << 1); + if (oid_array == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dth->dth_local_oid_array = oid_array; + dth->dth_local_oid_cap <<= 1; + } + + record = &dth->dth_local_oid_array[dth->dth_local_oid_cnt]; + record->dor_cont = cont; + vos_cont_addref(cont); + record->dor_oid = oid; + dth->dth_local_oid_cnt++; + + D_GOTO(out, rc = 0); + } + + if (daos_is_zero_dti(&dth->dth_xid)) + D_GOTO(out, rc = 0); + + dae = dth->dth_ent; + D_ASSERT(dae != NULL); + + if (dae->dae_oid_cnt == 0) { + if (daos_unit_oid_compare(oid, DAE_OID(dae)) == 0) + dae->dae_oids = &DAE_OID(dae); + else + dae->dae_oids = &dae->dae_oid_inline; + } else if (dae->dae_oid_cnt >= dae->dae_oid_cap) { + D_ALLOC_ARRAY(oids, dae->dae_oid_cnt << 1); + if (oids == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy(oids, dae->dae_oids, sizeof(*oids) * dae->dae_oid_cnt); + if (dae->dae_oids != &DAE_OID(dae) && dae->dae_oids != &dae->dae_oid_inline) + D_FREE(dae->dae_oids); + + dae->dae_oids = oids; + dae->dae_oid_cap = dae->dae_oid_cnt << 1; + } + + dae->dae_oids[dae->dae_oid_cnt++] = oid; + +out: + if (rc != 0) + D_ERROR("Failed to record oid " DF_UOID ": " DF_RC "\n", DP_UOID(oid), DP_RC(rc)); + + return rc; +} diff --git a/src/vos/vos_gc.c b/src/vos/vos_gc.c index dc76f95297a..8427b7fee6e 100644 --- a/src/vos/vos_gc.c +++ b/src/vos/vos_gc.c @@ -167,6 +167,12 @@ gc_drain_key(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, int creds = *credits; int rc; + /** + * Since the key's structure does not have a magic value and the ilog root (which has + * a magic value) is already destroyed at this stage there is no way to verify the pointer + * actually points to a valid data. + */ + if (key->kr_bmap & KREC_BF_NO_AKEY && gc->gc_type == GC_DKEY) { /** Special case, this will defer to the free callback * and the tree will be inserted as akey. @@ -1492,14 +1498,31 @@ gc_close_bkt(struct vos_gc_info *gc_info) gc_info->gi_last_pinned = UMEM_DEFAULT_MBKT_ID; } +#define CK_GC_TREE_STR "Garbage collector's tree" + static inline int -gc_open_bkt(struct umem_attr *uma, struct vos_gc_bkt_df *bkt_df, struct vos_gc_info *gc_info) +gc_open_bkt(struct umem_attr *uma, struct vos_gc_bkt_df *bkt_df, struct checker *ck, + struct vos_gc_info *gc_info) { - int rc; + const bool error_on_non_zero_padding = + (IS_CHECKER(ck) ? (ck->ck_options.cko_non_zero_padding == CHECKER_EVENT_ERROR) : false); + int rc; + + if (IS_CHECKER(ck)) { + CK_PRINT(ck, CK_GC_TREE_STR "...\n"); + CK_INDENT(ck, rc = dbtree_check_inplace(&bkt_df->gd_bins_root, uma, ck_report, ck, + error_on_non_zero_padding)); + CK_PRINTL_RC(ck, rc, CK_GC_TREE_STR); + if (rc != DER_SUCCESS) { + return rc; + } + } rc = dbtree_open_inplace(&bkt_df->gd_bins_root, uma, &gc_info->gi_bins_btr); - if (rc) + if (rc) { DL_ERROR(rc, "Failed to open GC bin tree."); + } + return rc; } @@ -1509,13 +1532,61 @@ gc_close_pool(struct vos_pool *pool) return gc_close_bkt(&pool->vp_gc_info); } +#define CK_NON_ZERO_PADDING_FMT "non-zero padding[%d] (%#" PRIx64 ")" +#define CK_NON_ZERO_RESERVED_FMT "non-zero reserved space (%#" PRIx64 ")" + +static int +dlck_pd_ext_check(struct vos_pool_ext_df *pd_ext, umem_off_t off, struct checker *ck) +{ + CK_PRINTF(ck, "Pool extension (off=%#lx)... ", off); + + if (pd_ext == NULL) { + CK_APPENDL_OK(ck); + return DER_SUCCESS; + } + + for (int i = 0; i < VOS_POOL_EXT_DF_PADDING_SIZE; ++i) { + if (pd_ext->ped_paddings[i] != 0 || DAOS_FAIL_CHECK(DAOS_FAULT_POOL_EXT_PADDING)) { + if (ck->ck_options.cko_non_zero_padding == CHECKER_EVENT_ERROR) { + CK_APPENDFL_ERR(ck, CK_NON_ZERO_PADDING_FMT, i, + pd_ext->ped_paddings[i]); + return -DER_NOTYPE; + } else { + CK_APPENDFL_WARN(ck, CK_NON_ZERO_PADDING_FMT, i, + pd_ext->ped_paddings[i]); + } + } + } + + if (pd_ext->ped_reserve != 0 || DAOS_FAIL_CHECK(DAOS_FAULT_POOL_EXT_RESERVED)) { + if (ck->ck_options.cko_non_zero_padding == CHECKER_EVENT_ERROR) { + CK_APPENDFL_ERR(ck, CK_NON_ZERO_RESERVED_FMT, pd_ext->ped_reserve); + return -DER_NOTYPE; + } else { + CK_APPENDFL_WARN(ck, CK_NON_ZERO_RESERVED_FMT, pd_ext->ped_reserve); + } + } + + CK_APPENDL_OK(ck); + + return DER_SUCCESS; +} + int -gc_open_pool(struct vos_pool *pool) +gc_open_pool(struct vos_pool *pool, struct checker *ck) { - struct vos_pool_ext_df *pd_ext = umem_off2ptr(&pool->vp_umm, pool->vp_pool_df->pd_ext); + struct vos_pool_ext_df *pd_ext = umem_off2ptr(&pool->vp_umm, pool->vp_pool_df->pd_ext); + int rc; + + if (IS_CHECKER(ck)) { + rc = dlck_pd_ext_check(pd_ext, pool->vp_pool_df->pd_ext, ck); + if (rc != DER_SUCCESS) { + return rc; + } + } if (pd_ext != NULL) - return gc_open_bkt(&pool->vp_uma, &pd_ext->ped_gc_bkt, &pool->vp_gc_info); + return gc_open_bkt(&pool->vp_uma, &pd_ext->ped_gc_bkt, ck, &pool->vp_gc_info); return 0; } @@ -1533,7 +1604,7 @@ gc_open_cont(struct vos_container *cont) struct vos_cont_ext_df *cd_ext = umem_off2ptr(&pool->vp_umm, cont->vc_cont_df->cd_ext); if (cd_ext != NULL) - return gc_open_bkt(&pool->vp_uma, &cd_ext->ced_gc_bkt, &cont->vc_gc_info); + return gc_open_bkt(&pool->vp_uma, &cd_ext->ced_gc_bkt, NULL, &cont->vc_gc_info); return 0; } diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 8c4a3e800be..f7a00dd0918 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -263,7 +263,8 @@ struct vos_cache_metrics { struct d_tm_node_t *vcm_obj_hit; }; -void vos_cache_metrics_init(struct vos_cache_metrics *vc_metrcis, const char *path, int tgt_id); +void +vos_cache_metrics_init(struct vos_cache_metrics *vc_metrics, const char *path, int tgt_id); struct vos_pool_metrics { void *vp_vea_metrics; @@ -315,9 +316,7 @@ struct vos_pool { /** memory attribute of the @vp_umm */ struct umem_attr vp_uma; /** memory class instance of the pool */ - struct umem_instance vp_umm; - /** Size of pool file */ - uint64_t vp_size; + struct umem_instance vp_umm; /** Features enabled for this pool */ uint64_t vp_feats; /** btr handle for the container table */ @@ -439,9 +438,7 @@ struct vos_container { /* GC runtime for container */ struct vos_gc_info vc_gc_info; /* Various flags */ - unsigned int vc_in_aggregation:1, - vc_in_discard:1, - vc_cmt_dtx_indexed:1; + uint32_t vc_in_aggregation : 1, vc_in_discard : 1, vc_cmt_dtx_indexed : 1, vc_dtx_reset : 1; unsigned int vc_obj_discard_count; unsigned int vc_open_count; /* The latest pool map version that DTX resync has been done. */ @@ -470,10 +467,6 @@ struct vos_dtx_act_ent { * then 'dae_oids' points to the 'dae_oid_inline'. * * Otherwise, 'dae_oids' points to new buffer to hold more. - * - * These information is used for EC aggregation optimization. - * If server restarts, then we will lose the optimization but - * it is not fatal. */ daos_unit_oid_t *dae_oids; /* The time (hlc) when the DTX entry is created. */ @@ -485,6 +478,9 @@ struct vos_dtx_act_ent { /* Back pointer to the DTX handle. */ struct dtx_handle *dae_dth; + /* The capacity of dae_oids if it points to new allocated area. */ + uint32_t dae_oid_cap; + unsigned int dae_committable:1, dae_committing:1, dae_committed:1, @@ -517,11 +513,8 @@ struct vos_dtx_act_ent { #define DAE_MBS_OFF(dae) ((dae)->dae_base.dae_mbs_off) struct vos_dtx_cmt_ent { - struct vos_dtx_cmt_ent_df dce_base; - - uint32_t dce_reindex:1, - dce_exist:1, - dce_invalid:1; + struct vos_dtx_cmt_ent_df dce_base; + uint32_t dce_invalid : 1; }; #define DCE_XID(dce) ((dce)->dce_base.dce_xid) @@ -858,6 +851,9 @@ vos_dtx_post_handle(struct vos_container *cont, int vos_dtx_act_reindex(struct vos_container *cont); +int +vos_dtx_record_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t oid); + enum vos_tree_class { /** the first reserved tree class */ VOS_BTR_BEGIN = DBTREE_VOS_BEGIN, @@ -1339,7 +1335,8 @@ vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, daos_handle_t coh, struct vos_object *obj); int -vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb); +vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, + struct vos_object *obj); /** Finish the transaction and publish or cancel the reservations or * return if err == 0 and it's a multi-modification transaction that @@ -1458,7 +1455,7 @@ vos_gc_pool_tight(daos_handle_t poh, int *credits); void gc_reserve_space(struct vos_pool *pool, daos_size_t *rsrvd); int -gc_open_pool(struct vos_pool *pool); +gc_open_pool(struct vos_pool *pool, struct checker *ck); void gc_close_pool(struct vos_pool *pool); int @@ -1931,20 +1928,6 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v return false; } -/** - * Insert object ID and its parent container into the array of objects touched by the ongoing - * local transaction. - * - * \param[in] dth DTX handle for ongoing local transaction - * \param[in] cont VOS container - * \param[in] oid Object ID - * - * \return 0 : Success. - * -DER_NOMEM : Run out of the volatile memory. - */ -int -vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid); - static inline bool vos_pool_is_p2(struct vos_pool *pool) { diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index cebf9181aaa..4d105b91412 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2552,32 +2552,6 @@ update_cancel(struct vos_io_context *ioc) true /* abort */); } -int -vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid) -{ - struct dtx_local_oid_record *oid_array = NULL; - struct dtx_local_oid_record *record = NULL; - - /** The array has to grow to accommodate the next record. */ - if (dth->dth_local_oid_cnt == dth->dth_local_oid_cap) { - D_REALLOC_ARRAY(oid_array, dth->dth_local_oid_array, dth->dth_local_oid_cap, - dth->dth_local_oid_cap << 1); - if (oid_array == NULL) - return -DER_NOMEM; - - dth->dth_local_oid_array = oid_array; - dth->dth_local_oid_cap <<= 1; - } - - record = &dth->dth_local_oid_array[dth->dth_local_oid_cnt]; - record->dor_cont = cont; - vos_cont_addref(cont); - record->dor_oid = *oid; - dth->dth_local_oid_cnt++; - - return 0; -} - int vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, daos_size_t *size, struct dtx_handle *dth) @@ -2598,6 +2572,13 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; + if (ioc->ic_pinned_obj != NULL && unlikely(vos_obj_is_evicted(ioc->ic_pinned_obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted during update, need to restart TX.\n", + DP_UOID(ioc->ic_oid)); + + D_GOTO(abort, err = -DER_TX_RESTART); + } + err = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); D_ASSERT(err == 0); @@ -2606,7 +2587,10 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; - err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb); + if (ioc->ic_pinned_obj != NULL) + D_ASSERT(ioc->ic_pinned_obj == ioc->ic_obj); + + err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb, ioc->ic_obj); if (err != 0) goto abort; @@ -2663,9 +2647,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, goto abort; } - if (dtx_is_valid_handle(dth) && dth->dth_local) { - err = vos_insert_oid(dth, ioc->ic_cont, &ioc->ic_oid); - } + err = vos_dtx_record_oid(dth, ioc->ic_cont, ioc->ic_oid); abort: if (err == -DER_NONEXIST || err == -DER_EXIST || @@ -2727,7 +2709,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, *size = ioc->ic_io_size; D_FREE(daes); D_FREE(dces); - vos_ioc_destroy(ioc, err != 0); + vos_ioc_destroy(ioc, err != 0 && tx_started); return err; } diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index b13790fe74f..19335f3df6e 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -116,6 +116,8 @@ struct vos_gc_bkt_df { /** 2.8 features */ #define VOS_POOL_FEAT_2_8 (VOS_POOL_FEAT_GANG_SV) +#define VOS_POOL_EXT_DF_PADDING_SIZE 53 + /* VOS pool durable format extension */ struct vos_pool_ext_df { /* Extension for GC bucket */ @@ -123,7 +125,7 @@ struct vos_pool_ext_df { /* Memory file size for md-on-ssd phase2 pool */ uint64_t ped_mem_sz; /* Paddings for other potential new feature */ - uint64_t ped_paddings[53]; + uint64_t ped_paddings[VOS_POOL_EXT_DF_PADDING_SIZE]; /* Reserved for future extension */ uint64_t ped_reserve; }; @@ -270,19 +272,27 @@ enum vos_io_stream { VOS_IOS_CNT }; +enum vos_cont_ext_bits { + VCEB_CSUM = (1 << 0), +}; + /* VOS container durable format extension */ struct vos_cont_ext_df { /* GC bucket extension */ - struct vos_gc_bkt_df ced_gc_bkt; + struct vos_gc_bkt_df ced_gc_bkt; + uint32_t ced_valid_bits; + uint32_t ced_padding0; /* * Any modification involved in current target (container shard) under the global * stable epoch have already been persistently stored globally. */ - uint64_t ced_global_stable_epoch; + uint64_t ced_global_stable_epoch; + uint32_t ced_csum_type; + uint32_t ced_chunksize; /* Reserved for potential new features */ - uint64_t ced_paddings[37]; + uint64_t ced_padding1[35]; /* Reserved for future extension */ - uint64_t ced_reserve; + uint64_t ced_reserve; }; /* VOS Container Value */ diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index 0015d91d916..117cf8baaab 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -494,7 +494,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, if (rc != 0) goto reset; - rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb); + rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb, obj); if (rc != 0) goto reset; @@ -572,11 +572,9 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } if (rc == 0) { - vos_ts_set_wupdate(ts_set, epr.epr_hi); - - if (dtx_is_valid_handle(dth) && dth->dth_local) { - rc = vos_insert_oid(dth, cont, &oid); - } + rc = vos_dtx_record_oid(dth, cont, oid); + if (rc == 0) + vos_ts_set_wupdate(ts_set, epr.epr_hi); } rc = vos_tx_end(cont, dth, NULL, NULL, tx_started, NULL, rc); @@ -592,7 +590,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } if (obj != NULL) - vos_obj_release(obj, 0, rc != 0); + vos_obj_release(obj, 0, rc != 0 && tx_started); D_FREE(daes); D_FREE(dces); @@ -816,7 +814,8 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, daos_handle_t toh = DAOS_HDL_INVAL; int rc = 0; int i; - bool dirty = false; + bool dirty = false; + bool tx_started = false; cont = vos_hdl2cont(coh); D_ASSERT(cont != NULL); @@ -842,6 +841,7 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, } } +restart: rc = vos_obj_hold(cont, oid, &epr, epoch, VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_MARK, &obj, NULL); if (rc != 0) @@ -851,6 +851,16 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, if (rc != 0) goto log; + if (unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted, needs to restart TX.\n", DP_UOID(oid)); + umem_tx_end(umm, -DER_TX_RESTART); + vos_obj_release(obj, 0, false); + obj = NULL; + goto restart; + } + + tx_started = true; + rc = vos_obj_incarnate(obj, &epr, epoch, VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_MARK, NULL); if (rc != 0) @@ -906,12 +916,14 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, ", dkey (empty), akey_nr %u, epoch " DF_X64 ", pm_ver %u", DP_UOID(oid), akey_nr, epoch, pm_ver); + if (rc == -DER_ALREADY) + rc = 0; if (daos_handle_is_valid(toh)) dbtree_close(toh); if (obj != NULL) - vos_obj_release(obj, 0, true); + vos_obj_release(obj, 0, rc != 0 && tx_started); - return rc == -DER_ALREADY ? 0 : rc; + return rc; } static int diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index f572ebb03d9..be67bd27ac6 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -123,6 +124,12 @@ void vos_obj_evict(struct vos_object *obj); int vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid); +static inline bool +vos_obj_is_evicted(struct vos_object *obj) +{ + return daos_lru_is_evicted(&obj->obj_llink); +} + /** * Create an object cache. * diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index ba1898e0f25..a94450cfd0d 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -849,7 +849,8 @@ vos_bkt_array_subset(struct vos_bkt_array *super, struct vos_bkt_array *sub) return false; for (i = 0; i < sub->vba_cnt; i++) { - idx = daos_array_find(super, super->vba_cnt, sub->vba_bkts[i], &bkt_sort_ops); + idx = daos_array_find(super->vba_bkts, super->vba_cnt, sub->vba_bkts[i], + &bkt_sort_ops); if (idx < 0) return false; } diff --git a/src/vos/vos_pool.c b/src/vos/vos_pool.c index ad558c62fd8..a6e06d6e026 100644 --- a/src/vos/vos_pool.c +++ b/src/vos/vos_pool.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2025 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -996,7 +996,8 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, umem_create: D_DEBUG(DB_MGMT, "umempobj_create sz: " DF_U64 " store_sz: " DF_U64, scm_sz, store.stor_size); - pop = umempobj_create(path, layout, UMEMPOBJ_ENABLE_STATS, scm_sz, 0600, &store); + pop = umempobj_create(path, layout, UMEMPOBJ_ENABLE_STATS, scm_sz, UMEM_FILE_MODE_DEFAULT, + &store); if (pop != NULL) { *ph = pop; return 0; @@ -1018,9 +1019,11 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, return rc; } +#define BIO_META_CLOSE_FAIL_STR "Failed to close BIO meta context" + static int vos_pmemobj_open(const char *path, uuid_t pool_id, const char *layout, unsigned int flags, - void *metrics, struct umem_pool **ph) + void *metrics, struct checker *ck, struct umem_pool **ph) { struct bio_xs_context *xs_ctxt = vos_xsctxt_get(); struct umem_store store = { 0 }; @@ -1047,6 +1050,7 @@ vos_pmemobj_open(const char *path, uuid_t pool_id, const char *layout, unsigned xs_ctxt, DP_UUID(pool_id)); rc = bio_mc_open(xs_ctxt, pool_id, mc_flags, &mc); + CK_PRINTL_RC(ck, rc, "Open BIO meta context"); if (rc) { D_ERROR("Failed to open BIO meta context for xs:%p pool:"DF_UUID", "DF_RC"\n", xs_ctxt, DP_UUID(pool_id), DP_RC(rc)); @@ -1058,17 +1062,21 @@ vos_pmemobj_open(const char *path, uuid_t pool_id, const char *layout, unsigned umem_open: pop = umempobj_open(path, layout, UMEMPOBJ_ENABLE_STATS, &store); + rc = (pop == NULL) ? daos_errno2der(errno) : DER_SUCCESS; + CK_PRINTL_RC(ck, rc, "Open the pool"); if (pop != NULL) { *ph = pop; return 0; } - rc = daos_errno2der(errno); + D_ASSERT(rc != 0); if (store.stor_priv != NULL) { ret = bio_mc_close(store.stor_priv); - if (ret) - D_ERROR("Failed to close BIO meta context. "DF_RC"\n", DP_RC(ret)); + if (ret) { + CK_PRINTL_RC(ck, ret, BIO_META_CLOSE_FAIL_STR); + D_ERROR(BIO_META_CLOSE_FAIL_STR ". " DF_RC "\n", DP_RC(ret)); + } } return rc; @@ -1154,16 +1162,6 @@ pool_hop_free(struct d_ulink *hlink) if (daos_handle_is_valid(pool->vp_cont_th)) dbtree_close(pool->vp_cont_th); - if (pool->vp_size != 0) { - rc = munlock((void *)pool->vp_umm.umm_base, pool->vp_size); - if (rc != 0) - D_WARN("Failed to unlock pool memory at "DF_X64": errno=%d (%s)\n", - pool->vp_umm.umm_base, errno, strerror(errno)); - else - D_DEBUG(DB_MGMT, "Unlocked VOS pool memory: "DF_U64" bytes at "DF_X64"\n", - pool->vp_size, pool->vp_umm.umm_base); - } - if (pool->vp_uma.uma_pool) vos_pmemobj_close(pool->vp_uma.uma_pool); @@ -1317,7 +1315,7 @@ pool_open_prep(uuid_t uuid, unsigned int flags, struct vos_pool **p_pool); static int pool_open_post(struct umem_pool **p_ph, struct vos_pool_df *pool_df, unsigned int flags, - void *metrics, struct vos_pool *pool, int ret); + void *metrics, struct vos_pool *pool, struct checker *ck, int ret); int vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t nvme_sz, @@ -1488,7 +1486,7 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ post: if (rc == 0 && poh != NULL) { - rc = pool_open_post(&ph, pool_df, flags, NULL, pool, rc); + rc = pool_open_post(&ph, pool_df, flags, NULL, pool, NULL, rc); if (rc == 0) *poh = vos_pool2hdl(pool); } else { @@ -1600,65 +1598,6 @@ vos_pool_destroy(const char *path, uuid_t uuid) return vos_pool_destroy_ex(path, uuid, 0); } -enum { - /** Memory locking flag not initialized */ - LM_FLAG_UNINIT, - /** Memory locking disabled */ - LM_FLAG_DISABLED, - /** Memory locking enabled */ - LM_FLAG_ENABLED -}; - -static void -lock_pool_memory(struct vos_pool *pool) -{ - static int lock_mem = LM_FLAG_UNINIT; - struct rlimit rlim; - size_t lock_bytes; - int rc; - - if (lock_mem == LM_FLAG_UNINIT) { - rc = getrlimit(RLIMIT_MEMLOCK, &rlim); - if (rc != 0) { - D_WARN("getrlimit() failed; errno=%d (%s)\n", errno, strerror(errno)); - lock_mem = LM_FLAG_DISABLED; - return; - } - - if (rlim.rlim_cur != RLIM_INFINITY || rlim.rlim_max != RLIM_INFINITY) { - D_WARN("Infinite rlimit not detected, not locking VOS pool memory\n"); - lock_mem = LM_FLAG_DISABLED; - return; - } - - lock_mem = LM_FLAG_ENABLED; - } - - if (lock_mem == LM_FLAG_DISABLED) - return; - - /* - * Mlock may take several tens of seconds to complete when memory - * is tight, so mlock is skipped in current MD-on-SSD scenario. - */ - if (bio_nvme_configured(SMD_DEV_TYPE_META)) - return; - - lock_bytes = pool->vp_pool_df->pd_scm_sz; - rc = mlock((void *)pool->vp_umm.umm_base, lock_bytes); - if (rc != 0) { - D_WARN("Could not lock memory for VOS pool "DF_U64" bytes at "DF_X64 - "; errno=%d (%s)\n", lock_bytes, pool->vp_umm.umm_base, - errno, strerror(errno)); - return; - } - - /* Only save the size if the locking was successful */ - pool->vp_size = lock_bytes; - D_DEBUG(DB_MGMT, "Locking VOS pool in memory "DF_U64" bytes at "DF_X64"\n", pool->vp_size, - pool->vp_umm.umm_base); -} - static int pool_open_prep(uuid_t uuid, unsigned int flags, struct vos_pool **p_pool) { @@ -1701,11 +1640,15 @@ pool_open_prep(uuid_t uuid, unsigned int flags, struct vos_pool **p_pool) return rc; } +#define CK_CONT_TREE_STR "Containers tree" + static int pool_open_post(struct umem_pool **p_ph, struct vos_pool_df *pool_df, unsigned int flags, - void *metrics, struct vos_pool *pool, int ret) + void *metrics, struct vos_pool *pool, struct checker *ck, int ret) { struct umem_attr *uma; + const bool error_on_non_zero_padding = + (IS_CHECKER(ck) ? (ck->ck_options.cko_non_zero_padding == CHECKER_EVENT_ERROR) : false); daos_handle_t poh; int rc; @@ -1741,6 +1684,16 @@ pool_open_post(struct umem_pool **p_ph, struct vos_pool_df *pool_df, unsigned in goto out; } + if (IS_CHECKER(ck)) { + CK_PRINT(ck, CK_CONT_TREE_STR "...\n"); + CK_INDENT(ck, rc = dbtree_check_inplace(&pool_df->pd_cont_root, &pool->vp_uma, + ck_report, ck, error_on_non_zero_padding)); + CK_PRINTL_RC(ck, rc, CK_CONT_TREE_STR); + if (rc != DER_SUCCESS) { + goto out; + } + } + /* Cache container table btree hdl */ rc = dbtree_open_inplace_ex(&pool_df->pd_cont_root, &pool->vp_uma, DAOS_HDL_INVAL, pool, &pool->vp_cont_th); @@ -1781,7 +1734,7 @@ pool_open_post(struct umem_pool **p_ph, struct vos_pool_df *pool_df, unsigned in if (rc) goto out; - rc = gc_open_pool(pool); + rc = gc_open_pool(pool, ck); if (rc) goto out; @@ -1794,7 +1747,6 @@ pool_open_post(struct umem_pool **p_ph, struct vos_pool_df *pool_df, unsigned in vos_space_sys_init(pool); /* Ensure GC is triggered after server restart */ gc_add_pool(pool); - lock_pool_memory(pool); out: DL_CDEBUG(rc != 0, DLOG_ERR, DB_MGMT, rc, @@ -1811,7 +1763,7 @@ pool_open_post(struct umem_pool **p_ph, struct vos_pool_df *pool_df, unsigned in int vos_pool_open_metrics(const char *path, uuid_t uuid, unsigned int flags, void *metrics, - daos_handle_t *poh) + struct checker *ck, daos_handle_t *poh) { struct vos_pool_df *pool_df = NULL; struct vos_pool *pool = NULL; @@ -1830,6 +1782,10 @@ vos_pool_open_metrics(const char *path, uuid_t uuid, unsigned int flags, void *m return -DER_NOTSUPPORTED; } + /** header with parameters */ + CK_PRINTF(ck, "Check pool:\n\tpath: %s\n\tuuid: " DF_UUIDF "\n", path, DP_UUID(uuid)); + checker_print_indent_inc(ck); + D_DEBUG(DB_MGMT, "Pool Path: %s, UUID: "DF_UUID"\n", path, DP_UUID(uuid)); @@ -1840,7 +1796,8 @@ vos_pool_open_metrics(const char *path, uuid_t uuid, unsigned int flags, void *m rc = pool_lookup(&ukey, &pool, true); if (rc == 0) { - D_ASSERT(pool != NULL); + CK_ASSERT(ck, "Pool is not NULL... ", pool != NULL); + CK_PRINT(ck, "Pool is already opened.\n"); D_DEBUG(DB_MGMT, "Found already opened(%d) pool : %p\n", pool->vp_opened, pool); if (pool->vp_dying) { @@ -1863,12 +1820,13 @@ vos_pool_open_metrics(const char *path, uuid_t uuid, unsigned int flags, void *m return rc; rc = bio_xsctxt_health_check(vos_xsctxt_get(), false, false); + CK_PRINTL_RC(ck, rc, "NVMe devices (if applicable)"); if (rc) { DL_WARN(rc, DF_UUID": Skip pool open due to faulty NVMe.", DP_UUID(uuid)); goto out; } - rc = vos_pmemobj_open(path, uuid, VOS_POOL_LAYOUT, flags, metrics, &ph); + rc = vos_pmemobj_open(path, uuid, VOS_POOL_LAYOUT, flags, metrics, ck, &ph); if (rc) { D_ERROR("Error in opening the pool "DF_UUID". "DF_RC"\n", DP_UUID(uuid), DP_RC(rc)); @@ -1876,14 +1834,19 @@ vos_pool_open_metrics(const char *path, uuid_t uuid, unsigned int flags, void *m } pool_df = vos_pool_pop2df(ph); - if (pool_df->pd_magic != POOL_DF_MAGIC) { + CK_PRINT(ck, "Magic... "); + if (pool_df->pd_magic != POOL_DF_MAGIC || DAOS_FAIL_CHECK(DAOS_FAULT_POOL_OPEN_MAGIC)) { + CK_APPENDFL_ERR(ck, "invalid (%#x)", pool_df->pd_magic); D_CRIT("Unknown DF magic %x\n", pool_df->pd_magic); rc = -DER_DF_INVAL; goto out; } + CK_APPENDL_OK(ck); - if (pool_df->pd_version > POOL_DF_VERSION || - pool_df->pd_version < POOL_DF_VER_1) { + CK_PRINT(ck, "Version... "); + if (pool_df->pd_version > POOL_DF_VERSION || pool_df->pd_version < POOL_DF_VER_1 || + DAOS_FAIL_CHECK(DAOS_FAULT_POOL_OPEN_VERSION)) { + CK_APPENDFL_ERR(ck, "unsupported (%#x)", pool_df->pd_version); D_ERROR("Unsupported DF version %x\n", pool_df->pd_version); /** Send a RAS notification */ vos_report_layout_incompat("VOS pool", pool_df->pd_version, @@ -1892,31 +1855,41 @@ vos_pool_open_metrics(const char *path, uuid_t uuid, unsigned int flags, void *m rc = -DER_DF_INCOMPT; goto out; } + CK_APPENDL_OK(ck); - if (uuid_compare(uuid, pool_df->pd_id)) { - D_ERROR("Mismatch uuid, user="DF_UUIDF", pool="DF_UUIDF"\n", - DP_UUID(uuid), DP_UUID(pool_df->pd_id)); + CK_PRINT(ck, "UUID... "); + if (uuid_compare(uuid, pool_df->pd_id) || DAOS_FAIL_CHECK(DAOS_FAULT_POOL_OPEN_UUID)) { + CK_APPENDFL_ERR(ck, "mismatch (requested=" DF_UUIDF ", received=" DF_UUIDF ")", + DP_UUID(uuid), DP_UUID(pool_df->pd_id)); + D_ERROR("Mismatch uuid, user=" DF_UUIDF ", pool=" DF_UUIDF "\n", DP_UUID(uuid), + DP_UUID(pool_df->pd_id)); rc = -DER_ID_MISMATCH; goto out; } + CK_APPENDL_OK(ck); out: - rc = pool_open_post(&ph, pool_df, flags, metrics, pool, rc); - if (rc == 0) + rc = pool_open_post(&ph, pool_df, flags, metrics, pool, ck, rc); + if (rc == 0) { *poh = vos_pool2hdl(pool); + checker_print_indent_dec(ck); + CK_PRINTL_RC(ck, rc, "Check pool"); + } + /* Close this local handle, if it hasn't been consumed nor already * been closed by pool_open upon error. */ if (ph != NULL) vos_pmemobj_close(ph); + return rc; } int vos_pool_open(const char *path, uuid_t uuid, unsigned int flags, daos_handle_t *poh) { - return vos_pool_open_metrics(path, uuid, flags, NULL, poh); + return vos_pool_open_metrics(path, uuid, flags, NULL, NULL, poh); } int diff --git a/src/vos/vos_pool_scrub.c b/src/vos/vos_pool_scrub.c index a6cb3ffb510..bd7b4091a4f 100644 --- a/src/vos/vos_pool_scrub.c +++ b/src/vos/vos_pool_scrub.c @@ -878,7 +878,7 @@ cont_iter_is_loaded_cb(daos_handle_t ih, vos_iter_entry_t *entry, * initialized if csums are enabled */ if (!args->args_found_unloaded_container) - args->args_found_unloaded_container = !args->args_ctx->sc_cont.scs_props_fetched; + args->args_found_unloaded_container = !args->args_ctx->sc_cont.scs_csummer_inited; sc_cont_teardown(ctx); return 0; diff --git a/third_party_programs.txt b/third_party_programs.txt index a98b70907d6..3fd62a5416a 100644 --- a/third_party_programs.txt +++ b/third_party_programs.txt @@ -117,9 +117,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Copyright (c) 2009,2014 Google Inc. All rights reserved. pmdk (BSD 3-clause "New" or "Revised" License) - https://github.com/pmem/pmdk - https://github.com/pmem/pmdk/blob/master/LICENSE - Copyright 2014-2020, Intel Corporation + https://github.com/daos-stack/pmdk + https://github.com/daos-stack/pmdk/blob/master/LICENSE + Copyright 2014-2024, Intel Corporation Portable Hardware Locality (hwloc) (BSD 3-clause "New" or "Revised" License) https://www.open-mpi.org/projects/hwloc/ diff --git a/utils/build.config b/utils/build.config index d565444cf64..76f5c0f6fcb 100644 --- a/utils/build.config +++ b/utils/build.config @@ -4,12 +4,12 @@ component=daos [commit_versions] argobots=v1.2 fused=v1.0.0 -pmdk=stable-2.1.0-daos +pmdk=2.1.3 isal=v2.31.1 -isal_crypto=v2.24.0 -spdk=v22.01.2 +isal_crypto=v2.25.0 +spdk=v24.09 ofi=v1.22.0 -mercury=v2.4.0 +mercury=v2.4.1 protobufc=v1.3.3 ucx=v1.14.1 @@ -26,6 +26,6 @@ protobufc=https://github.com/protobuf-c/protobuf-c.git ucx=https://github.com/openucx/ucx.git [patch_versions] -spdk=0001_b0aba3fcd5aceceea530a702922153bc75664978.diff,0002_445a4c808badbad3942696ecf16fa60e8129a747.diff -mercury=0001_na_ucx.patch,0002_na_ucx_ep_flush.patch,0003_combined_plugin_path.patch +spdk=0001_3428322b812fe31cc3e1d0308a7f5bd4b06b9886.diff,0002_spdk_rwf_nowait.patch,0003_external_isal.patch +mercury=0001_dep_versions.patch,0002_ofi_counters.patch,0003_ofi_auth_key.patch argobots=0001_411e5b344642ebc82190fd8b125db512e5b449d1.diff,0002_bb0c908abfac4bfe37852eee621930634183c6aa.diff diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index 16c2760bfc1..b712a600f56 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -56,6 +56,20 @@ #port: 10001 # # +## Network interface for the control plane listener +# +## Bind the control plane to a specific network interface. When set, the server +## will use the first IPv4 address on this interface for binding the control +## plane listener and for reporting its address to other system components. +## This is useful in environments with multiple network interfaces where you +## want to restrict control plane traffic to a specific network. +# +## If not set, the default behavior is to bind to 0.0.0.0 (all interfaces). +# +## default: (not set - binds to all interfaces) +#control_iface: eth0 +# +# ## Transport credentials specifying certificates to secure communications # #transport_config: @@ -117,7 +131,7 @@ # ## CART: Fabric authorization key ## If the fabric requires an authorization key, set it here to -## be used on the server and clients. +## be used on the server. # #fabric_auth_key: foo:bar # @@ -198,16 +212,16 @@ #socket_dir: ./.daos/daos_server # # -## Number of hugepages to allocate for DMA buffer memory +## Number of hugepages to allocate for DMA buffer memory (total value for all engines) # -## Optional parameter that should only be set if overriding the automatically calculated value is # -## #necessary. Specifies the number (not size) of hugepages to allocate for use by NVMe through -## #SPDK. For optimum performance each target requires 1 GiB of hugepage space. The provided value +## Optional parameter that should only be set if overriding the automatically calculated value is +## necessary. Specifies the number (not size) of hugepages to allocate for use by NVMe through +## SPDK. For optimum performance each target requires 1 GiB of hugepage space. The provided value ## should be calculated by dividing the total amount of hugepages memory required for all targets ## across all engines on a host by the system hugepage size. If not set here, the value will be ## automatically calculated based on the number of targets (using the default system hugepage size). # -## Example: (2 engines * (16 targets/engine * 1GiB)) / 2MiB hugepage size = 16834 +## Example: (2 engines * (16 targets/engine * 1GiB)) / 2MiB hugepage size = 16384 # ## default: 0 #nr_hugepages: 0 @@ -228,6 +242,16 @@ #allow_numa_imbalance: true # # +## Allow DAOS server to run with transparent hugepages (THP) enabled on the host machine. +# +## WARNING: Transparent hugepages can conflict with how the DAOS server uses hugepages, and enabling +## them may cause intermittent errors. Unless transparent hugepages are required by another +## application on the machine, it is strongly recommended to leave this setting disabled. +# +## default: false +#allow_thp: true +# +# ## Reserve an amount of RAM for system use when calculating the size of RAM-disks that will be ## created for DAOS I/O engines. Units are in GiB and represents the total RAM that will be ## reserved when calculating RAM-disk sizes for all engines. @@ -240,7 +264,7 @@ ## minimum of 4gib. Increasing the value may help avoid the potential of OOM killer terminating ## engine processes but could also result in stopping DAOS from using available memory resources. # -## default: 26 +## default: 64 #system_ram_reserved: 5 # # @@ -367,7 +391,10 @@ # # Force specific debug mask for the engine at start up time. # # By default, just use the default debug mask used by DAOS. # # Mask specifies minimum level of message significance to pass to logger. -# +# # Currently supported values: +# # DEBUG, DBUG (alias for DEBUG), INFO, NOTE, WARN, ERROR, ERR (alias for ERROR), CRIT, ALRT, +# # FATAL, EMRG, EMIT +# # # # default: ERR # log_mask: INFO # @@ -406,10 +433,10 @@ # # #scm_size: 0 # -# # When class is set to ram, tmpfs will be mounted with hugepage -# # support, if the kernel supports it. If this is not desirable, -# # the behavior may be disabled here. -# scm_hugepages_disabled: true +# # When class is set to ram, tmpfs will only be mounted with hugepage +# # support if the kernel supports it and this flag is explicitly set to false. +# # default: true +# scm_hugepages_disabled: false # # - # # Backend block device type. Force a SPDK driver to be used by this engine @@ -465,6 +492,10 @@ # max_io_errs: 100 # max_csum_errs: 200 # +# # Set SPDK iobuf tunable values. Defaults if unset are 8192 for small and 1024 for large. +# spdk_iobuf: +# small_pool_count: 16384 +# large_pool_count: 2048 # #- # # Number of I/O service threads (and network endpoints) per engine. @@ -531,7 +562,10 @@ # # Force specific debug mask for the engine at start up time. # # By default, just use the default debug mask used by DAOS. # # Mask specifies minimum level of message significance to pass to logger. -# +# # Currently supported values: +# # DEBUG, DBUG (alias for DEBUG), INFO, NOTE, WARN, ERROR, ERR (alias for ERROR), CRIT, ALRT, +# # FATAL, EMRG, EMIT +# # # # default: ERR # log_mask: INFO # @@ -574,6 +608,11 @@ # #class: dcpm # #scm_list: [/dev/pmem1] # +# # When class is set to ram, tmpfs will only be mounted with hugepage +# # support if the kernel supports it and this flag is explicitly set to false. +# # default: true +# scm_hugepages_disabled: false +# # - # # Backend block device type. Force a SPDK driver to be used by this engine # # instance. diff --git a/utils/cq/d_logging_check.py b/utils/cq/d_logging_check.py index 328f0155469..d3d2b3d84ce 100755 --- a/utils/cq/d_logging_check.py +++ b/utils/cq/d_logging_check.py @@ -18,7 +18,7 @@ import re import sys -ARGS = None +ARGS = None # pylint: disable=invalid-name class FileLine(): diff --git a/utils/cq/requirements.txt b/utils/cq/requirements.txt index b3fb1ab2f2f..ef9340fcb73 100644 --- a/utils/cq/requirements.txt +++ b/utils/cq/requirements.txt @@ -3,9 +3,9 @@ pyenchant ## flake8 6 removed --diff option which breaks flake precommit hook. ## https://github.com/pycqa/flake8/issues/1389 https://github.com/PyCQA/flake8/pull/1720 flake8==7.3.0 -isort==6.1.0 -pylint==3.3.9 -yamllint==1.37.1 +isort==8.0.1 +pylint==4.0.5 +yamllint==1.38.0 codespell==2.4.1 # Used by ci/jira_query.py which pip installs it standalone. jira diff --git a/utils/docker/Dockerfile.el.8 b/utils/docker/Dockerfile.el.8 index 9108b8aab7d..bf80f3fc122 100644 --- a/utils/docker/Dockerfile.el.8 +++ b/utils/docker/Dockerfile.el.8 @@ -22,6 +22,16 @@ ARG REPO_FILE_URL ARG JENKINS_URL ARG REPOS ARG DAOS_LAB_CA_FILE_URL + +# Accept DAOS_NO_PROXY at build time +ARG DAOS_NO_PROXY +# Propagate into the build environment +ENV no_proxy=${DAOS_NO_PROXY} +ENV NO_PROXY=${DAOS_NO_PROXY} +# Persist into /etc/environment for use by shells and services +RUN echo "no_proxy=${DAOS_NO_PROXY}" >> /etc/environment && \ + echo "NO_PROXY=${DAOS_NO_PROXY}" >> /etc/environment + # script to install OS updates basic tools and daos dependencies COPY ./utils/scripts/install-el8.sh /tmp/install.sh # script to setup local repo if available @@ -37,9 +47,7 @@ FROM basic # with a local repository, yet needing a proxy to reach outside repositories. # This needs to be moved to a shell script like above in the future to # properly only remove the proxy variables only when they need to be removed -RUN if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; "\ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ - dnf upgrade && \ +RUN dnf upgrade && \ /tmp/install.sh && \ dnf clean all && \ rm -f /tmp/install.sh @@ -59,8 +67,6 @@ RUN mkdir -p /opt/daos /mnt/daos /var/run/daos_server /var/run/daos_agent /home/ chown -R daos_server.daos_server /opt/daos /mnt/daos /var/run/daos_server /home/daos && \ chown daos_agent.daos_agent /var/run/daos_agent -ARG JENKINS_URL - USER daos_server:daos_server # Setup a python venv so that python packages can be installed locally. @@ -96,8 +102,6 @@ ARG DAOS_PACKAGES_BUILD=yes # ensure that latest dependencies are used. USER root:root RUN [ "$DAOS_DEPS_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ dnf upgrade --exclude=spdk,spdk-devel,dpdk-devel,dpdk,mercury-devel,mercury && \ dnf clean all; \ } @@ -106,8 +110,6 @@ USER daos_server:daos_server ARG DEPS_JOBS=1 RUN [ "$DAOS_DEPS_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; "\ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ scons --build-deps=only --jobs $DEPS_JOBS PREFIX=/opt/daos \ TARGET_TYPE=$DAOS_TARGET_TYPE && \ ([ "$DAOS_KEEP_BUILD" != "no" ] || /bin/rm -rf build *.gz); \ @@ -118,8 +120,7 @@ COPY --chown=daos_server:daos_server utils/sl utils/sl # Build third party RPMs RUN [ "$DAOS_PACKAGES_BUILD" != "yes" ] || [ "$DAOS_DEPS_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; "\ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ + export DISTRO="el8" && \ utils/rpms/build_packages.sh deps && \ mkdir -p /home/daos/rpms && \ mv *.rpm /home/daos/rpms; \ @@ -129,10 +130,7 @@ USER root:root # force an upgrade to get any newly built RPMs, but only if CB1 is set. ARG CB1 RUN [ -z "$CB1" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ - dnf upgrade \ - --exclude=spdk,spdk-devel,dpdk-devel,dpdk,mercury-devel,mercury && \ + dnf upgrade --exclude=spdk,spdk-devel,dpdk-devel,dpdk,mercury-devel,mercury && \ dnf clean all; \ } USER daos_server:daos_server @@ -158,8 +156,6 @@ ARG DAOS_BUILD=$DAOS_DEPS_BUILD # Build DAOS RUN [ "$DAOS_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ scons --jobs $JOBS install PREFIX=/opt/daos COMPILER=$COMPILER \ FIRMWARE_MGMT=1 BUILD_TYPE=$DAOS_BUILD_TYPE TARGET_TYPE=$DAOS_TARGET_TYPE && \ ([ "$DAOS_KEEP_BUILD" != "no" ] || /bin/rm -rf build) && \ @@ -171,8 +167,6 @@ COPY --chown=daos_server:daos_server utils utils # Build DAOS RPMs RUN [ "$DAOS_PACKAGES_BUILD" != "yes" ] || [ "$DAOS_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ utils/rpms/build_packages.sh daos && \ mkdir -p /home/daos/rpms && \ cp *.rpm /home/daos/rpms; \ @@ -188,8 +182,6 @@ WORKDIR /home/daos/daos/src/client/java ARG DAOS_JAVA_BUILD=$DAOS_BUILD RUN [ "$DAOS_JAVA_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ mkdir /home/daos/.m2 && \ cp /home/daos/daos/utils/scripts/helpers/maven-settings.xml.in /home/daos/.m2/settings.xml && \ export JAVA_HOME=$(daos-java/find_java_home.sh) && \ diff --git a/utils/docker/Dockerfile.el.9 b/utils/docker/Dockerfile.el.9 index 5187d0d4407..82c5287ff26 100644 --- a/utils/docker/Dockerfile.el.9 +++ b/utils/docker/Dockerfile.el.9 @@ -22,6 +22,16 @@ ARG REPO_FILE_URL ARG JENKINS_URL ARG REPOS ARG DAOS_LAB_CA_FILE_URL + +# Accept DAOS_NO_PROXY at build time +ARG DAOS_NO_PROXY +# Propagate into the build environment +ENV no_proxy=${DAOS_NO_PROXY} +ENV NO_PROXY=${DAOS_NO_PROXY} +# Persist into /etc/environment for use by shells and services +RUN echo "no_proxy=${DAOS_NO_PROXY}" >> /etc/environment && \ + echo "NO_PROXY=${DAOS_NO_PROXY}" >> /etc/environment + # script to install OS updates basic tools and daos dependencies COPY ./utils/scripts/install-el9.sh /tmp/install.sh # script to setup local repo if available diff --git a/utils/docker/Dockerfile.leap.15 b/utils/docker/Dockerfile.leap.15 index 7ea185fb60c..93b40c12630 100644 --- a/utils/docker/Dockerfile.leap.15 +++ b/utils/docker/Dockerfile.leap.15 @@ -22,6 +22,16 @@ ARG REPO_FILE_URL ARG JENKINS_URL ARG REPOS ARG DAOS_LAB_CA_FILE_URL + +# Accept DAOS_NO_PROXY at build time +ARG DAOS_NO_PROXY +# Propagate into the build environment +ENV no_proxy=${DAOS_NO_PROXY} +ENV NO_PROXY=${DAOS_NO_PROXY} +# Persist into /etc/environment for use by shells and services +RUN echo "no_proxy=${DAOS_NO_PROXY}" >> /etc/environment && \ + echo "NO_PROXY=${DAOS_NO_PROXY}" >> /etc/environment + # script to install OS updates basic tools and daos dependencies COPY ./utils/scripts/install-leap15.sh /tmp/install.sh # script to setup local repo if available @@ -38,9 +48,7 @@ FROM basic # with a local repository, yet needing a proxy to reach outside repositories. # This needs to be moved to a shell script like above in the future to # properly only remove the proxy variables only when they need to be removed -RUN if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; "\ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ - dnf upgrade && \ +RUN dnf upgrade && \ /tmp/install.sh && \ dnf clean all && \ rm -f /tmp/install.sh @@ -96,8 +104,6 @@ ARG DAOS_PACKAGES_BUILD=yes # The dnf upgrade can add or re-enable distro repositories. USER root:root RUN [ "$DAOS_DEPS_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ dnf upgrade --exclude=fuse,fuse-libs,fuse-devel,libraft0,raft-devel,mercury,mercury-devel && \ dnf clean all; \ } @@ -106,8 +112,6 @@ USER daos_server:daos_server ARG DEPS_JOBS=1 RUN [ "$DAOS_DEPS_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; "\ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ scons --build-deps=only --jobs $DEPS_JOBS PREFIX=/opt/daos \ TARGET_TYPE=$DAOS_TARGET_TYPE && \ ([ "$DAOS_KEEP_BUILD" != "no" ] || /bin/rm -rf build *.gz); \ @@ -140,8 +144,6 @@ RUN if [ "$COMPILER" = "icc" ]; then rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \ dnf config-manager --add-repo https://yum.repos.intel.com/oneapi oneAPI; \ fi; \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; "\ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ dnf install intel-oneapi-compiler-dpcpp-cpp && \ dnf clean all; \ fi @@ -149,8 +151,6 @@ RUN if [ "$COMPILER" = "icc" ]; then # force an upgrade to get any newly built RPMs, but only if CB1 is set. ARG CB1 RUN [ -z "$CB1" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ dnf upgrade --exclude=fuse,fuse-libs,fuse-devel,libraft0,raft-devel,mercury,mercury-devel && \ dnf clean all; \ } @@ -175,8 +175,6 @@ ARG DAOS_BUILD=$DAOS_DEPS_BUILD # Build DAOS RUN [ "$DAOS_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ scons --jobs $JOBS install PREFIX=/opt/daos COMPILER=$COMPILER \ BUILD_TYPE=$DAOS_BUILD_TYPE TARGET_TYPE=$DAOS_TARGET_TYPE \ FIRMWARE_MGMT=1 && \ @@ -205,8 +203,6 @@ WORKDIR /home/daos/daos/src/client/java ARG DAOS_JAVA_BUILD=$DAOS_BUILD RUN [ "$DAOS_JAVA_BUILD" != "yes" ] || { \ - if [ -n "$REPO_FILE_URL" ]; then direct="${REPO_FILE_URL##*//}; " \ - direct="${direct%%/*}"; export no_proxy="${direct}"; fi; \ mkdir /home/daos/.m2 && \ cp /home/daos/daos/utils/scripts/helpers/maven-settings.xml.in /home/daos/.m2/settings.xml && \ mvn clean install -ntp -T 1C -DskipITs -Dgpg.skip -Ddaos.install.path=/opt/daos; \ diff --git a/utils/docker/Dockerfile.ubuntu b/utils/docker/Dockerfile.ubuntu index b3790b5f980..589c3f3dd9d 100644 --- a/utils/docker/Dockerfile.ubuntu +++ b/utils/docker/Dockerfile.ubuntu @@ -22,6 +22,16 @@ ENV DEBIAN_FRONTEND=noninteractive # Install basic tools ARG DAOS_LAB_CA_FILE_URL + +# Accept DAOS_NO_PROXY at build time +ARG DAOS_NO_PROXY +# Propagate into the build environment +ENV no_proxy=${DAOS_NO_PROXY} +ENV NO_PROXY=${DAOS_NO_PROXY} +# Persist into /etc/environment for use by shells and services +RUN echo "no_proxy=${DAOS_NO_PROXY}" >> /etc/environment && \ + echo "NO_PROXY=${DAOS_NO_PROXY}" >> /etc/environment + # script to setup local repo if available and install packages COPY ./utils/scripts/helpers/repo-helper-ubuntu.sh /tmp/repo-helper.sh COPY ./utils/scripts/install-ubuntu.sh /tmp/install.sh diff --git a/utils/githooks/branches.default b/utils/githooks/branches.default index 2e21a25fbd3..4a74c43cef0 100755 --- a/utils/githooks/branches.default +++ b/utils/githooks/branches.default @@ -1,5 +1,4 @@ #!/bin/bash set -eEuo pipefail -echo feature/cat_recovery echo feature/multiprovider echo feature/firewall diff --git a/utils/githooks/pre-commit.d/30-Jenkinsfile.sh b/utils/githooks/pre-commit.d/30-Jenkinsfile.sh index 84385123c9e..098a7790332 100755 --- a/utils/githooks/pre-commit.d/30-Jenkinsfile.sh +++ b/utils/githooks/pre-commit.d/30-Jenkinsfile.sh @@ -21,7 +21,6 @@ if [ -z "$(_git_diff_cached_files "Jenkinsfile")" ] ; then exit 0 fi - echo "Checking syntax" : "${JENKINS_HOST:=jenkins.daos.hpc.amslabs.hpecorp.net}" diff --git a/utils/nlt_server.yaml b/utils/nlt_server.yaml index 1e24e4d9c3c..438e55718be 100644 --- a/utils/nlt_server.yaml +++ b/utils/nlt_server.yaml @@ -19,3 +19,4 @@ engines: - class: ram scm_mount: /mnt/daos + scm_hugepages_disabled: false diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 7d4b0ff4dc2..18337560726 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -2,7 +2,7 @@ """Node local test (NLT). (C) Copyright 2020-2024 Intel Corporation. -(C) Copyright 2025 Hewlett Packard Enterprise Development LP +(C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP (C) Copyright 2025 Google LLC (C) Copyright 2025 Enakta Labs Ltd @@ -1015,7 +1015,8 @@ def _make_pool(self): else: size = 1024 * 4 - rc = self.run_dmg(['pool', 'create', 'NLT', '--scm-size', f'{size}M']) + rc = self.run_dmg(['pool', 'create', 'NLT', '--scm-size', f'{size}M', '--properties', + 'rd_fac:0,space_rb:0']) print(rc) assert rc.returncode == 0 self.fetch_pools() @@ -1839,6 +1840,8 @@ def create_cont(conf, pool=None, ctype=None, label=None, path=None, oclass=None, if attrs: cmd.extend(['--attrs', ','.join([f"{name}:{val}" for name, val in attrs.items()])]) + cmd.extend(['--properties', 'cksum:off,srv_cksum:off,rd_fac:0']) + def _create_cont(): """Helper function for create_cont""" rc = run_daos_cmd(conf, cmd, use_json=True, log_check=log_check, valgrind=valgrind, @@ -2829,7 +2832,14 @@ def test_il_cat(self): with open(fname, 'w'): pass - self.dfuse.il_cmd(['cat', fname], check_write=False) + self.dfuse.il_cmd([ + 'dd', + f'if={fname}', + 'of=/dev/null', + 'bs=4096', + 'iflag=fullblock', + 'status=none' + ], check_write=False, check_fstat=False) @needs_dfuse_with_opt(caching_variants=[False]) def test_il(self): @@ -2847,14 +2857,40 @@ def test_il(self): with open(file, 'w') as fd: fd.write('Hello') # Copy it across containers. - self.dfuse.il_cmd(['cp', file, sub_cont_dir]) + dst = join(sub_cont_dir, 'file') + self.dfuse.il_cmd([ + 'dd', + f'if={file}', + f'of={dst}', + 'bs=4096', + 'iflag=fullblock', + 'status=none' + ], check_fstat=False) # Copy it within the container. child_dir = join(self.dfuse.dir, 'new_dir') os.mkdir(child_dir) - self.dfuse.il_cmd(['cp', file, child_dir]) + dst = join(child_dir, 'file') + + self.dfuse.il_cmd([ + 'dd', + f'if={file}', + f'of={dst}', + 'bs=128K', + 'status=none' + ], check_fstat=False) + # Copy something into a container - self.dfuse.il_cmd(['cp', '/bin/bash', sub_cont_dir], check_read=False) + dst = join(sub_cont_dir, 'bash') + + self.dfuse.il_cmd([ + 'dd', + 'if=/bin/bash', + f'of={dst}', + 'bs=128K', + 'status=none' + ], check_read=False, check_fstat=False) + # Read it from within a container self.dfuse.il_cmd(['md5sum', join(sub_cont_dir, 'bash')], check_read=False, check_write=False, check_fstat=False) @@ -5018,7 +5054,16 @@ def create_and_read_via_il(dfuse, path): ofd.flush() assert_file_size(ofd, 12) print(os.fstat(ofd.fileno())) - dfuse.il_cmd(['cat', fname], check_write=False) + + # Replace Python snippet with dd to guarantee read() + dfuse.il_cmd([ + 'dd', + f'if={fname}', + 'of=/dev/null', + 'bs=4096', + 'iflag=fullblock', + 'status=none' + ], check_write=False, check_fstat=False) def run_container_query(conf, path): @@ -5428,6 +5473,7 @@ def test_pydaos_kv(server, conf): print("That's not good") del kv + container.destroy('my_test_kv') del container print('Running PyDAOS container checker') @@ -6166,7 +6212,7 @@ def get_cmd(cont_id): 'create', pool.id(), '--properties', - f'srv_cksum:on,label:{cont_id}'] + f'srv_cksum:on,label:{cont_id},rd_fac:0'] test_cmd = AllocFailTest(conf, 'cont-create', get_cmd) test_cmd.wf = wf diff --git a/utils/rpms/argobots.changelog b/utils/rpms/argobots.changelog new file mode 100644 index 00000000000..4864f0fa8e3 --- /dev/null +++ b/utils/rpms/argobots.changelog @@ -0,0 +1,55 @@ +%changelog +* Wed Nov 19 2025 Tomasz Gromadzki - 1.2-4 +- Restore the RPM changelog, which has not been available since version 1.2-1 + +* Fri Sep 12 2025 Jeff Olivier - 1.2-3 +- Fix leap package name + +* Mon Aug 11 2025 Jeff Olivier - 1.2-2 +- Switch to fpm build for RPMs + +* Wed Oct 02 2024 Cedric Koch-Hofer - 1.2-1 +- Update to 1.2 +- Add patch 411e5b3 Fix DAOS-14248: ULTs stacks dump works only once +- Add patch bb0c908 Restore the libunwind support + +* Tue Jun 06 2023 Brian J. Murrell - 1.1-3 +- Update to build on EL9 + +* Wed Oct 19 2022 Brian J. Murrell - 1.1-2 +- Create debuginfo packages for SUSE +- Fix up some issues found by rpmlint + +* Thu Apr 01 2021 Brian J. Murrell - 1.1-1 +- Update to 1.1 + +* Wed Mar 17 2021 Brian J. Murrell - 1.1~rc1-1 +- Update to 1.1rc1 + +* Tue Feb 23 2021 B.Faccini - 1.1~b1-1 +- Update to 1.1b1 +- Build with unwinding enabled + +* Mon Aug 17 2020 Brian J. Murrell - 1.0-1 +- Update to 1.0 final + +* Mon Jun 22 2020 Brian J. Murrell - 1.0rc-5 +- Update License: + +* Sat Sep 21 2019 Brian J. Murrell - 1.0rc-4 +- Add BR: pkgconfig + +* Sat Sep 21 2019 Brian J. Murrell - 1.0rc-3 +- Revert libabt0 packaging for EL7; RH just doesn't do that + +* Fri Sep 20 2019 Brian J. Murrell - 1.0rc-2 +- Add patch to bring up to 89507c1f8c +- Create a libabt0 subpackage +- Force autogen.sh since we add a patch that modifies a Makefile.am + +* Wed Apr 17 2019 Brian J. Murrell - 1.0rc-1 +- Update to 1.0rc1 +- Add patch to bring up to 9d48af08 + +* Wed Apr 03 2019 Brian J. Murrell - 0.99-1 +- Initial package diff --git a/utils/rpms/argobots.sh b/utils/rpms/argobots.sh index 65a3dc937f8..71726148e93 100755 --- a/utils/rpms/argobots.sh +++ b/utils/rpms/argobots.sh @@ -16,6 +16,7 @@ DESCRIPTION="Argobots is a lightweight, low-level threading and tasking framewor This release is an experimental version of Argobots that contains features related to user-level threads, tasklets, and some schedulers." URL="https://argobots.org" +RPM_CHANGELOG="argobots.changelog" files=() TARGET_PATH="${libdir}" diff --git a/utils/rpms/build_packages.sh b/utils/rpms/build_packages.sh index e696c48fee8..6c72cfd52e6 100755 --- a/utils/rpms/build_packages.sh +++ b/utils/rpms/build_packages.sh @@ -10,7 +10,7 @@ if [[ "${build_type}" =~ deps|all ]]; then utils/rpms/libfabric.sh utils/rpms/mercury.sh utils/rpms/pmdk.sh - utils/rpms/spdk.sh + utils/rpms/daos-spdk.sh fi if [[ "${build_type}" =~ daos|all ]]; then utils/rpms/daos.sh diff --git a/utils/rpms/daos-spdk.changelog b/utils/rpms/daos-spdk.changelog new file mode 100644 index 00000000000..181aa36bc16 --- /dev/null +++ b/utils/rpms/daos-spdk.changelog @@ -0,0 +1,6 @@ +* Tue Nov 25 2025 Jeff Olivier - 2.0.0-1 +- Upgrade to SPDK 24.09. +- Restore missing changelog + +* Wed Sep 10 2025 Jeff Olivier - 1.0.0-1 +- Switch to daos-spdk package for spdk, deprecates old spdk diff --git a/utils/rpms/spdk.sh b/utils/rpms/daos-spdk.sh similarity index 89% rename from utils/rpms/spdk.sh rename to utils/rpms/daos-spdk.sh index 2cff9fd4138..4a7bed3428c 100755 --- a/utils/rpms/spdk.sh +++ b/utils/rpms/daos-spdk.sh @@ -17,6 +17,7 @@ DESCRIPTION="The Storage Performance Development Kit provides a set of tools and libraries for writing high performance, scalable, user-mode storage applications." URL="https://spdk.io" +RPM_CHANGELOG="daos-spdk.changelog" files=() TARGET_PATH="${bindir}" @@ -35,8 +36,8 @@ list_files files "${SL_SPDK_PREFIX}/lib64/daos_srv/libspdk*.so.*" \ clean_bin "${files[@]}" append_install_list "${files[@]}" -TARGET_PATH="${libdir}/daos_srv/dpdk/pmds-22.0" -list_files files "${SL_SPDK_PREFIX}/lib64/daos_srv/dpdk/pmds-22.0/lib*.so.*" +TARGET_PATH="${libdir}/daos_srv/dpdk/pmds-24.1" +list_files files "${SL_SPDK_PREFIX}/lib64/daos_srv/dpdk/pmds-24.1/lib*.so.*" clean_bin "${files[@]}" append_install_list "${files[@]}" @@ -65,8 +66,8 @@ list_files files "${SL_SPDK_PREFIX}/lib64/pkgconfig/daos_spdk.pc" replace_paths "${SL_SPDK_PREFIX}" "${files[@]}" append_install_list "${files[@]}" -TARGET_PATH="${libdir}/daos_srv/dpdk/pmds-22.0" -list_files files "${SL_SPDK_PREFIX}/lib64/daos_srv/dpdk/pmds-22.0/lib*.so" +TARGET_PATH="${libdir}/daos_srv/dpdk/pmds-24.1" +list_files files "${SL_SPDK_PREFIX}/lib64/daos_srv/dpdk/pmds-24.1/lib*.so" append_install_list "${files[@]}" TARGET_PATH="${includedir}/daos_srv/spdk" diff --git a/utils/rpms/daos.changelog b/utils/rpms/daos.changelog new file mode 100644 index 00000000000..16f420643ac --- /dev/null +++ b/utils/rpms/daos.changelog @@ -0,0 +1,848 @@ +%changelog +* Wed Feb 18 2026 Oksana Salyk 2.7.104-2 +- Update PMDK to release 2.1.3 + +* Tue Feb 10 2026 Dalton Bohning 2.7.104-1 +- Bump version to 2.7.104 + +* Fri Jan 16 2026 Jerome Soumagne 2.7.103-2 +- Drop libfabric-devel build requirement +- Drop libfabric requirement that is already provided by mercury-libfabric + +* Fri Dec 19 2025 Dalton Bohning 2.7.103-1 +- Bump version to 2.7.103 + +* Tue Nov 25 2025 Jeff Olivier 2.7.102-3 +- Upgrade spdk +- Add some missing dependencies numactl and pcituils + +* Mon Nov 24 2025 Jeff Olivier 2.7.102-2 +- Require isal_crypto 2.25.0 due to API deprecation warnings + +* Fri Nov 17 2025 Phillip Henderson 2.7.102-1 +- Bump version to 2.7.102 + +* Wed Nov 05 2025 Tomasz Gromadzki - 2.7.101-17 +- Update PMDK to release 2.1.2 + +* Thu Oct 16 2025 Jeff Olivier 2.7.101-16 +- Make daos-spdk conflict with spdk + +* Thu Sep 12 2025 Jeff Olivier 2.7.101-15 +- Fix leap package name + +* Thu Sep 11 2025 Jeff Olivier 2.7.101-14 +- Fix pmdk package for leap +- Fix daos-spdk package + +* Mon Aug 11 2025 Jeff Olivier 2.7.101-13 +- Switch to fpm build for RPMs + +* Wed Jul 30 2025 Tomasz Gromadzki 2.7.101-12 +- pmemobj errors and warnings reported via DAOS logging system + +* Mon Jun 2 2025 Samirkumar Raval 2.7.101-11 +- Changing the default log location to /var/log/daos from /tmp + +* Mon May 19 2025 Jeff Olivier 2.7.101-10 +- Start to deprecate this file being used to build DAOS but rather only source + RPM + +* Mon May 12 2025 Tomasz Gromadzki 2.7.101-9 +- Bump lua-lmod version to >=8.7.36 +- Bump lmod version to >=8.7.36 +- Bump mpich version to 4.1~a1 +- Bump python3-mpi4py-tests version to >= 3.1.6 +- Add openmpi requiremnent for daos-client-tests on Leap. + +* Fri Mar 21 2025 Cedric Koch-Hofer 2.7.101-8 +- Add support of the libasan + +* Tue Mar 18 2025 Jeff Olivier 2.7.101-7 +- Remove raft as external dependency + +* Mon Mar 10 2025 Jeff Olivier 2.7.101-6 +- Remove server from Ubuntu packaging and fix client only build + +* Wed Jan 22 2025 Jan Michalski 2.7.101-5 +- Add ddb_ut and dtx_ut to the server-tests package + +* Fri Dec 20 2024 Jeff Olivier 2.7.101-4 +- Switch libfuse3 to libfused + +* Thu Dec 19 2024 Phillip Henderson 2.7.101-3 +- Fix protobuf-c requiremnent for daos-client-tests on Leap. + +* Thu Nov 14 2024 Denis Barakhtanov 2.7.101-2 +- Add pydaos.torch module to daos-client rpm. + +* Fri Nov 08 2024 Phillip Henderson 2.7.101-1 +- Bump version to 2.7.100 + +* Tue Nov 5 2024 Michael MacDonald 2.7.100-11 +- Move daos_metrics tool to daos package for use on both clients + and servers. + +* Fri Nov 1 2024 Sherin T George 2.7.100-10 +- The modified DAV allocator with memory bucket support for md_on_ssd + phase-2 is delivered as dav_v2.so. + +* Tue Oct 15 2024 Brian J. Murrell - 2.7.100-9 +- Drop BRs for UCX as they were obsoleted as of e01970d + +* Mon Oct 07 2024 Cedric Koch-Hofer 2.7.100-8 +- Update BR: argobots to 1.2 + +* Tue Oct 01 2024 Tomasz Gromadzki 2.7.100-7 +- Add support of the PMDK package 2.1.0 with NDCTL enabled. + * Increase the default ULT stack size to 20KiB if the engine uses + the DCPM storage class. + * Prevent using the RAM storage class (simulated PMem) when + the shutdown state (SDS) is active. + * Automatically disable SDS for the RAM storage class on engine startup. + * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0' + environment variable to deactivate SDS for the DAOS tools + (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM. + Otherwise, a user is supposed to be stopped by an error + like: "Unsafe shutdown count is not supported for this source". + +* Mon Sep 23 2024 Kris Jacque 2.7.100-6 +- Bump min supported go version to 1.21 + +* Thu Aug 15 2024 Michael MacDonald 2.7.100-5 +- Add libdaos_self_test.so to client RPM + +* Mon Aug 05 2024 Jerome Soumagne 2.7.100-4 +- Bump mercury version to 2.4.0rc4 + +* Thu Jul 11 2024 Dalton Bohning 2.7.100-3 +- Add pciutils-devel build dep for client-tests package + +* Mon Jun 24 2024 Tom Nabarro 2.7.100-2 +- Add pciutils runtime dep for daos_server lspci call +- Add pciutils-devel build dep for pciutils CGO bindings + +* Mon May 20 2024 Phillip Henderson 2.7.100-1 +- Bump version to 2.7.100 + +* Fri May 03 2024 Lei Huang 2.5.101-5 +- Add libaio as a dependent package + +* Fri Apr 05 2024 Fan Yong 2.5.101-4 +- Catastrophic Recovery + +* Thu Apr 04 2024 Ashley M. Pittman 2.5.101-3 +- Update pydaos install process +- Add a dependency from daos-client-tests to daos-devel + +* Mon Mar 18 2024 Jan Michalski 2.5.101-2 +- Add dtx_tests to the server-tests package + +* Fri Mar 15 2024 Phillip Henderson 2.5.101-1 +- Bump version to 2.5.101 + +* Tue Feb 27 2024 Li Wei 2.5.100-16 +- Update raft to 0.11.0-1.416.g12dbc15 + +* Mon Feb 12 2024 Ryon Jensen 2.5.100-15 +- Updated isa-l package name to match EPEL + +* Tue Jan 09 2024 Brian J. Murrell 2.5.100-14 +- Move /etc/ld.so.conf.d/daos.conf to daos-server sub-package + +* Wed Dec 06 2023 Brian J. Murrell 2.5.100-13 +- Update for EL 8.8 and Leap 15.5 +- Update raft to 0.10.1-2.411.gefa15f4 + +* Fri Nov 17 2023 Tomasz Gromadzki 2.5.100-12 +- Update to PMDK 2.0.0 + * Remove libpmemblk from dependencies. + * Start using BUILD_EXAMPLES=n and BUILD_BENCHMARKS=n instead of patches. + * Stop using BUILD_RPMEM=n (removed) and NDCTL_DISABLE=y (invalid). + * Point https://github.com/pmem/pmdk as the main PMDK reference source. + NOTE: PMDK upgrade to 2.0.0 does not affect any API call used by DAOS. + libpmemobj (and libpmem) API stays unchanged. + +* Wed Nov 15 2023 Jerome Soumagne 2.5.100-11 +- Bump mercury min version to 2.3.1 + +* Fri Nov 03 2023 Phillip Henderson 2.5.100-10 +- Move verify_perms.py location + +* Wed Aug 23 2023 Brian J. Murrell 2.5.100-9 +- Update fuse3 requirement to R: /usr/bin/fusermount3 by path + rather than by package name, for portability and future-proofing +- Adding fuse3-devel as a requirement for daos-client-tests subpackage + +* Tue Aug 08 2023 Brian J. Murrell 2.5.100-8 +- Build on EL9 +- Add a client-tests-mpich subpackage for mpich test dependencies. + +* Fri Jul 07 2023 Brian J. Murrell 2.5.100-7 +- Fix golang daos-client-tests dependency to be go instead + +* Thu Jun 29 2023 Michael MacDonald 2.5.100-6 +- Install golang >= 1.18 as a daos-client-tests dependency + +* Thu Jun 22 2023 Li Wei 2.5.100-5 +- Update raft to 0.10.1-1.408.g9524cdb + +* Wed Jun 14 2023 Mohamad Chaarawi - 2.5.100-4 +- Add pipeline lib + +* Wed Jun 14 2023 Wang Shilong 2.5.100-3 +- Remove lmdb-devel for MD on SSD + +* Wed Jun 07 2023 Ryon Jensen 2.5.100-2 +- Removed unnecessary test files + +* Tue Jun 06 2023 Jeff Olivier 2.5.100-1 +- Switch version to 2.5.100 for 2.6 test builds + +* Mon Jun 5 2023 Jerome Soumagne 2.3.107-7 +- Remove libfabric pinning and allow for 1.18 builds + +* Fri May 26 2023 Jeff Olivier 2.3.107-6 +- Add lmdb-devel and bio_ut for MD on SSD + +* Tue May 23 2023 Lei Huang 2.3.107-5 +- Add libcapstone-devel to deps of client-tests package + +* Tue May 16 2023 Lei Huang 2.3.107-4 +- Add libcapstone as a new prerequisite package +- Add libpil4dfs.so in daos-client rpm + +* Mon May 15 2023 Jerome Soumagne 2.3.107-3 +- Fix libfabric/libfabric1 dependency mismatch on SuSE + +* Wed May 10 2023 Jerome Soumagne 2.3.107-2 +- Temporarily pin libfabric to < 1.18 + +* Fri May 5 2023 Johann Lombardi 2.3.107-1 +- Bump version to 2.3.107 + +* Fri Mar 17 2023 Tom Nabarro 2.3.106-2 +- Add numactl requires for server package + +* Tue Mar 14 2023 Brian J. Murrell 2.3.106-1 +- Bump version to be higher than TB5 + +* Wed Feb 22 2023 Li Wei 2.3.103-6 +- Update raft to 0.9.2-1.403.g3d20556 + +* Tue Feb 21 2023 Michael MacDonald 2.3.103-5 +- Bump min supported go version to 1.17 + +* Fri Feb 17 2023 Ashley M. Pittman 2.3.103-4 +- Add protobuf-c-devel to deps of client-tests package + +* Mon Feb 13 2023 Brian J. Murrell 2.3.103-3 +- Remove explicit R: protobuf-c and let the auto-dependency generator + handle it + +* Wed Feb 8 2023 Michael Hennecke 2.3.103-2 +- Change ipmctl requirement from v2 to v3 + +* Fri Jan 27 2023 Phillip Henderson 2.3.103-1 +- Bump version to 2.3.103 + +* Wed Jan 25 2023 Johann Lombardi 2.3.102-1 +- Bump version to 2.3.102 + +* Tue Jan 24 2023 Phillip Henderson 2.3.101-7 +- Fix daos-tests-internal requirement for daos-tests + +* Fri Jan 6 2023 Brian J. Murrell 2.3.101-6 +- Don't need to O: cart any more +- Add %%doc to all packages +- _datadir -> _datarootdir +- Don't use PREFIX= with scons in %%build +- Fix up some hard-coded paths to use macros instead +- Use some guards to prevent creating empty scriptlets + +* Tue Dec 06 2022 Joseph G. Moore 2.3.101-5 +- Update Mercury to 2.2.0-6 + +* Thu Dec 01 2022 Tom Nabarro 2.3.101-4 +- Update SPDK dependency requirement to greater than or equal to 22.01.2. + +* Tue Oct 18 2022 Brian J. Murrell 2.3.101-3 +- Set flag to build per-subpackage debuginfo packages for Leap 15 + +* Thu Oct 6 2022 Michael MacDonald 2.3.101-2 +- Rename daos_admin -> daos_server_helper + +* Tue Sep 20 2022 Johann Lombardi 2.3.101-1 +- Bump version to 2.3.101 + +* Thu Sep 8 2022 Jeff Olivier 2.3.100-22 +- Move io_conf files from bin to TESTING + +* Tue Aug 16 2022 Jeff Olivier 2.3.100-21 +- Update PMDK to 1.12.1~rc1 to fix DAOS-11151 + +* Thu Aug 11 2022 Wang Shilong 2.3.100-20 +- Add daos_debug_set_params to daos-client-tests rpm for fault injection test. + +* Fri Aug 5 2022 Jerome Soumagne 2.3.100-19 +- Update to mercury 2.2.0 + +* Tue Jul 26 2022 Michael MacDonald 2.3.100-18 +- Bump min supported go version to 1.16 + +* Mon Jul 18 2022 Jerome Soumagne 2.3.100-17 +- Remove now unused openpa dependency + +* Fri Jul 15 2022 Jeff Olivier 2.3.100-16 +- Add pool_scrubbing_tests to test package + +* Wed Jul 13 2022 Tom Nabarro 2.3.100-15 +- Update SPDK dependency requirement to greater than or equal to 22.01.1. + +* Mon Jun 27 2022 Jerome Soumagne 2.3.100-14 +- Update to mercury 2.2.0rc6 + +* Fri Jun 17 2022 Jeff Olivier 2.3.100-13 +- Remove libdts.so, replace with build time static + +* Thu Jun 2 2022 Jeff Olivier 2.3.100-12 +- Make ucx required for build on all platforms + +* Wed Jun 1 2022 Michael MacDonald 2.3.100-11 +- Move dmg to new daos-admin RPM + +* Wed May 18 2022 Lei Huang 2.3.100-10 +- Update to libfabric to v1.15.1-1 to include critical performance patches + +* Tue May 17 2022 Phillip Henderson 2.3.100-9 +- Remove doas-client-tests-openmpi dependency from daos-tests +- Add daos-tests-internal package + +* Mon May 9 2022 Ashley Pittman 2.3.100-8 +- Extend dfusedaosbuild test to run in different configurations. + +* Fri May 6 2022 Ashley Pittman 2.3.100-7 +- Add dfuse unit-test binary to call from ftest. + +* Wed May 4 2022 Joseph Moore 2.3.100-6 +- Update to mercury 2.1.0.rc4-9 to enable non-unified mode in UCX + +* Tue Apr 26 2022 Phillip Henderson 2.3.100-5 +- Move daos_gen_io_conf and daos_run_io_conf to daos-client-tests + +* Wed Apr 20 2022 Lei Huang 2.3.100-4 +- Update to libfabric to v1.15.0rc3-1 to include critical performance patches + +* Tue Apr 12 2022 Li Wei 2.3.100-3 +- Update raft to 0.9.1-1401.gc18bcb8 to fix uninitialized node IDs + +* Wed Apr 6 2022 Jeff Olivier 2.3.100-2 +- Remove direct MPI dependency from most of tests + +* Wed Apr 6 2022 Johann Lombardi 2.3.100-1 +- Switch version to 2.3.100 for 2.4 test builds + +* Wed Apr 6 2022 Joseph Moore 2.1.100-26 +- Add build depends entries for UCX libraries. + +* Sat Apr 2 2022 Joseph Moore 2.1.100-25 +- Update to mercury 2.1.0.rc4-8 to include UCX provider patch + +* Fri Mar 11 2022 Alexander Oganezov 2.1.100-24 +- Update to mercury 2.1.0.rc4-6 to include CXI provider patch + +* Wed Mar 02 2022 Michael Hennecke 2.1.100-23 +- DAOS-6344: Create secondary group daos_daemons for daos_server and daos_agent + +* Tue Feb 22 2022 Alexander Oganezov 2.1.100-22 +- Update mercury to include DAOS-9561 workaround + +* Sun Feb 13 2022 Michael MacDonald 2.1.100-21 +- Update go toolchain requirements + +* Thu Feb 10 2022 Li Wei 2.1.100-20 +- Update raft to 0.9.0-1394.gc81505f to fix membership change bugs + +* Wed Jan 19 2022 Michael MacDonald 2.1.100-19 +- Move libdaos_common.so from daos-client to daos package + +* Mon Jan 17 2022 Johann Lombardi 2.1.100-18 +- Update libfabric to 1.14.0 GA and apply fix for DAOS-9376 + +* Thu Dec 23 2021 Alexander Oganezov 2.1.100-17 +- Update to v2.1.0-rc4-3 to pick fix for DAOS-9325 high cpu usage +- Change mercury pinning to be >= instead of strict = + +* Thu Dec 16 2021 Brian J. Murrell 2.1.100-16 +- Add BR: python-rpm-macros for Leap 15 as python3-base dropped that + as a R: + +* Sat Dec 11 2021 Brian J. Murrell 2.1.100-15 +- Create a shim package to allow daos openmpi packages built with the + distribution openmpi to install on MOFED systems + +* Fri Dec 10 2021 Brian J. Murrell 2.1.100-14 +- Don't make daos-*-tests-openmi a dependency of anything + - If they are wanted, they should be installed explicitly, due to + potential conflicts with other MPI stacks + +* Wed Dec 08 2021 Alexander Oganezov 2.1.100-13 +- Remove DAOS-9173 workaround from mercury. Apply DAOS-9173 to ofi + +* Tue Dec 07 2021 Alexander Oganezov 2.1.100-12 +- Apply DAOS-9173 workaround to mercury + +* Fri Dec 03 2021 Alexander Oganezov 2.1.100-11 +- Update mercury to v2.1.0rc4 + +* Thu Dec 02 2021 Danielle M. Sikich 2.1.100-10 +- Fix name of daos serialize package + +* Sun Nov 28 2021 Tom Nabarro 2.1.100-9 +- Set rmem_{max,default} sysctl values on server package install to enable + SPDK pci_event module to operate in unprivileged process (daos_engine). + +* Wed Nov 24 2021 Brian J. Murrell 2.1.100-8 +- Remove invalid "%%else if" syntax +- Fix a few other rpmlint warnings + +* Tue Nov 16 2021 Wang Shilong 2.1.100-7 +- Update for libdaos major version bump +- Fix version of libpemobj1 for SUSE + +* Sat Nov 13 2021 Alexander Oganezov 2.1.100-6 +- Update OFI to v1.14.0rc3 + +* Tue Oct 26 2021 Brian J. Murrell 2.1.100-5 +- Create new daos-{client,server}tests-openmpi and daos-server-tests subpackages +- Rename daos-tests daos-client-tests and make daos-tests require all + other test suites to maintain existing behavior + +* Mon Oct 25 2021 Alexander Oganezov 2.1.100-4 +- Update mercury to v2.1.0rc2 + +* Wed Oct 20 2021 Jeff Olivier 2.1.100-3 +- Explicitly require 1.11.0-3 of PMDK + +* Wed Oct 13 2021 David Quigley 2.1.100-2 +- Add defusedxml as a required dependency for the test package. + +* Wed Oct 13 2021 Johann Lombardi 2.1.100-1 +- Switch version to 2.1.100 for 2.2 test builds + +* Tue Oct 12 2021 Johann Lombardi 1.3.106-1 +- Version bump to 1.3.106 for 2.0 test build 6 + +* Fri Oct 8 2021 Alexander Oganezov 1.13.105-4 +- Update OFI to v1.13.2rc1 + +* Wed Sep 15 2021 Li Wei 1.3.105-3 +- Update raft to fix InstallSnapshot performance as well as to avoid some + incorrect 0.8.0 RPMs + +* Fri Sep 03 2021 Brian J. Murrell 1.3.105-2 +- Remove R: hwloc; RPM's auto-requires/provides will take care of this + +* Tue Aug 24 2021 Jeff Olivier 1.3.105-1 +- Version bump to 1.3.105 for 2.0 test build 5 + +* Mon Aug 09 2021 Yawei 1.3.104-5 +- Fix duplicates +- Add vos_perf + +* Thu Aug 05 2021 Christopher Hoffman 1.3.104-4 +- Update conditional statement to include checking for distributions to + determine which unit files to use for daos-server and daos-agent + +* Wed Aug 04 2021 Kris Jacque 1.3.104-3 +- Move daos_metrics tool from tests package to server package + +* Wed Aug 04 2021 Tom Nabarro 1.3.104-2 +- Update to spdk 21.07 and (indirectly) dpdk 21.05 + +* Mon Aug 02 2021 Jeff Olivier 1.3.104-1 +- Version bump to 1.3.104 for 2.0 test build 4 + +* Mon Jul 19 2021 Danielle M. Sikich 1.3.103-5 +- Add DAOS serialization library that requires hdf5 + +* Wed Jul 14 2021 Li Wei 1.3.103-4 +- Update raft to fix slow leader re-elections + +* Tue Jul 13 2021 Maureen Jean 1.3.103-3 +- Add python modules to python3.6 site-packages + +* Mon Jul 12 2021 Alexander Oganezov 1.3.103-2 +- Update to mercury release v2.0.1 + +* Mon Jul 12 2021 Johann Lombardi 1.3.103-1 +- Version bump to 1.3.103 for 2.0 test build 3 + +* Wed Jul 7 2021 Phillip Henderson 1.3.102-6 +- Update daos-devel to always require the same version daos-client + +* Wed Jun 30 2021 Tom Nabarro 1.3.102-5 +- Update to spdk 21.04 and (indirectly) dpdk 21.05 + +* Fri Jun 25 2021 Brian J. Murrell - 1.3.102-4 +- Add libuuid-devel back as a requirement of daos-devel + +* Wed Jun 23 2021 Li Wei 1.3.102-3 +- Update raft to pick up Pre-Vote + +* Mon Jun 14 2021 Jeff Olivier 1.3.102-2 +- Update to pmdk 1.11.0-rc1 +- Remove dependence on libpmem since we use libpmemobj directly + +* Fri Jun 11 2021 Johann Lombardi 1.3.102-1 +- Version bump to 1.3.102 for 2.0 test build 2 + +* Wed Jun 02 2021 Johann Lombardi 1.3.101-3 +- Remove libs from devel package + +* Thu May 20 2021 Jeff Olivier 1.3.0-101-2 +- Remove client libs from common package + +* Wed May 19 2021 Johann Lombardi 1.3.101-1 +- Version bump to 1.3.101 for 2.0 test build 1 + +* Fri May 07 2021 Brian J. Murrell 1.3.0-16 +- Enable debuginfo package building on SUSE platforms + +* Thu May 06 2021 Brian J. Murrell 1.3.0-15 +- Update to build on EL8 + +* Wed May 05 2021 Brian J. Murrell 1.3.0-14 +- Package /etc/daos/certs in main/common package so that both server + and client get it created + +* Wed Apr 21 2021 Tom Nabarro - 1.3.0-13 +- Relax ipmctl version requirement on leap15 as we have runtime checks + +* Fri Apr 16 2021 Mohamad Chaarawi - 1.3.0-12 +- remove dfuse_hl + +* Wed Apr 14 2021 Jeff Olivier - 1.3.0-11 +- Remove storage_estimator and io_conf from client packages to remove + any client side dependence on bio and vos (and and PMDK/SPDK) + +* Mon Apr 12 2021 Dalton A. Bohning - 1.3.0-10 +- Add attr to the test dependencies + +* Tue Apr 06 2021 Kris Jacque 1.3.0-9 +- Add package for daos_firmware helper binary + +* Fri Apr 02 2021 Jeff Olivier 1.3.0-8 +- Remove unused readline-devel + +* Thu Apr 01 2021 Brian J. Murrell 1.3.0-7 +- Update argobots to 1.1 + +* Tue Mar 30 2021 Maureen Jean 1.3.0-6 +- Change pydaos_shim_3 to pydaos_shim + +* Mon Mar 29 2021 Brian J. Murrell - 1.3.0-5 +- Move libdts.so to the daos-tests subpackage + +* Tue Mar 23 2021 Alexander Oganezov 1.3.0-4 +- Update libfabric to v1.12.0 +- Disable grdcopy/gdrapi linkage in libfabric + + +* Thu Mar 18 2021 Maureen Jean 1.3.0-3 +- Update to python3 + +* Thu Feb 25 2021 Li Wei 1.3.0-2 +- Require raft-devel 0.7.3 that fixes an unstable leadership problem caused by + removed replicas as well as some Coverity issues + +* Wed Feb 24 2021 Brian J. Murrell - 1.3.0-1 +- Version bump up to 1.3.0 + +* Mon Feb 22 2021 Brian J. Murrell 1.1.3-3 +- Remove all *-devel Requires from daos-devel as none of those are + actually necessary to build libdaos clients + +* Tue Feb 16 2021 Alexander Oganezov 1.1.3-2 +- Update libfabric to v1.12.0rc1 + +* Wed Feb 10 2021 Johann Lombardi 1.1.3-1 +- Version bump up to 1.1.3 + +* Tue Feb 9 2021 Vish Venkatesan 1.1.2.1-11 +- Add new pmem specific version of DAOS common library + +* Fri Feb 5 2021 Saurabh Tandan 1.1.2.1-10 +- Added dbench as requirement for test package. + +* Wed Feb 3 2021 Hua Kuang 1.1.2.1-9 +- Changed License to BSD-2-Clause-Patent + +* Wed Feb 03 2021 Brian J. Murrell - 1.1.2-8 +- Update minimum required libfabric to 1.11.1 + +* Thu Jan 28 2021 Phillip Henderson 1.1.2.1-7 +- Change ownership and permissions for the /etc/daos/certs directory. + +* Sat Jan 23 2021 Alexander Oganezov 1.1.2.1-6 +- Update to mercury v2.0.1rc1 + +* Fri Jan 22 2021 Michael MacDonald 1.1.2.1-5 +- Install daos_metrics utility to %%{_bindir} + +* Wed Jan 20 2021 Kenneth Cain 1.1.2.1-4 +- Version update for API major version 1, libdaos.so.1 (1.0.0) + +* Fri Jan 15 2021 Michael Hennecke 1.1.2.1-3 +- Harmonize daos_server and daos_agent groups. + +* Tue Dec 15 2020 Ashley Pittman 1.1.2.1-2 +- Combine the two memcheck suppressions files. + +* Wed Dec 09 2020 Johann Lombardi 1.1.2.1-1 +- Version bump up to 1.1.2.1 + +* Fri Dec 04 2020 Li Wei 1.1.2-3 +- Require raft-devel 0.7.1 that fixes recent Coverity issues + +* Wed Dec 02 2020 Maureen Jean - 1.1.2-2 +- define scons_args to be BUILD_TYPE= +- the scons default is BUILD_TYPE=release +- BUILD_TYPE=release will disable fault injection in build + +* Tue Dec 01 2020 Brian J. Murrell - 1.1.2-1 +- Version bump up to 1.1.2 + +* Tue Nov 17 2020 Li Wei 1.1.1-8 +- Require raft-devel 0.7.0 that changes log indices and terms to 63-bit + +* Wed Nov 11 2020 Tom Nabarro 1.1.1-7 +- Add version validation for runtime daos_server ipmctl requirement to avoid + potential corruption of PMMs when setting PMem goal, issue fixed in + https://github.com/intel/ipmctl/commit/9e3898cb15fa9eed3ef3e9de4488be1681d53ff4 + +* Thu Oct 29 2020 Jonathan Martinez Montes 1.1.1-6 +- Restore obj_ctl utility + +* Wed Oct 28 2020 Brian J. Murrell - 1.1.1-5 +- Use %%autosetup +- Only use systemd_requires if it exists +- Obsoletes: cart now that it's included in daos + +* Sat Oct 24 2020 Maureen Jean 1.1.1-4 +- Add daos.conf to the daos package to resolve the path to libbio.so + +* Tue Oct 13 2020 Jonathan Martinez Montes 1.1.1-3 +- Remove obj_ctl from Tests RPM package +- Add libdts.so shared library that is used by daos_perf, daos_racer and + the daos utility. + +* Tue Oct 13 2020 Amanda Justiniano 1.1.1-3 +- Add lbzip2 requirement to the daos-tests package + +* Tue Oct 13 2020 Michael MacDonald 1.1.1-2 +- Create unprivileged user for daos_agent + +* Mon Oct 12 2020 Johann Lombardi 1.1.1-1 +- Version bump up to 1.1.1 + +* Sat Oct 03 2020 Michael MacDonald 1.1.0-34 +- Add go-race to BuildRequires on OpenSUSE Leap + +* Wed Sep 16 2020 Alexander Oganezov 1.1.0-33 +- Update OFI to v1.11.0 + +* Mon Aug 17 2020 Michael MacDonald 1.1.0-32 +- Install completion script in /etc/bash_completion.d + +* Wed Aug 05 2020 Brian J. Murrell - 1.1.0-31 +- Change fuse requirement to fuse3 +- Use Lmod for MPI module loading +- Remove unneeded (and un-distro gated) Requires: json-c + +* Wed Jul 29 2020 Jonathan Martinez Montes - 1.1.0-30 +- Add the daos_storage_estimator.py tool. It merges the functionality of the + former tools vos_size, vos_size.py, vos_size_dfs_sample.py and parse_csv.py. + +* Wed Jul 29 2020 Jeffrey V Olivier - 1.1.0-29 +- Revert prior changes from version 28 + +* Mon Jul 13 2020 Brian J. Murrell - 1.1.0-28 +- Change fuse requirement to fuse3 +- Use Lmod for MPI module loading + +* Tue Jul 7 2020 Alexander A Oganezov - 1.1.0-27 +- Update to mercury release 2.0.0~rc1-1 + +* Sun Jun 28 2020 Jonathan Martinez Montes - 1.1.0-26 +- Add the vos_size_dfs_sample.py tool. It is used to generate dynamically + the vos_dfs_sample.yaml file using the real DFS super block data. + +* Tue Jun 23 2020 Jeff Olivier - 1.1.0-25 +- Add -no-rpath option and use it for rpm build rather than modifying + SCons files in place + +* Tue Jun 16 2020 Jeff Olivier - 1.1.0-24 +- Modify RPATH removal snippet to replace line with pass as some lines + can't be removed without breaking the code + +* Fri Jun 05 2020 Ryon Jensen - 1.1.0-23 +- Add libisa-l_crypto dependency + +* Fri Jun 05 2020 Tom Nabarro - 1.1.0-22 +- Change server systemd run-as user to daos_server in unit file + +* Thu Jun 04 2020 Hua Kuang - 1.1.0-21 +- Remove dmg_old from DAOS RPM package + +* Thu May 28 2020 Tom Nabarro - 1.1.0-20 +- Create daos group to run as in systemd unit file + +* Tue May 26 2020 Brian J. Murrell - 1.1.0-19 +- Enable parallel building with _smp_mflags + +* Fri May 15 2020 Kenneth Cain - 1.1.0-18 +- Require raft-devel >= 0.6.0 that adds new API raft_election_start() + +* Thu May 14 2020 Brian J. Murrell - 1.1.0-17 +- Add cart-devel's Requires to daos-devel as they were forgotten + during the cart merge + +* Thu May 14 2020 Brian J. Murrell - 1.1.0-16 +- Fix fuse3-libs -> libfuse3 for SLES/Leap 15 + +* Thu Apr 30 2020 Brian J. Murrell - 1.1.0-15 +- Use new properly pre-release tagged mercury RPM + +* Thu Apr 30 2020 Brian J. Murrell - 1.1.0-14 +- Move fuse dependencies to the client subpackage + +* Mon Apr 27 2020 Michael MacDonald 1.1.0-13 +- Rename /etc/daos.yml -> /etc/daos_control.yml + +* Thu Apr 16 2020 Brian J. Murrell - 1.1.0-12 +- Use distro fuse + +* Fri Apr 10 2020 Alexander Oganezov - 1.1.0-11 +- Update to mercury 4871023 to pick na_ofi.c race condition fix for + "No route to host" errors. + +* Sun Apr 05 2020 Brian J. Murrell - 1.1.0-10 +- Clean up spdk dependencies + +* Mon Mar 30 2020 Tom Nabarro - 1.1.0-9 +- Set version of spdk to < v21, > v19 + +* Fri Mar 27 2020 David Quigley - 1.1.0-8 +- add daos and dmg man pages to the daos-client files list + +* Thu Mar 26 2020 Michael MacDonald 1.1.0-7 +- Add systemd scriptlets for managing daos_server/daos_agent services + +* Thu Mar 26 2020 Alexander Oganeozv - 1.1.0-6 +- Update ofi to 62f6c937601776dac8a1f97c8bb1b1a6acfbc3c0 + +* Tue Mar 24 2020 Jeffrey V. Olivier - 1.1.0-5 +- Remove cart as an external dependence + +* Mon Mar 23 2020 Jeffrey V. Olivier - 1.1.0-4 +- Remove scons_local as dependency + +* Tue Mar 03 2020 Brian J. Murrell - 1.1.0-3 +- Bump up go minimum version to 1.12 + +* Thu Feb 20 2020 Brian J. Murrell - 1.1.0-2 +- daos-server requires daos-client (same version) + +* Fri Feb 14 2020 Brian J. Murrell - 1.1.0-1 +- Version bump up to 1.1.0 + +* Wed Feb 12 2020 Brian J. Murrell - 0.9.0-2 +- Remove undefine _missing_build_ids_terminate_build + +* Thu Feb 06 2020 Johann Lombardi - 0.9.0-1 +- Version bump up to 0.9.0 + +* Sat Jan 18 2020 Jeff Olivier - 0.8.0-3 +- Fixing a few warnings in the RPM spec file + +* Fri Dec 27 2019 Jeff Olivier - 0.8.0-2 +- Remove openmpi, pmix, and hwloc builds, use hwloc and openmpi packages + +* Tue Dec 17 2019 Johann Lombardi - 0.8.0-1 +- Version bump up to 0.8.0 + +* Thu Dec 05 2019 Johann Lombardi - 0.7.0-1 +- Version bump up to 0.7.0 + +* Tue Nov 19 2019 Tom Nabarro 0.6.0-15 +- Temporarily unconstrain max. version of spdk + +* Wed Nov 06 2019 Brian J. Murrell 0.6.0-14 +- Constrain max. version of spdk + +* Wed Nov 06 2019 Brian J. Murrell 0.6.0-13 +- Use new cart with R: mercury to < 1.0.1-20 due to incompatibility + +* Wed Nov 06 2019 Michael MacDonald 0.6.0-12 +- Add daos_admin privileged helper for daos_server + +* Fri Oct 25 2019 Brian J. Murrell 0.6.0-11 +- Handle differences in Leap 15 Python packaging + +* Wed Oct 23 2019 Brian J. Murrell 0.6.0-9 +- Update BR: libisal-devel for Leap + +* Mon Oct 07 2019 Brian J. Murrell 0.6.0-8 +- Use BR: cart-devel-%%{cart_sha1} if available +- Remove cart's BRs as it's -devel Requires them now + +* Tue Oct 01 2019 Brian J. Murrell 0.6.0-7 +- Constrain cart BR to <= 1.0.0 + +* Sat Sep 21 2019 Brian J. Murrell +- Remove Requires: {argobots, cart} + - autodependencies should take care of these + +* Thu Sep 19 2019 Jeff Olivier +- Add valgrind-devel requirement for argobots change + +* Tue Sep 10 2019 Tom Nabarro +- Add requires ndctl as runtime dep for control plane. + +* Thu Aug 15 2019 David Quigley +- Add systemd unit files to packaging. + +* Thu Jul 25 2019 Brian J. Murrell +- Add git hash and commit count to release + +* Thu Jul 18 2019 David Quigley +- Add certificate generation files to packaging. + +* Tue Jul 09 2019 Johann Lombardi +- Version bump up to 0.6.0 + +* Fri Jun 21 2019 David Quigley +- Add daos_agent.yml to the list of packaged files + +* Thu Jun 13 2019 Brian J. Murrell +- move obj_ctl daos_gen_io_conf daos_run_io_conf to + daos-tests sub-package +- daos-server needs spdk-tools + +* Fri May 31 2019 Ken Cain +- Add new daos utility binary + +* Wed May 29 2019 Brian J. Murrell +- Version bump up to 0.5.0 +- Add Requires: libpsm_infinipath1 for SLES 12.3 + +* Tue May 07 2019 Brian J. Murrell +- Move some files around among the sub-packages + +* Mon May 06 2019 Brian J. Murrell +- Only BR fio + - fio-{devel,src} is not needed + +* Wed Apr 03 2019 Brian J. Murrell +- initial package diff --git a/utils/rpms/daos.sh b/utils/rpms/daos.sh index 5d7b3abd7a4..2db52a22eba 100755 --- a/utils/rpms/daos.sh +++ b/utils/rpms/daos.sh @@ -1,6 +1,10 @@ #!/bin/bash -# (C) Copyright 2025 Google LLC -# WORK IN PROGRESS +# +# (C) Copyright 2025 Google LLC +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP +# +# SPDX-License-Identifier: BSD-2-Clause-Patent +# set -eEuo pipefail root="$(realpath "$(dirname "${BASH_SOURCE[0]}")")" . "${root}/fpm_common.sh" @@ -31,6 +35,8 @@ to-end data integrity, fine grained data control and elastic storage to optimize performance and cost." URL="https://daos.io" +RPM_CHANGELOG="daos.changelog" + # Some extra "install" steps # daos package files=() @@ -62,7 +68,8 @@ install_list+=("${tmp}${sysconfdir}/daos/certs=${sysconfdir}/daos") EXTRA_OPTS+=("--rpm-attr" "0755,root,root:${sysconfdir}/daos/certs") -DEPENDS=( "mercury >= ${mercury_full}" "${libfabric_lib} >= ${libfabric_full}" ) +DEPENDS=( "mercury >= ${mercury_version}" ) +DEPENDS+=( "${isal_crypto_lib} >= ${isal_crypto_version}" ) build_package "daos" # Only build server RPMs if we built the server @@ -177,7 +184,8 @@ EOF EXTRA_OPTS+=("--rpm-attr" "2755,root,daos_server:${bindir}/daos_server") DEPENDS=( "daos = ${VERSION}-${RELEASE}" "daos-spdk = ${daos_spdk_full}" ) - DEPENDS+=( "${pmemobj_lib} >= ${pmdk_full}" "${argobots_lib} >= ${argobots_full}" ) + DEPENDS+=( "${pmemobj_lib} = ${pmdk_full}" "${argobots_lib} >= ${argobots_full}" ) + DEPENDS+=( "${isal_crypto_lib} >= ${isal_crypto_version}" "numactl" "pciutils" ) build_package "daos-server" TARGET_PATH="${bindir}" diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 0098ec2a96c..bbba1cf3406 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -12,7 +12,6 @@ %global daos_build_args client test %endif %global mercury_version 2.4 -%global libfabric_version 1.15.1-1 %global argobots_version 1.2 %global __python %{__python3} %global daos_log_dir "/var/log/daos" @@ -24,8 +23,8 @@ %endif Name: daos -Version: 2.7.101 -Release: 16%{?relval}%{?dist} +Version: 2.7.104 +Release: 2%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -40,7 +39,6 @@ BuildRequires: python3-scons >= 2.4 %else BuildRequires: scons >= 2.4 %endif -BuildRequires: libfabric-devel >= %{libfabric_version} BuildRequires: mercury-devel >= %{mercury_version} BuildRequires: gcc-c++ %if (0%{?rhel} >= 8) @@ -63,7 +61,7 @@ BuildRequires: libjson-c-devel BuildRequires: boost-devel %endif %if %{with server} -BuildRequires: libpmemobj-devel >= 2.1.0 +BuildRequires: libpmemobj-devel >= 2.1.3 %endif BuildRequires: fused-devel %if (0%{?suse_version} >= 1500) @@ -166,13 +164,11 @@ Requires: ndctl # needed to set PMem configuration goals in BIOS through control-plane %if (0%{?suse_version} >= 1500) Requires: ipmctl >= 03.00.00.0423 -Requires: libpmemobj1 >= 2.1.0-1.suse1500 -Requires: libfabric1 >= %{libfabric_version} +Requires: libpmemobj1 >= 2.1.3 %else Requires: ipmctl >= 03.00.00.0468 -Requires: libpmemobj >= 2.1.0-1%{?dist} +Requires: libpmemobj >= 2.1.3 %endif -Requires: libfabric >= %{libfabric_version} Requires: mercury >= %{mercury_version} Requires(post): /sbin/ldconfig Requires(postun): /sbin/ldconfig @@ -195,10 +191,6 @@ This package contains DAOS administrative tools (e.g. dmg). Summary: The DAOS client Requires: %{name}%{?_isa} = %{version}-%{release} Requires: mercury >= %{mercury_version} -Requires: libfabric >= %{libfabric_version} -%if (0%{?suse_version} >= 1500) -Requires: libfabric1 >= %{libfabric_version} -%endif Requires: /usr/bin/fusermount3 %{?systemd_requires} @@ -657,826 +649,3 @@ fi %doc README.md # No files in a shim package %endif - -%changelog -* Thu Oct 16 2025 Jeff Olivier 2.7.101-16 -- Make daos-spdk conflict with spdk - -* Thu Sep 12 2025 Jeff Olivier 2.7.101-15 -- Fix leap package name - -* Thu Sep 11 2025 Jeff Olivier 2.7.101-14 -- Fix pmdk package for leap -- Fix daos-spdk package - -* Mon Aug 11 2025 Jeff Olivier 2.7.101-13 -- Switch to fpm build for RPMs - -* Wed Jul 30 2025 Tomasz Gromadzki 2.7.101-12 -- pmemobj errors and warnings reported via DAOS logging system - -* Mon Jun 2 2025 Samirkumar Raval 2.7.101-11 -- Changing the default log location to /var/log/daos from /tmp - -* Mon May 19 2025 Jeff Olivier 2.7.101-10 -- Start to deprecate this file being used to build DAOS but rather only source - RPM - -* Mon May 12 2025 Tomasz Gromadzki 2.7.101-9 -- Bump lua-lmod version to >=8.7.36 -- Bump lmod version to >=8.7.36 -- Bump mpich version to 4.1~a1 -- Bump python3-mpi4py-tests version to >= 3.1.6 -- Add openmpi requiremnent for daos-client-tests on Leap. - -* Fri Mar 21 2025 Cedric Koch-Hofer 2.7.101-8 -- Add support of the libasan - -* Tue Mar 18 2025 Jeff Olivier 2.7.101-7 -- Remove raft as external dependency - -* Mon Mar 10 2025 Jeff Olivier 2.7.101-6 -- Remove server from Ubuntu packaging and fix client only build - -* Wed Jan 22 2025 Jan Michalski 2.7.101-5 -- Add ddb_ut and dtx_ut to the server-tests package - -* Fri Dec 20 2024 Jeff Olivier 2.7.101-4 -- Switch libfuse3 to libfused - -* Thu Dec 19 2024 Phillip Henderson 2.7.101-3 -- Fix protobuf-c requiremnent for daos-client-tests on Leap. - -* Thu Nov 14 2024 Denis Barakhtanov 2.7.101-2 -- Add pydaos.torch module to daos-client rpm. - -* Fri Nov 08 2024 Phillip Henderson 2.7.101-1 -- Bump version to 2.7.100 - -* Tue Nov 5 2024 Michael MacDonald 2.7.100-11 -- Move daos_metrics tool to daos package for use on both clients - and servers. - -* Fri Nov 1 2024 Sherin T George 2.7.100-10 -- The modified DAV allocator with memory bucket support for md_on_ssd - phase-2 is delivered as dav_v2.so. - -* Tue Oct 15 2024 Brian J. Murrell - 2.7.100-9 -- Drop BRs for UCX as they were obsoleted as of e01970d - -* Mon Oct 07 2024 Cedric Koch-Hofer 2.7.100-8 -- Update BR: argobots to 1.2 - -* Tue Oct 01 2024 Tomasz Gromadzki 2.7.100-7 -- Add support of the PMDK package 2.1.0 with NDCTL enabled. - * Increase the default ULT stack size to 20KiB if the engine uses - the DCPM storage class. - * Prevent using the RAM storage class (simulated PMem) when - the shutdown state (SDS) is active. - * Automatically disable SDS for the RAM storage class on engine startup. - * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0' - environment variable to deactivate SDS for the DAOS tools - (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM. - Otherwise, a user is supposed to be stopped by an error - like: "Unsafe shutdown count is not supported for this source". - -* Mon Sep 23 2024 Kris Jacque 2.7.100-6 -- Bump min supported go version to 1.21 - -* Thu Aug 15 2024 Michael MacDonald 2.7.100-5 -- Add libdaos_self_test.so to client RPM - -* Mon Aug 05 2024 Jerome Soumagne 2.7.100-4 -- Bump mercury version to 2.4.0rc4 - -* Thu Jul 11 2024 Dalton Bohning 2.7.100-3 -- Add pciutils-devel build dep for client-tests package - -* Mon Jun 24 2024 Tom Nabarro 2.7.100-2 -- Add pciutils runtime dep for daos_server lspci call -- Add pciutils-devel build dep for pciutils CGO bindings - -* Mon May 20 2024 Phillip Henderson 2.7.100-1 -- Bump version to 2.7.100 - -* Fri May 03 2024 Lei Huang 2.5.101-5 -- Add libaio as a dependent package - -* Fri Apr 05 2024 Fan Yong 2.5.101-4 -- Catastrophic Recovery - -* Thu Apr 04 2024 Ashley M. Pittman 2.5.101-3 -- Update pydaos install process -- Add a dependency from daos-client-tests to daos-devel - -* Mon Mar 18 2024 Jan Michalski 2.5.101-2 -- Add dtx_tests to the server-tests package - -* Fri Mar 15 2024 Phillip Henderson 2.5.101-1 -- Bump version to 2.5.101 - -* Tue Feb 27 2024 Li Wei 2.5.100-16 -- Update raft to 0.11.0-1.416.g12dbc15 - -* Mon Feb 12 2024 Ryon Jensen 2.5.100-15 -- Updated isa-l package name to match EPEL - -* Tue Jan 09 2024 Brian J. Murrell 2.5.100-14 -- Move /etc/ld.so.conf.d/daos.conf to daos-server sub-package - -* Wed Dec 06 2023 Brian J. Murrell 2.5.100-13 -- Update for EL 8.8 and Leap 15.5 -- Update raft to 0.10.1-2.411.gefa15f4 - -* Fri Nov 17 2023 Tomasz Gromadzki 2.5.100-12 -- Update to PMDK 2.0.0 - * Remove libpmemblk from dependencies. - * Start using BUILD_EXAMPLES=n and BUILD_BENCHMARKS=n instead of patches. - * Stop using BUILD_RPMEM=n (removed) and NDCTL_DISABLE=y (invalid). - * Point https://github.com/pmem/pmdk as the main PMDK reference source. - NOTE: PMDK upgrade to 2.0.0 does not affect any API call used by DAOS. - libpmemobj (and libpmem) API stays unchanged. - -* Wed Nov 15 2023 Jerome Soumagne 2.5.100-11 -- Bump mercury min version to 2.3.1 - -* Fri Nov 03 2023 Phillip Henderson 2.5.100-10 -- Move verify_perms.py location - -* Wed Aug 23 2023 Brian J. Murrell 2.5.100-9 -- Update fuse3 requirement to R: /usr/bin/fusermount3 by path - rather than by package name, for portability and future-proofing -- Adding fuse3-devel as a requirement for daos-client-tests subpackage - -* Tue Aug 08 2023 Brian J. Murrell 2.5.100-8 -- Build on EL9 -- Add a client-tests-mpich subpackage for mpich test dependencies. - -* Fri Jul 07 2023 Brian J. Murrell 2.5.100-7 -- Fix golang daos-client-tests dependency to be go instead - -* Thu Jun 29 2023 Michael MacDonald 2.5.100-6 -- Install golang >= 1.18 as a daos-client-tests dependency - -* Thu Jun 22 2023 Li Wei 2.5.100-5 -- Update raft to 0.10.1-1.408.g9524cdb - -* Wed Jun 14 2023 Mohamad Chaarawi - 2.5.100-4 -- Add pipeline lib - -* Wed Jun 14 2023 Wang Shilong 2.5.100-3 -- Remove lmdb-devel for MD on SSD - -* Wed Jun 07 2023 Ryon Jensen 2.5.100-2 -- Removed unnecessary test files - -* Tue Jun 06 2023 Jeff Olivier 2.5.100-1 -- Switch version to 2.5.100 for 2.6 test builds - -* Mon Jun 5 2023 Jerome Soumagne 2.3.107-7 -- Remove libfabric pinning and allow for 1.18 builds - -* Fri May 26 2023 Jeff Olivier 2.3.107-6 -- Add lmdb-devel and bio_ut for MD on SSD - -* Tue May 23 2023 Lei Huang 2.3.107-5 -- Add libcapstone-devel to deps of client-tests package - -* Tue May 16 2023 Lei Huang 2.3.107-4 -- Add libcapstone as a new prerequisite package -- Add libpil4dfs.so in daos-client rpm - -* Mon May 15 2023 Jerome Soumagne 2.3.107-3 -- Fix libfabric/libfabric1 dependency mismatch on SuSE - -* Wed May 10 2023 Jerome Soumagne 2.3.107-2 -- Temporarily pin libfabric to < 1.18 - -* Fri May 5 2023 Johann Lombardi 2.3.107-1 -- Bump version to 2.3.107 - -* Fri Mar 17 2023 Tom Nabarro 2.3.106-2 -- Add numactl requires for server package - -* Tue Mar 14 2023 Brian J. Murrell 2.3.106-1 -- Bump version to be higher than TB5 - -* Wed Feb 22 2023 Li Wei 2.3.103-6 -- Update raft to 0.9.2-1.403.g3d20556 - -* Tue Feb 21 2023 Michael MacDonald 2.3.103-5 -- Bump min supported go version to 1.17 - -* Fri Feb 17 2023 Ashley M. Pittman 2.3.103-4 -- Add protobuf-c-devel to deps of client-tests package - -* Mon Feb 13 2023 Brian J. Murrell 2.3.103-3 -- Remove explicit R: protobuf-c and let the auto-dependency generator - handle it - -* Wed Feb 8 2023 Michael Hennecke 2.3.103-2 -- Change ipmctl requirement from v2 to v3 - -* Fri Jan 27 2023 Phillip Henderson 2.3.103-1 -- Bump version to 2.3.103 - -* Wed Jan 25 2023 Johann Lombardi 2.3.102-1 -- Bump version to 2.3.102 - -* Tue Jan 24 2023 Phillip Henderson 2.3.101-7 -- Fix daos-tests-internal requirement for daos-tests - -* Fri Jan 6 2023 Brian J. Murrell 2.3.101-6 -- Don't need to O: cart any more -- Add %%doc to all packages -- _datadir -> _datarootdir -- Don't use PREFIX= with scons in %%build -- Fix up some hard-coded paths to use macros instead -- Use some guards to prevent creating empty scriptlets - -* Tue Dec 06 2022 Joseph G. Moore 2.3.101-5 -- Update Mercury to 2.2.0-6 - -* Thu Dec 01 2022 Tom Nabarro 2.3.101-4 -- Update SPDK dependency requirement to greater than or equal to 22.01.2. - -* Tue Oct 18 2022 Brian J. Murrell 2.3.101-3 -- Set flag to build per-subpackage debuginfo packages for Leap 15 - -* Thu Oct 6 2022 Michael MacDonald 2.3.101-2 -- Rename daos_admin -> daos_server_helper - -* Tue Sep 20 2022 Johann Lombardi 2.3.101-1 -- Bump version to 2.3.101 - -* Thu Sep 8 2022 Jeff Olivier 2.3.100-22 -- Move io_conf files from bin to TESTING - -* Tue Aug 16 2022 Jeff Olivier 2.3.100-21 -- Update PMDK to 1.12.1~rc1 to fix DAOS-11151 - -* Thu Aug 11 2022 Wang Shilong 2.3.100-20 -- Add daos_debug_set_params to daos-client-tests rpm for fault injection test. - -* Fri Aug 5 2022 Jerome Soumagne 2.3.100-19 -- Update to mercury 2.2.0 - -* Tue Jul 26 2022 Michael MacDonald 2.3.100-18 -- Bump min supported go version to 1.16 - -* Mon Jul 18 2022 Jerome Soumagne 2.3.100-17 -- Remove now unused openpa dependency - -* Fri Jul 15 2022 Jeff Olivier 2.3.100-16 -- Add pool_scrubbing_tests to test package - -* Wed Jul 13 2022 Tom Nabarro 2.3.100-15 -- Update SPDK dependency requirement to greater than or equal to 22.01.1. - -* Mon Jun 27 2022 Jerome Soumagne 2.3.100-14 -- Update to mercury 2.2.0rc6 - -* Fri Jun 17 2022 Jeff Olivier 2.3.100-13 -- Remove libdts.so, replace with build time static - -* Thu Jun 2 2022 Jeff Olivier 2.3.100-12 -- Make ucx required for build on all platforms - -* Wed Jun 1 2022 Michael MacDonald 2.3.100-11 -- Move dmg to new daos-admin RPM - -* Wed May 18 2022 Lei Huang 2.3.100-10 -- Update to libfabric to v1.15.1-1 to include critical performance patches - -* Tue May 17 2022 Phillip Henderson 2.3.100-9 -- Remove doas-client-tests-openmpi dependency from daos-tests -- Add daos-tests-internal package - -* Mon May 9 2022 Ashley Pittman 2.3.100-8 -- Extend dfusedaosbuild test to run in different configurations. - -* Fri May 6 2022 Ashley Pittman 2.3.100-7 -- Add dfuse unit-test binary to call from ftest. - -* Wed May 4 2022 Joseph Moore 2.3.100-6 -- Update to mercury 2.1.0.rc4-9 to enable non-unified mode in UCX - -* Tue Apr 26 2022 Phillip Henderson 2.3.100-5 -- Move daos_gen_io_conf and daos_run_io_conf to daos-client-tests - -* Wed Apr 20 2022 Lei Huang 2.3.100-4 -- Update to libfabric to v1.15.0rc3-1 to include critical performance patches - -* Tue Apr 12 2022 Li Wei 2.3.100-3 -- Update raft to 0.9.1-1401.gc18bcb8 to fix uninitialized node IDs - -* Wed Apr 6 2022 Jeff Olivier 2.3.100-2 -- Remove direct MPI dependency from most of tests - -* Wed Apr 6 2022 Johann Lombardi 2.3.100-1 -- Switch version to 2.3.100 for 2.4 test builds - -* Wed Apr 6 2022 Joseph Moore 2.1.100-26 -- Add build depends entries for UCX libraries. - -* Sat Apr 2 2022 Joseph Moore 2.1.100-25 -- Update to mercury 2.1.0.rc4-8 to include UCX provider patch - -* Fri Mar 11 2022 Alexander Oganezov 2.1.100-24 -- Update to mercury 2.1.0.rc4-6 to include CXI provider patch - -* Wed Mar 02 2022 Michael Hennecke 2.1.100-23 -- DAOS-6344: Create secondary group daos_daemons for daos_server and daos_agent - -* Tue Feb 22 2022 Alexander Oganezov 2.1.100-22 -- Update mercury to include DAOS-9561 workaround - -* Sun Feb 13 2022 Michael MacDonald 2.1.100-21 -- Update go toolchain requirements - -* Thu Feb 10 2022 Li Wei 2.1.100-20 -- Update raft to 0.9.0-1394.gc81505f to fix membership change bugs - -* Wed Jan 19 2022 Michael MacDonald 2.1.100-19 -- Move libdaos_common.so from daos-client to daos package - -* Mon Jan 17 2022 Johann Lombardi 2.1.100-18 -- Update libfabric to 1.14.0 GA and apply fix for DAOS-9376 - -* Thu Dec 23 2021 Alexander Oganezov 2.1.100-17 -- Update to v2.1.0-rc4-3 to pick fix for DAOS-9325 high cpu usage -- Change mercury pinning to be >= instead of strict = - -* Thu Dec 16 2021 Brian J. Murrell 2.1.100-16 -- Add BR: python-rpm-macros for Leap 15 as python3-base dropped that - as a R: - -* Sat Dec 11 2021 Brian J. Murrell 2.1.100-15 -- Create a shim package to allow daos openmpi packages built with the - distribution openmpi to install on MOFED systems - -* Fri Dec 10 2021 Brian J. Murrell 2.1.100-14 -- Don't make daos-*-tests-openmi a dependency of anything - - If they are wanted, they should be installed explicitly, due to - potential conflicts with other MPI stacks - -* Wed Dec 08 2021 Alexander Oganezov 2.1.100-13 -- Remove DAOS-9173 workaround from mercury. Apply DAOS-9173 to ofi - -* Tue Dec 07 2021 Alexander Oganezov 2.1.100-12 -- Apply DAOS-9173 workaround to mercury - -* Fri Dec 03 2021 Alexander Oganezov 2.1.100-11 -- Update mercury to v2.1.0rc4 - -* Thu Dec 02 2021 Danielle M. Sikich 2.1.100-10 -- Fix name of daos serialize package - -* Sun Nov 28 2021 Tom Nabarro 2.1.100-9 -- Set rmem_{max,default} sysctl values on server package install to enable - SPDK pci_event module to operate in unprivileged process (daos_engine). - -* Wed Nov 24 2021 Brian J. Murrell 2.1.100-8 -- Remove invalid "%%else if" syntax -- Fix a few other rpmlint warnings - -* Tue Nov 16 2021 Wang Shilong 2.1.100-7 -- Update for libdaos major version bump -- Fix version of libpemobj1 for SUSE - -* Sat Nov 13 2021 Alexander Oganezov 2.1.100-6 -- Update OFI to v1.14.0rc3 - -* Tue Oct 26 2021 Brian J. Murrell 2.1.100-5 -- Create new daos-{client,server}tests-openmpi and daos-server-tests subpackages -- Rename daos-tests daos-client-tests and make daos-tests require all - other test suites to maintain existing behavior - -* Mon Oct 25 2021 Alexander Oganezov 2.1.100-4 -- Update mercury to v2.1.0rc2 - -* Wed Oct 20 2021 Jeff Olivier 2.1.100-3 -- Explicitly require 1.11.0-3 of PMDK - -* Wed Oct 13 2021 David Quigley 2.1.100-2 -- Add defusedxml as a required dependency for the test package. - -* Wed Oct 13 2021 Johann Lombardi 2.1.100-1 -- Switch version to 2.1.100 for 2.2 test builds - -* Tue Oct 12 2021 Johann Lombardi 1.3.106-1 -- Version bump to 1.3.106 for 2.0 test build 6 - -* Fri Oct 8 2021 Alexander Oganezov 1.13.105-4 -- Update OFI to v1.13.2rc1 - -* Wed Sep 15 2021 Li Wei 1.3.105-3 -- Update raft to fix InstallSnapshot performance as well as to avoid some - incorrect 0.8.0 RPMs - -* Fri Sep 03 2021 Brian J. Murrell 1.3.105-2 -- Remove R: hwloc; RPM's auto-requires/provides will take care of this - -* Tue Aug 24 2021 Jeff Olivier 1.3.105-1 -- Version bump to 1.3.105 for 2.0 test build 5 - -* Mon Aug 09 2021 Yawei 1.3.104-5 -- Fix duplicates -- Add vos_perf - -* Thu Aug 05 2021 Christopher Hoffman 1.3.104-4 -- Update conditional statement to include checking for distributions to - determine which unit files to use for daos-server and daos-agent - -* Wed Aug 04 2021 Kris Jacque 1.3.104-3 -- Move daos_metrics tool from tests package to server package - -* Wed Aug 04 2021 Tom Nabarro 1.3.104-2 -- Update to spdk 21.07 and (indirectly) dpdk 21.05 - -* Mon Aug 02 2021 Jeff Olivier 1.3.104-1 -- Version bump to 1.3.104 for 2.0 test build 4 - -* Mon Jul 19 2021 Danielle M. Sikich 1.3.103-5 -- Add DAOS serialization library that requires hdf5 - -* Wed Jul 14 2021 Li Wei 1.3.103-4 -- Update raft to fix slow leader re-elections - -* Tue Jul 13 2021 Maureen Jean 1.3.103-3 -- Add python modules to python3.6 site-packages - -* Mon Jul 12 2021 Alexander Oganezov 1.3.103-2 -- Update to mercury release v2.0.1 - -* Mon Jul 12 2021 Johann Lombardi 1.3.103-1 -- Version bump to 1.3.103 for 2.0 test build 3 - -* Wed Jul 7 2021 Phillip Henderson 1.3.102-6 -- Update daos-devel to always require the same version daos-client - -* Wed Jun 30 2021 Tom Nabarro 1.3.102-5 -- Update to spdk 21.04 and (indirectly) dpdk 21.05 - -* Fri Jun 25 2021 Brian J. Murrell - 1.3.102-4 -- Add libuuid-devel back as a requirement of daos-devel - -* Wed Jun 23 2021 Li Wei 1.3.102-3 -- Update raft to pick up Pre-Vote - -* Mon Jun 14 2021 Jeff Olivier 1.3.102-2 -- Update to pmdk 1.11.0-rc1 -- Remove dependence on libpmem since we use libpmemobj directly - -* Fri Jun 11 2021 Johann Lombardi 1.3.102-1 -- Version bump to 1.3.102 for 2.0 test build 2 - -* Wed Jun 02 2021 Johann Lombardi 1.3.101-3 -- Remove libs from devel package - -* Thu May 20 2021 Jeff Olivier 1.3.0-101-2 -- Remove client libs from common package - -* Wed May 19 2021 Johann Lombardi 1.3.101-1 -- Version bump to 1.3.101 for 2.0 test build 1 - -* Fri May 07 2021 Brian J. Murrell 1.3.0-16 -- Enable debuginfo package building on SUSE platforms - -* Thu May 06 2021 Brian J. Murrell 1.3.0-15 -- Update to build on EL8 - -* Wed May 05 2021 Brian J. Murrell 1.3.0-14 -- Package /etc/daos/certs in main/common package so that both server - and client get it created - -* Wed Apr 21 2021 Tom Nabarro - 1.3.0-13 -- Relax ipmctl version requirement on leap15 as we have runtime checks - -* Fri Apr 16 2021 Mohamad Chaarawi - 1.3.0-12 -- remove dfuse_hl - -* Wed Apr 14 2021 Jeff Olivier - 1.3.0-11 -- Remove storage_estimator and io_conf from client packages to remove - any client side dependence on bio and vos (and and PMDK/SPDK) - -* Mon Apr 12 2021 Dalton A. Bohning - 1.3.0-10 -- Add attr to the test dependencies - -* Tue Apr 06 2021 Kris Jacque 1.3.0-9 -- Add package for daos_firmware helper binary - -* Fri Apr 02 2021 Jeff Olivier 1.3.0-8 -- Remove unused readline-devel - -* Thu Apr 01 2021 Brian J. Murrell 1.3.0-7 -- Update argobots to 1.1 - -* Tue Mar 30 2021 Maureen Jean 1.3.0-6 -- Change pydaos_shim_3 to pydaos_shim - -* Mon Mar 29 2021 Brian J. Murrell - 1.3.0-5 -- Move libdts.so to the daos-tests subpackage - -* Tue Mar 23 2021 Alexander Oganezov 1.3.0-4 -- Update libfabric to v1.12.0 -- Disable grdcopy/gdrapi linkage in libfabric - - -* Thu Mar 18 2021 Maureen Jean 1.3.0-3 -- Update to python3 - -* Thu Feb 25 2021 Li Wei 1.3.0-2 -- Require raft-devel 0.7.3 that fixes an unstable leadership problem caused by - removed replicas as well as some Coverity issues - -* Wed Feb 24 2021 Brian J. Murrell - 1.3.0-1 -- Version bump up to 1.3.0 - -* Mon Feb 22 2021 Brian J. Murrell 1.1.3-3 -- Remove all *-devel Requires from daos-devel as none of those are - actually necessary to build libdaos clients - -* Tue Feb 16 2021 Alexander Oganezov 1.1.3-2 -- Update libfabric to v1.12.0rc1 - -* Wed Feb 10 2021 Johann Lombardi 1.1.3-1 -- Version bump up to 1.1.3 - -* Tue Feb 9 2021 Vish Venkatesan 1.1.2.1-11 -- Add new pmem specific version of DAOS common library - -* Fri Feb 5 2021 Saurabh Tandan 1.1.2.1-10 -- Added dbench as requirement for test package. - -* Wed Feb 3 2021 Hua Kuang 1.1.2.1-9 -- Changed License to BSD-2-Clause-Patent - -* Wed Feb 03 2021 Brian J. Murrell - 1.1.2-8 -- Update minimum required libfabric to 1.11.1 - -* Thu Jan 28 2021 Phillip Henderson 1.1.2.1-7 -- Change ownership and permissions for the /etc/daos/certs directory. - -* Sat Jan 23 2021 Alexander Oganezov 1.1.2.1-6 -- Update to mercury v2.0.1rc1 - -* Fri Jan 22 2021 Michael MacDonald 1.1.2.1-5 -- Install daos_metrics utility to %%{_bindir} - -* Wed Jan 20 2021 Kenneth Cain 1.1.2.1-4 -- Version update for API major version 1, libdaos.so.1 (1.0.0) - -* Fri Jan 15 2021 Michael Hennecke 1.1.2.1-3 -- Harmonize daos_server and daos_agent groups. - -* Tue Dec 15 2020 Ashley Pittman 1.1.2.1-2 -- Combine the two memcheck suppressions files. - -* Wed Dec 09 2020 Johann Lombardi 1.1.2.1-1 -- Version bump up to 1.1.2.1 - -* Fri Dec 04 2020 Li Wei 1.1.2-3 -- Require raft-devel 0.7.1 that fixes recent Coverity issues - -* Wed Dec 02 2020 Maureen Jean - 1.1.2-2 -- define scons_args to be BUILD_TYPE= -- the scons default is BUILD_TYPE=release -- BUILD_TYPE=release will disable fault injection in build - -* Tue Dec 01 2020 Brian J. Murrell - 1.1.2-1 -- Version bump up to 1.1.2 - -* Tue Nov 17 2020 Li Wei 1.1.1-8 -- Require raft-devel 0.7.0 that changes log indices and terms to 63-bit - -* Wed Nov 11 2020 Tom Nabarro 1.1.1-7 -- Add version validation for runtime daos_server ipmctl requirement to avoid - potential corruption of PMMs when setting PMem goal, issue fixed in - https://github.com/intel/ipmctl/commit/9e3898cb15fa9eed3ef3e9de4488be1681d53ff4 - -* Thu Oct 29 2020 Jonathan Martinez Montes 1.1.1-6 -- Restore obj_ctl utility - -* Wed Oct 28 2020 Brian J. Murrell - 1.1.1-5 -- Use %%autosetup -- Only use systemd_requires if it exists -- Obsoletes: cart now that it's included in daos - -* Sat Oct 24 2020 Maureen Jean 1.1.1-4 -- Add daos.conf to the daos package to resolve the path to libbio.so - -* Tue Oct 13 2020 Jonathan Martinez Montes 1.1.1-3 -- Remove obj_ctl from Tests RPM package -- Add libdts.so shared library that is used by daos_perf, daos_racer and - the daos utility. - -* Tue Oct 13 2020 Amanda Justiniano 1.1.1-3 -- Add lbzip2 requirement to the daos-tests package - -* Tue Oct 13 2020 Michael MacDonald 1.1.1-2 -- Create unprivileged user for daos_agent - -* Mon Oct 12 2020 Johann Lombardi 1.1.1-1 -- Version bump up to 1.1.1 - -* Sat Oct 03 2020 Michael MacDonald 1.1.0-34 -- Add go-race to BuildRequires on OpenSUSE Leap - -* Wed Sep 16 2020 Alexander Oganezov 1.1.0-33 -- Update OFI to v1.11.0 - -* Mon Aug 17 2020 Michael MacDonald 1.1.0-32 -- Install completion script in /etc/bash_completion.d - -* Wed Aug 05 2020 Brian J. Murrell - 1.1.0-31 -- Change fuse requirement to fuse3 -- Use Lmod for MPI module loading -- Remove unneeded (and un-distro gated) Requires: json-c - -* Wed Jul 29 2020 Jonathan Martinez Montes - 1.1.0-30 -- Add the daos_storage_estimator.py tool. It merges the functionality of the - former tools vos_size, vos_size.py, vos_size_dfs_sample.py and parse_csv.py. - -* Wed Jul 29 2020 Jeffrey V Olivier - 1.1.0-29 -- Revert prior changes from version 28 - -* Mon Jul 13 2020 Brian J. Murrell - 1.1.0-28 -- Change fuse requirement to fuse3 -- Use Lmod for MPI module loading - -* Tue Jul 7 2020 Alexander A Oganezov - 1.1.0-27 -- Update to mercury release 2.0.0~rc1-1 - -* Sun Jun 28 2020 Jonathan Martinez Montes - 1.1.0-26 -- Add the vos_size_dfs_sample.py tool. It is used to generate dynamically - the vos_dfs_sample.yaml file using the real DFS super block data. - -* Tue Jun 23 2020 Jeff Olivier - 1.1.0-25 -- Add -no-rpath option and use it for rpm build rather than modifying - SCons files in place - -* Tue Jun 16 2020 Jeff Olivier - 1.1.0-24 -- Modify RPATH removal snippet to replace line with pass as some lines - can't be removed without breaking the code - -* Fri Jun 05 2020 Ryon Jensen - 1.1.0-23 -- Add libisa-l_crypto dependency - -* Fri Jun 05 2020 Tom Nabarro - 1.1.0-22 -- Change server systemd run-as user to daos_server in unit file - -* Thu Jun 04 2020 Hua Kuang - 1.1.0-21 -- Remove dmg_old from DAOS RPM package - -* Thu May 28 2020 Tom Nabarro - 1.1.0-20 -- Create daos group to run as in systemd unit file - -* Tue May 26 2020 Brian J. Murrell - 1.1.0-19 -- Enable parallel building with _smp_mflags - -* Fri May 15 2020 Kenneth Cain - 1.1.0-18 -- Require raft-devel >= 0.6.0 that adds new API raft_election_start() - -* Thu May 14 2020 Brian J. Murrell - 1.1.0-17 -- Add cart-devel's Requires to daos-devel as they were forgotten - during the cart merge - -* Thu May 14 2020 Brian J. Murrell - 1.1.0-16 -- Fix fuse3-libs -> libfuse3 for SLES/Leap 15 - -* Thu Apr 30 2020 Brian J. Murrell - 1.1.0-15 -- Use new properly pre-release tagged mercury RPM - -* Thu Apr 30 2020 Brian J. Murrell - 1.1.0-14 -- Move fuse dependencies to the client subpackage - -* Mon Apr 27 2020 Michael MacDonald 1.1.0-13 -- Rename /etc/daos.yml -> /etc/daos_control.yml - -* Thu Apr 16 2020 Brian J. Murrell - 1.1.0-12 -- Use distro fuse - -* Fri Apr 10 2020 Alexander Oganezov - 1.1.0-11 -- Update to mercury 4871023 to pick na_ofi.c race condition fix for - "No route to host" errors. - -* Sun Apr 05 2020 Brian J. Murrell - 1.1.0-10 -- Clean up spdk dependencies - -* Mon Mar 30 2020 Tom Nabarro - 1.1.0-9 -- Set version of spdk to < v21, > v19 - -* Fri Mar 27 2020 David Quigley - 1.1.0-8 -- add daos and dmg man pages to the daos-client files list - -* Thu Mar 26 2020 Michael MacDonald 1.1.0-7 -- Add systemd scriptlets for managing daos_server/daos_agent services - -* Thu Mar 26 2020 Alexander Oganeozv - 1.1.0-6 -- Update ofi to 62f6c937601776dac8a1f97c8bb1b1a6acfbc3c0 - -* Tue Mar 24 2020 Jeffrey V. Olivier - 1.1.0-5 -- Remove cart as an external dependence - -* Mon Mar 23 2020 Jeffrey V. Olivier - 1.1.0-4 -- Remove scons_local as dependency - -* Tue Mar 03 2020 Brian J. Murrell - 1.1.0-3 -- Bump up go minimum version to 1.12 - -* Thu Feb 20 2020 Brian J. Murrell - 1.1.0-2 -- daos-server requires daos-client (same version) - -* Fri Feb 14 2020 Brian J. Murrell - 1.1.0-1 -- Version bump up to 1.1.0 - -* Wed Feb 12 2020 Brian J. Murrell - 0.9.0-2 -- Remove undefine _missing_build_ids_terminate_build - -* Thu Feb 06 2020 Johann Lombardi - 0.9.0-1 -- Version bump up to 0.9.0 - -* Sat Jan 18 2020 Jeff Olivier - 0.8.0-3 -- Fixing a few warnings in the RPM spec file - -* Fri Dec 27 2019 Jeff Olivier - 0.8.0-2 -- Remove openmpi, pmix, and hwloc builds, use hwloc and openmpi packages - -* Tue Dec 17 2019 Johann Lombardi - 0.8.0-1 -- Version bump up to 0.8.0 - -* Thu Dec 05 2019 Johann Lombardi - 0.7.0-1 -- Version bump up to 0.7.0 - -* Tue Nov 19 2019 Tom Nabarro 0.6.0-15 -- Temporarily unconstrain max. version of spdk - -* Wed Nov 06 2019 Brian J. Murrell 0.6.0-14 -- Constrain max. version of spdk - -* Wed Nov 06 2019 Brian J. Murrell 0.6.0-13 -- Use new cart with R: mercury to < 1.0.1-20 due to incompatibility - -* Wed Nov 06 2019 Michael MacDonald 0.6.0-12 -- Add daos_admin privileged helper for daos_server - -* Fri Oct 25 2019 Brian J. Murrell 0.6.0-11 -- Handle differences in Leap 15 Python packaging - -* Wed Oct 23 2019 Brian J. Murrell 0.6.0-9 -- Update BR: libisal-devel for Leap - -* Mon Oct 07 2019 Brian J. Murrell 0.6.0-8 -- Use BR: cart-devel-%%{cart_sha1} if available -- Remove cart's BRs as it's -devel Requires them now - -* Tue Oct 01 2019 Brian J. Murrell 0.6.0-7 -- Constrain cart BR to <= 1.0.0 - -* Sat Sep 21 2019 Brian J. Murrell -- Remove Requires: {argobots, cart} - - autodependencies should take care of these - -* Thu Sep 19 2019 Jeff Olivier -- Add valgrind-devel requirement for argobots change - -* Tue Sep 10 2019 Tom Nabarro -- Add requires ndctl as runtime dep for control plane. - -* Thu Aug 15 2019 David Quigley -- Add systemd unit files to packaging. - -* Thu Jul 25 2019 Brian J. Murrell -- Add git hash and commit count to release - -* Thu Jul 18 2019 David Quigley -- Add certificate generation files to packaging. - -* Tue Jul 09 2019 Johann Lombardi -- Version bump up to 0.6.0 - -* Fri Jun 21 2019 David Quigley -- Add daos_agent.yml to the list of packaged files - -* Thu Jun 13 2019 Brian J. Murrell -- move obj_ctl daos_gen_io_conf daos_run_io_conf to - daos-tests sub-package -- daos-server needs spdk-tools - -* Fri May 31 2019 Ken Cain -- Add new daos utility binary - -* Wed May 29 2019 Brian J. Murrell -- Version bump up to 0.5.0 -- Add Requires: libpsm_infinipath1 for SLES 12.3 - -* Tue May 07 2019 Brian J. Murrell -- Move some files around among the sub-packages - -* Mon May 06 2019 Brian J. Murrell -- Only BR fio - - fio-{devel,src} is not needed - -* Wed Apr 03 2019 Brian J. Murrell -- initial package diff --git a/utils/rpms/fpm_common.sh b/utils/rpms/fpm_common.sh index 2e78742a1b2..18cce473275 100644 --- a/utils/rpms/fpm_common.sh +++ b/utils/rpms/fpm_common.sh @@ -179,6 +179,9 @@ build_package() { --architecture "${ARCH}" \ --description "${DESCRIPTION}" \ --url "${URL}" \ + --vendor "" \ + --maintainer "DAOS Foundation " \ + --prefix "" \ "${depends[@]}" \ "${conflicts[@]}" \ "${EXTRA_OPTS[@]}" \ diff --git a/utils/rpms/isa-l.changelog b/utils/rpms/isa-l.changelog new file mode 100644 index 00000000000..0d86547362a --- /dev/null +++ b/utils/rpms/isa-l.changelog @@ -0,0 +1,41 @@ +%changelog +* Wed Nov 19 2025 Tomasz Gromadzki - 2.31.1-8 +- Restore the RPM changelog, which has not been available since version 2.30.0-2 + +* Fri Sep 12 2025 Jeff Olivier - 2.31.1-7 +- Fix leap package name + +* Mon Aug 11 2025 Jeff Olivier - 2.31.1-6 +- Switch to fpm build for RPMs +- Update isa-l to 2.31.1 + +* Fri May 19 2023 Brian J. Murrell - 2.30.0-2 +- Disable static library build +- Add debuginfo generation for Leap 15 +- Add hardened build flags for CentOS 7 and Leap 15 + +* Thu Jan 28 2021 Brian J. Murrell - 2.30.0-1 +- Update to latest +- Add %%{_libdir}/pkgconfig/libisal.pc to -devel package + +* Tue Jun 16 2020 Brian J. Murrell - 2.26.0-3 +- Add %%license files + +* Wed Oct 02 2019 John E. Malmberg - 2.26.0-2 +- Fix some SUSE rpmlint packaging complaints + +* Wed May 15 2019 Brian J. Murrell - 2.26.0-1 +- Update to latest +- Split into a man utilities package with igizp and a library + package + - Obsoletes: the older isa-l packages accordingly + +* Tue May 07 2019 Brian J. Murrell - 2.21.0-3 +- Bump release for RPM cache coherency + +* Fri May 03 2019 Brian J. Murrell - 2.21.0-2 +- Use the more stable "archive" URL for the source +- Define a make_build macro for SLES 12.3 + +* Fri Apr 05 2019 Brian J. Murrell - 2.21.0-1 +- initial package \ No newline at end of file diff --git a/utils/rpms/isa-l.sh b/utils/rpms/isa-l.sh index 96580340f56..d258c42d6c3 100755 --- a/utils/rpms/isa-l.sh +++ b/utils/rpms/isa-l.sh @@ -17,6 +17,7 @@ DESCRIPTION="Intelligent Storage Acceleration Library. Provides various algorithms for erasure coding, crc, raid, compression and decompression" URL="https://github.com/intel/isa-l" +RPM_CHANGELOG="isa-l.changelog" files=() TARGET_PATH="${bindir}" diff --git a/utils/rpms/isa-l_crypto.changelog b/utils/rpms/isa-l_crypto.changelog new file mode 100644 index 00000000000..fa390eeb012 --- /dev/null +++ b/utils/rpms/isa-l_crypto.changelog @@ -0,0 +1,24 @@ +* Thu Nov 6 2025 Jeff Olivier - 2.25.0-1 +- Update DAOS to 2.25.0 release +- Restore change log missing since 2.24.0-1 + +* Wed Sep 10 2025 Jeff Olivier - 2.24.0-2 +- Use fpm to build isa-l_crypto + +* Thu Jun 22 2023 Brian J. Murrell - 2.24.0-1 +- Update to new version +- Disable static library build +- Add debuginfo generation for Leap 15 + +* Mon Feb 01 2021 Brian J. Murrell - 2.23.0-1 +- Update to new version +- Add %%{_libdir}/pkgconfig/libisal_crypto.pc to -devel package + +* Wed Oct 02 2019 John E. Malmberg - 2.21.0-3 +- Fix the Red Hat family devel package name. + +* Wed Oct 02 2019 John E. Malmberg - 2.21.0-2 +- Fix some SUSE rpmlint packaging complaints + +* Fri Aug 16 2019 Ryon Jensen - 2.21.0-1 +- initial package diff --git a/utils/rpms/isa-l_crypto.sh b/utils/rpms/isa-l_crypto.sh index 5fbe9a7eee3..32c6910cf45 100755 --- a/utils/rpms/isa-l_crypto.sh +++ b/utils/rpms/isa-l_crypto.sh @@ -26,6 +26,7 @@ SHA1, SHA256, SHA512, MD5) Provides various algorithms for erasure coding, crc, raid, compression and decompression" URL="https://github.com/intel/isa-l_crypto" +RPM_CHANGELOG="isa-l_crypto.changelog" files=() TARGET_PATH="${libdir}" diff --git a/utils/rpms/libfabric.changelog b/utils/rpms/libfabric.changelog new file mode 100644 index 00000000000..b3e780be7bd --- /dev/null +++ b/utils/rpms/libfabric.changelog @@ -0,0 +1,258 @@ +%changelog +* Wed Nov 19 2025 Tomasz Gromadzki - 1-22.0-5 +- Restore the RPM changelog, which has not been available since version 1.22.0-2 + +* Fri Sep 12 2025 Jeff Olivier - 1.22.0-4 +- Fix leap package name + +* Mon Aug 11 2025 Jeff Olivier - 1.22.0-3 +- Switch to fpm build for RPMs + +* Thu Feb 06 2025 Jerome Soumagne - 1.22.0-2 +- Re-enable psm2 provider for other applications depending on libfabric + +* Fri Oct 25 2024 Jerome Soumagne - 1.22.0-1 +- Update to 1.22.0 +- Drop prov/verbs patch merged upstream + +* Thu Mar 14 2024 Jerome Soumagne - 1.19.1-1 +- Update to 1.19.1 +- Drop prov/tcp multi-recv patch merged upstream +- Add prov/verbs assert patch + +* Mon Oct 30 2023 Jerome Soumagne - 1.19.0-1 +- Update to 1.19.0 +- Drop prov/tcp patches that were merged in 1.19.0 +- Drop prov/opx patch that was merged in 1.19.0 +- Add prov/tcp multi-recv patch +- Drop support for CentOS7 + +* Fri Jul 21 2023 Jerome Soumagne - 1.18.1-1 +- Update to 1.18.1 +- Drop patches that have been merged to 1.18.1 +- Add additional prov/tcp patches + +* Fri Jun 23 2023 Brian J. Murrell - 1.18.0-4 +- Rebuild for EL9 + +* Wed Jun 14 2023 Jerome Soumagne - 1.18.0-3 +- Add prov/tcp patch to fix registration lock issue +- Add prov/opx patch to fix 32-bit conversion issue +- Fix build_opx macro logic + +* Thu Jun 1 2023 Jerome Soumagne - 1.18.0-2 +- Add prov/tcp patch to fix busy spin issue + +* Wed May 3 2023 Jerome Soumagne - 1.18.0-1 +- Update to 1.18.0 +- Enable opx provider and add libuuid-devel dependency +- Add libnuma/numactl-devel dependency +- Clean up spec file and disable unused / deprecated providers +- Use tar.bz2 archive instead of tar.gz to skip autogen process +- Add prov/verbs patch to recover from qp error state + +* Thu Apr 13 2023 Alexander Oganezov - 1.17.1-1 +- Update to v1.17.1 +- Apply DAOS-12407 workaround to ofi + +* Thu Jan 26 2023 Brian J. Murrell - 1.15.1-4 +- Remove libpsm2[-devel] dependencies + +* Mon Aug 1 2022 Jerome Soumagne - 1.15.1-3 +- Drop CXI compat patch that is no longer needed + +* Tue Jul 5 2022 Jerome Soumagne - 1.15.1-2 +- Add patch to keep backward compatibility with CXI provider using v1.14.x + +* Wed May 18 2022 Lei Huang - 1.15.1-1 +- Update to v1.15.1 + +* Wed May 4 2022 Brian J. Murrell - 1.15.0~rc3-2 +- Add _hardened_build flag to build PIE binaries on CentOS 7 +- Add options to C*FLAGS to build PIE binaries on Leap 15 + +* Tue Apr 19 2022 Lei Huang - 1.15.0~rc3-1 +- Update to v1.15.0rc3 +- Remove patches already landed + +* Mon Apr 04 2022 Dmitry Eremin - 1.14.0-2 +- Apply patch for TCP provider +- Revert patch with performance degradation + +* Mon Jan 17 2022 Johann Lombardi - 1.14.0-1 +- Upgrade to 1.14.0 GA +- Apply patch for DAOS-9376 + +* Fri Dec 17 2021 Phillip Henderson - 1.14.0~rc3-3 +- Enable building debuginfo package on SUSE platforms + +* Wed Dec 8 2021 Alexander Oganezov - 1.14.0~rc3-2 +- Apply patch for DAOS-9173 + +* Sat Nov 13 2021 Alexander Oganezov - 1.14.0~rc3-1 +- Update to v1.14.0rc3 + +* Fri Oct 8 2021 Alexander Oganezov - 1.13.2~rc1-1 +- Update to v1.13.2rc1 + +* Wed Mar 10 2021 Alexander Oganezov - 1.12.0-1 +- Update to v1.12.0 + +* Tue Feb 16 2021 Alexander Oganezov - 1.12.0~rc1-1 +- Update to v1.12.0rc1 + +* Tue Nov 24 2020 Brian J. Murrell - 1.11.1-1 +- Update to 1.11.1 GA +- Make the use of %%{dl_verison} more automatic + +* Thu Oct 15 2020 Alexander Oganezov - 1.11.1~rc1-2 +- Fix to include DL_VERSION in Makefile + +* Fri Oct 9 2020 Alexander Oganezov - 1.11.1~rc1-1 +- Update to libfabric v1.11.1rc1 + +* Thu Oct 1 2020 Alexander Oganezov - 1.11.0-2 +- Disable EFA provider + +* Mon Sep 14 2020 Alexander Oganezov - 1.11.0-1 +- Update to libfabric v1.11.0 + +* Thu Aug 20 2020 Li Wei - 1.9.0-8 +- Update sockets_provider.patch to report the original connect errors + +* Wed Jul 1 2020 Alexander Oganezov - 1.9.0-7 +- Commented out infinipath from BuildRequires +- Removed --enable-psm from configuration flags + +* Mon May 18 2020 Alexander Oganezov - 1.9.0-6 +- update to 8fa7c5bbbfee7df5194b65d9294929a893eb4093 +- apply custom patch for sockets provider + +* Wed Mar 25 2020 Alexander Oganezov - 1.9.0-5 +- update to 62f6c937601776dac8a1f97c8bb1b1a6acfbc3c0 + +* Tue Mar 17 2020 Alexander Oganezov - 1.9.0-4 +- update to 15ce5c62e2f87715b32bc546d33bb132b97aea4c + +* Fri Mar 6 2020 Alexander Oganezov - 1.9.0-3 +- update to 8af3c112bfce155eb04218bef656f58f3609ce19 + +* Thu Feb 6 2020 Alexander Oganezov - 1.9.0-2 +- update to 955f3a07dd011fb1dbfa6b6c772ada03d5af320e to pick configure.ac fix + +* Wed Feb 5 2020 Brian J. Murrell - 1.9.0-1 +- Update to 1b8ed7876204692fd95b07df8cba21683707e5dc + +* Sat Nov 9 2019 Alexander Oganezov - 1.8.0-6 +- Update to 863407 + +* Wed Sep 25 2019 Brian J. Murrell - 1.8.0-5 +- Update BR: for psm2 to 11.2.78 +- Accordingly, devel subpackage should Requires: psm2-devel + +* Mon Sep 23 2019 Brian J. Murrell - 1.8.0-4 +- %%setup -> %%autosetup +- Add patch to bring up to 3712eb0 +- Set _default_patch_fuzz 1 due to GitHub's dirty compare/ patches +- Once again create the libfabric1 subpackage for SLES + +* Thu Aug 22 2019 Brian J. Murrell - 1.8.0-3 +- Revert previous change as it was causing (on SLES 12.3): +/usr/lib64/libfabric.so.1: undefined reference to `psm2_epaddr_to_epid@PSM2_1.0' +/usr/lib64/libfabric.so.1: undefined reference to `psm2_ep_disconnect2@PSM2_1.0' +/usr/lib64/libfabric.so.1: undefined reference to `psm2_am_register_handlers_2@PSM2_1.0' +/usr/lib64/libfabric.so.1: undefined reference to `psm2_info_query@PSM2_1.0' +/usr/lib64/libfabric.so.1: undefined reference to `psm2_get_capability_mask@PSM2_1.0' +/usr/lib64/libfabric.so.1: undefined reference to `psm2_ep_epid_lookup2@PSM2_1.0' + +* Tue Aug 20 2019 Brian J. Murrell - 1.8.0-2 +- Install libnl3-devel on all platforms +- Create a libfabric1 subpackage with the shared library +- Clean up much of SUSE's post build linting errors/warnings + +* Thu Jul 25 2019 Alexander A. Oganezov - 1.8.0-1 +- Update to 1.8.0 + +* Wed Jun 26 2019 Brian J. Murrell - 1.7.1rc1-4 +- Add BuildRequires: libpsm2-devel >= 10.3.58 + - needed for psm2_am_register_handlers_2@PSM2_1.0 + +* Tue May 14 2019 Brian J. Murrell - 1.7.1rc1-3 +- Fix SLES 12.3 OS conditionals >= 1315 + +* Wed May 01 2019 Brian J. Murrell - 1.7.1rc1-2 +- Disable psm2 on SLES 12.3 as the psm2 library there is too old + +* Tue Mar 19 2019 Brian J. Murrell - 1.7.1rc1-1 +- Update to 1.7.1 RC1 + +* Mon Mar 11 2019 Brian J. Murrell - 1.7.0rc3-1 +- Rebase to latest release 1.7.0rc3 + +* Wed Aug 15 2018 Brian J. Murrell - 1.6.0-1 +- Rebase to latest release 1.6.0 +- Remove obsolete patch +- Strip out local libtool Rpathing per + https://fedoraproject.org/wiki/RPath_Packaging_Draft#Removing_Rpath + +* Wed Jan 10 2018 Honggang Li - 1.5.3-1 +- Rebase to latest release 1.5.3 +- Resolves: bz1533293 + +* Thu Jan 4 2018 Honggang Li - 1.5.1-3 +- Add support of different CQ formats for the verbs/RDM +- Resolves: bz1530715 + +* Fri Oct 20 2017 Honggang Li - 1.5.1-2 +- Fix PPC32 compiling issue +- Resolves: bz1504395 + +* Tue Oct 17 2017 Honggang Li - 1.5.1-1 +- Rebase to v1.5.1 +- Resolves: bz1452791 + +* Tue May 16 2017 Honggang Li - 1.4.2-1 +- Update to upstream v1.4.2 release +- Related: bz1451100 + +* Wed Mar 01 2017 Jarod Wilson - 1.4.1-1 +- Update to upstream v1.4.1 release +- Related: bz1382827 + +* Mon May 30 2016 Honggang Li - 1.3.0-3 +- Rebuild against latest infinipath-psm. +- Related: bz1280143 + +* Mon May 30 2016 Honggang Li - 1.3.0-2 +- Rebuild libfabric to support Intel OPA PSM2. +- Related: bz1280143 + +* Wed May 4 2016 Honggang Li - 1.3.0-1 +- Update to latest upstream release +- Related: bz1280143 + +* Wed Sep 30 2015 Doug Ledford - 1.1.0-2 +- Rebuild against libnl3 now that the UD RoCE bug is fixed +- Related: bz1261028 + +* Fri Aug 14 2015 Honggang Li - 1.1.0-1 +- Rebase to upstream 1.1.0 +- Resolves: bz1253381 + +* Fri Aug 07 2015 Michal Schmidt - 1.1.0-0.2.rc4 +- Packaging Guidelines conformance fixes and spec file cleanups +- Related: bz1235266 + +* Thu Aug 6 2015 Honggang Li - 1.1.0-0.1.rc4 +- fix N-V-R issue and disable static library +- Related: bz1235266 + +* Tue Aug 4 2015 Honggang Li - 1.1.0rc4 +- Initial build for RHEL-7.2 +- Related: bz1235266 + +* Fri Jun 26 2015 Open Fabrics Interfaces Working Group 1.1.0rc1 +- Release 1.1.0rc1 + +* Sun May 3 2015 Open Fabrics Interfaces Working Group 1.0.0 +- Release 1.0.0 diff --git a/utils/rpms/libfabric.sh b/utils/rpms/libfabric.sh index 5b3077c410e..10853a4e5f3 100755 --- a/utils/rpms/libfabric.sh +++ b/utils/rpms/libfabric.sh @@ -16,6 +16,7 @@ ARCH=${isa} DESCRIPTION="Provides a user-space API to access high-performance fabric services, such as RDMA. This package contains the runtime library." URL="https://github.com/ofiwg/libfabric" +RPM_CHANGELOG="libfabric.changelog" files=() TARGET_PATH="${bindir}" diff --git a/utils/rpms/mercury.changelog b/utils/rpms/mercury.changelog new file mode 100644 index 00000000000..c00f682eb61 --- /dev/null +++ b/utils/rpms/mercury.changelog @@ -0,0 +1,274 @@ +%changelog +* Mon Jan 26 2026 Jerome Soumagne - 2.4.1-1 +- Update to 2.4.1 +- Separate libfabric plugin from main build to align with ucx plugin +- Add patches for runtime version checks and libfabric plugin counters +- Add patch for libfabric auth key + +* Wed Jun 25 2025 Joseph Moore - 2.4.0-5 +- Update release number to differentiate from test RPMs for prior issue. + +* Tue Mar 11 2025 Joseph Moore - 2.4.0-4 +- Change to addr_release for handling of "already present" warning. + +* Wed Jan 15 2025 Joseph Moore - 2.4.0-3 +- Add patch to na_ucx.c to flush end point prior to close. + +* Tue Jan 07 2025 Joseph Moore - 2.4.0-2 +- Enable debug RPMs for Leap sub-packages. + +* Mon Nov 04 2024 Jerome Soumagne - 2.4.0-1 +- Update to 2.4.0 +- Update required libfabric version (>= 1.20) + +* Mon Oct 07 2024 Joseph Moore - 2.4.0~rc5-5 +- Update patch to na_ucx.c to set thread-safe on clients. + +* Thu Sep 26 2024 Joseph Moore - 2.4.0~rc5-4 +- Update patch to na_ucx.c to add fix for connection accept. + +* Wed Sep 04 2024 Brian J. Murrell - 2.4.0~rc5-3 +- Add --without ucx build switch + +* Thu Aug 29 2024 Joseph Moore - 2.4.0~rc5-2 +- Add patch to na_ucx.c to check ep in key_resolve. + +* Mon Aug 26 2024 Jerome Soumagne - 2.4.0~rc5-1 +- Update to 2.4.0rc5 + +* Fri Aug 02 2024 Jerome Soumagne - 2.4.0~rc4-1 +- Update to 2.4.0rc4 +- Remove previous patches now included in 2.4 +- Require libfabric >= 1.15 + +* Tue Mar 19 2024 Jerome Soumagne - 2.3.1-3 +- Add patch to fix ucx hg_info +- Add patch to remove ofi cxi MR warnings +- Add patch to fix potential segfault on log free + +* Wed Nov 22 2023 Jerome Soumagne - 2.3.1-2 +- Rebuild for EL 8.8 and Leap 15.5 + +* Fri Oct 27 2023 Jerome Soumagne - 2.3.1-1 +- Update to 2.3.1 +- Add json-c dependency for hg_info JSON output support +- Drop support for CentOS7 + +* Tue Sep 26 2023 Joseph Moore - 2.3.1~rc1-2 +- Add patch to na_ucx.c to force retry of out-of-memory error. + +* Tue Aug 29 2023 Jerome Soumagne - 2.3.1~rc1-1 +- Update to 2.3.1rc1 + +* Thu Jun 22 2023 Brian J. Murrell - 2.3.0-2 +- Rebuild for EL9 + +* Wed Jun 7 2023 Jerome Soumagne - 2.3.0-1 +- Update to 2.3.0 +- Add hg_info tool +- Fix pie flags on CentOS7 +- Remove na_ucx_src_port.patch and old patches + +* Tue Apr 25 2023 Jerome Soumagne - 2.3.0~rc5-1 +- Update to 2.3.0rc5 +- Remove na_ucx.c patch and add temporary na_ucx_src_port.patch +- Update build to make use of NA dynamic plugins +- Fix source URL and package perf tests + +* Thu Dec 22 2022 Joseph Moore - 2.2.0-6 +- Regenerate packages for LEAP15.4 + +* Thu Nov 17 2022 Joseph Moore - 2.2.0-5 +- Update na_ucx.c patch to support reconnection following a disconnect. + +* Wed Oct 05 2022 Joseph Moore - 2.2.0-4 +- Update na_ucx.c patch to include UCX status to NA error mapping. + +* Tue Sep 20 2022 Joseph Moore - 2.2.0-3 +- Fix defect in connect function. + +* Fri Sep 09 2022 Joseph Moore - 2.2.0-2 +- Add na_ucx.c patch to change ep creation for single IB device. + +* Fri Aug 5 2022 Jerome Soumagne - 2.2.0-1 +- Update to 2.2.0 + +* Mon Aug 1 2022 Jerome Soumagne - 2.2.0~rc6-2 +- Rebuild after libfabric rpm dropped CXI compat patch +- Drop CXI compat patch + +* Mon Jun 27 2022 Jerome Soumagne - 2.2.0~rc6-1 +- Update to 2.2.0rc6 +- Skip install rpath, enable debug log. +- Remove openpa dependency. + +* Fri Apr 22 2022 Joseph Moore - 2.1.0~rc4-9 +- Change ucx unified mode to off (updated UCX patch file). + +* Fri Apr 1 2022 Brian J. Murrell - 2.1.0~rc4-8 +- Build with ucx subpackage on supported platforms +- Removed invalid build options: + * MERCURY_ENABLE_VERBOSE_ERROR + * MERCURY_USE_SELF_FORWARD + +* Thu Mar 31 2022 Joseph Moore - 2.1.0~rc4-7 +- Apply daos-9679 address parsing change and active message revision to na_ucx.c. + +* Fri Mar 11 2022 Alexander Oganezov - 2.1.0~rc4-6 +- Apply cxi provider patch + +* Tue Feb 22 2022 Alexander Oganezov - 2.1.0~rc4-5 +- Apply doas-9561 workaround + +* Thu Feb 17 2022 Brian J. Murrell - 2.1.0~rc4-4 +- Fix issues with %%post* ldconfig + - No lines are allowed after %%post -p + - These are not needed on EL8 as it's glibc does the work + +* Thu Dec 23 2021 Alexander Oganezov - 2.1.0~rc4-3 +- Remove daos-9173 workaround +- Apply cpu usage fix to mercury + +* Tue Dec 7 2021 Alexander Oganezov - 2.1.0~rc4-2 +- Apply DAOS-9173 workaround patch to na_ofi.c + +* Tue Nov 30 2021 Alexander Oganezov - 2.1.0~rc4-1 +- Update to version v2.1.0rc4 + +* Tue Oct 12 2021 Alexander Oganezov - 2.1.0~rc2-1 +- Update to version v2.1.0rc2 + +* Fri May 14 2021 Alexander Oganezov - 2.0.1-1 +- Update to version v2.0.1 + +* Mon May 10 2021 Brian J. Murryyell - 2.0.1~rc1-2 +- Enable debuginfo package building for SUSE + +* Wed Jan 20 2021 Alexander Oganezov - 2.0.1~rc1-1 +- Update to version v2.0.1rc1 + +* Wed Nov 18 2020 Alexander Oganezov - 2.0.0-1 +- Update to release v2.0.0 + +* Wed Oct 28 2020 Alexander Oganezov - 2.0.0~rc3-1 +- Update to release v2.0.0rc3 + +* Mon Oct 12 2020 Alexander Oganezov - 2.0.0~rc2-1 +- Update to release v2.0.0rc2 + +* Tue Aug 18 2020 Brian J. Murryyell - 2.0.0~rc1-2 +- Use release tarball and not individual submodule tarballs + +* Mon Jul 6 2020 Alexander A Oganezov - 2.0.0~rc1-1 +- Update to release v2.0.0rc1 + +* Mon Jun 22 2020 Brian J. Murryyell - 2.0.0~a1-2 +- Fix License: +- Add %%license + +* Thu May 07 2020 Brian J. Murrell - 2.0.0~a1-1 +- Fix pre-release tag in Version: +- Add Requires: libfabric-devel to devel package + +* Thu Apr 9 2020 Alexander A Oganezov - 2.0.0a1-0.8 +- Update to 4871023058887444d47ead4d089c99db979f3d93 + +* Tue Mar 17 2020 Alexander A Oganezov - 2.0.0a1-0.7 +- Update to 41caa143a07ed179a3149cac4af0dc7aa3f946fd + +* Thu Mar 12 2020 Alexander A Oganezov - 2.0.0a1-0.6 +- Update to 299b06d47e6c1d59a45985dcbbebe3caca0189d0 + +* Tue Mar 10 2020 Alexander A Oganezov - 2.0.0a1-0.5 +- Updated to ad5a3b3dbf171a97e1ca5f1683299db1c69b03ea + +* Thu Mar 05 2020 Vikram Chhabra - 2.0.0a1-0.4 +- Updated to latest master with HG_Forward fix. + +* Tue Feb 11 2020 Yulu Jia - 2.0.0a1-0.3 +- Remove nameserver patch + +* Sun Feb 09 2020 Yulu Jia - 2.0.0a1-0.2 +- Update patch to enable ip:port URI format for psm2 + +* Tue Feb 04 2020 Brian J. Murrell - 2.0.0a1-0.1 +- Update to 2.0.0a1 + +* Tue Jan 28 2020 Yulu Jia - 1.0.1-22 +- Update to c2c2628 +- Apply patch to enable ip:port URI format for psm2 + +* Mon Dec 2 2019 Alexander Oganezov - 1.0.1-21 +- Removed sl_patch on top of 7b529b +- Updated to 9889a0 + +* Thu Oct 31 2019 Alexander Oganezov - 1.0.1-20 +- sl_patch on top of 7b529b + +* Wed Oct 23 2019 Alexander Oganezov - 1.0.1-19 +- Update to 7b529b + +* Tue Oct 22 2019 Alexander Oganezov - 1.0.1-18 +- Reverting from 6a8b693 due to mercury segfaults + +* Mon Oct 21 2019 Alexander Oganezov - 1.0.1-17 +- Update to 6a8b693 + +* Wed Oct 16 2019 Alexander Oganezov - 1.0.1-16 +- Fixed spec to apply patch for 616fee properly + +* Tue Oct 15 2019 Alexander Oganezov - 1.0.1-15 +- Update to 616fee to get latest changes + +* Wed Oct 02 2019 Brian J. Murrell - 1.0.1-14 +- Update to cc0807 to include the HG_Cancel() fix. +- Update to f0b9f9 to get latest changes + +* Wed Oct 02 2019 Brian J. Murrell - 1.0.1-13 +- Once again revert previous update + +* Wed Oct 02 2019 Brian J. Murrell - 1.0.1-12 +- Update to cc0807 to include the HG_Cancel() fix. +- Update to f0b9f9 to get latest changes + +* Wed Sep 25 2019 Brian J. Murrell - 1.0.1-11 +- Back out previous update + - not all consumers are ready for it yet so they need to + pin their BR + +* Fri Sep 20 2019 Brian J. Murrell - 1.0.1-10 +- Update to cc0807 to include the HG_Cancel() fix. +- Update to f0b9f9 to get latest changes + +* Thu Aug 08 2019 Brian J. Murrell - 1.0.1-9 +- Revert previous update + +* Fri Aug 02 2019 Yulu Jia - 1.0.1-8 +- Update to cc0807 to include the HG_Cancel() fix. +- Roll the version number back to 1.0.1 + +* Fri Aug 02 2019 Brian J. Murrell - 1.0.1-7 +- Revert back to the 1.0.1-4 release as the upgrade included + in -5 (and the subsequent fix in -6) was premature + +* Thu Aug 01 2019 Brian J. Murrell - 1.0.1-6 +- Roll the version number back to 1.0.1 + +* Fri Jul 26 2019 Yulu Jia - 1.0.1-5 +- Update to cc0807 to include the HG_Cancel() fix. + +* Thu May 02 2019 Brian J. Murrell - 1.0.1-4 +- Devel package needs to require the lib package + +* Fri Mar 15 2019 Brian J. Murrell - 1.0.1-2 +- Add patch to revert back to Dec 06, 2018 c68870f + +* Mon Mar 11 2019 Brian J. Murrell - 1.0.1-1 +- Update to 1.0.1 +- Add patch for "HG Core: fix missing static inline in mercury_core.h" + +* Wed Oct 24 2018 Brian J. Murrell - 0.9.0-1.git.0f8f25b +- Update mercury to git sha1 0f8f25bb3d57f117979de65cc3c05cf192cf4b31 + +* Mon Aug 20 2018 Brian J. Murrell - 0.9.0-1.git.f7f6955 +- Initial package diff --git a/utils/rpms/mercury.sh b/utils/rpms/mercury.sh index f14fe057043..5f30dd89372 100755 --- a/utils/rpms/mercury.sh +++ b/utils/rpms/mercury.sh @@ -1,5 +1,10 @@ #!/bin/bash -# (C) Copyright 2025 Google LLC +# +# (C) Copyright 2025 Google LLC +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP +# +# SPDX-License-Identifier: BSD-2-Clause-Patent +# set -eEuo pipefail root="$(realpath "$(dirname "${BASH_SOURCE[0]}")")" . "${root}/fpm_common.sh" @@ -24,6 +29,7 @@ Access (RMA). Its interface is generic and allows any function call to be serialized. Since code generation is done using the C preprocessor, no external tool is required." URL="http://mercury-hpc.github.io" +RPM_CHANGELOG="mercury.changelog" files=() TARGET_PATH="${bindir}" @@ -36,14 +42,17 @@ list_files files "${SL_MERCURY_PREFIX}/lib64/lib*.so.*" clean_bin "${files[@]}" append_install_list "${files[@]}" +ARCH="${isa}" +build_package "mercury" + TARGET_PATH="${libdir}/mercury" list_files files "${SL_MERCURY_PREFIX}/lib64/mercury/libna_plugin_ofi.so" clean_bin "${files[@]}" append_install_list "${files[@]}" ARCH="${isa}" -DEPENDS=("${libfabric_lib} >= ${libfabric_version}") -build_package "mercury" +DEPENDS=("${libfabric_lib} >= ${libfabric_min_version}") +build_package "mercury-libfabric" DEPENDS=() TARGET_PATH="${libdir}/mercury" diff --git a/utils/rpms/package_info.sh b/utils/rpms/package_info.sh index 54920d77365..ef3338124d4 100644 --- a/utils/rpms/package_info.sh +++ b/utils/rpms/package_info.sh @@ -1,4 +1,10 @@ #!/bin/bash +# +# (C) Copyright 2025 Google LLC +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP +# +# SPDX-License-Identifier: BSD-2-Clause-Patent +# root="$(realpath "$(dirname "$(dirname "$(dirname "${BASH_SOURCE[0]}")")")")" set_lib_name() { comp="$1"; shift @@ -34,26 +40,27 @@ daos_release="$(grep "^Release: " "${root}/utils/rpms/daos.spec" | \ sed 's/^Release: *//' | sed 's/%.*//')${DAOS_RELVAL:-}${distro_name}" export daos_release +export libfabric_min_version="1.20" export libfabric_version="1.22.0" -export libfabric_release="4${distro_name}" +export libfabric_release="5${distro_name}" export libfabric_full="${libfabric_version}-${libfabric_release}" -export mercury_version="2.4.0" -export mercury_release="8${distro_name}" +export mercury_version="2.4.1" +export mercury_release="1${distro_name}" export mercury_full="${mercury_version}-${mercury_release}" export argobots_version="1.2" -export argobots_release="3${distro_name}" +export argobots_release="4${distro_name}" export argobots_full="${argobots_version}-${argobots_release}" -export pmdk_version="2.1.0" -export pmdk_release="7${distro_name}" +export pmdk_version="2.1.3" +export pmdk_release="1${distro_name}" export pmdk_full="${pmdk_version}-${pmdk_release}" export isal_version="2.31.1" -export isal_release="7${distro_name}" +export isal_release="8${distro_name}" export isal_full="${isal_version}-${isal_release}" -export isal_crypto_version="2.24.0" -export isal_crypto_release="3${distro_name}" +export isal_crypto_version="2.25.0" +export isal_crypto_release="1${distro_name}" export isal_crypto_full="${isal_crypto_version}-${isal_crypto_release}" -export daos_spdk_version="1.0.0" -export daos_spdk_release="4${distro_name}" +export daos_spdk_version="2.0.0" +export daos_spdk_release="1${distro_name}" export daos_spdk_full="${daos_spdk_version}-${daos_spdk_release}" export fused_version="1.0.0" export fused_release="3${distro_name}" @@ -93,11 +100,13 @@ set_lib_name mercury dev mercury mercury mercury export mercury_dev set_lib_name mercury lib mercury mercury mercury export mercury_lib +set_lib_name mercury_libfabric lib mercury-libfabric mercury-libfabric mercury-libfabric +export mercury_libfabric_lib set_lib_name pmemobj lib libpmemobj libpmemobj1 libpmemobj1 -set_lib_name pmemobj dev libpmemobj libpmemobj1 libpmemobj1 +set_lib_name pmemobj dev libpmemobj libpmemobj libpmemobj set_lib_name pmem lib libpmem libpmem1 libpmem1 -set_lib_name pmem dev libpmem libpmem libpmem1 +set_lib_name pmem dev libpmem libpmem libpmem set_lib_name pmempool lib libpmempool libpmempool1 libpmempool1 export pmem_lib export pmem_dev diff --git a/utils/rpms/pmdk.changelog b/utils/rpms/pmdk.changelog index 464a3120c75..0b6f288aa1f 100644 --- a/utils/rpms/pmdk.changelog +++ b/utils/rpms/pmdk.changelog @@ -1,3 +1,17 @@ +%changelog +* Mon Jan 19 2026 Oksana Salyk - 2.1.3-1 +- Expand the sds.at_create CTL to disable unnecessary bad-block checking when running without PMem, preventing the stack overflow (DAOS-18296). +- Fix an issue in the PMEMOBJ allocator with a potential to corrupt the allocator's metadata (DAOS-18195). + +* Wed Nov 05 2025 Tomasz Gromadzki - 2.1.2-1 +- Expand the sds.at_create CTL to also cover pmemobj_open() (DAOS-17449) + - Previously, this CTL affected only pmemobj_create(). + - Now, it affects both pmemobj_create() and pmemobj_open(). + - pmemobj_open() won't be able to open a pool with SDS enabled if the feature is currently + force-disabled. + - Conversely, pmemobj_open() does not issue a warning when attempting to open a pool with SDS disabled + while the feature is force-disabled. + * Fri Oct 31 2025 Tomasz Gromadzki - 2.1.0-7 - Restore the RPM changelog, which has not been available since version 2.1.0-4. @@ -17,6 +31,7 @@ * Mon Aug 11 2025 Jeff Olivier - 2.1.0-4 - Switch to fpm build for RPMs +- New location of the PMDK repository (https://github.com/daos-stack/pmdk) * Wed Nov 06 2024 Tomasz Gromadzki - 2.1.0-3 - Apply patches to silence annoying error messages on: diff --git a/utils/rpms/pmdk.sh b/utils/rpms/pmdk.sh index 11dc0490cc5..d26dbeee2ac 100755 --- a/utils/rpms/pmdk.sh +++ b/utils/rpms/pmdk.sh @@ -1,5 +1,6 @@ #!/bin/bash # (C) Copyright 2025 Google LLC +# Copyright 2026 Hewlett Packard Enterprise Development LP set -eEuo pipefail root="$(realpath "$(dirname "${BASH_SOURCE[0]}")")" . "${root}/fpm_common.sh" @@ -15,7 +16,7 @@ LICENSE="BSD-3-Clause" ARCH=${isa} DESCRIPTION="The Persistent Memory Development Kit is a collection of libraries for using memory-mapped persistence, optimized specifically for persistent memory." -URL="https://github.com/pmem/pmdk" +URL="https://github.com/daos-stack/pmdk" RPM_CHANGELOG="pmdk.changelog" files=() diff --git a/utils/run_utest.py b/utils/run_utest.py index a555e9f8203..39261be2a51 100755 --- a/utils/run_utest.py +++ b/utils/run_utest.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """ Copyright 2023-2024 Intel Corporation. + Copyright 2025 Google LLC Copyright 2025 Hewlett Packard Enterprise Development LP All rights reserved. @@ -270,7 +271,7 @@ def create_config(self, name): }}, {{ "params": {{ - "retry_count": 4, + "bdev_retry_count": 4, "timeout_us": 0, "nvme_adminq_poll_period_us": 100000, "action_on_timeout": "none", diff --git a/utils/scripts/helpers/repo-helper-el8.sh b/utils/scripts/helpers/repo-helper-el8.sh index 88bcf6654aa..28b2e7f01a3 100755 --- a/utils/scripts/helpers/repo-helper-el8.sh +++ b/utils/scripts/helpers/repo-helper-el8.sh @@ -74,8 +74,8 @@ if [ -n "$REPO_FILE_URL" ]; then popd # These may have been created in the Dockerfile must be removed # when using a local repository. - unset HTTPS_PROXY - unset https_proxy + # unset HTTPS_PROXY + # unset https_proxy fi dnf -y --disablerepo \*epel\* install dnf-plugins-core dnf -y config-manager --save --setopt=assumeyes=True @@ -130,10 +130,12 @@ disable_repos /etc/yum.repos.d/ "${save_repos[@]}" if [ -n "$REPO_FILE_URL" ]; then trusted_host="${REPO_FILE_URL##*//}" trusted_host="${trusted_host%%/*}"; \ - { - echo "[global]" - echo "trusted-host = ${trusted_host}" - echo "index-url = https://${trusted_host}/artifactory/api/pypi/pypi-proxy/simple" - echo "proxy = \"\"" - } > /etc/pip.conf + cat < /etc/pip.conf +[global] + trusted-host = ${trusted_host} + index-url = https://${trusted_host}/artifactory/api/pypi/pypi-proxy/simple + progress_bar = off + no_color = true + quiet = 1 +EOF fi diff --git a/utils/scripts/helpers/repo-helper-el9.sh b/utils/scripts/helpers/repo-helper-el9.sh index f96a144b674..ece81cf786f 100644 --- a/utils/scripts/helpers/repo-helper-el9.sh +++ b/utils/scripts/helpers/repo-helper-el9.sh @@ -126,10 +126,12 @@ disable_repos /etc/yum.repos.d/ "${save_repos[@]}" if [ -n "$REPO_FILE_URL" ]; then trusted_host="${REPO_FILE_URL##*//}" trusted_host="${trusted_host%%/*}"; \ - { - echo "[global]" - echo "trusted-host = ${trusted_host}" - echo "index-url = https://${trusted_host}/artifactory/api/pypi/pypi-proxy/simple" - echo "proxy = " - } > /etc/pip.conf + cat < /etc/pip.conf +[global] + trusted-host = ${trusted_host} + index-url = https://${trusted_host}/artifactory/api/pypi/pypi-proxy/simple + progress_bar = off + no_color = true + quiet = 1 +EOF fi diff --git a/utils/scripts/helpers/repo-helper-leap15.sh b/utils/scripts/helpers/repo-helper-leap15.sh index ab01f2cda51..bacae8b2698 100755 --- a/utils/scripts/helpers/repo-helper-leap15.sh +++ b/utils/scripts/helpers/repo-helper-leap15.sh @@ -174,10 +174,12 @@ update-ca-certificates if [ -n "$REPO_FILE_URL" ]; then trusted_host="${REPO_FILE_URL##*//}" trusted_host="${trusted_host%%/*}"; \ - { - echo "[global]" - echo "trusted-host = ${trusted_host}" - echo "index-url = https://${trusted_host}/artifactory/api/pypi/pypi-proxy/simple" - echo "proxy = \"\"" - } > /etc/pip.conf + cat < /etc/pip.conf +[global] + trusted-host = ${trusted_host} + index-url = https://${trusted_host}/artifactory/api/pypi/pypi-proxy/simple + progress_bar = off + no_color = true + quiet = 1 +EOF fi diff --git a/utils/scripts/helpers/repo-helper-ubuntu.sh b/utils/scripts/helpers/repo-helper-ubuntu.sh index 32cf3663065..54fd74b669c 100644 --- a/utils/scripts/helpers/repo-helper-ubuntu.sh +++ b/utils/scripts/helpers/repo-helper-ubuntu.sh @@ -111,10 +111,12 @@ apt-get clean all if [ -n "$REPO_FILE_URL" ]; then trusted_host="${REPO_FILE_URL##*//}" trusted_host="${trusted_host%%/*}"; \ - { - echo "[global]" - echo "trusted-host = ${trusted_host}" - echo "index-url = https://${trusted_host}/artifactory/api/pypi/pypi-proxy/simple" - echo "proxy = \"\"" - } > /etc/pip.conf + cat < /etc/pip.conf +[global] + trusted-host = ${trusted_host} + index-url = https://${trusted_host}/artifactory/api/pypi/pypi-proxy/simple + progress_bar = off + no_color = true + quiet = 1 +EOF fi diff --git a/utils/scripts/install-el8.sh b/utils/scripts/install-el8.sh index cb51c8a7f65..670ef8f6eef 100755 --- a/utils/scripts/install-el8.sh +++ b/utils/scripts/install-el8.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# (C) Copyright 2025 Google LLC # Install OS updates and packages as required for building DAOS on EL 8 and # derivatives. Include basic tools and daos dependencies that come from the core repos. @@ -59,6 +60,7 @@ dnf --nodocs install ${dnf_install_args} \ Lmod \ lz4-devel \ make \ + nasm \ ndctl \ ndctl-devel \ numactl \ @@ -79,6 +81,7 @@ dnf --nodocs install ${dnf_install_args} \ systemd \ valgrind-devel \ which \ + ncurses-devel \ yasm if [[ -z "${NO_OPENMPI_DEVEL+set}" ]]; then diff --git a/utils/scripts/install-el9.sh b/utils/scripts/install-el9.sh index 268f1c109ca..21980fac63f 100755 --- a/utils/scripts/install-el9.sh +++ b/utils/scripts/install-el9.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# (C) Copyright 2025 Google LLC # Install OS updates and packages as required for building DAOS on EL 9 and # derivatives. Include basic tools and daos dependencies that come from the core repos. @@ -59,11 +60,11 @@ dnf --nodocs install ${dnf_install_args} \ lz4-devel \ Lmod \ make \ + nasm \ ndctl \ ndctl-devel \ numactl \ numactl-devel \ - openmpi-devel \ openssl-devel \ pandoc \ patch \ @@ -79,8 +80,15 @@ dnf --nodocs install ${dnf_install_args} \ sudo \ valgrind-devel \ which \ + ncurses-devel \ yasm +if [[ -z "${NO_OPENMPI_DEVEL+set}" ]]; then + # shellcheck disable=SC2086 + dnf --nodocs install ${dnf_install_args} \ + openmpi-devel +fi + ruby_version=$(dnf module list ruby | grep -Eow "3\.[0-9]+" | tail -1) # shellcheck disable=SC2086 dnf --nodocs install ${dnf_install_args} \ diff --git a/utils/scripts/install-leap15.sh b/utils/scripts/install-leap15.sh index ae859e4fffb..5029eb1000a 100755 --- a/utils/scripts/install-leap15.sh +++ b/utils/scripts/install-leap15.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# (C) Copyright 2025 Google LLC # Install OS updates and package. Include basic tools and daos dependencies # that come from the core repo. @@ -60,6 +61,7 @@ dnf --nodocs install ${dnf_install_args} \ lua-lmod \ make \ maven \ + nasm \ numactl \ openmpi3-devel \ pandoc \ diff --git a/utils/scripts/install-ubuntu.sh b/utils/scripts/install-ubuntu.sh index 8c41006d70b..0db922daeb7 100755 --- a/utils/scripts/install-ubuntu.sh +++ b/utils/scripts/install-ubuntu.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# (C) Copyright 2025 Google LLC # Install OS updates and package. Include basic tools and daos dependencies # that come from the core repo. @@ -53,6 +54,7 @@ apt-get install ${apt_get_install_args} \ libyaml-dev \ locales \ maven \ + nasm \ numactl \ openjdk-8-jdk \ pandoc \ @@ -65,6 +67,7 @@ apt-get install ${apt_get_install_args} \ sudo \ uuid-dev \ valgrind \ + libncurses-dev \ yasm sudo gem install fpm diff --git a/utils/test_memcheck.supp b/utils/test_memcheck.supp index a4a59101891..9c3e7e2b2df 100644 --- a/utils/test_memcheck.supp +++ b/utils/test_memcheck.supp @@ -165,7 +165,7 @@ Memcheck:Leak match-leak-kinds: all ... - fun:?alloc + fun:*alloc ... fun:ompi_mpi_finalize ... @@ -271,15 +271,11 @@ ... } { - Tcp provider - Memcheck:Param - sendmsg(msg.msg_iov[1]) - ... - fun:sendmsg - fun:ofi_sockapi_sendv_socket - fun:ofi_bsock_sendv - ... - fun:fi_senddata + + Memcheck:Leak + match-leak-kinds: reachable + fun:malloc + fun:hg_dlog_mkcount64 ... } { @@ -289,20 +285,14 @@ ... fun:ofi_bsock_sendv ... - fun:fi_tsend - ... } { Tcp provider with ofi rxm 2 Memcheck:Param sendmsg(msg.msg_iov[2]) ... - fun:sendmsg - fun:ofi_sockapi_sendv_socket fun:ofi_bsock_sendv ... - fun:fi_tsend - ... } { par_init mpi or dlopen leak @@ -356,12 +346,11 @@ fun:start_thread } { - getpwnam_r leak + getpwnam_r() leak Memcheck:Leak + fun:*alloc ... - fun:_nss_systemd_getpwnam_r fun:getpwnam_r* - fun:daos_acl_principal_to_uid } { getgrgid_r leak @@ -404,6 +393,14 @@ ... fun:bdev_aio_writev } +{ + bdev_aio_rw param error + Memcheck:Param + io_submit(PWRITEV(iov[i])) + fun:syscall + ... + fun:bdev_aio_rw +} { Memcheck:Param @@ -451,3 +448,19 @@ fun:spdk_mem_map_set_translation ... } +{ + setgrent() leak + Memcheck:Leak + match-leak-kinds: reachable + fun:*alloc + ... + fun:setgrent +} +{ + OPENSSL_init_crypto leak + Memcheck:Leak + match-leak-kinds: reachable + fun:malloc + ... + fun:OPENSSL_init_crypto +} diff --git a/utils/trivy/.trivyignore b/utils/trivy/.trivyignore index c3452b8f4fa..4a63364afab 100644 --- a/utils/trivy/.trivyignore +++ b/utils/trivy/.trivyignore @@ -10,3 +10,9 @@ CVE-2025-48924 ## CVE-2025-58057,MEDIUM,7.5,"netty-codec: netty-codec-compression: Netty's BrotliDecoder is vulnerable to DoS via zip bomb style attack","io.netty:netty-codec","4.1.100.Final","4.1.125.Final",https://avd.aquasec.com/nvd/cve-2025-58057 CVE-2025-58057 + +## CVE-2025-33042,MEDIUM,,"org.apache.avro/avro: Apache Avro Java SDK: Code injection on Java generated code","org.apache.avro:avro","1.11.4","1.12.1, 1.11.5",https://avd.aquasec.com/nvd/cve-2025-33042 +CVE-2025-33042 + +## GHSA-72hv-8253-57qq,HIGH,,"jackson-core: Number Length Constraint Bypass in Async Parser Leads to Potential DoS Condition","com.fasterxml.jackson.core:jackson-core","2.14.3","2.18.6, 2.21.1, 3.1.0",https://github.com/advisories/GHSA-72hv-8253-57qq +GHSA-72hv-8253-57qq diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml index 5ac0b5a86c7..40ab9c24d3b 100644 --- a/utils/trivy/trivy.yaml +++ b/utils/trivy/trivy.yaml @@ -1,5 +1,12 @@ # SPDX-License-Identifier: BSD-2-Clause-Patent -# Copyright (c) 2024 Intel Corporation. +# Copyright 2024 Intel Corporation. +# Copyright 2026 Hewlett Packard Enterprise Development LP + +# +# Use the following command to run the trivy scan manually +# trivy fs -c utils/trivy/trivy.yaml . +# Scan results are written to the trivy-report-daos.txt file. +# cache: backend: fs diff --git a/utils/utest.yaml b/utils/utest.yaml index 8710a1bae46..e5077e57202 100644 --- a/utils/utest.yaml +++ b/utils/utest.yaml @@ -1,5 +1,5 @@ # (C) Copyright 2023-2024 Intel Corporation. -# (C) Copyright 2025 Hewlett Packard Enterprise Development LP. +# (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. # # SPDX-License-Identifier: BSD-2-Clause-Patent - name: common @@ -14,6 +14,7 @@ - cmd: ["src/common/tests/acl_real_tests"] - cmd: ["src/common/tests/prop_tests"] - cmd: ["src/common/tests/fault_domain_tests"] + - cmd: ["src/common/tests/control_tests"] - name: common_md_on_ssd base: "BUILD_DIR" required_src: ["src/common/tests/ad_mem_tests.c"] @@ -52,6 +53,7 @@ - name: gurt base: "BUILD_DIR" tests: + - cmd: ["src/gurt/tests/d_log_memory_ut"] - cmd: ["src/gurt/tests/test_gurt"] - cmd: ["src/gurt/tests/test_gurt_telem_producer"] - name: DTX @@ -190,6 +192,10 @@ tests: - cmd: ["bin/ddb_tests"] - cmd: ["bin/ddb_ut"] +- name: dlck + base: "BUILD_DIR" + tests: + - cmd: ["src/utils/dlck/tests/dlck_args_ut"] - name: Source metadata testing gha: True memcheck: False