diff --git a/.gitignore b/.gitignore index cd800581..4c03a849 100644 --- a/.gitignore +++ b/.gitignore @@ -234,3 +234,13 @@ tests/data # Local working directory (personal scripts, docs, tools) local/ +nitin_docs/ +nitin_scripts/ + +# Local notebooks (kept for development, not committed) +docs/user_guide/13_index_migrations.ipynb + +# Migration temp files (generated by rvl migrate commands) +migration_plan.yaml +migration_report.yaml +schema_patch.yaml diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..3004e051 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,165 @@ +# AGENTS.md - RedisVL Project Context + +## Frequently Used Commands + +```bash +# Development workflow +make install # Install dependencies +make format # Format code (black + isort) +make check-types # Run mypy type checking +make lint # Run all linting (format + types) +make test # Run tests (no external APIs) +make test-all # Run all tests (includes API tests) +make check # Full check (lint + test) + +# Redis setup +make redis-start # Start Redis container +make redis-stop # Stop Redis container + +# Documentation +make docs-build # Build documentation +make docs-serve # Serve docs locally +``` + +Pre-commit hooks are also configured, which you should +run before you commit: +```bash +pre-commit run --all-files +``` + +## Important Architectural Patterns + +### Async/Sync Dual Interfaces +- Most core classes have both sync and async versions (e.g., `SearchIndex` / `AsyncSearchIndex`) +- Follow existing patterns when adding new functionality + +### Schema-Driven Design +```python +# Index schemas define structure +schema = IndexSchema.from_yaml("schema.yaml") +index = SearchIndex(schema, redis_url="redis://localhost:6379") +``` + +## Critical Rules + +### Do Not Modify +- **CRITICAL**: Do not change this line unless explicitly asked: + ```python + token.strip().strip(",").replace(""", "").replace(""", "").lower() + ``` + +### Git Operations +**CRITICAL**: NEVER use `git push` or attempt to push to remote repositories. The user will handle all git push operations. + +### Branch and Commit Policy +**IMPORTANT**: Use conventional branch names and conventional commits. + +Branch naming: +- Human-created branches should use `/` +- Automation-created branches may use `codex//` +- Preferred branch types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`, `perf`, `build`, `ci` +- Examples: + - `feat/index-migrator` + - `fix/async-sentinel-pool` + - `docs/index-migrator-benchmarking` + - `codex/feat/index-migrator` + +Commit messages: +- Use Conventional Commits: `(optional-scope): ` +- Preferred commit types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`, `perf`, `build`, `ci` +- Examples: + - `feat(migrate): add drop recreate planning docs` + - `docs(index-migrator): add benchmarking guidance` + - `fix(cli): validate migrate plan inputs` + +### Code Quality +**IMPORTANT**: Always run `make format` before committing code to ensure proper formatting and linting compliance. + +### README.md Maintenance +**IMPORTANT**: DO NOT modify README.md unless explicitly requested. + +**If you need to document something, use these alternatives:** +- Development info → CONTRIBUTING.md +- API details → docs/ directory +- Examples → docs/examples/ +- Project memory (explicit preferences, directives, etc.) → AGENTS.md + +## Code Style Preferences + +### Import Organization +- **Prefer module-level imports** by default for clarity and standard Python conventions +- **Use local/inline imports only when necessary** for specific reasons: + - Avoiding circular import dependencies + - Improving startup time for heavy/optional dependencies + - Lazy loading for performance-critical paths +- When using local imports, add a brief comment explaining why (e.g., `# Local import to avoid circular dependency`) + +### Comments and Output +- **No emojis in code comments or print statements** +- Keep comments professional and focused on technical clarity +- Use emojis sparingly only in user-facing documentation (markdown files), not in Python code + +### General Guidelines +- Follow existing patterns in the RedisVL codebase +- Maintain consistency with the project's established conventions +- Run `make format` before committing to ensure code quality standards + +## Testing Notes +RedisVL uses `pytest` with `testcontainers` for testing. + +- `make test` - unit tests only (no external APIs) +- `make test-all` - run the full suite, including tests that call external APIs +- `pytest --run-api-tests` - explicitly run API-dependent tests (e.g., LangCache, + external vectorizer/reranker providers). These require the appropriate API + keys and environment variables to be set. + +## Project Structure + +``` +redisvl/ +├── cli/ # Command-line interface (rvl command) +├── extensions/ # AI extensions (cache, memory, routing) +│ ├── cache/ # Semantic caching for LLMs +│ ├── llmcache/ # LLM-specific caching +│ ├── message_history/ # Chat history management +│ ├── router/ # Semantic routing +│ └── session_manager/ # Session management +├── index/ # SearchIndex classes (sync/async) +├── query/ # Query builders (Vector, Range, Filter, Count) +├── redis/ # Redis client utilities +├── schema/ # Index schema definitions +└── utils/ # Utilities (vectorizers, rerankers, optimization) + ├── rerank/ # Result reranking + └── vectorize/ # Embedding providers integration +``` + +## Core Components + +### 1. Index Management +- `SearchIndex` / `AsyncSearchIndex` - Main interface for Redis vector indices +- `IndexSchema` - Define index structure with fields (text, tags, vectors, etc.) +- Support for JSON and Hash storage types + +### 2. Query System +- `VectorQuery` - Semantic similarity search +- `RangeQuery` - Vector search within distance range +- `FilterQuery` - Metadata filtering and full-text search +- `CountQuery` - Count matching records +- Etc. + +### 3. AI Extensions +- `SemanticCache` - LLM response caching with semantic similarity +- `EmbeddingsCache` - Cache for vector embeddings +- `MessageHistory` - Chat history with recency/relevancy retrieval +- `SemanticRouter` - Route queries to topics/intents + +### 4. Vectorizers (Optional Dependencies) +- OpenAI, Azure OpenAI, Cohere, HuggingFace, Mistral, VoyageAI +- Custom vectorizer support +- Batch processing capabilities + +## Documentation +- Main docs: https://docs.redisvl.com +- Built with Sphinx from `docs/` directory +- Includes API reference and user guides +- Example notebooks in documentation `docs/user_guide/...` diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 54138d77..6f4afb7f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -251,12 +251,33 @@ Before suggesting a new feature: ## Pull Request Process -1. **Fork and create a branch**: Create a descriptive branch name (e.g., `fix-search-bug` or `add-vector-similarity`) +1. **Fork and create a branch**: Use a conventional branch name such as `feat/index-migrator`, `fix/search-bug`, or `docs/vectorizer-guide` 2. **Make your changes**: Follow our coding standards and include tests 3. **Test thoroughly**: Ensure your changes work and don't break existing functionality 4. **Update documentation**: Add or update documentation as needed 5. **Submit your PR**: Include a clear description of what your changes do +### Branch Naming and Commit Messages + +We use conventional branch names and Conventional Commits to keep history easy to scan and automate. + +Branch naming: + +- Use `/` +- Recommended types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`, `perf`, `build`, `ci` +- Examples: + - `feat/index-migrator` + - `fix/async-sentinel-pool` + - `docs/migration-benchmarking` + +Commit messages: + +- Use `(optional-scope): ` +- Examples: + - `feat(migrate): add drop recreate plan generation` + - `docs(index-migrator): add benchmark guidance` + - `fix(cli): reject unsupported migration diffs` + ### Review Process - The core team reviews Pull Requests regularly diff --git a/docs/concepts/field-attributes.md b/docs/concepts/field-attributes.md index c7764a4a..96060d2f 100644 --- a/docs/concepts/field-attributes.md +++ b/docs/concepts/field-attributes.md @@ -267,7 +267,7 @@ Key vector attributes: - `dims`: Vector dimensionality (required) - `algorithm`: `flat`, `hnsw`, or `svs-vamana` - `distance_metric`: `COSINE`, `L2`, or `IP` -- `datatype`: `float16`, `float32`, `float64`, or `bfloat16` +- `datatype`: Vector precision (see table below) - `index_missing`: Allow searching for documents without vectors ```yaml @@ -281,6 +281,48 @@ Key vector attributes: index_missing: true # Handle documents without embeddings ``` +### Vector Datatypes + +The `datatype` attribute controls how vector components are stored. Smaller datatypes reduce memory usage but may affect precision. + +| Datatype | Bits | Memory (768 dims) | Use Case | +|----------|------|-------------------|----------| +| `float32` | 32 | 3 KB | Default. Best precision for most applications. | +| `float16` | 16 | 1.5 KB | Good balance of memory and precision. Recommended for large-scale deployments. | +| `bfloat16` | 16 | 1.5 KB | Better dynamic range than float16. Useful when embeddings have large value ranges. | +| `float64` | 64 | 6 KB | Maximum precision. Rarely needed. | +| `int8` | 8 | 768 B | Integer quantization. Significant memory savings with some precision loss. | +| `uint8` | 8 | 768 B | Unsigned integer quantization. For embeddings with non-negative values. | + +**Algorithm Compatibility:** + +| Datatype | FLAT | HNSW | SVS-VAMANA | +|----------|------|------|------------| +| `float32` | Yes | Yes | Yes | +| `float16` | Yes | Yes | Yes | +| `bfloat16` | Yes | Yes | No | +| `float64` | Yes | Yes | No | +| `int8` | Yes | Yes | No | +| `uint8` | Yes | Yes | No | + +**Choosing a Datatype:** + +- **Start with `float32`** unless you have memory constraints +- **Use `float16`** for production systems with millions of vectors (50% memory savings, minimal precision loss) +- **Use `int8`/`uint8`** only after benchmarking recall on your specific dataset +- **SVS-VAMANA users**: Must use `float16` or `float32` + +**Quantization with the Migrator:** + +You can change vector datatypes on existing indexes using the migration wizard: + +```bash +rvl migrate wizard --index my_index --url redis://localhost:6379 +# Select "Update field" > choose vector field > change datatype +``` + +The migrator automatically re-encodes stored vectors to the new precision. See {doc}`/user_guide/how_to_guides/migrate-indexes` for details. + ## Redis-Specific Subtleties ### Modifier Ordering @@ -304,6 +346,53 @@ Not all attributes work with all field types: | `unf` | ✓ | ✗ | ✓ | ✗ | ✗ | | `withsuffixtrie` | ✓ | ✓ | ✗ | ✗ | ✗ | +### Migration Support + +The migration wizard (`rvl migrate wizard`) supports updating field attributes on existing indexes. The table below shows which attributes can be updated via the wizard vs requiring manual schema patch editing. + +**Wizard Prompts:** + +| Attribute | Text | Tag | Numeric | Geo | Vector | +|-----------|------|-----|---------|-----|--------| +| `sortable` | Wizard | Wizard | Wizard | Wizard | N/A | +| `index_missing` | Wizard | Wizard | Wizard | Wizard | N/A | +| `index_empty` | Wizard | Wizard | N/A | N/A | N/A | +| `no_index` | Wizard | Wizard | Wizard | Wizard | N/A | +| `unf` | Wizard* | N/A | Wizard* | N/A | N/A | +| `separator` | N/A | Wizard | N/A | N/A | N/A | +| `case_sensitive` | N/A | Wizard | N/A | N/A | N/A | +| `no_stem` | Wizard | N/A | N/A | N/A | N/A | +| `weight` | Wizard | N/A | N/A | N/A | N/A | +| `algorithm` | N/A | N/A | N/A | N/A | Wizard | +| `datatype` | N/A | N/A | N/A | N/A | Wizard | +| `distance_metric` | N/A | N/A | N/A | N/A | Wizard | +| `m`, `ef_construction` | N/A | N/A | N/A | N/A | Wizard | + +*\* `unf` is only prompted when `sortable` is enabled.* + +**Manual Schema Patch Required:** + +| Attribute | Notes | +|-----------|-------| +| `phonetic_matcher` | Enable phonetic search | +| `withsuffixtrie` | Suffix/contains search optimization | + +**Example manual patch** for adding `index_missing` to a field: + +```yaml +# schema_patch.yaml +version: 1 +changes: + update_fields: + - name: category + attrs: + index_missing: true +``` + +```bash +rvl migrate plan --index my_index --schema-patch schema_patch.yaml +``` + ### JSON Path for Nested Fields When using JSON storage, use the `path` attribute to index nested fields: diff --git a/docs/concepts/index-migrations.md b/docs/concepts/index-migrations.md new file mode 100644 index 00000000..a162f0ff --- /dev/null +++ b/docs/concepts/index-migrations.md @@ -0,0 +1,145 @@ +--- +myst: + html_meta: + "description lang=en": | + Learn how RedisVL index migrations work and which schema changes are supported. +--- + +# Index Migrations + +Redis Search indexes are immutable. To change an index schema, you must drop the existing index and create a new one. RedisVL provides a migration workflow that automates this process while preserving your data. + +This page explains how migrations work and which changes are supported. For step by step instructions, see the [migration guide](../user_guide/how_to_guides/migrate-indexes.md). + +## Supported and blocked changes + +The migrator classifies schema changes into two categories: + +| Change | Status | +|--------|--------| +| Add or remove a field | Supported | +| Change field options (sortable, separator) | Supported | +| Change vector algorithm (FLAT, HNSW, SVS-VAMANA) | Supported | +| Change distance metric (COSINE, L2, IP) | Supported | +| Tune algorithm parameters (M, EF_CONSTRUCTION) | Supported | +| Quantize vectors (float32 to float16) | Supported | +| Change vector dimensions | Blocked | +| Change key prefix | Blocked | +| Rename a field | Blocked | +| Change storage type (hash to JSON) | Blocked | +| Add a new vector field | Blocked | + +**Supported** changes can be applied automatically using `rvl migrate`. The migrator handles the index rebuild and any necessary data transformations. + +**Blocked** changes require manual intervention because they involve incompatible data formats or missing data. The migrator will reject these changes and explain why. + +## How the migrator works + +The migrator uses a plan first workflow: + +1. **Plan**: Capture the current schema, classify your changes, and generate a migration plan +2. **Review**: Inspect the plan before making any changes +3. **Apply**: Drop the index, transform data if needed, and recreate with the new schema +4. **Validate**: Verify the result matches expectations + +This separation ensures you always know what will happen before any changes are made. + +## Migration mode: drop_recreate + +The `drop_recreate` mode rebuilds the index in place while preserving your documents. + +The process: + +1. Drop only the index structure (documents remain in Redis) +2. For datatype changes, re-encode vectors to the target precision +3. Recreate the index with the new schema +4. Wait for Redis to re-index the existing documents +5. Validate the result + +**Tradeoff**: The index is unavailable during the rebuild. The migrator requires explicit acknowledgment of this downtime before proceeding. + +## Index only vs document dependent changes + +Schema changes fall into two categories based on whether they require modifying stored data. + +**Index only changes** affect how Redis Search indexes data, not the data itself: + +- Algorithm changes: The stored vector bytes are identical. Only the index structure differs. +- Distance metric changes: Same vectors, different similarity calculation. +- Adding or removing fields: The documents already contain the data. The index just starts or stops indexing it. + +These changes complete quickly because they only require rebuilding the index. + +**Document dependent changes** require modifying the stored data: + +- Datatype changes (float32 to float16): Stored vector bytes must be re-encoded. +- Field renames: Stored field names must be updated in every document. +- Dimension changes: Vectors must be re-embedded with a different model. + +The migrator handles datatype changes automatically. Other document dependent changes are blocked because they require application level logic or external services. + +## Vector quantization + +Changing vector precision from float32 to float16 reduces memory usage at the cost of slight precision loss. The migrator handles this automatically by: + +1. Reading all vectors from Redis +2. Converting to the target precision +3. Writing updated vectors back +4. Recreating the index with the new schema + +Typical reductions: + +| Metric | Value | +|--------|-------| +| Index size reduction | ~50% | +| Memory reduction | ~35% | + +Quantization time is proportional to document count. Plan for downtime accordingly. + +## Why some changes are blocked + +### Vector dimension changes + +Vector dimensions are determined by your embedding model. A 384 dimensional vector from one model is mathematically incompatible with a 768 dimensional index expecting vectors from a different model. There is no way to resize an embedding. + +**Resolution**: Re-embed your documents using the new model and load them into a new index. + +### Prefix changes + +Changing a prefix from `docs:` to `articles:` requires copying every document to a new key. This operation doubles storage temporarily and can leave orphaned keys if interrupted. + +**Resolution**: Create a new index with the new prefix and reload your data. + +### Field renames + +Field names are stored in the documents themselves as hash field names or JSON keys. Renaming requires iterating through every document and updating the field name. + +**Resolution**: Create a new index with the correct field name and reload your data. + +### Storage type changes + +Hash and JSON have different data layouts. Hash stores flat key value pairs. JSON stores nested structures. Converting between them requires understanding your schema and restructuring each document. + +**Resolution**: Export your data, transform it to the new format, and reload into a new index. + +### Adding a vector field + +Adding a vector field means all existing documents need vectors for that field. The migrator cannot generate these vectors because it does not know which embedding model to use or what content to embed. + +**Resolution**: Add vectors to your documents using your application, then run the migration. + +## Downtime considerations + +With `drop_recreate`, your index is unavailable between the drop and when re-indexing completes. Plan for: + +- Search unavailability during the migration window +- Partial results while indexing is in progress +- Resource usage from the re-indexing process +- Quantization time if changing vector datatypes + +The duration depends on document count, field count, and vector dimensions. For large indexes, consider running migrations during low traffic periods. + +## Learn more + +- [Migration guide](../user_guide/how_to_guides/migrate-indexes.md): Step by step instructions +- [Search and indexing](search-and-indexing.md): How Redis Search indexes work diff --git a/docs/concepts/index.md b/docs/concepts/index.md index 0e522b1a..02f4d8b0 100644 --- a/docs/concepts/index.md +++ b/docs/concepts/index.md @@ -26,6 +26,13 @@ How RedisVL components connect: schemas, indexes, queries, and extensions. Schemas, fields, documents, storage types, and query patterns. ::: +:::{grid-item-card} 🔄 Index Migrations +:link: index-migrations +:link-type: doc + +How RedisVL handles migration planning, rebuilds, and future shadow migration. +::: + :::{grid-item-card} 🏷️ Field Attributes :link: field-attributes :link-type: doc @@ -62,6 +69,7 @@ Pre-built patterns: caching, message history, and semantic routing. architecture search-and-indexing +index-migrations field-attributes queries utilities diff --git a/docs/concepts/search-and-indexing.md b/docs/concepts/search-and-indexing.md index b4fe6956..5312d7df 100644 --- a/docs/concepts/search-and-indexing.md +++ b/docs/concepts/search-and-indexing.md @@ -106,9 +106,14 @@ To change a schema, you create a new index with the updated configuration, reind Planning your schema carefully upfront reduces the need for migrations, but the capability exists when requirements evolve. ---- +RedisVL now includes a dedicated migration workflow for this lifecycle: + +- `drop_recreate` for document-preserving rebuilds, including vector quantization (`float32` → `float16`) -**Related concepts:** {doc}`field-attributes` explains how to configure field options like `sortable` and `index_missing`. {doc}`queries` covers the different query types available. +That means schema evolution is no longer only a manual operational pattern. It is also a product surface in RedisVL with a planner, CLI, and validation artifacts. + +--- -**Learn more:** {doc}`/user_guide/01_getting_started` walks through building your first index. {doc}`/user_guide/05_hash_vs_json` compares storage options in depth. {doc}`/user_guide/02_complex_filtering` covers query composition. +**Related concepts:** {doc}`field-attributes` explains how to configure field options like `sortable` and `index_missing`. {doc}`queries` covers the different query types available. {doc}`index-migrations` explains migration modes, supported changes, and architecture. +**Learn more:** {doc}`/user_guide/01_getting_started` walks through building your first index. {doc}`/user_guide/05_hash_vs_json` compares storage options in depth. {doc}`/user_guide/02_complex_filtering` covers query composition. {doc}`/user_guide/how_to_guides/migrate-indexes` shows how to use the migration CLI in practice. diff --git a/docs/user_guide/cli.ipynb b/docs/user_guide/cli.ipynb index ba9d645a..dc9377d4 100644 --- a/docs/user_guide/cli.ipynb +++ b/docs/user_guide/cli.ipynb @@ -6,7 +6,7 @@ "source": [ "# The RedisVL CLI\n", "\n", - "RedisVL is a Python library with a dedicated CLI to help load and create vector search indices within Redis.\n", + "RedisVL is a Python library with a dedicated CLI to help load, inspect, migrate, and create vector search indices within Redis.\n", "\n", "This notebook will walk through how to use the Redis Vector Library CLI (``rvl``).\n", "\n", @@ -50,7 +50,12 @@ "| `rvl index` | `delete --index` or `-i ` | remove the specified index, leaving the data still in Redis|\n", "| `rvl index` | `destroy --index` or `-i `| remove the specified index, as well as the associated data|\n", "| `rvl stats` | `--index` or `-i ` | display the index statistics, including number of docs, average bytes per record, indexing time, etc|\n", - "| `rvl stats` | `--schema` or `-s ` | display the index statistics of a schema defined in . The index must have already been created within Redis|" + "| `rvl stats` | `--schema` or `-s ` | display the index statistics of a schema defined in . The index must have already been created within Redis|\n", + "| `rvl migrate` | `helper` or `list` | show migration guidance and list indexes available for migration|\n", + "| `rvl migrate` | `wizard` | interactively build a migration plan and schema patch|\n", + "| `rvl migrate` | `plan` | generate `migration_plan.yaml` from a patch or target schema|\n", + "| `rvl migrate` | `apply --allow-downtime` | execute a reviewed `drop_recreate` migration|\n", + "| `rvl migrate` | `validate` | validate a completed migration and emit report artifacts|" ] }, { diff --git a/docs/user_guide/how_to_guides/index.md b/docs/user_guide/how_to_guides/index.md index c03d705d..f6511d54 100644 --- a/docs/user_guide/how_to_guides/index.md +++ b/docs/user_guide/how_to_guides/index.md @@ -34,6 +34,7 @@ How-to guides are **task-oriented** recipes that help you accomplish specific go :::{grid-item-card} 💾 Storage - [Choose a Storage Type](../05_hash_vs_json.ipynb) -- Hash vs JSON formats and nested data +- [Migrate an Index](migrate-indexes.md) -- use the migrator helper, wizard, plan, apply, and validate workflow ::: :::{grid-item-card} 💻 CLI Operations @@ -59,6 +60,7 @@ How-to guides are **task-oriented** recipes that help you accomplish specific go | Optimize index performance | [Optimize Indexes with SVS-VAMANA](../09_svs_vamana.ipynb) | | Decide on storage format | [Choose a Storage Type](../05_hash_vs_json.ipynb) | | Manage indices from terminal | [Manage Indices with the CLI](../cli.ipynb) | +| Plan and run a supported index migration | [Migrate an Index](migrate-indexes.md) | ```{toctree} :hidden: @@ -74,4 +76,5 @@ Optimize Indexes with SVS-VAMANA <../09_svs_vamana> Cache Embeddings <../10_embeddings_cache> Use Advanced Query Types <../11_advanced_queries> Write SQL Queries for Redis <../12_sql_to_redis_queries> +Migrate an Index ``` diff --git a/docs/user_guide/how_to_guides/migrate-indexes.md b/docs/user_guide/how_to_guides/migrate-indexes.md new file mode 100644 index 00000000..2942f09f --- /dev/null +++ b/docs/user_guide/how_to_guides/migrate-indexes.md @@ -0,0 +1,394 @@ +--- +myst: + html_meta: + "description lang=en": | + How to migrate a RedisVL index schema without losing data. +--- + +# Migrate an Index + +This guide shows how to safely change your index schema using the RedisVL migrator. + +## Quick Start + +Add a field to your index in 4 commands: + +```bash +# 1. See what indexes exist +rvl migrate list --url redis://localhost:6379 + +# 2. Use the wizard to build a migration plan +rvl migrate wizard --index myindex --url redis://localhost:6379 + +# 3. Apply the migration +rvl migrate apply --plan migration_plan.yaml --allow-downtime --url redis://localhost:6379 + +# 4. Verify the result +rvl migrate validate --plan migration_plan.yaml --url redis://localhost:6379 +``` + +## Prerequisites + +- Redis with the Search module (Redis Stack, Redis Cloud, or Redis Enterprise) +- An existing index to migrate +- `redisvl` installed (`pip install redisvl`) + +```bash +# Local development with Redis Stack +docker run -d --name redis -p 6379:6379 redis/redis-stack-server:latest +``` + +## Step 1: Discover Available Indexes + +```bash +rvl migrate helper --url redis://localhost:6379 +rvl migrate list --url redis://localhost:6379 +``` + +**Example output:** +``` +Index Migrator +============== +The migrator helps you safely change your index schema. + +Supported changes: + - Add, remove, or update text/tag/numeric/geo fields + - Change vector algorithm (FLAT, HNSW, SVS-VAMANA) + - Change distance metric (COSINE, L2, IP) + - Quantize vectors (float32 → float16) + +Commands: + rvl migrate list List all indexes + rvl migrate wizard Build a migration interactively + rvl migrate plan Generate a migration plan + rvl migrate apply Execute a migration + rvl migrate validate Verify a migration +``` + +## Step 2: Build Your Schema Change + +Choose one of these approaches: + +### Option A: Use the Wizard (Recommended) + +The wizard guides you through building a migration interactively. Run: + +```bash +rvl migrate wizard --index myindex --url redis://localhost:6379 +``` + +**Example wizard session (adding a field):** + +```text +Building a migration plan for index 'myindex' +Current schema: +- Index name: myindex +- Storage type: hash + - title (text) + - embedding (vector) + +Choose an action: +1. Add field (text, tag, numeric, geo) +2. Update field (sortable, weight, separator) +3. Remove field +4. Preview patch (show pending changes as YAML) +5. Finish +Enter a number: 1 + +Field name: category +Field type options: text, tag, numeric, geo +Field type: tag + Sortable: enables sorting and aggregation on this field +Sortable [y/n]: n + Separator: character that splits multiple values (default: comma) +Separator [leave blank to keep existing/default]: | + +Choose an action: +1. Add field (text, tag, numeric, geo) +2. Update field (sortable, weight, separator) +3. Remove field +4. Preview patch (show pending changes as YAML) +5. Finish +Enter a number: 5 + +Migration plan written to /path/to/migration_plan.yaml +Mode: drop_recreate +Supported: True +Warnings: +- Index downtime is required +``` + +**Example wizard session (quantizing vectors):** + +```text +Choose an action: +1. Add field (text, tag, numeric, geo) +2. Update field (sortable, weight, separator) +3. Remove field +4. Preview patch (show pending changes as YAML) +5. Finish +Enter a number: 2 + +Updatable fields: +1. title (text) +2. embedding (vector) +Select a field to update by number or name: 2 + +Current vector config for 'embedding': + algorithm: HNSW + datatype: float32 + distance_metric: cosine + dims: 384 (cannot be changed) + m: 16 + ef_construction: 200 + +Leave blank to keep current value. + Algorithm: vector search method (FLAT=brute force, HNSW=graph, SVS-VAMANA=compressed graph) +Algorithm [current: HNSW]: + Datatype: float16, float32, bfloat16, float64, int8, uint8 + (float16 reduces memory ~50%, int8/uint8 reduce ~75%) +Datatype [current: float32]: float16 + Distance metric: how similarity is measured (cosine, l2, ip) +Distance metric [current: cosine]: + M: number of connections per node (higher=better recall, more memory) +M [current: 16]: + EF_CONSTRUCTION: build-time search depth (higher=better recall, slower build) +EF_CONSTRUCTION [current: 200]: + +Choose an action: +... +5. Finish +Enter a number: 5 + +Migration plan written to /path/to/migration_plan.yaml +Mode: drop_recreate +Supported: True +``` + +### Option B: Write a Schema Patch (YAML) + +Create `schema_patch.yaml` manually: + +```yaml +version: 1 +changes: + add_fields: + - name: category + type: tag + path: $.category + attrs: + separator: "|" + remove_fields: + - legacy_field + update_fields: + - name: title + attrs: + sortable: true + - name: embedding + attrs: + datatype: float16 # quantize vectors + algorithm: HNSW + distance_metric: cosine +``` + +Then generate the plan: + +```bash +rvl migrate plan \ + --index myindex \ + --schema-patch schema_patch.yaml \ + --url redis://localhost:6379 \ + --plan-out migration_plan.yaml +``` + +### Option C: Provide a Target Schema + +If you have the complete target schema, use it directly: + +```bash +rvl migrate plan \ + --index myindex \ + --target-schema target_schema.yaml \ + --url redis://localhost:6379 \ + --plan-out migration_plan.yaml +``` + +## Step 3: Review the Migration Plan + +Before applying, review `migration_plan.yaml`: + +```yaml +# migration_plan.yaml (example) +version: 1 +index_name: myindex +migration_mode: drop_recreate + +source: + schema_snapshot: + index: + name: myindex + prefix: "doc:" + storage_type: json + fields: + - name: title + type: text + - name: embedding + type: vector + attrs: + dims: 384 + algorithm: HNSW + datatype: float32 + doc_count: 10000 + key_sample: + - "doc:1" + - "doc:2" + - "doc:3" + +diff_classification: + supported: true + mode: drop_recreate + warnings: + - "Index will be unavailable during migration" + blocked_reasons: [] + +changes: + add_fields: + - name: category + type: tag + +merged_target_schema: + index: + name: myindex + prefix: "doc:" + storage_type: json + fields: + - name: title + type: text + - name: category + type: tag + - name: embedding + type: vector + attrs: + dims: 384 + algorithm: HNSW + datatype: float32 +``` + +**Key fields to check:** +- `diff_classification.supported` - Must be `true` to proceed +- `diff_classification.blocked_reasons` - Must be empty +- `merged_target_schema` - The final schema after migration + +## Step 4: Apply the Migration + +The `apply` command requires `--allow-downtime` since the index will be temporarily unavailable. + +```bash +rvl migrate apply \ + --plan migration_plan.yaml \ + --allow-downtime \ + --url redis://localhost:6379 \ + --report-out migration_report.yaml \ + --benchmark-out benchmark_report.yaml +``` + +What `apply` does: + +1. checks that the live source schema still matches the saved source snapshot +2. drops only the index structure +3. preserves the existing documents +4. recreates the same index name with the merged target schema +5. waits for indexing readiness +6. validates the result +7. writes report artifacts + +## Step 5: Validate the Result + +Validation happens automatically during `apply`, but you can run it separately: + +```bash +rvl migrate validate \ + --plan migration_plan.yaml \ + --url redis://localhost:6379 \ + --report-out migration_report.yaml +``` + +**Validation checks:** +- Live schema matches `merged_target_schema` +- Document count matches the source snapshot +- Sampled keys still exist +- No increase in indexing failures + +## What's Supported + +| Change | Supported | Notes | +|--------|-----------|-------| +| Add text/tag/numeric/geo field | ✅ | | +| Remove a field | ✅ | | +| Make a field sortable | ✅ | | +| Change field options (separator, stemming) | ✅ | | +| Change vector algorithm (FLAT ↔ HNSW ↔ SVS-VAMANA) | ✅ | Index-only | +| Change distance metric (COSINE ↔ L2 ↔ IP) | ✅ | Index-only | +| Tune HNSW parameters (M, EF_CONSTRUCTION) | ✅ | Index-only | +| Quantize vectors (float32 → float16) | ✅ | Auto re-encode | + +## What's Blocked + +| Change | Why | Workaround | +|--------|-----|------------| +| Change vector dimensions | Requires re-embedding | Re-embed with new model, reload data | +| Change prefix/keyspace | Documents at wrong keys | Create new index, reload data | +| Rename a field | Stored data uses old name | Create new index, reload data | +| Change storage type (hash ↔ JSON) | Different data format | Export, transform, reload | +| Add a new vector field | Requires vectors for all docs | Add vectors first, then migrate | + +## CLI Reference + +| Command | Description | +|---------|-------------| +| `rvl migrate helper` | Show supported changes and usage tips | +| `rvl migrate list` | List all indexes | +| `rvl migrate wizard` | Build a migration interactively | +| `rvl migrate plan` | Generate a migration plan | +| `rvl migrate apply` | Execute a migration | +| `rvl migrate validate` | Verify a migration result | + +**Common flags:** +- `--url` : Redis connection URL +- `--index` : Index name to migrate +- `--plan` / `--plan-out` : Path to migration plan +- `--allow-downtime` : Acknowledge index unavailability (required for apply) +- `--report-out` : Path for validation report +- `--benchmark-out` : Path for performance metrics + +## Troubleshooting + +### Migration blocked: "unsupported change" + +The planner detected a change that requires data transformation. Check `diff_classification.blocked_reasons` in the plan for details. + +### Apply failed: "source schema mismatch" + +The live index schema changed since the plan was generated. Re-run `rvl migrate plan` to create a fresh plan. + +### Apply failed: "timeout waiting for index ready" + +The index is taking longer to rebuild than expected. This can happen with large datasets. Check Redis logs and consider increasing the timeout or running during lower traffic periods. + +### Validation failed: "document count mismatch" + +Documents were added or removed between plan and apply. This is expected if your application is actively writing. Re-run validation with `--skip-count-check` if acceptable. + +### How to recover from a failed migration + +If `apply` fails mid-migration: + +1. **Check if the index exists:** `rvl index info --index myindex` +2. **If the index exists but is wrong:** Re-run `apply` with the same plan +3. **If the index was dropped:** Recreate it from the plan's `merged_target_schema` + +The underlying documents are never deleted by `drop_recreate`. + +## Learn more + +- {doc}`/concepts/index-migrations`: How migrations work and which changes are supported diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md index 5d2cf6df..d85177e7 100644 --- a/docs/user_guide/index.md +++ b/docs/user_guide/index.md @@ -39,7 +39,7 @@ Schema → Index → Load → Query **Solve specific problems.** Task-oriented recipes for LLM extensions, querying, embeddings, optimization, and storage. +++ -LLM Caching • Filtering • Vectorizers • Reranking +LLM Caching • Filtering • Vectorizers • Reranking • Migrations ::: :::{grid-item-card} 💻 CLI Reference @@ -49,7 +49,7 @@ LLM Caching • Filtering • Vectorizers • Reranking **Command-line tools.** Manage indices, inspect stats, and work with schemas using the `rvl` CLI. +++ -rvl index • rvl stats • Schema YAML +rvl index • rvl stats • rvl migrate • Schema YAML ::: :::{grid-item-card} 💡 Use Cases diff --git a/redisvl/cli/main.py b/redisvl/cli/main.py index 1353192f..e62c4f62 100644 --- a/redisvl/cli/main.py +++ b/redisvl/cli/main.py @@ -2,6 +2,7 @@ import sys from redisvl.cli.index import Index +from redisvl.cli.migrate import Migrate from redisvl.cli.stats import Stats from redisvl.cli.version import Version from redisvl.utils.log import get_logger @@ -14,6 +15,7 @@ def _usage(): "rvl []\n", "Commands:", "\tindex Index manipulation (create, delete, etc.)", + "\tmigrate Index migration planning and execution", "\tversion Obtain the version of RedisVL", "\tstats Obtain statistics about an index", ] @@ -49,3 +51,7 @@ def version(self): def stats(self): Stats() exit(0) + + def migrate(self): + Migrate() + exit(0) diff --git a/redisvl/cli/migrate.py b/redisvl/cli/migrate.py new file mode 100644 index 00000000..fe527ed0 --- /dev/null +++ b/redisvl/cli/migrate.py @@ -0,0 +1,397 @@ +import argparse +import sys +from argparse import Namespace +from typing import Optional + +from redisvl.cli.utils import add_redis_connection_options, create_redis_url +from redisvl.migration import MigrationExecutor, MigrationPlanner, MigrationValidator +from redisvl.migration.utils import ( + list_indexes, + load_migration_plan, + write_benchmark_report, + write_migration_report, +) +from redisvl.migration.wizard import MigrationWizard +from redisvl.utils.log import get_logger + +logger = get_logger("[RedisVL]") + + +class Migrate: + usage = "\n".join( + [ + "rvl migrate []\n", + "Commands:", + "\thelper Show migration guidance and supported capabilities", + "\tlist List all available indexes", + "\tplan Generate a migration plan for a document-preserving drop/recreate migration", + "\twizard Interactively build a migration plan and schema patch", + "\tapply Execute a reviewed drop/recreate migration plan", + "\tvalidate Validate a completed migration plan against the live index", + "\n", + ] + ) + + def __init__(self): + parser = argparse.ArgumentParser(usage=self.usage) + parser.add_argument("command", help="Subcommand to run") + + args = parser.parse_args(sys.argv[2:3]) + if not hasattr(self, args.command): + parser.print_help() + exit(0) + + try: + getattr(self, args.command)() + except Exception as e: + logger.error(e) + exit(1) + + def helper(self): + parser = argparse.ArgumentParser( + usage="rvl migrate helper [--host --port | --url ]" + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + redis_url = create_redis_url(args) + indexes = list_indexes(redis_url=redis_url) + + print( + """RedisVL Index Migrator + +Available indexes:""" + ) + if indexes: + for position, index_name in enumerate(indexes, start=1): + print(f" {position}. {index_name}") + else: + print(" (none found)") + + print( + """ +Supported changes: + - Adding or removing non-vector fields (text, tag, numeric, geo) + - Changing field options (sortable, separator, weight) + - Changing vector algorithm (FLAT, HNSW, SVS_VAMANA) + - Changing distance metric (COSINE, L2, IP) + - Tuning algorithm parameters (M, EF_CONSTRUCTION) + - Quantizing vectors (float32 to float16) + +Not yet supported: + - Changing vector dimensions + - Changing key prefix or separator + - Changing storage type (hash to JSON) + - Renaming fields + +Commands: + rvl migrate list List all indexes + rvl migrate wizard --index Guided migration builder + rvl migrate plan --index --schema-patch + rvl migrate apply --plan --allow-downtime + rvl migrate validate --plan """ + ) + + def list(self): + parser = argparse.ArgumentParser( + usage="rvl migrate list [--host --port | --url ]" + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + redis_url = create_redis_url(args) + indexes = list_indexes(redis_url=redis_url) + print("Available indexes:") + for position, index_name in enumerate(indexes, start=1): + print(f"{position}. {index_name}") + + def plan(self): + parser = argparse.ArgumentParser( + usage=( + "rvl migrate plan --index " + "(--schema-patch | --target-schema )" + ) + ) + parser.add_argument("-i", "--index", help="Source index name", required=True) + parser.add_argument("--schema-patch", help="Path to a schema patch file") + parser.add_argument("--target-schema", help="Path to a target schema file") + parser.add_argument( + "--plan-out", + help="Path to write migration_plan.yaml", + default="migration_plan.yaml", + ) + parser.add_argument( + "--key-sample-limit", + help="Maximum number of keys to sample from the index keyspace", + type=int, + default=10, + ) + parser = add_redis_connection_options(parser) + + args = parser.parse_args(sys.argv[3:]) + redis_url = create_redis_url(args) + planner = MigrationPlanner(key_sample_limit=args.key_sample_limit) + plan = planner.create_plan( + args.index, + redis_url=redis_url, + schema_patch_path=args.schema_patch, + target_schema_path=args.target_schema, + ) + planner.write_plan(plan, args.plan_out) + self._print_plan_summary(args.plan_out, plan) + + def wizard(self): + parser = argparse.ArgumentParser( + usage=( + "rvl migrate wizard [--index ] " + "[--patch ] " + "[--plan-out ] [--patch-out ]" + ) + ) + parser.add_argument("-i", "--index", help="Source index name", required=False) + parser.add_argument( + "--patch", + help="Load an existing schema patch to continue editing", + default=None, + ) + parser.add_argument( + "--plan-out", + help="Path to write migration_plan.yaml", + default="migration_plan.yaml", + ) + parser.add_argument( + "--patch-out", + help="Path to write schema_patch.yaml (for later editing)", + default="schema_patch.yaml", + ) + parser.add_argument( + "--target-schema-out", + help="Optional path to write the merged target schema", + default=None, + ) + parser.add_argument( + "--key-sample-limit", + help="Maximum number of keys to sample from the index keyspace", + type=int, + default=10, + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + + redis_url = create_redis_url(args) + wizard = MigrationWizard( + planner=MigrationPlanner(key_sample_limit=args.key_sample_limit) + ) + plan = wizard.run( + index_name=args.index, + redis_url=redis_url, + existing_patch_path=args.patch, + plan_out=args.plan_out, + patch_out=args.patch_out, + target_schema_out=args.target_schema_out, + ) + self._print_plan_summary(args.plan_out, plan) + + def apply(self): + parser = argparse.ArgumentParser( + usage=( + "rvl migrate apply --plan --allow-downtime " + "[--report-out ]" + ) + ) + parser.add_argument("--plan", help="Path to migration_plan.yaml", required=True) + parser.add_argument( + "--allow-downtime", + help="Explicitly acknowledge downtime for drop_recreate", + action="store_true", + ) + parser.add_argument( + "--report-out", + help="Path to write migration_report.yaml", + default="migration_report.yaml", + ) + parser.add_argument( + "--benchmark-out", + help="Optional path to write benchmark_report.yaml", + default=None, + ) + parser.add_argument( + "--query-check-file", + help="Optional YAML file containing fetch_ids and keys_exist checks", + default=None, + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + + if not args.allow_downtime: + raise ValueError( + "apply requires --allow-downtime for drop_recreate migrations" + ) + + redis_url = create_redis_url(args) + plan = load_migration_plan(args.plan) + executor = MigrationExecutor() + + print(f"\nApplying migration to '{plan.source.index_name}'...") + + def progress_callback(step: str, detail: str) -> None: + step_labels = { + "drop": "[1/5] Drop index", + "quantize": "[2/5] Quantize vectors", + "create": "[3/5] Create index", + "index": "[4/5] Re-indexing", + "validate": "[5/5] Validate", + } + label = step_labels.get(step, step) + # Use carriage return to update in place for progress + if detail and not detail.startswith("done"): + print(f" {label}: {detail} ", end="\r", flush=True) + else: + print(f" {label}: {detail} ") + + report = executor.apply( + plan, + redis_url=redis_url, + query_check_file=args.query_check_file, + progress_callback=progress_callback, + ) + + # Print completion summary + if report.result == "succeeded": + total_time = report.timings.total_migration_duration_seconds or 0 + downtime = report.timings.downtime_duration_seconds or 0 + print(f"\nMigration completed in {total_time}s (downtime: {downtime}s)") + else: + print(f"\nMigration {report.result}") + # Show errors immediately for visibility + if report.validation.errors: + for error in report.validation.errors: + print(f" ERROR: {error}") + + write_migration_report(report, args.report_out) + if args.benchmark_out: + write_benchmark_report(report, args.benchmark_out) + self._print_report_summary(args.report_out, report, args.benchmark_out) + + def validate(self): + parser = argparse.ArgumentParser( + usage=( + "rvl migrate validate --plan " + "[--report-out ]" + ) + ) + parser.add_argument("--plan", help="Path to migration_plan.yaml", required=True) + parser.add_argument( + "--report-out", + help="Path to write migration_report.yaml", + default="migration_report.yaml", + ) + parser.add_argument( + "--benchmark-out", + help="Optional path to write benchmark_report.yaml", + default=None, + ) + parser.add_argument( + "--query-check-file", + help="Optional YAML file containing fetch_ids and keys_exist checks", + default=None, + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + + redis_url = create_redis_url(args) + plan = load_migration_plan(args.plan) + validator = MigrationValidator() + validation, target_info, validation_duration = validator.validate( + plan, + redis_url=redis_url, + query_check_file=args.query_check_file, + ) + + from redisvl.migration.models import ( + MigrationBenchmarkSummary, + MigrationReport, + MigrationTimings, + ) + from redisvl.migration.utils import timestamp_utc + + source_size = float( + plan.source.stats_snapshot.get("vector_index_sz_mb", 0) or 0 + ) + target_size = float(target_info.get("vector_index_sz_mb", 0) or 0) + + report = MigrationReport( + source_index=plan.source.index_name, + target_index=plan.merged_target_schema["index"]["name"], + result="succeeded" if not validation.errors else "failed", + started_at=timestamp_utc(), + finished_at=timestamp_utc(), + timings=MigrationTimings(validation_duration_seconds=validation_duration), + validation=validation, + benchmark_summary=MigrationBenchmarkSummary( + source_index_size_mb=round(source_size, 3), + target_index_size_mb=round(target_size, 3), + index_size_delta_mb=round(target_size - source_size, 3), + ), + warnings=list(plan.warnings), + manual_actions=( + ["Review validation errors before proceeding."] + if validation.errors + else [] + ), + ) + write_migration_report(report, args.report_out) + if args.benchmark_out: + write_benchmark_report(report, args.benchmark_out) + self._print_report_summary(args.report_out, report, args.benchmark_out) + + def _print_plan_summary(self, plan_out: str, plan) -> None: + import os + + abs_path = os.path.abspath(plan_out) + print(f"Migration plan written to {abs_path}") + print(f"Mode: {plan.mode}") + print(f"Supported: {plan.diff_classification.supported}") + if plan.warnings: + print("Warnings:") + for warning in plan.warnings: + print(f"- {warning}") + if plan.diff_classification.blocked_reasons: + print("Blocked reasons:") + for reason in plan.diff_classification.blocked_reasons: + print(f"- {reason}") + + print("\nNext steps:") + print(f" Review the plan: cat {plan_out}") + print( + f" Apply the migration: rvl migrate apply --plan {plan_out} --allow-downtime" + ) + print(f" Validate the result: rvl migrate validate --plan {plan_out}") + print( + f"\nTo add more changes: rvl migrate wizard --index {plan.source.index_name} --patch schema_patch.yaml" + ) + print( + f"To start over: rvl migrate wizard --index {plan.source.index_name}" + ) + print(f"To cancel: rm {plan_out}") + + def _print_report_summary( + self, + report_out: str, + report, + benchmark_out: Optional[str], + ) -> None: + print(f"Migration report written to {report_out}") + print(f"Result: {report.result}") + print(f"Schema match: {report.validation.schema_match}") + print(f"Doc count match: {report.validation.doc_count_match}") + print(f"Key sample exists: {report.validation.key_sample_exists}") + print(f"Indexing failures delta: {report.validation.indexing_failures_delta}") + if report.validation.errors: + print("Errors:") + for error in report.validation.errors: + print(f"- {error}") + if report.manual_actions: + print("Manual actions:") + for action in report.manual_actions: + print(f"- {action}") + if benchmark_out: + print(f"Benchmark report written to {benchmark_out}") diff --git a/redisvl/cli/utils.py b/redisvl/cli/utils.py index 5d76a184..8245b69a 100644 --- a/redisvl/cli/utils.py +++ b/redisvl/cli/utils.py @@ -26,11 +26,7 @@ def create_redis_url(args: Namespace) -> str: return url -def add_index_parsing_options(parser: ArgumentParser) -> ArgumentParser: - parser.add_argument("-i", "--index", help="Index name", type=str, required=False) - parser.add_argument( - "-s", "--schema", help="Path to schema file", type=str, required=False - ) +def add_redis_connection_options(parser: ArgumentParser) -> ArgumentParser: parser.add_argument("-u", "--url", help="Redis URL", type=str, required=False) parser.add_argument("--host", help="Redis host", type=str, default="localhost") parser.add_argument("-p", "--port", help="Redis port", type=int, default=6379) @@ -38,3 +34,11 @@ def add_index_parsing_options(parser: ArgumentParser) -> ArgumentParser: parser.add_argument("--ssl", help="Use SSL", action="store_true") parser.add_argument("-a", "--password", help="Redis password", type=str, default="") return parser + + +def add_index_parsing_options(parser: ArgumentParser) -> ArgumentParser: + parser.add_argument("-i", "--index", help="Index name", type=str, required=False) + parser.add_argument( + "-s", "--schema", help="Path to schema file", type=str, required=False + ) + return add_redis_connection_options(parser) diff --git a/redisvl/migration/__init__.py b/redisvl/migration/__init__.py new file mode 100644 index 00000000..6cedb500 --- /dev/null +++ b/redisvl/migration/__init__.py @@ -0,0 +1,15 @@ +from redisvl.migration.executor import MigrationExecutor +from redisvl.migration.models import MigrationPlan, MigrationReport, SchemaPatch +from redisvl.migration.planner import MigrationPlanner +from redisvl.migration.validation import MigrationValidator +from redisvl.migration.wizard import MigrationWizard + +__all__ = [ + "MigrationExecutor", + "MigrationPlan", + "MigrationPlanner", + "MigrationReport", + "MigrationValidator", + "MigrationWizard", + "SchemaPatch", +] diff --git a/redisvl/migration/executor.py b/redisvl/migration/executor.py new file mode 100644 index 00000000..e34b4b04 --- /dev/null +++ b/redisvl/migration/executor.py @@ -0,0 +1,335 @@ +from __future__ import annotations + +import logging +import time +from typing import Any, Callable, Dict, Optional + +from redisvl.index import SearchIndex +from redisvl.migration.models import ( + MigrationBenchmarkSummary, + MigrationPlan, + MigrationReport, + MigrationTimings, + MigrationValidation, +) +from redisvl.migration.planner import MigrationPlanner +from redisvl.migration.utils import ( + current_source_matches_snapshot, + timestamp_utc, + wait_for_index_ready, +) +from redisvl.migration.validation import MigrationValidator +from redisvl.redis.utils import array_to_buffer, buffer_to_array +from redisvl.schema import StorageType + +logger = logging.getLogger(__name__) + + +class MigrationExecutor: + def __init__(self, validator: Optional[MigrationValidator] = None): + self.validator = validator or MigrationValidator() + + def apply( + self, + plan: MigrationPlan, + *, + redis_url: Optional[str] = None, + redis_client: Optional[Any] = None, + query_check_file: Optional[str] = None, + progress_callback: Optional[Callable[[str, Optional[str]], None]] = None, + ) -> MigrationReport: + """Apply a migration plan. + + Args: + plan: The migration plan to apply. + redis_url: Redis connection URL. + redis_client: Optional existing Redis client. + query_check_file: Optional file with query checks. + progress_callback: Optional callback(step, detail) for progress updates. + step: Current step name (e.g., "drop", "quantize", "create", "index", "validate") + detail: Optional detail string (e.g., "1000/5000 docs (20%)") + """ + started_at = timestamp_utc() + started = time.perf_counter() + + report = MigrationReport( + source_index=plan.source.index_name, + target_index=plan.merged_target_schema["index"]["name"], + result="failed", + started_at=started_at, + finished_at=started_at, + warnings=list(plan.warnings), + ) + + if not plan.diff_classification.supported: + report.validation.errors.extend(plan.diff_classification.blocked_reasons) + report.manual_actions.append( + "This change requires document migration, which is not yet supported." + ) + report.finished_at = timestamp_utc() + return report + + if not current_source_matches_snapshot( + plan.source.index_name, + plan.source.schema_snapshot, + redis_url=redis_url, + redis_client=redis_client, + ): + report.validation.errors.append( + "The current live source schema no longer matches the saved source snapshot." + ) + report.manual_actions.append( + "Re-run `rvl migrate plan` to refresh the migration plan before applying." + ) + report.finished_at = timestamp_utc() + return report + + source_index = SearchIndex.from_existing( + plan.source.index_name, + redis_url=redis_url, + redis_client=redis_client, + ) + target_index = SearchIndex.from_dict( + plan.merged_target_schema, + redis_url=redis_url, + redis_client=redis_client, + ) + + drop_duration = 0.0 + quantize_duration = 0.0 + recreate_duration = 0.0 + indexing_duration = 0.0 + target_info: Dict[str, Any] = {} + docs_quantized = 0 + + # Check if we need to re-encode vectors for datatype changes + datatype_changes = MigrationPlanner.get_vector_datatype_changes( + plan.source.schema_snapshot, plan.merged_target_schema + ) + + def _notify(step: str, detail: Optional[str] = None) -> None: + if progress_callback: + progress_callback(step, detail) + + try: + _notify("drop", "Dropping index definition...") + drop_started = time.perf_counter() + source_index.delete(drop=False) + drop_duration = round(time.perf_counter() - drop_started, 3) + _notify("drop", f"done ({drop_duration}s)") + + # Re-encode vectors if datatype changes are needed + if datatype_changes: + _notify("quantize", "Re-encoding vectors...") + quantize_started = time.perf_counter() + docs_quantized = self._quantize_vectors( + source_index, + datatype_changes, + plan, + progress_callback=lambda done, total: _notify( + "quantize", f"{done:,}/{total:,} docs" + ), + ) + quantize_duration = round(time.perf_counter() - quantize_started, 3) + _notify( + "quantize", + f"done ({docs_quantized:,} docs in {quantize_duration}s)", + ) + report.warnings.append( + f"Re-encoded {docs_quantized} documents for vector quantization: " + f"{datatype_changes}" + ) + + _notify("create", "Creating index with new schema...") + recreate_started = time.perf_counter() + target_index.create() + recreate_duration = round(time.perf_counter() - recreate_started, 3) + _notify("create", f"done ({recreate_duration}s)") + + _notify("index", "Waiting for re-indexing...") + + def _index_progress(indexed: int, total: int, pct: float) -> None: + _notify("index", f"{indexed:,}/{total:,} docs ({pct:.0f}%)") + + target_info, indexing_duration = wait_for_index_ready( + target_index, progress_callback=_index_progress + ) + _notify("index", f"done ({indexing_duration}s)") + + _notify("validate", "Validating migration...") + validation, target_info, validation_duration = self.validator.validate( + plan, + redis_url=redis_url, + redis_client=redis_client, + query_check_file=query_check_file, + ) + _notify("validate", f"done ({validation_duration}s)") + report.validation = validation + total_duration = round(time.perf_counter() - started, 3) + report.timings = MigrationTimings( + total_migration_duration_seconds=total_duration, + drop_duration_seconds=drop_duration, + quantize_duration_seconds=( + quantize_duration if quantize_duration else None + ), + recreate_duration_seconds=recreate_duration, + initial_indexing_duration_seconds=indexing_duration, + validation_duration_seconds=validation_duration, + downtime_duration_seconds=round( + drop_duration + + quantize_duration + + recreate_duration + + indexing_duration, + 3, + ), + ) + report.benchmark_summary = self._build_benchmark_summary( + plan, + target_info, + report.timings, + ) + report.result = "succeeded" if not validation.errors else "failed" + if validation.errors: + report.manual_actions.append( + "Review validation errors before treating the migration as complete." + ) + except Exception as exc: + total_duration = round(time.perf_counter() - started, 3) + report.timings = MigrationTimings( + total_migration_duration_seconds=total_duration, + drop_duration_seconds=drop_duration or None, + quantize_duration_seconds=quantize_duration or None, + recreate_duration_seconds=recreate_duration or None, + initial_indexing_duration_seconds=indexing_duration or None, + downtime_duration_seconds=( + round( + drop_duration + + quantize_duration + + recreate_duration + + indexing_duration, + 3, + ) + if drop_duration + or quantize_duration + or recreate_duration + or indexing_duration + else None + ), + ) + report.validation = MigrationValidation( + errors=[f"Migration execution failed: {exc}"] + ) + report.manual_actions.extend( + [ + "Inspect the Redis index state before retrying.", + "If the source index was dropped, recreate it from the saved migration plan.", + ] + ) + finally: + report.finished_at = timestamp_utc() + + return report + + def _quantize_vectors( + self, + source_index: SearchIndex, + datatype_changes: Dict[str, Dict[str, str]], + plan: MigrationPlan, + progress_callback: Optional[Callable[[int, int], None]] = None, + ) -> int: + """Re-encode vectors in documents for datatype changes (quantization). + + This iterates over all documents matching the index prefix and converts + vector fields from source datatype to target datatype. + + Args: + source_index: The source SearchIndex (already dropped but client available) + datatype_changes: Dict mapping field_name -> {"source": dtype, "target": dtype} + plan: The migration plan containing schema info + progress_callback: Optional callback(docs_done, total_docs) + + Returns: + Number of documents processed + """ + client = source_index._redis_client + prefix = plan.source.schema_snapshot["index"]["prefix"] + storage_type = ( + plan.source.schema_snapshot["index"].get("storage_type", "hash").lower() + ) + + # Get estimated total from source stats + estimated_total = int(plan.source.stats_snapshot.get("num_docs", 0) or 0) + + # Get vector field dimensions for validation + field_dims: Dict[str, int] = {} + for field in plan.source.schema_snapshot.get("fields", []): + if field.get("type") == "vector" and field["name"] in datatype_changes: + field_dims[field["name"]] = field.get("attrs", {}).get("dims", 0) + + docs_processed = 0 + batch_size = 500 + cursor = 0 + + while True: + cursor, keys = client.scan( + cursor=cursor, + match=f"{prefix}*", + count=batch_size, + ) + + if keys: + pipe = client.pipeline() + keys_to_update = [] + + for key in keys: + if storage_type == "hash": + # Read all vector fields that need conversion + for field_name, change in datatype_changes.items(): + field_data = client.hget(key, field_name) + if field_data: + # Convert: source dtype -> array -> target dtype -> bytes + array = buffer_to_array(field_data, change["source"]) + new_bytes = array_to_buffer(array, change["target"]) + pipe.hset(key, field_name, new_bytes) + keys_to_update.append(key) + else: + # JSON storage - vectors stored as arrays, need different handling + logger.warning( + f"JSON storage quantization for key {key} - " + "vectors stored as arrays may not need re-encoding" + ) + + if keys_to_update: + pipe.execute() + docs_processed += len(set(keys_to_update)) + if progress_callback: + progress_callback(docs_processed, estimated_total) + + if cursor == 0: + break + + logger.info(f"Quantized {docs_processed} documents: {datatype_changes}") + return docs_processed + + def _build_benchmark_summary( + self, + plan: MigrationPlan, + target_info: dict, + timings: MigrationTimings, + ) -> MigrationBenchmarkSummary: + source_index_size = float( + plan.source.stats_snapshot.get("vector_index_sz_mb", 0) or 0 + ) + target_index_size = float(target_info.get("vector_index_sz_mb", 0) or 0) + source_num_docs = int(plan.source.stats_snapshot.get("num_docs", 0) or 0) + indexed_per_second = None + indexing_time = timings.initial_indexing_duration_seconds + if indexing_time and indexing_time > 0: + indexed_per_second = round(source_num_docs / indexing_time, 3) + + return MigrationBenchmarkSummary( + documents_indexed_per_second=indexed_per_second, + source_index_size_mb=round(source_index_size, 3), + target_index_size_mb=round(target_index_size, 3), + index_size_delta_mb=round(target_index_size - source_index_size, 3), + ) diff --git a/redisvl/migration/models.py b/redisvl/migration/models.py new file mode 100644 index 00000000..9feda0c5 --- /dev/null +++ b/redisvl/migration/models.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field, model_validator + + +class FieldUpdate(BaseModel): + """Partial field update for schema patch inputs.""" + + name: str + type: Optional[str] = None + path: Optional[str] = None + attrs: Dict[str, Any] = Field(default_factory=dict) + options: Dict[str, Any] = Field(default_factory=dict) + + @model_validator(mode="after") + def merge_options_into_attrs(self) -> "FieldUpdate": + if self.options: + merged_attrs = dict(self.attrs) + merged_attrs.update(self.options) + self.attrs = merged_attrs + self.options = {} + return self + + +class SchemaPatchChanges(BaseModel): + add_fields: List[Dict[str, Any]] = Field(default_factory=list) + remove_fields: List[str] = Field(default_factory=list) + update_fields: List[FieldUpdate] = Field(default_factory=list) + index: Dict[str, Any] = Field(default_factory=dict) + + +class SchemaPatch(BaseModel): + version: int = 1 + changes: SchemaPatchChanges = Field(default_factory=SchemaPatchChanges) + + +class KeyspaceSnapshot(BaseModel): + storage_type: str + prefixes: List[str] + key_separator: str + key_sample: List[str] = Field(default_factory=list) + + +class SourceSnapshot(BaseModel): + index_name: str + schema_snapshot: Dict[str, Any] + stats_snapshot: Dict[str, Any] + keyspace: KeyspaceSnapshot + + +class DiffClassification(BaseModel): + supported: bool + blocked_reasons: List[str] = Field(default_factory=list) + + +class ValidationPolicy(BaseModel): + require_doc_count_match: bool = True + require_schema_match: bool = True + + +class MigrationPlan(BaseModel): + version: int = 1 + mode: str = "drop_recreate" + source: SourceSnapshot + requested_changes: Dict[str, Any] + merged_target_schema: Dict[str, Any] + diff_classification: DiffClassification + warnings: List[str] = Field(default_factory=list) + validation: ValidationPolicy = Field(default_factory=ValidationPolicy) + + +class QueryCheckResult(BaseModel): + name: str + passed: bool + details: Optional[str] = None + + +class MigrationValidation(BaseModel): + schema_match: bool = False + doc_count_match: bool = False + key_sample_exists: bool = False + indexing_failures_delta: int = 0 + query_checks: List[QueryCheckResult] = Field(default_factory=list) + errors: List[str] = Field(default_factory=list) + + +class MigrationTimings(BaseModel): + total_migration_duration_seconds: Optional[float] = None + drop_duration_seconds: Optional[float] = None + quantize_duration_seconds: Optional[float] = None + recreate_duration_seconds: Optional[float] = None + initial_indexing_duration_seconds: Optional[float] = None + validation_duration_seconds: Optional[float] = None + downtime_duration_seconds: Optional[float] = None + + +class MigrationBenchmarkSummary(BaseModel): + documents_indexed_per_second: Optional[float] = None + source_index_size_mb: Optional[float] = None + target_index_size_mb: Optional[float] = None + index_size_delta_mb: Optional[float] = None + + +class MigrationReport(BaseModel): + version: int = 1 + mode: str = "drop_recreate" + source_index: str + target_index: str + result: str + started_at: str + finished_at: str + timings: MigrationTimings = Field(default_factory=MigrationTimings) + validation: MigrationValidation = Field(default_factory=MigrationValidation) + benchmark_summary: MigrationBenchmarkSummary = Field( + default_factory=MigrationBenchmarkSummary + ) + warnings: List[str] = Field(default_factory=list) + manual_actions: List[str] = Field(default_factory=list) diff --git a/redisvl/migration/planner.py b/redisvl/migration/planner.py new file mode 100644 index 00000000..8f79db1b --- /dev/null +++ b/redisvl/migration/planner.py @@ -0,0 +1,439 @@ +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + +from redisvl.index import SearchIndex +from redisvl.migration.models import ( + DiffClassification, + KeyspaceSnapshot, + MigrationPlan, + SchemaPatch, + SourceSnapshot, +) +from redisvl.schema.schema import IndexSchema + + +class MigrationPlanner: + """Migration planner for document-preserving drop/recreate flows. + + The `drop_recreate` mode drops the index definition and recreates it with + a new schema. Documents remain untouched in Redis. + + This means: + - Index-only changes work (algorithm, distance metric, tuning params) + - Document-dependent changes fail (the index expects data in a format + that doesn't match what's stored) + + Document-dependent changes (not supported): + - Vector dimensions: stored vectors have wrong number of dimensions + - Prefix/keyspace: documents are at keys the new index won't scan + - Field rename: documents store data under the old field name + - Storage type: documents are in hash format but index expects JSON + """ + + def __init__(self, key_sample_limit: int = 10): + self.key_sample_limit = key_sample_limit + + def create_plan( + self, + index_name: str, + *, + redis_url: Optional[str] = None, + schema_patch_path: Optional[str] = None, + target_schema_path: Optional[str] = None, + redis_client: Optional[Any] = None, + ) -> MigrationPlan: + if not schema_patch_path and not target_schema_path: + raise ValueError( + "Must provide either --schema-patch or --target-schema for migration planning" + ) + if schema_patch_path and target_schema_path: + raise ValueError( + "Provide only one of --schema-patch or --target-schema for migration planning" + ) + + snapshot = self.snapshot_source( + index_name, + redis_url=redis_url, + redis_client=redis_client, + ) + source_schema = IndexSchema.from_dict(snapshot.schema_snapshot) + + if schema_patch_path: + schema_patch = self.load_schema_patch(schema_patch_path) + else: + schema_patch = self.normalize_target_schema_to_patch( + source_schema, target_schema_path + ) + + return self.create_plan_from_patch( + index_name, + schema_patch=schema_patch, + redis_url=redis_url, + redis_client=redis_client, + ) + + def create_plan_from_patch( + self, + index_name: str, + *, + schema_patch: SchemaPatch, + redis_url: Optional[str] = None, + redis_client: Optional[Any] = None, + ) -> MigrationPlan: + snapshot = self.snapshot_source( + index_name, + redis_url=redis_url, + redis_client=redis_client, + ) + source_schema = IndexSchema.from_dict(snapshot.schema_snapshot) + merged_target_schema = self.merge_patch(source_schema, schema_patch) + diff_classification = self.classify_diff( + source_schema, schema_patch, merged_target_schema + ) + + return MigrationPlan( + source=snapshot, + requested_changes=schema_patch.model_dump(exclude_none=True), + merged_target_schema=merged_target_schema.to_dict(), + diff_classification=diff_classification, + warnings=["Index downtime is required"], + ) + + def snapshot_source( + self, + index_name: str, + *, + redis_url: Optional[str] = None, + redis_client: Optional[Any] = None, + ) -> SourceSnapshot: + index = SearchIndex.from_existing( + index_name, + redis_url=redis_url, + redis_client=redis_client, + ) + schema_dict = index.schema.to_dict() + stats_snapshot = index.info() + prefixes = index.schema.index.prefix + prefix_list = prefixes if isinstance(prefixes, list) else [prefixes] + + return SourceSnapshot( + index_name=index_name, + schema_snapshot=schema_dict, + stats_snapshot=stats_snapshot, + keyspace=KeyspaceSnapshot( + storage_type=index.schema.index.storage_type.value, + prefixes=prefix_list, + key_separator=index.schema.index.key_separator, + key_sample=self._sample_keys( + client=index.client, + prefixes=prefix_list, + key_separator=index.schema.index.key_separator, + ), + ), + ) + + def load_schema_patch(self, schema_patch_path: str) -> SchemaPatch: + patch_path = Path(schema_patch_path).resolve() + if not patch_path.exists(): + raise FileNotFoundError( + f"Schema patch file {schema_patch_path} does not exist" + ) + + with open(patch_path, "r") as f: + patch_data = yaml.safe_load(f) or {} + return SchemaPatch.model_validate(patch_data) + + def normalize_target_schema_to_patch( + self, source_schema: IndexSchema, target_schema_path: str + ) -> SchemaPatch: + target_schema = IndexSchema.from_yaml(target_schema_path) + source_dict = source_schema.to_dict() + target_dict = target_schema.to_dict() + + changes: Dict[str, Any] = { + "add_fields": [], + "remove_fields": [], + "update_fields": [], + "index": {}, + } + + source_fields = {field["name"]: field for field in source_dict["fields"]} + target_fields = {field["name"]: field for field in target_dict["fields"]} + + for field_name, target_field in target_fields.items(): + if field_name not in source_fields: + changes["add_fields"].append(target_field) + elif source_fields[field_name] != target_field: + changes["update_fields"].append(target_field) + + for field_name in source_fields: + if field_name not in target_fields: + changes["remove_fields"].append(field_name) + + for index_key, target_value in target_dict["index"].items(): + source_value = source_dict["index"].get(index_key) + if source_value != target_value: + changes["index"][index_key] = target_value + + return SchemaPatch.model_validate({"version": 1, "changes": changes}) + + def merge_patch( + self, source_schema: IndexSchema, schema_patch: SchemaPatch + ) -> IndexSchema: + schema_dict = deepcopy(source_schema.to_dict()) + changes = schema_patch.changes + fields_by_name = { + field["name"]: deepcopy(field) for field in schema_dict["fields"] + } + + for field_name in changes.remove_fields: + fields_by_name.pop(field_name, None) + + for field_update in changes.update_fields: + if field_update.name not in fields_by_name: + raise ValueError( + f"Cannot update field '{field_update.name}' because it does not exist in the source schema" + ) + existing_field = fields_by_name[field_update.name] + if field_update.type is not None: + existing_field["type"] = field_update.type + if field_update.path is not None: + existing_field["path"] = field_update.path + if field_update.attrs: + merged_attrs = dict(existing_field.get("attrs", {})) + merged_attrs.update(field_update.attrs) + existing_field["attrs"] = merged_attrs + + for field in changes.add_fields: + field_name = field["name"] + if field_name in fields_by_name: + raise ValueError( + f"Cannot add field '{field_name}' because it already exists in the source schema" + ) + fields_by_name[field_name] = deepcopy(field) + + schema_dict["fields"] = list(fields_by_name.values()) + schema_dict["index"].update(changes.index) + return IndexSchema.from_dict(schema_dict) + + def classify_diff( + self, + source_schema: IndexSchema, + schema_patch: SchemaPatch, + merged_target_schema: IndexSchema, + ) -> DiffClassification: + blocked_reasons: List[str] = [] + changes = schema_patch.changes + source_dict = source_schema.to_dict() + target_dict = merged_target_schema.to_dict() + + for index_key, target_value in changes.index.items(): + source_value = source_dict["index"].get(index_key) + if source_value == target_value: + continue + if index_key == "name": + blocked_reasons.append( + "Changing the index name requires document migration (not yet supported)." + ) + elif index_key == "prefix": + blocked_reasons.append( + "Changing index prefixes requires document migration (not yet supported)." + ) + elif index_key == "key_separator": + blocked_reasons.append( + "Changing the key separator requires document migration (not yet supported)." + ) + elif index_key == "storage_type": + blocked_reasons.append( + "Changing the storage type requires document migration (not yet supported)." + ) + + source_fields = {field["name"]: field for field in source_dict["fields"]} + target_fields = {field["name"]: field for field in target_dict["fields"]} + + for field in changes.add_fields: + if field["type"] == "vector": + blocked_reasons.append( + f"Adding vector field '{field['name']}' requires document migration (not yet supported)." + ) + + for field_update in changes.update_fields: + source_field = source_fields[field_update.name] + target_field = target_fields[field_update.name] + source_type = source_field["type"] + target_type = target_field["type"] + + if source_type != target_type: + blocked_reasons.append( + f"Changing field '{field_update.name}' type from {source_type} to {target_type} is not supported by drop_recreate." + ) + continue + + source_path = source_field.get("path") + target_path = target_field.get("path") + if source_path != target_path: + blocked_reasons.append( + f"Changing field '{field_update.name}' path from {source_path} to {target_path} is not supported by drop_recreate." + ) + continue + + if target_type == "vector" and source_field != target_field: + # Check for document-dependent changes that are not yet supported + vector_blocked = self._classify_vector_field_change( + source_field, target_field + ) + blocked_reasons.extend(vector_blocked) + + blocked_reasons.extend( + self._detect_possible_field_renames(source_fields, target_fields) + ) + + return DiffClassification( + supported=len(blocked_reasons) == 0, + blocked_reasons=self._dedupe(blocked_reasons), + ) + + def write_plan(self, plan: MigrationPlan, plan_out: str) -> None: + plan_path = Path(plan_out).resolve() + with open(plan_path, "w") as f: + yaml.safe_dump(plan.model_dump(exclude_none=True), f, sort_keys=False) + + def _sample_keys( + self, *, client: Any, prefixes: List[str], key_separator: str + ) -> List[str]: + key_sample: List[str] = [] + if client is None or self.key_sample_limit <= 0: + return key_sample + + for prefix in prefixes: + if len(key_sample) >= self.key_sample_limit: + break + match_pattern = ( + f"{prefix}*" + if prefix.endswith(key_separator) + else f"{prefix}{key_separator}*" + ) + cursor = 0 + while True: + cursor, keys = client.scan( + cursor=cursor, + match=match_pattern, + count=max(self.key_sample_limit, 10), + ) + for key in keys: + decoded_key = key.decode() if isinstance(key, bytes) else str(key) + if decoded_key not in key_sample: + key_sample.append(decoded_key) + if len(key_sample) >= self.key_sample_limit: + return key_sample + if cursor == 0: + break + return key_sample + + def _detect_possible_field_renames( + self, + source_fields: Dict[str, Dict[str, Any]], + target_fields: Dict[str, Dict[str, Any]], + ) -> List[str]: + blocked_reasons: List[str] = [] + added_fields = [ + field for name, field in target_fields.items() if name not in source_fields + ] + removed_fields = [ + field for name, field in source_fields.items() if name not in target_fields + ] + + for removed_field in removed_fields: + for added_field in added_fields: + if self._fields_match_except_name(removed_field, added_field): + blocked_reasons.append( + f"Possible field rename from '{removed_field['name']}' to '{added_field['name']}' is not supported by drop_recreate." + ) + return blocked_reasons + + @staticmethod + def _classify_vector_field_change( + source_field: Dict[str, Any], target_field: Dict[str, Any] + ) -> List[str]: + """Classify vector field changes as supported or blocked for drop_recreate. + + Index-only changes (allowed with drop_recreate): + - algorithm (FLAT -> HNSW -> SVS-VAMANA) + - distance_metric (COSINE, L2, IP) + - initial_cap + - Algorithm tuning: m, ef_construction, ef_runtime, epsilon, block_size, + graph_max_degree, construction_window_size, search_window_size, etc. + + Quantization changes (allowed with drop_recreate, requires vector re-encoding): + - datatype (float32 -> float16, etc.) - executor will re-encode vectors + + Document-dependent changes (blocked, not yet supported): + - dims (vectors stored with wrong number of dimensions) + """ + blocked_reasons: List[str] = [] + field_name = source_field.get("name", "unknown") + source_attrs = source_field.get("attrs", {}) + target_attrs = target_field.get("attrs", {}) + + # Document-dependent properties (not yet supported) + if source_attrs.get("dims") != target_attrs.get("dims"): + blocked_reasons.append( + f"Changing vector field '{field_name}' dims from {source_attrs.get('dims')} " + f"to {target_attrs.get('dims')} requires document migration (not yet supported). " + "Vectors are stored with incompatible dimensions." + ) + + # Datatype changes are now ALLOWED - executor will re-encode vectors + # before recreating the index + + # All other vector changes are index-only and allowed + return blocked_reasons + + @staticmethod + def get_vector_datatype_changes( + source_schema: Dict[str, Any], target_schema: Dict[str, Any] + ) -> Dict[str, Dict[str, str]]: + """Identify vector fields that need datatype conversion (quantization). + + Returns: + Dict mapping field_name -> {"source": source_dtype, "target": target_dtype} + """ + changes: Dict[str, Dict[str, str]] = {} + source_fields = {f["name"]: f for f in source_schema.get("fields", [])} + target_fields = {f["name"]: f for f in target_schema.get("fields", [])} + + for name, source_field in source_fields.items(): + if source_field.get("type") != "vector": + continue + target_field = target_fields.get(name) + if not target_field or target_field.get("type") != "vector": + continue + + source_dtype = source_field.get("attrs", {}).get("datatype", "float32") + target_dtype = target_field.get("attrs", {}).get("datatype", "float32") + + if source_dtype != target_dtype: + changes[name] = {"source": source_dtype, "target": target_dtype} + + return changes + + @staticmethod + def _fields_match_except_name( + source_field: Dict[str, Any], target_field: Dict[str, Any] + ) -> bool: + comparable_source = {k: v for k, v in source_field.items() if k != "name"} + comparable_target = {k: v for k, v in target_field.items() if k != "name"} + return comparable_source == comparable_target + + @staticmethod + def _dedupe(values: List[str]) -> List[str]: + deduped: List[str] = [] + for value in values: + if value not in deduped: + deduped.append(value) + return deduped diff --git a/redisvl/migration/utils.py b/redisvl/migration/utils.py new file mode 100644 index 00000000..a5b12766 --- /dev/null +++ b/redisvl/migration/utils.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import json +import time +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple + +import yaml + +from redisvl.index import SearchIndex +from redisvl.migration.models import MigrationPlan, MigrationReport +from redisvl.redis.connection import RedisConnectionFactory +from redisvl.schema.schema import IndexSchema + + +def list_indexes( + *, redis_url: Optional[str] = None, redis_client: Optional[Any] = None +): + if redis_client is None: + if not redis_url: + raise ValueError("Must provide either redis_url or redis_client") + redis_client = RedisConnectionFactory.get_redis_connection(redis_url=redis_url) + index = SearchIndex.from_dict( + {"index": {"name": "__redisvl_migration_helper__"}, "fields": []}, + redis_client=redis_client, + ) + return index.listall() + + +def load_yaml(path: str) -> Dict[str, Any]: + resolved = Path(path).resolve() + with open(resolved, "r") as f: + return yaml.safe_load(f) or {} + + +def write_yaml(data: Dict[str, Any], path: str) -> None: + resolved = Path(path).resolve() + with open(resolved, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) + + +def load_migration_plan(path: str) -> MigrationPlan: + return MigrationPlan.model_validate(load_yaml(path)) + + +def write_migration_report(report: MigrationReport, path: str) -> None: + write_yaml(report.model_dump(exclude_none=True), path) + + +def write_benchmark_report(report: MigrationReport, path: str) -> None: + benchmark_report = { + "version": report.version, + "mode": report.mode, + "source_index": report.source_index, + "target_index": report.target_index, + "result": report.result, + "timings": report.timings.model_dump(exclude_none=True), + "benchmark_summary": report.benchmark_summary.model_dump(exclude_none=True), + "validation": { + "schema_match": report.validation.schema_match, + "doc_count_match": report.validation.doc_count_match, + "indexing_failures_delta": report.validation.indexing_failures_delta, + "key_sample_exists": report.validation.key_sample_exists, + }, + } + write_yaml(benchmark_report, path) + + +def canonicalize_schema(schema_dict: Dict[str, Any]) -> Dict[str, Any]: + schema = IndexSchema.from_dict(schema_dict).to_dict() + schema["fields"] = sorted(schema.get("fields", []), key=lambda field: field["name"]) + prefixes = schema["index"].get("prefix") + if isinstance(prefixes, list): + schema["index"]["prefix"] = sorted(prefixes) + stopwords = schema["index"].get("stopwords") + if isinstance(stopwords, list): + schema["index"]["stopwords"] = list(stopwords) + return schema + + +def schemas_equal(left: Dict[str, Any], right: Dict[str, Any]) -> bool: + return json.dumps(canonicalize_schema(left), sort_keys=True) == json.dumps( + canonicalize_schema(right), sort_keys=True + ) + + +def wait_for_index_ready( + index: SearchIndex, + *, + timeout_seconds: int = 1800, + poll_interval_seconds: float = 0.5, + progress_callback: Optional[Callable[[int, int, float], None]] = None, +) -> Tuple[Dict[str, Any], float]: + """Wait for index to finish indexing all documents. + + Args: + index: The SearchIndex to monitor. + timeout_seconds: Maximum time to wait. + poll_interval_seconds: How often to check status. + progress_callback: Optional callback(indexed_docs, total_docs, percent). + """ + start = time.perf_counter() + deadline = start + timeout_seconds + latest_info = index.info() + + stable_ready_checks = 0 + while time.perf_counter() < deadline: + latest_info = index.info() + indexing = latest_info.get("indexing") + percent_indexed = latest_info.get("percent_indexed") + + if percent_indexed is not None or indexing is not None: + ready = float(percent_indexed or 0) >= 1.0 and not bool(indexing) + if progress_callback: + total_docs = int(latest_info.get("num_docs", 0)) + pct = float(percent_indexed or 0) + indexed_docs = int(total_docs * pct) + progress_callback(indexed_docs, total_docs, pct * 100) + else: + current_docs = latest_info.get("num_docs") + if current_docs is None: + ready = True + else: + if stable_ready_checks == 0: + stable_ready_checks = int(current_docs) + time.sleep(poll_interval_seconds) + continue + ready = int(current_docs) == stable_ready_checks + + if ready: + return latest_info, round(time.perf_counter() - start, 3) + + time.sleep(poll_interval_seconds) + + raise TimeoutError( + f"Index {index.schema.index.name} did not become ready within {timeout_seconds} seconds" + ) + + +def current_source_matches_snapshot( + index_name: str, + expected_schema: Dict[str, Any], + *, + redis_url: Optional[str] = None, + redis_client: Optional[Any] = None, +) -> bool: + current_index = SearchIndex.from_existing( + index_name, + redis_url=redis_url, + redis_client=redis_client, + ) + return schemas_equal(current_index.schema.to_dict(), expected_schema) + + +def timestamp_utc() -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) diff --git a/redisvl/migration/validation.py b/redisvl/migration/validation.py new file mode 100644 index 00000000..f5bc0ca7 --- /dev/null +++ b/redisvl/migration/validation.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +import time +from typing import Any, Dict, Optional + +from redisvl.index import SearchIndex +from redisvl.migration.models import ( + MigrationPlan, + MigrationValidation, + QueryCheckResult, +) +from redisvl.migration.utils import load_yaml, schemas_equal + + +class MigrationValidator: + def validate( + self, + plan: MigrationPlan, + *, + redis_url: Optional[str] = None, + redis_client: Optional[Any] = None, + query_check_file: Optional[str] = None, + ) -> tuple[MigrationValidation, Dict[str, Any], float]: + started = time.perf_counter() + target_index = SearchIndex.from_existing( + plan.merged_target_schema["index"]["name"], + redis_url=redis_url, + redis_client=redis_client, + ) + target_info = target_index.info() + validation = MigrationValidation() + + live_schema = target_index.schema.to_dict() + validation.schema_match = schemas_equal(live_schema, plan.merged_target_schema) + + source_num_docs = int(plan.source.stats_snapshot.get("num_docs", 0) or 0) + target_num_docs = int(target_info.get("num_docs", 0) or 0) + validation.doc_count_match = source_num_docs == target_num_docs + + source_failures = int( + plan.source.stats_snapshot.get("hash_indexing_failures", 0) or 0 + ) + target_failures = int(target_info.get("hash_indexing_failures", 0) or 0) + validation.indexing_failures_delta = target_failures - source_failures + + key_sample = plan.source.keyspace.key_sample + if not key_sample: + validation.key_sample_exists = True + else: + existing_count = target_index.client.exists(*key_sample) + validation.key_sample_exists = existing_count == len(key_sample) + + if query_check_file: + validation.query_checks = self._run_query_checks( + target_index, + query_check_file, + ) + + if not validation.schema_match: + validation.errors.append("Live schema does not match merged_target_schema.") + if not validation.doc_count_match: + validation.errors.append( + "Live document count does not match source num_docs." + ) + if validation.indexing_failures_delta != 0: + validation.errors.append("Indexing failures increased during migration.") + if not validation.key_sample_exists: + validation.errors.append( + "One or more sampled source keys is missing after migration." + ) + if any(not query_check.passed for query_check in validation.query_checks): + validation.errors.append("One or more query checks failed.") + + return validation, target_info, round(time.perf_counter() - started, 3) + + def _run_query_checks( + self, + target_index: SearchIndex, + query_check_file: str, + ) -> list[QueryCheckResult]: + query_checks = load_yaml(query_check_file) + results: list[QueryCheckResult] = [] + + for doc_id in query_checks.get("fetch_ids", []): + fetched = target_index.fetch(doc_id) + results.append( + QueryCheckResult( + name=f"fetch:{doc_id}", + passed=fetched is not None, + details=( + "Document fetched successfully" + if fetched + else "Document not found" + ), + ) + ) + + for key in query_checks.get("keys_exist", []): + exists = bool(target_index.client.exists(key)) + results.append( + QueryCheckResult( + name=f"key:{key}", + passed=exists, + details="Key exists" if exists else "Key not found", + ) + ) + + return results diff --git a/redisvl/migration/wizard.py b/redisvl/migration/wizard.py new file mode 100644 index 00000000..feb4d3b0 --- /dev/null +++ b/redisvl/migration/wizard.py @@ -0,0 +1,502 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + +from redisvl.migration.models import FieldUpdate, SchemaPatch, SchemaPatchChanges +from redisvl.migration.planner import MigrationPlanner +from redisvl.migration.utils import list_indexes, write_yaml +from redisvl.schema.schema import IndexSchema + +SUPPORTED_FIELD_TYPES = ["text", "tag", "numeric", "geo"] +UPDATABLE_FIELD_TYPES = ["text", "tag", "numeric", "geo", "vector"] + + +class MigrationWizard: + def __init__(self, planner: Optional[MigrationPlanner] = None): + self.planner = planner or MigrationPlanner() + + def run( + self, + *, + index_name: Optional[str] = None, + redis_url: Optional[str] = None, + redis_client: Optional[Any] = None, + existing_patch_path: Optional[str] = None, + plan_out: str = "migration_plan.yaml", + patch_out: Optional[str] = None, + target_schema_out: Optional[str] = None, + ): + resolved_index_name = self._resolve_index_name( + index_name=index_name, + redis_url=redis_url, + redis_client=redis_client, + ) + snapshot = self.planner.snapshot_source( + resolved_index_name, + redis_url=redis_url, + redis_client=redis_client, + ) + source_schema = IndexSchema.from_dict(snapshot.schema_snapshot) + + print(f"Building a migration plan for index '{resolved_index_name}'") + self._print_source_schema(source_schema.to_dict()) + + # Load existing patch if provided + existing_changes = None + if existing_patch_path: + existing_changes = self._load_existing_patch(existing_patch_path) + + schema_patch = self._build_patch( + source_schema.to_dict(), existing_changes=existing_changes + ) + plan = self.planner.create_plan_from_patch( + resolved_index_name, + schema_patch=schema_patch, + redis_url=redis_url, + redis_client=redis_client, + ) + self.planner.write_plan(plan, plan_out) + + if patch_out: + write_yaml(schema_patch.model_dump(exclude_none=True), patch_out) + if target_schema_out: + write_yaml(plan.merged_target_schema, target_schema_out) + + return plan + + def _load_existing_patch(self, patch_path: str) -> SchemaPatchChanges: + from redisvl.migration.utils import load_yaml + + data = load_yaml(patch_path) + patch = SchemaPatch.model_validate(data) + print(f"Loaded existing patch from {patch_path}") + print(f" Add fields: {len(patch.changes.add_fields)}") + print(f" Update fields: {len(patch.changes.update_fields)}") + print(f" Remove fields: {len(patch.changes.remove_fields)}") + return patch.changes + + def _resolve_index_name( + self, + *, + index_name: Optional[str], + redis_url: Optional[str], + redis_client: Optional[Any], + ) -> str: + if index_name: + return index_name + + indexes = list_indexes(redis_url=redis_url, redis_client=redis_client) + if not indexes: + raise ValueError("No indexes found in Redis") + + print("Available indexes:") + for position, name in enumerate(indexes, start=1): + print(f"{position}. {name}") + + while True: + choice = input("Select an index by number or name: ").strip() + if choice in indexes: + return choice + if choice.isdigit(): + offset = int(choice) - 1 + if 0 <= offset < len(indexes): + return indexes[offset] + print("Invalid selection. Please try again.") + + def _build_patch( + self, + source_schema: Dict[str, Any], + existing_changes: Optional[SchemaPatchChanges] = None, + ) -> SchemaPatch: + if existing_changes: + changes = existing_changes + else: + changes = SchemaPatchChanges() + done = False + while not done: + print("\nChoose an action:") + print("1. Add field (text, tag, numeric, geo)") + print("2. Update field (sortable, weight, separator, vector config)") + print("3. Remove field") + print("4. Preview patch (show pending changes as YAML)") + print("5. Finish") + action = input("Enter a number: ").strip() + + if action == "1": + field = self._prompt_add_field(source_schema) + if field: + changes.add_fields.append(field) + elif action == "2": + update = self._prompt_update_field(source_schema) + if update: + changes.update_fields.append(update) + elif action == "3": + field_name = self._prompt_remove_field(source_schema) + if field_name: + changes.remove_fields.append(field_name) + elif action == "4": + print( + yaml.safe_dump( + {"version": 1, "changes": changes.model_dump()}, sort_keys=False + ) + ) + elif action == "5": + done = True + else: + print("Invalid action. Please choose 1-5.") + + return SchemaPatch(version=1, changes=changes) + + def _prompt_add_field( + self, source_schema: Dict[str, Any] + ) -> Optional[Dict[str, Any]]: + field_name = input("Field name: ").strip() + existing_names = {field["name"] for field in source_schema["fields"]} + if not field_name: + print("Field name is required.") + return None + if field_name in existing_names: + print(f"Field '{field_name}' already exists in the source schema.") + return None + + field_type = self._prompt_from_choices( + "Field type", + SUPPORTED_FIELD_TYPES, + block_message="Vector fields cannot be added (requires embedding all documents). Only text, tag, numeric, and geo are supported.", + ) + if not field_type: + return None + + field: Dict[str, Any] = {"name": field_name, "type": field_type} + storage_type = source_schema["index"]["storage_type"] + if storage_type == "json": + print(" JSON path: location in document where this field is stored") + path = ( + input(f"JSON path [default $.{field_name}]: ").strip() + or f"$.{field_name}" + ) + field["path"] = path + + attrs = self._prompt_common_attrs(field_type) + if attrs: + field["attrs"] = attrs + return field + + def _prompt_update_field( + self, source_schema: Dict[str, Any] + ) -> Optional[FieldUpdate]: + fields = [ + field + for field in source_schema["fields"] + if field["type"] in UPDATABLE_FIELD_TYPES + ] + if not fields: + print("No updatable fields are available.") + return None + + print("Updatable fields:") + for position, field in enumerate(fields, start=1): + print(f"{position}. {field['name']} ({field['type']})") + + choice = input("Select a field to update by number or name: ").strip() + selected: Optional[Dict[str, Any]] = None + for position, field in enumerate(fields, start=1): + if choice == str(position) or choice == field["name"]: + selected = field + break + if not selected: + print("Invalid field selection.") + return None + + if selected["type"] == "vector": + attrs = self._prompt_vector_attrs(selected) + else: + attrs = self._prompt_common_attrs(selected["type"], allow_blank=True) + if not attrs: + print("No changes collected.") + return None + return FieldUpdate(name=selected["name"], attrs=attrs) + + def _prompt_remove_field(self, source_schema: Dict[str, Any]) -> Optional[str]: + removable_fields = [ + field["name"] + for field in source_schema["fields"] + if field["type"] != "vector" + ] + if not removable_fields: + print("No removable Phase 1 fields are available.") + return None + + print("Removable fields:") + for position, field_name in enumerate(removable_fields, start=1): + print(f"{position}. {field_name}") + + choice = input("Select a field to remove by number or name: ").strip() + if choice in removable_fields: + return choice + if choice.isdigit(): + offset = int(choice) - 1 + if 0 <= offset < len(removable_fields): + return removable_fields[offset] + print("Invalid field selection.") + return None + + def _prompt_common_attrs( + self, field_type: str, allow_blank: bool = False + ) -> Dict[str, Any]: + attrs: Dict[str, Any] = {} + + # Sortable - available for all non-vector types + print(" Sortable: enables sorting and aggregation on this field") + sortable = self._prompt_bool("Sortable", allow_blank=allow_blank) + if sortable is not None: + attrs["sortable"] = sortable + + # Index missing - available for all types (requires Redis Search 2.10+) + print( + " Index missing: enables ismissing() queries for documents without this field" + ) + index_missing = self._prompt_bool("Index missing", allow_blank=allow_blank) + if index_missing is not None: + attrs["index_missing"] = index_missing + + # Type-specific attributes + if field_type == "text": + self._prompt_text_attrs(attrs, allow_blank) + elif field_type == "tag": + self._prompt_tag_attrs(attrs, allow_blank) + elif field_type == "numeric": + self._prompt_numeric_attrs(attrs, allow_blank, sortable) + + # No index - only meaningful with sortable + if sortable or (allow_blank and attrs.get("sortable")): + print(" No index: store field for sorting only, not searchable") + no_index = self._prompt_bool("No index", allow_blank=allow_blank) + if no_index is not None: + attrs["no_index"] = no_index + + return attrs + + def _prompt_text_attrs(self, attrs: Dict[str, Any], allow_blank: bool) -> None: + """Prompt for text field specific attributes.""" + # No stem + print( + " Disable stemming: prevents word variations (running/runs) from matching" + ) + no_stem = self._prompt_bool("Disable stemming", allow_blank=allow_blank) + if no_stem is not None: + attrs["no_stem"] = no_stem + + # Weight + print(" Weight: relevance multiplier for full-text search (default: 1.0)") + weight_input = input("Weight [leave blank for default]: ").strip() + if weight_input: + try: + weight = float(weight_input) + if weight > 0: + attrs["weight"] = weight + else: + print("Weight must be positive.") + except ValueError: + print("Invalid weight value.") + + # Index empty (requires Redis Search 2.10+) + print(" Index empty: enables searching for empty string values") + index_empty = self._prompt_bool("Index empty", allow_blank=allow_blank) + if index_empty is not None: + attrs["index_empty"] = index_empty + + # UNF (only if sortable) + if attrs.get("sortable"): + print(" UNF: preserve original form (no lowercasing) for sorting") + unf = self._prompt_bool("UNF (un-normalized form)", allow_blank=allow_blank) + if unf is not None: + attrs["unf"] = unf + + def _prompt_tag_attrs(self, attrs: Dict[str, Any], allow_blank: bool) -> None: + """Prompt for tag field specific attributes.""" + # Separator + print(" Separator: character that splits multiple values (default: comma)") + separator = input("Separator [leave blank to keep existing/default]: ").strip() + if separator: + attrs["separator"] = separator + + # Case sensitive + print(" Case sensitive: match tags with exact case (default: false)") + case_sensitive = self._prompt_bool("Case sensitive", allow_blank=allow_blank) + if case_sensitive is not None: + attrs["case_sensitive"] = case_sensitive + + # Index empty (requires Redis Search 2.10+) + print(" Index empty: enables searching for empty tag values") + index_empty = self._prompt_bool("Index empty", allow_blank=allow_blank) + if index_empty is not None: + attrs["index_empty"] = index_empty + + def _prompt_numeric_attrs( + self, attrs: Dict[str, Any], allow_blank: bool, sortable: Optional[bool] + ) -> None: + """Prompt for numeric field specific attributes.""" + # UNF (only if sortable) + if sortable or attrs.get("sortable"): + print(" UNF: preserve exact numeric representation for sorting") + unf = self._prompt_bool("UNF (un-normalized form)", allow_blank=allow_blank) + if unf is not None: + attrs["unf"] = unf + + def _prompt_vector_attrs(self, field: Dict[str, Any]) -> Dict[str, Any]: + attrs: Dict[str, Any] = {} + current = field.get("attrs", {}) + field_name = field["name"] + + print(f"Current vector config for '{field_name}':") + print(f" algorithm: {current.get('algorithm', 'HNSW')}") + print(f" datatype: {current.get('datatype', 'float32')}") + print(f" distance_metric: {current.get('distance_metric', 'cosine')}") + print(f" dims: {current.get('dims')} (cannot be changed)") + if current.get("algorithm", "HNSW") == "HNSW": + print(f" m: {current.get('m', 16)}") + print(f" ef_construction: {current.get('ef_construction', 200)}") + + print("\nLeave blank to keep current value.") + + # Algorithm + print( + " Algorithm: vector search method (FLAT=brute force, HNSW=graph, SVS-VAMANA=compressed graph)" + ) + algo = ( + input(f"Algorithm [current: {current.get('algorithm', 'HNSW')}]: ") + .strip() + .upper() + .replace("_", "-") # Normalize SVS_VAMANA to SVS-VAMANA + ) + if algo and algo in ("FLAT", "HNSW", "SVS-VAMANA"): + attrs["algorithm"] = algo + + # Datatype (quantization) - show algorithm-specific options + effective_algo = attrs.get( + "algorithm", current.get("algorithm", "HNSW") + ).upper() + if effective_algo == "SVS-VAMANA": + # SVS-VAMANA only supports float16, float32 + print( + " Datatype for SVS-VAMANA: float16, float32 " + "(float16 reduces memory by ~50%)" + ) + valid_datatypes = ("float16", "float32") + else: + # FLAT/HNSW support: float16, float32, bfloat16, float64, int8, uint8 + print( + " Datatype: float16, float32, bfloat16, float64, int8, uint8\n" + " (float16 reduces memory ~50%, int8/uint8 reduce ~75%)" + ) + valid_datatypes = ( + "float16", + "float32", + "bfloat16", + "float64", + "int8", + "uint8", + ) + datatype = ( + input(f"Datatype [current: {current.get('datatype', 'float32')}]: ") + .strip() + .lower() + ) + if datatype and datatype in valid_datatypes: + attrs["datatype"] = datatype + + # Distance metric + print(" Distance metric: how similarity is measured (cosine, l2, ip)") + metric = ( + input( + f"Distance metric [current: {current.get('distance_metric', 'cosine')}]: " + ) + .strip() + .lower() + ) + if metric and metric in ("cosine", "l2", "ip"): + attrs["distance_metric"] = metric + + # Algorithm-specific params (effective_algo already computed above) + if effective_algo == "HNSW": + print( + " M: number of connections per node (higher=better recall, more memory)" + ) + m_input = input(f"M [current: {current.get('m', 16)}]: ").strip() + if m_input and m_input.isdigit(): + attrs["m"] = int(m_input) + + print( + " EF_CONSTRUCTION: build-time search depth (higher=better recall, slower build)" + ) + ef_input = input( + f"EF_CONSTRUCTION [current: {current.get('ef_construction', 200)}]: " + ).strip() + if ef_input and ef_input.isdigit(): + attrs["ef_construction"] = int(ef_input) + + elif effective_algo == "SVS-VAMANA": + print( + " GRAPH_MAX_DEGREE: max edges per node (higher=better recall, more memory)" + ) + gmd_input = input( + f"GRAPH_MAX_DEGREE [current: {current.get('graph_max_degree', 40)}]: " + ).strip() + if gmd_input and gmd_input.isdigit(): + attrs["graph_max_degree"] = int(gmd_input) + + print(" COMPRESSION: optional vector compression for memory savings") + print(" Options: LVQ4, LVQ8, LVQ4x4, LVQ4x8, LeanVec4x8, LeanVec8x8") + compression = input("COMPRESSION [leave blank for none]: ").strip().upper() + if compression and compression in ( + "LVQ4", + "LVQ8", + "LVQ4X4", + "LVQ4X8", + "LEANVEC4X8", + "LEANVEC8X8", + ): + attrs["compression"] = compression + + return attrs + + def _prompt_bool(self, label: str, allow_blank: bool = False) -> Optional[bool]: + suffix = " [y/n]" if not allow_blank else " [y/n/skip]" + while True: + value = input(f"{label}{suffix}: ").strip().lower() + if value in ("y", "yes"): + return True + if value in ("n", "no"): + return False + if allow_blank and value in ("", "skip", "s"): + return None + if not allow_blank and value == "": + return False + print("Please answer y, n, or skip.") + + def _prompt_from_choices( + self, + label: str, + choices: List[str], + *, + block_message: str, + ) -> Optional[str]: + print(f"{label} options: {', '.join(choices)}") + value = input(f"{label}: ").strip().lower() + if value not in choices: + print(block_message) + return None + return value + + def _print_source_schema(self, schema_dict: Dict[str, Any]) -> None: + print("Current schema:") + print(f"- Index name: {schema_dict['index']['name']}") + print(f"- Storage type: {schema_dict['index']['storage_type']}") + for field in schema_dict["fields"]: + path = field.get("path") + suffix = f" path={path}" if path else "" + print(f" - {field['name']} ({field['type']}){suffix}") diff --git a/tests/integration/test_field_modifier_ordering_integration.py b/tests/integration/test_field_modifier_ordering_integration.py index b26463df..1eae743a 100644 --- a/tests/integration/test_field_modifier_ordering_integration.py +++ b/tests/integration/test_field_modifier_ordering_integration.py @@ -399,6 +399,233 @@ def test_indexmissing_enables_ismissing_query(self, client, redis_url, worker_id index.delete(drop=True) +class TestIndexEmptyIntegration: + """Integration tests for INDEXEMPTY functionality.""" + + def test_text_field_index_empty_creates_successfully( + self, client, redis_url, worker_id + ): + """Test that INDEXEMPTY on text field allows index creation.""" + skip_if_search_version_below_for_indexmissing(client) + schema_dict = { + "index": { + "name": f"test_text_empty_{worker_id}", + "prefix": f"textempty_{worker_id}:", + "storage_type": "hash", + }, + "fields": [ + { + "name": "description", + "type": "text", + "attrs": {"index_empty": True}, + } + ], + } + + schema = IndexSchema.from_dict(schema_dict) + index = SearchIndex(schema=schema, redis_url=redis_url) + index.create(overwrite=True) + + # Verify index was created + info = client.execute_command("FT.INFO", f"test_text_empty_{worker_id}") + assert info is not None + + # Create documents with empty and non-empty values + client.hset(f"textempty_{worker_id}:1", "description", "has content") + client.hset(f"textempty_{worker_id}:2", "description", "") + client.hset(f"textempty_{worker_id}:3", "description", "more content") + + # Search should work, empty string doc should be indexed + result = client.execute_command( + "FT.SEARCH", + f"test_text_empty_{worker_id}", + "*", + ) + # All 3 docs should be found + assert result[0] == 3 + + # Cleanup + client.delete( + f"textempty_{worker_id}:1", + f"textempty_{worker_id}:2", + f"textempty_{worker_id}:3", + ) + index.delete(drop=True) + + def test_tag_field_index_empty_creates_successfully( + self, client, redis_url, worker_id + ): + """Test that INDEXEMPTY on tag field allows index creation.""" + skip_if_search_version_below_for_indexmissing(client) + schema_dict = { + "index": { + "name": f"test_tag_empty_{worker_id}", + "prefix": f"tagempty_{worker_id}:", + "storage_type": "hash", + }, + "fields": [ + { + "name": "category", + "type": "tag", + "attrs": {"index_empty": True}, + } + ], + } + + schema = IndexSchema.from_dict(schema_dict) + index = SearchIndex(schema=schema, redis_url=redis_url) + index.create(overwrite=True) + + # Verify index was created + info = client.execute_command("FT.INFO", f"test_tag_empty_{worker_id}") + assert info is not None + + # Create documents with empty and non-empty values + client.hset(f"tagempty_{worker_id}:1", "category", "electronics") + client.hset(f"tagempty_{worker_id}:2", "category", "") + client.hset(f"tagempty_{worker_id}:3", "category", "books") + + # Search should work + result = client.execute_command( + "FT.SEARCH", + f"test_tag_empty_{worker_id}", + "*", + ) + # All 3 docs should be found + assert result[0] == 3 + + # Cleanup + client.delete( + f"tagempty_{worker_id}:1", + f"tagempty_{worker_id}:2", + f"tagempty_{worker_id}:3", + ) + index.delete(drop=True) + + +class TestUnfModifierIntegration: + """Integration tests for UNF (un-normalized form) modifier.""" + + def test_text_field_unf_requires_sortable(self, client, redis_url, worker_id): + """Test that UNF on text field works only when sortable is also True.""" + skip_if_search_version_below_for_indexmissing(client) + schema_dict = { + "index": { + "name": f"test_text_unf_{worker_id}", + "prefix": f"textunf_{worker_id}:", + "storage_type": "hash", + }, + "fields": [ + { + "name": "title", + "type": "text", + "attrs": {"sortable": True, "unf": True}, + } + ], + } + + schema = IndexSchema.from_dict(schema_dict) + index = SearchIndex(schema=schema, redis_url=redis_url) + + # Should create successfully + index.create(overwrite=True) + + info = client.execute_command("FT.INFO", f"test_text_unf_{worker_id}") + assert info is not None + + index.delete(drop=True) + + def test_numeric_field_unf_with_sortable(self, client, redis_url, worker_id): + """Test that UNF on numeric field works when sortable is True.""" + skip_if_search_version_below_for_indexmissing(client) + schema_dict = { + "index": { + "name": f"test_num_unf_{worker_id}", + "prefix": f"numunf_{worker_id}:", + "storage_type": "hash", + }, + "fields": [ + { + "name": "price", + "type": "numeric", + "attrs": {"sortable": True, "unf": True}, + } + ], + } + + schema = IndexSchema.from_dict(schema_dict) + index = SearchIndex(schema=schema, redis_url=redis_url) + + # Should create successfully + index.create(overwrite=True) + + info = client.execute_command("FT.INFO", f"test_num_unf_{worker_id}") + assert info is not None + + index.delete(drop=True) + + +class TestNoIndexModifierIntegration: + """Integration tests for NOINDEX modifier.""" + + def test_noindex_with_sortable_allows_sorting_not_searching( + self, client, redis_url, worker_id + ): + """Test that NOINDEX field can be sorted but not searched.""" + schema_dict = { + "index": { + "name": f"test_noindex_{worker_id}", + "prefix": f"noindex_{worker_id}:", + "storage_type": "hash", + }, + "fields": [ + { + "name": "searchable", + "type": "text", + }, + { + "name": "sort_only", + "type": "numeric", + "attrs": {"sortable": True, "no_index": True}, + }, + ], + } + + schema = IndexSchema.from_dict(schema_dict) + index = SearchIndex(schema=schema, redis_url=redis_url) + index.create(overwrite=True) + + # Add test documents + client.hset( + f"noindex_{worker_id}:1", mapping={"searchable": "hello", "sort_only": 10} + ) + client.hset( + f"noindex_{worker_id}:2", mapping={"searchable": "world", "sort_only": 5} + ) + client.hset( + f"noindex_{worker_id}:3", mapping={"searchable": "test", "sort_only": 15} + ) + + # Sorting by no_index field should work + result = client.execute_command( + "FT.SEARCH", + f"test_noindex_{worker_id}", + "*", + "SORTBY", + "sort_only", + "ASC", + ) + assert result[0] == 3 + + # Cleanup + client.delete( + f"noindex_{worker_id}:1", + f"noindex_{worker_id}:2", + f"noindex_{worker_id}:3", + ) + index.delete(drop=True) + + class TestFieldTypeModifierSupport: """Test that field types only support their documented modifiers.""" diff --git a/tests/integration/test_migration_v1.py b/tests/integration/test_migration_v1.py new file mode 100644 index 00000000..56a32d28 --- /dev/null +++ b/tests/integration/test_migration_v1.py @@ -0,0 +1,127 @@ +import uuid + +import yaml + +from redisvl.index import SearchIndex +from redisvl.migration import MigrationExecutor, MigrationPlanner, MigrationValidator +from redisvl.migration.utils import load_migration_plan, schemas_equal +from redisvl.redis.utils import array_to_buffer + + +def test_drop_recreate_plan_apply_validate_flow(redis_url, worker_id, tmp_path): + unique_id = str(uuid.uuid4())[:8] + index_name = f"migration_v1_{worker_id}_{unique_id}" + prefix = f"migration_v1:{worker_id}:{unique_id}" + + source_index = SearchIndex.from_dict( + { + "index": { + "name": index_name, + "prefix": prefix, + "storage_type": "hash", + }, + "fields": [ + {"name": "doc_id", "type": "tag"}, + {"name": "title", "type": "text"}, + {"name": "price", "type": "numeric"}, + { + "name": "embedding", + "type": "vector", + "attrs": { + "algorithm": "hnsw", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + }, + }, + ], + }, + redis_url=redis_url, + ) + + docs = [ + { + "doc_id": "1", + "title": "alpha", + "price": 1, + "category": "news", + "embedding": array_to_buffer([0.1, 0.2, 0.3], "float32"), + }, + { + "doc_id": "2", + "title": "beta", + "price": 2, + "category": "sports", + "embedding": array_to_buffer([0.2, 0.1, 0.4], "float32"), + }, + ] + + source_index.create(overwrite=True) + source_index.load(docs, id_field="doc_id") + + patch_path = tmp_path / "schema_patch.yaml" + patch_path.write_text( + yaml.safe_dump( + { + "version": 1, + "changes": { + "add_fields": [ + { + "name": "category", + "type": "tag", + "attrs": {"separator": ","}, + } + ], + "remove_fields": ["price"], + "update_fields": [{"name": "title", "attrs": {"sortable": True}}], + }, + }, + sort_keys=False, + ) + ) + + plan_path = tmp_path / "migration_plan.yaml" + planner = MigrationPlanner() + plan = planner.create_plan( + index_name, + redis_url=redis_url, + schema_patch_path=str(patch_path), + ) + assert plan.diff_classification.supported is True + planner.write_plan(plan, str(plan_path)) + + query_check_path = tmp_path / "query_checks.yaml" + query_check_path.write_text( + yaml.safe_dump({"fetch_ids": ["1", "2"]}, sort_keys=False) + ) + + executor = MigrationExecutor() + report = executor.apply( + load_migration_plan(str(plan_path)), + redis_url=redis_url, + query_check_file=str(query_check_path), + ) + + assert report.result == "succeeded" + assert report.validation.schema_match is True + assert report.validation.doc_count_match is True + assert report.validation.key_sample_exists is True + assert report.validation.indexing_failures_delta == 0 + assert not report.validation.errors + assert report.benchmark_summary.documents_indexed_per_second is not None + + live_index = SearchIndex.from_existing(index_name, redis_url=redis_url) + assert schemas_equal(live_index.schema.to_dict(), plan.merged_target_schema) + + validator = MigrationValidator() + validation, _target_info, _duration = validator.validate( + load_migration_plan(str(plan_path)), + redis_url=redis_url, + query_check_file=str(query_check_path), + ) + assert validation.schema_match is True + assert validation.doc_count_match is True + assert validation.key_sample_exists is True + assert not validation.errors + + live_index.delete(drop=True) diff --git a/tests/unit/test_migration_planner.py b/tests/unit/test_migration_planner.py new file mode 100644 index 00000000..5464b5cc --- /dev/null +++ b/tests/unit/test_migration_planner.py @@ -0,0 +1,889 @@ +from fnmatch import fnmatch + +import yaml + +from redisvl.migration import MigrationPlanner +from redisvl.schema.schema import IndexSchema + + +class DummyClient: + def __init__(self, keys): + self.keys = keys + + def scan(self, cursor=0, match=None, count=None): + matched = [] + for key in self.keys: + decoded_key = key.decode() if isinstance(key, bytes) else str(key) + if match is None or fnmatch(decoded_key, match): + matched.append(key) + return 0, matched + + +class DummyIndex: + def __init__(self, schema, stats, keys): + self.schema = schema + self._stats = stats + self._client = DummyClient(keys) + + @property + def client(self): + return self._client + + def info(self): + return self._stats + + +def _make_source_schema(): + return IndexSchema.from_dict( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "title", + "type": "text", + "path": "$.title", + "attrs": {"sortable": False}, + }, + { + "name": "price", + "type": "numeric", + "path": "$.price", + "attrs": {"sortable": True}, + }, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "flat", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + }, + }, + ], + } + ) + + +def test_create_plan_from_schema_patch_preserves_unspecified_config( + monkeypatch, tmp_path +): + source_schema = _make_source_schema() + dummy_index = DummyIndex( + source_schema, + {"num_docs": 2, "indexing": False}, + [b"docs:1", b"docs:2", b"docs:3"], + ) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + patch_path = tmp_path / "schema_patch.yaml" + patch_path.write_text( + yaml.safe_dump( + { + "version": 1, + "changes": { + "add_fields": [ + { + "name": "category", + "type": "tag", + "path": "$.category", + "attrs": {"separator": ","}, + } + ], + "remove_fields": ["price"], + "update_fields": [ + { + "name": "title", + "options": {"sortable": True}, + } + ], + }, + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner(key_sample_limit=2) + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + schema_patch_path=str(patch_path), + ) + + assert plan.diff_classification.supported is True + assert plan.source.index_name == "docs" + assert plan.source.keyspace.storage_type == "json" + assert plan.source.keyspace.prefixes == ["docs"] + assert plan.source.keyspace.key_separator == ":" + assert plan.source.keyspace.key_sample == ["docs:1", "docs:2"] + assert plan.warnings == ["Index downtime is required"] + + merged_fields = { + field["name"]: field for field in plan.merged_target_schema["fields"] + } + assert plan.merged_target_schema["index"]["prefix"] == "docs" + assert merged_fields["title"]["attrs"]["sortable"] is True + assert "price" not in merged_fields + assert merged_fields["category"]["type"] == "tag" + + plan_path = tmp_path / "migration_plan.yaml" + planner.write_plan(plan, str(plan_path)) + written_plan = yaml.safe_load(plan_path.read_text()) + assert written_plan["mode"] == "drop_recreate" + assert written_plan["validation"]["require_doc_count_match"] is True + assert written_plan["diff_classification"]["supported"] is True + + +def test_target_schema_vector_datatype_change_is_allowed(monkeypatch, tmp_path): + """Changing vector datatype (quantization) is allowed - executor will re-encode.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "title", + "type": "text", + "path": "$.title", + "attrs": {"sortable": False}, + }, + { + "name": "price", + "type": "numeric", + "path": "$.price", + "attrs": {"sortable": True}, + }, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "flat", # Same algorithm + "dims": 3, + "distance_metric": "cosine", + "datatype": "float16", # Changed from float32 + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + # Datatype change (quantization) should now be ALLOWED + assert plan.diff_classification.supported is True + assert len(plan.diff_classification.blocked_reasons) == 0 + + # Verify datatype changes are detected for the executor + datatype_changes = MigrationPlanner.get_vector_datatype_changes( + plan.source.schema_snapshot, plan.merged_target_schema + ) + assert "embedding" in datatype_changes + assert datatype_changes["embedding"]["source"] == "float32" + assert datatype_changes["embedding"]["target"] == "float16" + + +def test_target_schema_vector_algorithm_change_is_allowed(monkeypatch, tmp_path): + """Changing vector algorithm is allowed (index-only change).""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "title", + "type": "text", + "path": "$.title", + "attrs": {"sortable": False}, + }, + { + "name": "price", + "type": "numeric", + "path": "$.price", + "attrs": {"sortable": True}, + }, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "hnsw", # Changed from flat + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", # Same datatype + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + # Algorithm change should be ALLOWED + assert plan.diff_classification.supported is True + assert len(plan.diff_classification.blocked_reasons) == 0 + + +# ============================================================================= +# BLOCKED CHANGES (Document-Dependent) - require iterative_shadow +# ============================================================================= + + +def test_target_schema_prefix_change_is_blocked(monkeypatch, tmp_path): + """Prefix change is blocked: documents are at wrong keys.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs_v2", + "key_separator": ":", + "storage_type": "json", + }, + "fields": source_schema.to_dict()["fields"], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is False + assert any( + "prefix" in reason.lower() and "iterative_shadow" in reason + for reason in plan.diff_classification.blocked_reasons + ) + + +def test_key_separator_change_is_blocked(monkeypatch, tmp_path): + """Key separator change is blocked: document keys don't match new pattern.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": "/", # Changed from ":" + "storage_type": "json", + }, + "fields": source_schema.to_dict()["fields"], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is False + assert any( + "key_separator" in reason.lower() or "separator" in reason.lower() + for reason in plan.diff_classification.blocked_reasons + ) + + +def test_storage_type_change_is_blocked(monkeypatch, tmp_path): + """Storage type change is blocked: documents are in wrong format.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "hash", # Changed from "json" + }, + "fields": [ + {"name": "title", "type": "text", "attrs": {"sortable": False}}, + {"name": "price", "type": "numeric", "attrs": {"sortable": True}}, + { + "name": "embedding", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is False + assert any( + "storage" in reason.lower() + for reason in plan.diff_classification.blocked_reasons + ) + + +def test_vector_dimension_change_is_blocked(monkeypatch, tmp_path): + """Vector dimension change is blocked: stored vectors have wrong size.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "title", + "type": "text", + "path": "$.title", + "attrs": {"sortable": False}, + }, + { + "name": "price", + "type": "numeric", + "path": "$.price", + "attrs": {"sortable": True}, + }, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "flat", + "dims": 768, # Changed from 3 + "distance_metric": "cosine", + "datatype": "float32", + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is False + assert any( + "dims" in reason and "iterative_shadow" in reason + for reason in plan.diff_classification.blocked_reasons + ) + + +def test_field_path_change_is_blocked(monkeypatch, tmp_path): + """JSON path change is blocked: stored data is at wrong path.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "title", + "type": "text", + "path": "$.metadata.title", # Changed from $.title + "attrs": {"sortable": False}, + }, + { + "name": "price", + "type": "numeric", + "path": "$.price", + "attrs": {"sortable": True}, + }, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "flat", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is False + assert any( + "path" in reason.lower() for reason in plan.diff_classification.blocked_reasons + ) + + +def test_field_type_change_is_blocked(monkeypatch, tmp_path): + """Field type change is blocked: index expects different data format.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "title", + "type": "tag", # Changed from text + "path": "$.title", + }, + { + "name": "price", + "type": "numeric", + "path": "$.price", + "attrs": {"sortable": True}, + }, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "flat", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is False + assert any( + "type" in reason.lower() for reason in plan.diff_classification.blocked_reasons + ) + + +def test_field_rename_is_detected_and_blocked(monkeypatch, tmp_path): + """Field rename is blocked: stored data uses old field name.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "document_title", # Renamed from "title" + "type": "text", + "path": "$.title", + "attrs": {"sortable": False}, + }, + { + "name": "price", + "type": "numeric", + "path": "$.price", + "attrs": {"sortable": True}, + }, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "flat", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is False + assert any( + "rename" in reason.lower() + for reason in plan.diff_classification.blocked_reasons + ) + + +# ============================================================================= +# ALLOWED CHANGES (Index-Only) +# ============================================================================= + + +def test_add_non_vector_field_is_allowed(monkeypatch, tmp_path): + """Adding a non-vector field is allowed.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + patch_path = tmp_path / "schema_patch.yaml" + patch_path.write_text( + yaml.safe_dump( + { + "version": 1, + "changes": { + "add_fields": [ + {"name": "category", "type": "tag", "path": "$.category"} + ] + }, + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + schema_patch_path=str(patch_path), + ) + + assert plan.diff_classification.supported is True + + +def test_remove_field_is_allowed(monkeypatch, tmp_path): + """Removing a field from the index is allowed.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + patch_path = tmp_path / "schema_patch.yaml" + patch_path.write_text( + yaml.safe_dump( + {"version": 1, "changes": {"remove_fields": ["price"]}}, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + schema_patch_path=str(patch_path), + ) + + assert plan.diff_classification.supported is True + + +def test_change_field_sortable_is_allowed(monkeypatch, tmp_path): + """Changing field sortable option is allowed.""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + patch_path = tmp_path / "schema_patch.yaml" + patch_path.write_text( + yaml.safe_dump( + { + "version": 1, + "changes": { + "update_fields": [{"name": "title", "options": {"sortable": True}}] + }, + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + schema_patch_path=str(patch_path), + ) + + assert plan.diff_classification.supported is True + + +def test_change_vector_distance_metric_is_allowed(monkeypatch, tmp_path): + """Changing vector distance metric is allowed (index-only).""" + source_schema = _make_source_schema() + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "title", + "type": "text", + "path": "$.title", + "attrs": {"sortable": False}, + }, + { + "name": "price", + "type": "numeric", + "path": "$.price", + "attrs": {"sortable": True}, + }, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "flat", + "dims": 3, + "distance_metric": "L2", # Changed from cosine + "datatype": "float32", + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is True + assert len(plan.diff_classification.blocked_reasons) == 0 + + +def test_change_hnsw_tuning_params_is_allowed(monkeypatch, tmp_path): + """Changing HNSW tuning parameters is allowed (index-only).""" + source_schema = IndexSchema.from_dict( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "hnsw", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + "m": 16, + "ef_construction": 200, + }, + }, + ], + } + ) + dummy_index = DummyIndex(source_schema, {"num_docs": 2}, [b"docs:1"]) + monkeypatch.setattr( + "redisvl.migration.planner.SearchIndex.from_existing", + lambda *args, **kwargs: dummy_index, + ) + + target_schema_path = tmp_path / "target_schema.yaml" + target_schema_path.write_text( + yaml.safe_dump( + { + "index": { + "name": "docs", + "prefix": "docs", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "hnsw", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + "m": 32, # Changed from 16 + "ef_construction": 400, # Changed from 200 + }, + }, + ], + }, + sort_keys=False, + ) + ) + + planner = MigrationPlanner() + plan = planner.create_plan( + "docs", + redis_url="redis://localhost:6379", + target_schema_path=str(target_schema_path), + ) + + assert plan.diff_classification.supported is True + assert len(plan.diff_classification.blocked_reasons) == 0 diff --git a/tests/unit/test_migration_wizard.py b/tests/unit/test_migration_wizard.py new file mode 100644 index 00000000..56c3a6d3 --- /dev/null +++ b/tests/unit/test_migration_wizard.py @@ -0,0 +1,1142 @@ +import pytest + +from redisvl.migration.wizard import MigrationWizard + + +def _make_vector_source_schema(algorithm="hnsw", datatype="float32"): + """Helper to create a source schema with a vector field.""" + return { + "index": { + "name": "test_index", + "prefix": "test:", + "storage_type": "hash", + }, + "fields": [ + {"name": "title", "type": "text"}, + { + "name": "embedding", + "type": "vector", + "attrs": { + "algorithm": algorithm, + "dims": 384, + "distance_metric": "cosine", + "datatype": datatype, + "m": 16, + "ef_construction": 200, + }, + }, + ], + } + + +def test_wizard_builds_patch_from_interactive_inputs(monkeypatch): + source_schema = { + "index": { + "name": "docs", + "prefix": "docs", + "storage_type": "json", + }, + "fields": [ + {"name": "title", "type": "text", "path": "$.title"}, + {"name": "category", "type": "tag", "path": "$.category"}, + { + "name": "embedding", + "type": "vector", + "path": "$.embedding", + "attrs": { + "algorithm": "flat", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + }, + }, + ], + } + + answers = iter( + [ + # Add field + "1", + "status", # field name + "tag", # field type + "$.status", # JSON path + "y", # sortable + "n", # index_missing + "|", # separator + "n", # case_sensitive + "n", # index_empty + "n", # no_index (prompted since sortable=y) + # Update field + "2", + "title", # select field + "y", # sortable + "n", # index_missing + "n", # no_stem + "", # weight (blank to skip) + "n", # index_empty + "n", # unf (prompted since sortable=y) + "n", # no_index (prompted since sortable=y) + # Remove field + "3", + "category", + # Finish + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) # noqa: SLF001 + + assert patch.changes.add_fields == [ + { + "name": "status", + "type": "tag", + "path": "$.status", + "attrs": { + "sortable": True, + "index_missing": False, + "separator": "|", + "case_sensitive": False, + "index_empty": False, + "no_index": False, + }, + } + ] + assert patch.changes.remove_fields == ["category"] + assert len(patch.changes.update_fields) == 1 + assert patch.changes.update_fields[0].name == "title" + assert patch.changes.update_fields[0].attrs["sortable"] is True + assert patch.changes.update_fields[0].attrs["no_stem"] is False + + +# ============================================================================= +# Vector Algorithm Tests +# ============================================================================= + + +class TestVectorAlgorithmChanges: + """Test wizard handling of vector algorithm changes.""" + + def test_hnsw_to_flat(self, monkeypatch): + """Test changing from HNSW to FLAT algorithm.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", # Update field + "embedding", # Select vector field + "FLAT", # Change to FLAT + "", # datatype (keep current) + "", # distance_metric (keep current) + # No HNSW params prompted for FLAT + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 1 + update = patch.changes.update_fields[0] + assert update.name == "embedding" + assert update.attrs["algorithm"] == "FLAT" + + def test_flat_to_hnsw_with_params(self, monkeypatch): + """Test changing from FLAT to HNSW with custom M and EF_CONSTRUCTION.""" + source_schema = _make_vector_source_schema(algorithm="flat") + + answers = iter( + [ + "2", # Update field + "embedding", # Select vector field + "HNSW", # Change to HNSW + "", # datatype (keep current) + "", # distance_metric (keep current) + "32", # M + "400", # EF_CONSTRUCTION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "HNSW" + assert update.attrs["m"] == 32 + assert update.attrs["ef_construction"] == 400 + + def test_hnsw_to_svs_vamana_with_underscore(self, monkeypatch): + """Test changing to SVS_VAMANA (underscore format) is normalized.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", # Update field + "embedding", # Select vector field + "SVS_VAMANA", # Underscore format (should be normalized) + "float16", # SVS only supports float16/float32 + "", # distance_metric (keep current) + "64", # GRAPH_MAX_DEGREE + "LVQ8", # COMPRESSION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "SVS-VAMANA" # Normalized to hyphen + assert update.attrs["datatype"] == "float16" + assert update.attrs["graph_max_degree"] == 64 + assert update.attrs["compression"] == "LVQ8" + + def test_hnsw_to_svs_vamana_with_hyphen(self, monkeypatch): + """Test changing to SVS-VAMANA (hyphen format) works directly.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", # Update field + "embedding", # Select vector field + "SVS-VAMANA", # Hyphen format + "", # datatype (keep current) + "", # distance_metric (keep current) + "", # GRAPH_MAX_DEGREE (keep default) + "", # COMPRESSION (none) + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "SVS-VAMANA" + + def test_svs_vamana_with_leanvec_compression(self, monkeypatch): + """Test SVS-VAMANA with LeanVec compression type.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", # Update field + "embedding", # Select vector field + "SVS-VAMANA", + "float16", + "", # distance_metric + "48", # GRAPH_MAX_DEGREE + "LEANVEC8X8", # COMPRESSION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "SVS-VAMANA" + assert update.attrs["compression"] == "LEANVEC8X8" + + +# ============================================================================= +# Vector Datatype (Quantization) Tests +# ============================================================================= + + +class TestVectorDatatypeChanges: + """Test wizard handling of vector datatype/quantization changes.""" + + def test_float32_to_float16(self, monkeypatch): + """Test quantization from float32 to float16.""" + source_schema = _make_vector_source_schema(datatype="float32") + + answers = iter( + [ + "2", # Update field + "embedding", + "", # algorithm (keep current) + "float16", # datatype + "", # distance_metric + "", # M (keep current) + "", # EF_CONSTRUCTION (keep current) + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["datatype"] == "float16" + + def test_float16_to_float32(self, monkeypatch): + """Test changing from float16 back to float32.""" + source_schema = _make_vector_source_schema(datatype="float16") + + answers = iter( + [ + "2", # Update field + "embedding", + "", # algorithm + "float32", # datatype + "", # distance_metric + "", # M + "", # EF_CONSTRUCTION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["datatype"] == "float32" + + def test_int8_accepted_for_hnsw(self, monkeypatch): + """Test that int8 is accepted for HNSW/FLAT (but not SVS-VAMANA).""" + source_schema = _make_vector_source_schema(datatype="float32") + + answers = iter( + [ + "2", # Update field + "embedding", + "", # algorithm (keep HNSW) + "int8", # Valid for HNSW/FLAT + "", # distance_metric + "", # M + "", # EF_CONSTRUCTION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + # int8 is now valid for HNSW/FLAT + update = patch.changes.update_fields[0] + assert update.attrs["datatype"] == "int8" + + +# ============================================================================= +# Distance Metric Tests +# ============================================================================= + + +class TestDistanceMetricChanges: + """Test wizard handling of distance metric changes.""" + + def test_cosine_to_l2(self, monkeypatch): + """Test changing distance metric from cosine to L2.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", # Update field + "embedding", + "", # algorithm + "", # datatype + "l2", # distance_metric + "", # M + "", # EF_CONSTRUCTION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["distance_metric"] == "l2" + + def test_cosine_to_ip(self, monkeypatch): + """Test changing distance metric from cosine to inner product.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", # Update field + "embedding", + "", # algorithm + "", # datatype + "ip", # distance_metric (inner product) + "", # M + "", # EF_CONSTRUCTION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["distance_metric"] == "ip" + + +# ============================================================================= +# Combined Changes Tests +# ============================================================================= + + +class TestCombinedVectorChanges: + """Test wizard handling of multiple vector attribute changes.""" + + def test_algorithm_datatype_and_metric_change(self, monkeypatch): + """Test changing algorithm, datatype, and distance metric together.""" + source_schema = _make_vector_source_schema(algorithm="flat", datatype="float32") + + answers = iter( + [ + "2", # Update field + "embedding", + "HNSW", # algorithm + "float16", # datatype + "l2", # distance_metric + "24", # M + "300", # EF_CONSTRUCTION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "HNSW" + assert update.attrs["datatype"] == "float16" + assert update.attrs["distance_metric"] == "l2" + assert update.attrs["m"] == 24 + assert update.attrs["ef_construction"] == 300 + + def test_svs_vamana_full_config(self, monkeypatch): + """Test SVS-VAMANA with all parameters configured.""" + source_schema = _make_vector_source_schema(algorithm="hnsw", datatype="float32") + + answers = iter( + [ + "2", # Update field + "embedding", + "SVS-VAMANA", # algorithm + "float16", # datatype (required for SVS) + "ip", # distance_metric + "50", # GRAPH_MAX_DEGREE + "LVQ4X8", # COMPRESSION + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "SVS-VAMANA" + assert update.attrs["datatype"] == "float16" + assert update.attrs["distance_metric"] == "ip" + assert update.attrs["graph_max_degree"] == 50 + assert update.attrs["compression"] == "LVQ4X8" + + def test_no_changes_when_all_blank(self, monkeypatch): + """Test that blank inputs result in no changes.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", # Update field + "embedding", + "", # algorithm (keep current) + "", # datatype (keep current) + "", # distance_metric (keep current) + "", # M (keep current) + "", # EF_CONSTRUCTION (keep current) + "5", # Finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + # No changes collected means no update_fields + assert len(patch.changes.update_fields) == 0 + + +# ============================================================================= +# Adversarial / Edge Case Tests +# ============================================================================= + + +class TestWizardAdversarialInputs: + """Test wizard robustness against malformed, malicious, or edge case inputs.""" + + # ------------------------------------------------------------------------- + # Invalid Algorithm Inputs + # ------------------------------------------------------------------------- + + def test_typo_in_algorithm_ignored(self, monkeypatch): + """Test that typos in algorithm name are ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "HNSW_TYPO", # Invalid algorithm + "", + "", + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + # Invalid algorithm should be ignored, no changes + assert len(patch.changes.update_fields) == 0 + + def test_partial_algorithm_name_ignored(self, monkeypatch): + """Test that partial algorithm names are ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "HNS", # Partial name + "", + "", + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 0 + + def test_algorithm_with_special_chars_ignored(self, monkeypatch): + """Test that algorithm with special characters is ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "HNSW; DROP TABLE users;--", # SQL injection attempt + "", + "", + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 0 + + def test_algorithm_lowercase_works(self, monkeypatch): + """Test that lowercase algorithm names work (case insensitive).""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "flat", # lowercase + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "FLAT" + + def test_algorithm_mixed_case_works(self, monkeypatch): + """Test that mixed case algorithm names work.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "SvS_VaMaNa", # Mixed case with underscore + "", + "", + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "SVS-VAMANA" + + # ------------------------------------------------------------------------- + # Invalid Numeric Inputs + # ------------------------------------------------------------------------- + + def test_negative_m_ignored(self, monkeypatch): + """Test that negative M value is ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "HNSW", + "", + "", + "-16", # Negative M + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert "m" not in update.attrs # Negative should be ignored + + def test_float_m_ignored(self, monkeypatch): + """Test that float M value is ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "HNSW", + "", + "", + "16.5", # Float M + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert "m" not in update.attrs + + def test_string_m_ignored(self, monkeypatch): + """Test that string M value is ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "HNSW", + "", + "", + "sixteen", # String M + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert "m" not in update.attrs + + def test_zero_m_accepted(self, monkeypatch): + """Test that zero M is accepted (validation happens at schema level).""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "HNSW", + "", + "", + "0", # Zero M + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + # Zero is a valid digit, wizard accepts it (validation at apply time) + # isdigit() returns False for "0" in some edge cases, let's check + update = patch.changes.update_fields[0] + # "0".isdigit() returns True, so it should be accepted + assert update.attrs.get("m") == 0 + + def test_very_large_ef_construction_accepted(self, monkeypatch): + """Test that very large EF_CONSTRUCTION is accepted by wizard.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "HNSW", + "", + "", + "", + "999999999", # Very large + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["ef_construction"] == 999999999 + + # ------------------------------------------------------------------------- + # Invalid Datatype Inputs + # ------------------------------------------------------------------------- + + def test_bfloat16_accepted_for_hnsw(self, monkeypatch): + """Test that bfloat16 is accepted for HNSW/FLAT.""" + source_schema = _make_vector_source_schema(datatype="float32") + + answers = iter( + [ + "2", + "embedding", + "", + "bfloat16", # Valid for HNSW/FLAT + "", + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["datatype"] == "bfloat16" + + def test_uint8_accepted_for_hnsw(self, monkeypatch): + """Test that uint8 is accepted for HNSW/FLAT.""" + source_schema = _make_vector_source_schema(datatype="float32") + + answers = iter( + [ + "2", + "embedding", + "", + "uint8", # Valid for HNSW/FLAT + "", + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["datatype"] == "uint8" + + def test_int8_rejected_for_svs_vamana(self, monkeypatch): + """Test that int8 is rejected for SVS-VAMANA (only float16/float32 allowed).""" + source_schema = _make_vector_source_schema(datatype="float32", algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "SVS-VAMANA", # Switch to SVS-VAMANA + "int8", # Invalid for SVS-VAMANA + "", + "", + "", # graph_max_degree + "", # compression + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + # Should have algorithm change but NOT datatype + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "SVS-VAMANA" + assert "datatype" not in update.attrs # int8 rejected + + # ------------------------------------------------------------------------- + # Invalid Distance Metric Inputs + # ------------------------------------------------------------------------- + + def test_invalid_distance_metric_ignored(self, monkeypatch): + """Test that invalid distance metric is ignored.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", + "embedding", + "", + "", + "euclidean", # Invalid (should be 'l2') + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 0 + + def test_distance_metric_uppercase_works(self, monkeypatch): + """Test that uppercase distance metric works.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", + "embedding", + "", + "", + "L2", # Uppercase + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["distance_metric"] == "l2" + + # ------------------------------------------------------------------------- + # Invalid Compression Inputs + # ------------------------------------------------------------------------- + + def test_invalid_compression_ignored(self, monkeypatch): + """Test that invalid compression type is ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "SVS-VAMANA", + "", + "", + "", + "INVALID_COMPRESSION", # Invalid + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert "compression" not in update.attrs + + def test_compression_lowercase_works(self, monkeypatch): + """Test that lowercase compression works.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "SVS-VAMANA", + "", + "", + "", + "lvq8", # lowercase + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["compression"] == "LVQ8" + + # ------------------------------------------------------------------------- + # Whitespace and Special Character Inputs + # ------------------------------------------------------------------------- + + def test_whitespace_only_treated_as_blank(self, monkeypatch): + """Test that whitespace-only input is treated as blank.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", + "embedding", + " ", # Whitespace only + " ", + " ", + " ", + " ", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 0 + + def test_algorithm_with_leading_trailing_whitespace(self, monkeypatch): + """Test that algorithm with whitespace is trimmed and works.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + " FLAT ", # Whitespace around + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert update.attrs["algorithm"] == "FLAT" + + def test_unicode_input_ignored(self, monkeypatch): + """Test that unicode/emoji inputs are ignored.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", + "embedding", + "HNSW🚀", # Unicode emoji + "", + "", + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 0 + + def test_very_long_input_ignored(self, monkeypatch): + """Test that very long inputs are ignored.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", + "embedding", + "A" * 10000, # Very long string + "", + "", + "", + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 0 + + # ------------------------------------------------------------------------- + # Field Selection Edge Cases + # ------------------------------------------------------------------------- + + def test_nonexistent_field_selection(self, monkeypatch): + """Test selecting a nonexistent field.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", + "nonexistent_field", # Doesn't exist + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + # Should print "Invalid field selection" and continue + assert len(patch.changes.update_fields) == 0 + + def test_field_selection_by_number_out_of_range(self, monkeypatch): + """Test selecting a field by out-of-range number.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", + "99", # Out of range + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 0 + + def test_field_selection_negative_number(self, monkeypatch): + """Test selecting a field with negative number.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "2", + "-1", # Negative + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + assert len(patch.changes.update_fields) == 0 + + # ------------------------------------------------------------------------- + # Menu Action Edge Cases + # ------------------------------------------------------------------------- + + def test_invalid_menu_action(self, monkeypatch): + """Test invalid menu action selection.""" + source_schema = _make_vector_source_schema() + + answers = iter( + [ + "99", # Invalid action + "abc", # Invalid action + "", # Empty + "5", # Finally finish + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + # Should handle invalid actions gracefully and eventually finish + assert patch is not None + + # ------------------------------------------------------------------------- + # SVS-VAMANA Specific Edge Cases + # ------------------------------------------------------------------------- + + def test_svs_vamana_negative_graph_max_degree_ignored(self, monkeypatch): + """Test that negative GRAPH_MAX_DEGREE is ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "SVS-VAMANA", + "", + "", + "-40", # Negative + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert "graph_max_degree" not in update.attrs + + def test_svs_vamana_string_graph_max_degree_ignored(self, monkeypatch): + """Test that string GRAPH_MAX_DEGREE is ignored.""" + source_schema = _make_vector_source_schema(algorithm="hnsw") + + answers = iter( + [ + "2", + "embedding", + "SVS-VAMANA", + "", + "", + "forty", # String + "", + "5", + ] + ) + monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers)) + + wizard = MigrationWizard() + patch = wizard._build_patch(source_schema) + + update = patch.changes.update_fields[0] + assert "graph_max_degree" not in update.attrs