From 657ba5df71e3df49414bce64812064a5389aacfc Mon Sep 17 00:00:00 2001 From: michaelj Date: Sat, 7 Mar 2026 09:23:51 +0000 Subject: [PATCH 1/5] feat(perf): skip redundant DDL checks in SQLSampleStore.__init__ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, SQLSampleStore.__init__ called _create_source_table() unconditionally on every construction. _create_source_table() calls meta.create_all(checkfirst=True), which issues a separate table-existence query for each of the four backing tables even when the tables already exist — which they always do on read-only CLI commands. Locally this cost was ~1.45s. The changes reduce this to 0.24s. The main fix is to probe for table existence instead of blinding issuing _create_source_tables(). This removes the 1.45 and adds the probe. We use a direct SQL probe (taking 240ms locally measured) instead of sqlalchemy.inspect() which took 800ms locally. The PR also introduces a process-level _source_tables_verified that records tablename prefixes whose backing tables have already been confirmed to exist. On the second and subsequent SQLSampleStore constructions for the same store within the same process the probe is skipped entirely (0 round-trips). The main impact of this is in the tests. --- orchestrator/core/samplestore/sql.py | 34 ++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/orchestrator/core/samplestore/sql.py b/orchestrator/core/samplestore/sql.py index 6d51b7d6e..04b9c62da 100644 --- a/orchestrator/core/samplestore/sql.py +++ b/orchestrator/core/samplestore/sql.py @@ -53,6 +53,11 @@ import pandas as pd from rich.console import RenderableType +# Process-level cache of tablename prefixes for which the four DDL tables have +# already been verified to exist. Skips the four `CREATE TABLE IF NOT EXISTS` +# round-trips on every subsequent SQLSampleStore construction for the same store. +_source_tables_verified: set[str] = set() + class SQLSampleStoreConfiguration(pydantic.BaseModel): identifier: Annotated[ @@ -372,8 +377,33 @@ def __init__( self._tablename = f"sqlsource_{self._identifier}" self._engine = engine_for_sql_store(storageLocation) - # Create a table for this sample store - self._create_source_table() + # Create the four backing tables only when they do not yet exist. + # Use a single raw SQL probe (1 round-trip) as a fast path to avoid + # the 15+ DDL round-trips that create_all(checkfirst=True) + # issues when the tables are already present. + # The module level _source_table_verified enables skipping + # even the probe for subsequent constructions within the same process. + # + # We use a direct information_schema / sqlite_master query rather than + # sqlalchemy.inspect() to avoid the Inspector's internal connection + # overhead (it opens its own connection on top of the borrowed one). + if self._tablename not in _source_tables_verified: + if self.engine.dialect.name == "sqlite": + existence_query = sqlalchemy.text( + "SELECT 1 FROM sqlite_master WHERE type='table' AND name=:name" + ).bindparams(name=self._tablename) + else: + existence_query = sqlalchemy.text( + "SELECT 1 FROM information_schema.tables" + " WHERE table_schema = DATABASE() AND table_name = :name LIMIT 1" + ).bindparams(name=self._tablename) + + with self.engine.connect() as conn: + table_exists = conn.execute(existence_query).fetchone() is not None + + if not table_exists: + self._create_source_table() + _source_tables_verified.add(self._tablename) # Initialize entities cache as empty dict for lazy loading # Empty dict is falsy, so lazy loading check `if not self._entities:` still works From dc824bf4e044752d666e400c726f112833d3b7ff Mon Sep 17 00:00:00 2001 From: Michael Johnston <66301584+michael-johnston@users.noreply.github.com> Date: Mon, 9 Mar 2026 10:50:32 +0000 Subject: [PATCH 2/5] Apply suggestion from @michael-johnston Signed-off-by: Michael Johnston <66301584+michael-johnston@users.noreply.github.com> --- orchestrator/core/samplestore/sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orchestrator/core/samplestore/sql.py b/orchestrator/core/samplestore/sql.py index 04b9c62da..8892338a2 100644 --- a/orchestrator/core/samplestore/sql.py +++ b/orchestrator/core/samplestore/sql.py @@ -379,8 +379,8 @@ def __init__( # Create the four backing tables only when they do not yet exist. # Use a single raw SQL probe (1 round-trip) as a fast path to avoid - # the 15+ DDL round-trips that create_all(checkfirst=True) - # issues when the tables are already present. + # the ~4 SQL queries that create_all(checkfirst=True) issues when + # the tables are already present (4 table-existence checks) # The module level _source_table_verified enables skipping # even the probe for subsequent constructions within the same process. # From 43442c71596a71330a71c2c4d4badc79196714b4 Mon Sep 17 00:00:00 2001 From: michaelj Date: Mon, 9 Mar 2026 11:49:55 +0000 Subject: [PATCH 3/5] chore(black): formatting --- orchestrator/core/samplestore/sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orchestrator/core/samplestore/sql.py b/orchestrator/core/samplestore/sql.py index 8892338a2..d4ca79662 100644 --- a/orchestrator/core/samplestore/sql.py +++ b/orchestrator/core/samplestore/sql.py @@ -379,8 +379,8 @@ def __init__( # Create the four backing tables only when they do not yet exist. # Use a single raw SQL probe (1 round-trip) as a fast path to avoid - # the ~4 SQL queries that create_all(checkfirst=True) issues when - # the tables are already present (4 table-existence checks) + # the ~4 SQL queries that create_all(checkfirst=True) issues when + # the tables are already present (4 table-existence checks) # The module level _source_table_verified enables skipping # even the probe for subsequent constructions within the same process. # From 27d0bc374f98c1714bd0d4c2ac6f754ef1d21e12 Mon Sep 17 00:00:00 2001 From: michaelj Date: Mon, 30 Mar 2026 19:13:14 +0100 Subject: [PATCH 4/5] feat(sql): add function for checking table existence. --- orchestrator/metastore/sql/statements.py | 28 ++++++++++++++++++++++++ orchestrator/metastore/sqlstore.py | 13 ++++------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/orchestrator/metastore/sql/statements.py b/orchestrator/metastore/sql/statements.py index 19a5e6689..3cbc73224 100644 --- a/orchestrator/metastore/sql/statements.py +++ b/orchestrator/metastore/sql/statements.py @@ -216,6 +216,34 @@ def _searchable_scalar_value_for_query_string(value: _ScalarType) -> str: return fragments +def table_exists_query( + tablename: str, + dialect: Literal["mysql", "sqlite"] = "mysql", +) -> sqlalchemy.TextClause: + """Return a bound SQL query that checks whether a table exists in the database. + + Uses dialect-specific system catalogue tables: ``sqlite_master`` for SQLite + and ``information_schema.tables`` for MySQL. The tablename is passed as a + bind parameter to prevent SQL injection. + + Args: + tablename: The name of the table to check for. + dialect: The SQL dialect — ``"sqlite"`` or ``"mysql"`` (default). + + Returns: + A bound :class:`sqlalchemy.TextClause` that returns one row when the + table exists and no rows when it does not. + """ + if dialect == "sqlite": + return sqlalchemy.text( + "SELECT 1 FROM sqlite_master WHERE type='table' AND name=:name" + ).bindparams(name=tablename) + return sqlalchemy.text( + "SELECT 1 FROM information_schema.tables" + " WHERE table_schema = DATABASE() AND table_name = :name LIMIT 1" + ).bindparams(name=tablename) + + def resource_filter_by_arbitrary_selection( path: str, candidate: str, diff --git a/orchestrator/metastore/sqlstore.py b/orchestrator/metastore/sqlstore.py index cd5184789..080f68751 100644 --- a/orchestrator/metastore/sqlstore.py +++ b/orchestrator/metastore/sqlstore.py @@ -25,6 +25,7 @@ kind_custom_model_load, ) from orchestrator.metastore.project import ProjectContext +from orchestrator.metastore.sql.statements import table_exists_query from orchestrator.metastore.sql.utils import ( create_sql_resource_store, engine_for_sql_store, @@ -69,15 +70,9 @@ def __new__(cls, project_context: ProjectContext) -> "SQLResourceStore": # Use a direct SQL query rather than sqlalchemy.inspect() to avoid # the Inspector's internal connection overhead. log.debug("Checking if 'resources' table exists (network query)...") - if project_context.metadataStore.scheme == "sqlite": - existence_query = sqlalchemy.text( - "SELECT 1 FROM sqlite_master WHERE type='table' AND name='resources'" - ) - else: - existence_query = sqlalchemy.text( - "SELECT 1 FROM information_schema.tables" - " WHERE table_schema = DATABASE() AND table_name = 'resources' LIMIT 1" - ) + existence_query = table_exists_query( + "resources", dialect=project_context.metadataStore.scheme + ) with engine.connect() as conn: tables_exist = conn.execute(existence_query).fetchone() is not None log.debug(f"Table existence check complete: tables_exist={tables_exist}") From 622e275e43a3657347709a19a14d92657016b611 Mon Sep 17 00:00:00 2001 From: michaelj Date: Mon, 30 Mar 2026 19:20:52 +0100 Subject: [PATCH 5/5] fix(sql): scope SQLSampleStore table-existence cache by database URL The _source_tables_verified cache used only the table name as its key, so two SQLSampleStore instances with the same identifier but pointing to different databases would incorrectly share a cache entry. The second store would skip the table-existence check and attempt to query tables that don't exist in its database. Fix by keying the cache on (db_url, tablename), consistent with how _tables_exist_cache is already scoped in SQLStore (sqlstore.py). --- orchestrator/core/samplestore/sql.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/orchestrator/core/samplestore/sql.py b/orchestrator/core/samplestore/sql.py index 5d87bb912..b88e2c575 100644 --- a/orchestrator/core/samplestore/sql.py +++ b/orchestrator/core/samplestore/sql.py @@ -57,10 +57,12 @@ import pandas as pd from rich.console import RenderableType -# Process-level cache of tablename prefixes for which the four DDL tables have -# already been verified to exist. Skips the four `CREATE TABLE IF NOT EXISTS` +# Process-level cache of (db_url, tablename) pairs for which the four DDL tables +# have already been verified to exist. Skips the four `CREATE TABLE IF NOT EXISTS` # round-trips on every subsequent SQLSampleStore construction for the same store. -_source_tables_verified: set[str] = set() +# The db_url is included so that two stores with the same identifier but pointing +# to different databases are treated independently. +_source_tables_verified: set[tuple[str, str]] = set() class SQLSampleStoreConfiguration(pydantic.BaseModel): @@ -391,7 +393,8 @@ def __init__( # We use a direct information_schema / sqlite_master query rather than # sqlalchemy.inspect() to avoid the Inspector's internal connection # overhead (it opens its own connection on top of the borrowed one). - if self._tablename not in _source_tables_verified: + _cache_key = (str(self._engine.url), self._tablename) + if _cache_key not in _source_tables_verified: if self.engine.dialect.name == "sqlite": existence_query = sqlalchemy.text( "SELECT 1 FROM sqlite_master WHERE type='table' AND name=:name" @@ -407,7 +410,7 @@ def __init__( if not table_exists: self._create_source_table() - _source_tables_verified.add(self._tablename) + _source_tables_verified.add(_cache_key) # Initialize entities cache as empty dict for lazy loading # Empty dict is falsy, so lazy loading check `if not self._entities:` still works