diff --git a/orchestrator/core/samplestore/sql.py b/orchestrator/core/samplestore/sql.py index f50bc8c1b..b88e2c575 100644 --- a/orchestrator/core/samplestore/sql.py +++ b/orchestrator/core/samplestore/sql.py @@ -57,6 +57,13 @@ import pandas as pd from rich.console import RenderableType +# Process-level cache of (db_url, tablename) pairs for which the four DDL tables +# have already been verified to exist. Skips the four `CREATE TABLE IF NOT EXISTS` +# round-trips on every subsequent SQLSampleStore construction for the same store. +# The db_url is included so that two stores with the same identifier but pointing +# to different databases are treated independently. +_source_tables_verified: set[tuple[str, str]] = set() + class SQLSampleStoreConfiguration(pydantic.BaseModel): identifier: Annotated[ @@ -376,8 +383,34 @@ def __init__( self._tablename = f"sqlsource_{self._identifier}" self._engine = engine_for_sql_store(storageLocation) - # Create a table for this sample store - self._create_source_table() + # Create the four backing tables only when they do not yet exist. + # Use a single raw SQL probe (1 round-trip) as a fast path to avoid + # the ~4 SQL queries that create_all(checkfirst=True) issues when + # the tables are already present (4 table-existence checks) + # The module level _source_table_verified enables skipping + # even the probe for subsequent constructions within the same process. + # + # We use a direct information_schema / sqlite_master query rather than + # sqlalchemy.inspect() to avoid the Inspector's internal connection + # overhead (it opens its own connection on top of the borrowed one). + _cache_key = (str(self._engine.url), self._tablename) + if _cache_key not in _source_tables_verified: + if self.engine.dialect.name == "sqlite": + existence_query = sqlalchemy.text( + "SELECT 1 FROM sqlite_master WHERE type='table' AND name=:name" + ).bindparams(name=self._tablename) + else: + existence_query = sqlalchemy.text( + "SELECT 1 FROM information_schema.tables" + " WHERE table_schema = DATABASE() AND table_name = :name LIMIT 1" + ).bindparams(name=self._tablename) + + with self.engine.connect() as conn: + table_exists = conn.execute(existence_query).fetchone() is not None + + if not table_exists: + self._create_source_table() + _source_tables_verified.add(_cache_key) # Initialize entities cache as empty dict for lazy loading # Empty dict is falsy, so lazy loading check `if not self._entities:` still works diff --git a/orchestrator/metastore/sql/statements.py b/orchestrator/metastore/sql/statements.py index 19a5e6689..3cbc73224 100644 --- a/orchestrator/metastore/sql/statements.py +++ b/orchestrator/metastore/sql/statements.py @@ -216,6 +216,34 @@ def _searchable_scalar_value_for_query_string(value: _ScalarType) -> str: return fragments +def table_exists_query( + tablename: str, + dialect: Literal["mysql", "sqlite"] = "mysql", +) -> sqlalchemy.TextClause: + """Return a bound SQL query that checks whether a table exists in the database. + + Uses dialect-specific system catalogue tables: ``sqlite_master`` for SQLite + and ``information_schema.tables`` for MySQL. The tablename is passed as a + bind parameter to prevent SQL injection. + + Args: + tablename: The name of the table to check for. + dialect: The SQL dialect — ``"sqlite"`` or ``"mysql"`` (default). + + Returns: + A bound :class:`sqlalchemy.TextClause` that returns one row when the + table exists and no rows when it does not. + """ + if dialect == "sqlite": + return sqlalchemy.text( + "SELECT 1 FROM sqlite_master WHERE type='table' AND name=:name" + ).bindparams(name=tablename) + return sqlalchemy.text( + "SELECT 1 FROM information_schema.tables" + " WHERE table_schema = DATABASE() AND table_name = :name LIMIT 1" + ).bindparams(name=tablename) + + def resource_filter_by_arbitrary_selection( path: str, candidate: str, diff --git a/orchestrator/metastore/sqlstore.py b/orchestrator/metastore/sqlstore.py index cd5184789..080f68751 100644 --- a/orchestrator/metastore/sqlstore.py +++ b/orchestrator/metastore/sqlstore.py @@ -25,6 +25,7 @@ kind_custom_model_load, ) from orchestrator.metastore.project import ProjectContext +from orchestrator.metastore.sql.statements import table_exists_query from orchestrator.metastore.sql.utils import ( create_sql_resource_store, engine_for_sql_store, @@ -69,15 +70,9 @@ def __new__(cls, project_context: ProjectContext) -> "SQLResourceStore": # Use a direct SQL query rather than sqlalchemy.inspect() to avoid # the Inspector's internal connection overhead. log.debug("Checking if 'resources' table exists (network query)...") - if project_context.metadataStore.scheme == "sqlite": - existence_query = sqlalchemy.text( - "SELECT 1 FROM sqlite_master WHERE type='table' AND name='resources'" - ) - else: - existence_query = sqlalchemy.text( - "SELECT 1 FROM information_schema.tables" - " WHERE table_schema = DATABASE() AND table_name = 'resources' LIMIT 1" - ) + existence_query = table_exists_query( + "resources", dialect=project_context.metadataStore.scheme + ) with engine.connect() as conn: tables_exist = conn.execute(existence_query).fetchone() is not None log.debug(f"Table existence check complete: tables_exist={tables_exist}")