Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/cibuildwheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ env:
jobs:

build_wheels:
if: ${{ github.ref_name != 'ctable3' && github.head_ref != 'ctable3' }}
name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }}
runs-on: ${{ matrix.runs-on || matrix.os }}
permissions:
Expand Down Expand Up @@ -128,10 +129,9 @@ jobs:


upload_pypi:
if: ${{ (github.ref_name != 'ctable3' && github.head_ref != 'ctable3') && startsWith(github.event.ref, 'refs/tags') }}
needs: [ build_wheels]
runs-on: ubuntu-latest
# Only upload wheels when tagging (typically a release)
if: startsWith(github.event.ref, 'refs/tags')
steps:
- uses: actions/download-artifact@v8
with:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/wasm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ env:

jobs:
build_wheels_wasm:
if: ${{ github.ref_name != 'ctable3' && github.head_ref != 'ctable3' }}
name: Build and test wheels for WASM on ${{ matrix.os }} for ${{ matrix.p_ver }}
runs-on: ubuntu-latest
permissions:
Expand Down
81 changes: 81 additions & 0 deletions bench/ctable/compact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark for measuring compact() time and memory gain after deletions
# of varying fractions of the table.

from time import time
from typing import Annotated

import numpy as np
from pydantic import BaseModel, Field

import blosc2


class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype


# Row model
class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True


N = 1_000_000

print(f"compact() benchmark | N = {N:,}\n")

# Build base data once
np_dtype = np.dtype([
("id", np.int64),
("c_val", np.complex128),
("score", np.float64),
("active", np.bool_),
])
DATA = np.array(
[
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
for i in range(N)
],
dtype=np_dtype,
)

delete_fractions = [0.1, 0.25, 0.5, 0.75, 0.9]

print("=" * 75)
print(f"{'DELETED':>10} {'ROWS LEFT':>10} {'TIME (s)':>12} {'CBYTES BEFORE':>15} {'CBYTES AFTER':>14}")
print("-" * 75)

for frac in delete_fractions:
ct = blosc2.CTable(RowModel, expected_size=N)
ct.extend(DATA)

n_delete = int(N * frac)
ct.delete(list(range(n_delete)))

cbytes_before = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes

t0 = time()
ct.compact()
t_compact = time() - t0

cbytes_after = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes

print(
f"{frac*100:>9.0f}%"
f" {N - n_delete:>10,}"
f" {t_compact:>12.4f}"
f" {cbytes_before / 1024**2:>13.2f} MB"
f" {cbytes_after / 1024**2:>12.2f} MB"
)

print("-" * 75)
127 changes: 127 additions & 0 deletions bench/ctable/ctable_v_panda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark comparing CTable vs pandas DataFrame for:
# 1. Creation from a NumPy structured array
# 2. Column access (full column)
# 3. Filtering (where/query)
# 4. Row iteration

from time import time
from typing import Annotated

import numpy as np
import pandas as pd
from pydantic import BaseModel, Field

import blosc2


class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype


# Row model
class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True


N = 1_000_000
rng = np.random.default_rng(42)

print(f"CTable vs pandas benchmark | N = {N:,}\n")

# Build base data once
np_dtype = np.dtype([
("id", np.int64),
("c_val", np.complex128),
("score", np.float64),
("active", np.bool_),
])
DATA = np.empty(N, dtype=np_dtype)
DATA["id"] = np.arange(N, dtype=np.int64)
DATA["c_val"] = rng.standard_normal(N) + 1j * rng.standard_normal(N)
DATA["score"] = rng.uniform(0, 100, N)
DATA["active"] = rng.integers(0, 2, N, dtype=np.bool_)

print("=" * 65)
print(f"{'OPERATION':<30} {'CTable':>12} {'pandas':>12} {'SPEEDUP':>10}")
print("-" * 65)

# 1. Creation
t0 = time()
ct = blosc2.CTable(RowModel, expected_size=N)
ct.extend(DATA)
t_ct_create = time() - t0

t0 = time()
df = pd.DataFrame(DATA)
t_pd_create = time() - t0

print(f"{'Creation':<30} {t_ct_create:>12.4f} {t_pd_create:>12.4f} {t_pd_create/t_ct_create:>9.2f}x")

# 2. Column access (full column)
t0 = time()
arr = ct["score"]
t_ct_col = time() - t0

t0 = time()
arr = df["score"]
t_pd_col = time() - t0

print(f"{'Column access (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.2f}x")

# 2.5 Column access (full column)
t0 = time()
arr = ct["score"].to_numpy()
t_ct_col = time() - t0

t0 = time()
arr = df["score"].to_numpy()
t_pd_col = time() - t0

print(f"{'Column access to numpy (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.3f}x")

# 3. Filtering
t0 = time()
result_ct = ct.where((ct["id"] > 250_000) & (ct["id"] < 750_000))
t_ct_filter = time() - t0

t0 = time()
result_pd = df.query("250000 < id < 750000")
t_pd_filter = time() - t0

print(f"{'Filter (id 250k-750k)':<30} {t_ct_filter:>12.4f} {t_pd_filter:>12.4f} {t_pd_filter/t_ct_filter:>9.2f}x")

# 4. Row iteration
t0 = time()
for val in ct["score"]:
pass
t_ct_iter = time() - t0

t0 = time()
for val in df["score"]:
pass
t_pd_iter = time() - t0

print(f"{'Row iteration':<30} {t_ct_iter:>12.4f} {t_pd_iter:>12.4f} {t_pd_iter/t_ct_iter:>9.2f}x")

print("-" * 65)

# Memory
ct_cbytes = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
ct_nbytes = sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes
pd_nbytes = df.memory_usage(deep=True).sum()

print(f"\nMemory — CTable compressed: {ct_cbytes / 1024**2:.2f} MB")
print(f"Memory — CTable uncompressed: {ct_nbytes / 1024**2:.2f} MB")
print(f"Memory — pandas: {pd_nbytes / 1024**2:.2f} MB")
print(f"Compression ratio CTable: {ct_nbytes / ct_cbytes:.2f}x")
82 changes: 82 additions & 0 deletions bench/ctable/delete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark for measuring delete() performance with different index types:
# int, slice, and list — with varying sizes.

from time import time
from typing import Annotated

import numpy as np
from pydantic import BaseModel, Field

import blosc2


class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype


# Row model
class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True


N = 1_000_000

print(f"delete() benchmark | N = {N:,}\n")

# Build base data once
np_dtype = np.dtype([
("id", np.int64),
("c_val", np.complex128),
("score", np.float64),
("active", np.bool_),
])
DATA = np.array(
[
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
for i in range(N)
],
dtype=np_dtype,
)

delete_cases = [
("int", 0),
("slice small", slice(0, 100)),
("slice large", slice(0, 100_000)),
("slice full", slice(0, N)),
("list small", list(range(100))),
("list large", list(range(100_000))),
("list full", list(range(N))),
]

print("=" * 60)
print(f"{'CASE':<20} {'ROWS DELETED':>14} {'TIME (s)':>12}")
print("-" * 60)

for label, key in delete_cases:
ct = blosc2.CTable(RowModel, expected_size=N)
ct.extend(DATA)

if isinstance(key, int):
n_deleted = 1
elif isinstance(key, slice):
n_deleted = len(range(*key.indices(N)))
else:
n_deleted = len(key)

t0 = time()
ct.delete(key)
t_delete = time() - t0
print(f"{label:<20} {n_deleted:>14,} {t_delete:>12.6f}")

print("-" * 60)
75 changes: 75 additions & 0 deletions bench/ctable/expected_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark for measuring the overhead of resize() when expected_size
# is too small (M rows) vs correctly sized (N rows) during extend().

from time import time
from typing import Annotated

import numpy as np
from pydantic import BaseModel, Field

import blosc2


class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype


# Row model
class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True



M = 779
N = 62_500
MAX_N = 1_000_000
print(f"expected_size benchmark | wrong expected_size = {M}")

# Pre-generate full dataset once
np_dtype = np.dtype([
("id", np.int64),
("c_val", np.complex128),
("score", np.float64),
("active", np.bool_),
])
DATA = np.array(
[
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
for i in range(MAX_N)
],
dtype=np_dtype,
)

while N <= MAX_N:
print("-" * 80)
print(f"N = {N:,} rows")

# 1. extend() with correct expected_size = N
ct_correct = blosc2.CTable(RowModel, expected_size=N)
t0 = time()
ct_correct.extend(DATA[:N])
t_correct = time() - t0
print(f"extend() expected_size=N ({N:>8,}): {t_correct:.4f} s rows: {len(ct_correct):,}")

# 2. extend() with wrong expected_size = M (forces resize)
ct_wrong = blosc2.CTable(RowModel, expected_size=M)
t0 = time()
ct_wrong.extend(DATA[:N])
t_wrong = time() - t0
print(f"extend() expected_size=M ({M:>8,}): {t_wrong:.4f} s rows: {len(ct_wrong):,}")

# Summary
print(f" Slowdown from wrong expected_size: {t_wrong / t_correct:.2f}x")

N *= 2
Loading
Loading