Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 25 additions & 24 deletions cupy_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,15 @@
# Try to import CuPy for GPU acceleration
try:
import cupy as cp

CUPY_AVAILABLE = True
except ImportError:
CUPY_AVAILABLE = False
cp = None
warnings.warn(
"CuPy not available. GPU functions will fall back to CPU. "
"Install CuPy with: pip install cupy-cuda11x (or cupy-cuda12x for CUDA 12+)",
ImportWarning
ImportWarning,
)


Expand Down Expand Up @@ -95,7 +96,7 @@ def get_array_module(array):
return np


def to_gpu(array: np.ndarray) -> Union[np.ndarray, 'cp.ndarray']:
def to_gpu(array: np.ndarray) -> Union[np.ndarray, "cp.ndarray"]:
"""
Transfer numpy array to GPU if available, otherwise return as-is.

Expand All @@ -114,7 +115,7 @@ def to_gpu(array: np.ndarray) -> Union[np.ndarray, 'cp.ndarray']:
return np.asarray(array, dtype=np.float64)


def to_cpu(array: Union[np.ndarray, 'cp.ndarray']) -> np.ndarray:
def to_cpu(array: Union[np.ndarray, "cp.ndarray"]) -> np.ndarray:
"""
Transfer array to CPU (numpy).

Expand All @@ -134,10 +135,10 @@ def to_cpu(array: Union[np.ndarray, 'cp.ndarray']) -> np.ndarray:


def lorentz_similarity_gpu(
u: Union[np.ndarray, 'cp.ndarray'],
v: Union[np.ndarray, 'cp.ndarray'],
u: Union[np.ndarray, "cp.ndarray"],
v: Union[np.ndarray, "cp.ndarray"],
epsilon: float = 1e-10,
return_cpu: bool = True
return_cpu: bool = True,
) -> float:
"""
Compute Lorentz-invariant cosine similarity on GPU.
Expand Down Expand Up @@ -227,11 +228,11 @@ def lorentz_similarity_gpu(


def lorentz_similarity_batch_gpu(
U: Union[np.ndarray, 'cp.ndarray'],
V: Union[np.ndarray, 'cp.ndarray'],
U: Union[np.ndarray, "cp.ndarray"],
V: Union[np.ndarray, "cp.ndarray"],
epsilon: float = 1e-10,
return_cpu: bool = True
) -> Union[np.ndarray, 'cp.ndarray']:
return_cpu: bool = True,
) -> Union[np.ndarray, "cp.ndarray"]:
"""
Compute Lorentz-invariant similarities for batches of vector pairs on GPU.

Expand Down Expand Up @@ -314,7 +315,9 @@ def lorentz_similarity_batch_gpu(
final_valid[temp_indices[valid_denom_mask]] = True

# Compute similarities
similarities[final_valid] = lorentz_products_uv[final_valid] / xp.sqrt(denominator_squared[final_valid])
similarities[final_valid] = lorentz_products_uv[final_valid] / xp.sqrt(
denominator_squared[final_valid]
)

# Clamp to valid range
similarities = xp.clip(similarities, -1.0, 1.0)
Expand All @@ -326,11 +329,11 @@ def lorentz_similarity_batch_gpu(


def lorentz_similarity_matrix_gpu(
U: Union[np.ndarray, 'cp.ndarray'],
V: Optional[Union[np.ndarray, 'cp.ndarray']] = None,
U: Union[np.ndarray, "cp.ndarray"],
V: Optional[Union[np.ndarray, "cp.ndarray"]] = None,
epsilon: float = 1e-10,
return_cpu: bool = True
) -> Union[np.ndarray, 'cp.ndarray']:
return_cpu: bool = True,
) -> Union[np.ndarray, "cp.ndarray"]:
"""
Compute pairwise Lorentz-invariant similarity matrix on GPU.

Expand Down Expand Up @@ -408,8 +411,8 @@ def lorentz_similarity_matrix_gpu(

# Self inner products (for normalization)
# These are all zeros due to lightlike condition, but we compute for consistency
lorentz_products_uu = (norms_U ** 2) - (norms_U ** 2) # shape: (N, 1)
lorentz_products_vv = (norms_V ** 2) - (norms_V ** 2) # shape: (M, 1)
lorentz_products_uu = (norms_U**2) - (norms_U**2) # shape: (N, 1)
lorentz_products_vv = (norms_V**2) - (norms_V**2) # shape: (M, 1)

# Denominators: sqrt(|<u,u>_L| * |<v,v>_L|)
# Broadcast: (N, 1) * (1, M) -> (N, M)
Expand Down Expand Up @@ -445,10 +448,10 @@ def lorentz_similarity_matrix_gpu(


def standard_cosine_similarity_gpu(
u: Union[np.ndarray, 'cp.ndarray'],
v: Union[np.ndarray, 'cp.ndarray'],
u: Union[np.ndarray, "cp.ndarray"],
v: Union[np.ndarray, "cp.ndarray"],
epsilon: float = 1e-10,
return_cpu: bool = True
return_cpu: bool = True,
) -> float:
"""
Compute standard cosine similarity on GPU.
Expand Down Expand Up @@ -504,10 +507,7 @@ def standard_cosine_similarity_gpu(

# Convenience function for automatic GPU/CPU selection
def lorentz_similarity_auto(
u: np.ndarray,
v: np.ndarray,
epsilon: float = 1e-10,
prefer_gpu: bool = True
u: np.ndarray, v: np.ndarray, epsilon: float = 1e-10, prefer_gpu: bool = True
) -> float:
"""
Automatically select GPU or CPU implementation based on availability.
Expand All @@ -533,4 +533,5 @@ def lorentz_similarity_auto(
else:
# Fall back to CPU implementation
from similarity import lorentz_similarity

return lorentz_similarity(u, v, epsilon=epsilon)
12 changes: 3 additions & 9 deletions eigen_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,7 @@ def _reshape_from_heads(self, x: torch.Tensor, B: int, L: int) -> torch.Tensor:
"""
(B*H, L, d_head) -> (B, L, D)
"""
return (
x.view(B, self.num_heads, L, self.head_dim)
.transpose(1, 2)
.reshape(B, L, self.dim)
)
return x.view(B, self.num_heads, L, self.head_dim).transpose(1, 2).reshape(B, L, self.dim)

def forward(
self,
Expand Down Expand Up @@ -107,7 +103,7 @@ def forward(
v = self._reshape_to_heads(v) # (B*H, L, d_head)

# Eigen similarity per head: (B*H, L_q, L_k)
sim = eigen_similarity(q, k) # expected in [-1, 1]
sim = eigen_similarity(q, k) # expected in [-1, 1]

# loop prevention: attenuate near-self/lightlike connections
sim = torch.where(
Expand Down Expand Up @@ -150,9 +146,7 @@ def forward(
attn_mask_expanded = attn_mask.repeat(1, self.num_heads, 1, 1)
else:
attn_mask_expanded = attn_mask
attn_mask_expanded = attn_mask_expanded.reshape(
B * self.num_heads, L, L
)
attn_mask_expanded = attn_mask_expanded.reshape(B * self.num_heads, L, L)
logits = logits + attn_mask_expanded
else:
raise ValueError(
Expand Down
10 changes: 5 additions & 5 deletions eigen_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,18 +152,18 @@ def forward(

# temporal decay: favor recent entries (age=0 newest)
# effective weight ∝ decay^age ∈ (0, 1]
decay_factor = (self.decay ** age).clamp(min=1e-6) # (N,)
sim = sim + decay_factor.log().unsqueeze(0) # add log-decay
decay_factor = (self.decay**age).clamp(min=1e-6) # (N,)
sim = sim + decay_factor.log().unsqueeze(0) # add log-decay

# top-k selection
k = min(self.k_top, N)
sim_topk, idx_topk = torch.topk(sim, k, dim=-1) # (B, k)
sim_topk, idx_topk = torch.topk(sim, k, dim=-1) # (B, k)

# attention weights over top-k
attn = F.softmax(sim_topk, dim=-1) # (B, k)
attn = F.softmax(sim_topk, dim=-1) # (B, k)

# gather memory vectors
mem_topk = mem[idx_topk] # (B, k, D)
mem_topk = mem[idx_topk] # (B, k, D)

retrieved = torch.sum(attn.unsqueeze(-1) * mem_topk, dim=1) # (B, D)

Expand Down
49 changes: 29 additions & 20 deletions examples_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def check_gpu_status():
print("✓ GPU (CUDA) is available!")
try:
import cupy as cp

device = cp.cuda.Device()
print(f" Device: {device}")
print(f" Compute Capability: {device.compute_capability}")
Expand Down Expand Up @@ -139,9 +140,7 @@ def example_attention_mechanism():

# Compute attention scores using GPU
start = time.time()
attention_scores = gpu_sim.lorentz_similarity_matrix_gpu(
embeddings, embeddings
)
attention_scores = gpu_sim.lorentz_similarity_matrix_gpu(embeddings, embeddings)
gpu_time = time.time() - start

print(f"GPU attention matrix computation: {gpu_time*1000:.2f} ms")
Expand All @@ -151,8 +150,10 @@ def example_attention_mechanism():
# Analyze self-attention (diagonal)
diagonal = np.diag(attention_scores)
print("Self-attention analysis:")
print(f" Diagonal values (self-similarity): mean={np.mean(diagonal):.6f}, "
f"std={np.std(diagonal):.6f}")
print(
f" Diagonal values (self-similarity): mean={np.mean(diagonal):.6f}, "
f"std={np.std(diagonal):.6f}"
)
print(f" All diagonal ~0.0? {np.allclose(diagonal, 0.0, atol=1e-6)}")
print()

Expand All @@ -162,8 +163,10 @@ def example_attention_mechanism():
off_diagonal = attention_scores[mask]

print("Cross-attention analysis:")
print(f" Off-diagonal values: mean={np.mean(off_diagonal):.6f}, "
f"std={np.std(off_diagonal):.6f}")
print(
f" Off-diagonal values: mean={np.mean(off_diagonal):.6f}, "
f"std={np.std(off_diagonal):.6f}"
)
print(f" Range: [{np.min(off_diagonal):.3f}, {np.max(off_diagonal):.3f}]")
print()

Expand Down Expand Up @@ -203,9 +206,7 @@ def example_semantic_search():

# GPU search
start = time.time()
similarity_matrix = gpu_sim.lorentz_similarity_matrix_gpu(
query_embeddings, doc_embeddings
)
similarity_matrix = gpu_sim.lorentz_similarity_matrix_gpu(query_embeddings, doc_embeddings)
gpu_time = time.time() - start

print(f"GPU similarity computation: {gpu_time*1000:.2f} ms")
Expand All @@ -219,8 +220,9 @@ def example_semantic_search():
for i in range(min(3, num_queries)): # Show first 3 queries
top_k_indices = np.argsort(similarity_matrix[i])[-k:][::-1]
top_k_scores = similarity_matrix[i][top_k_indices]
print(f" Query {i}: docs {top_k_indices} "
f"(scores: {[f'{s:.3f}' for s in top_k_scores]})")
print(
f" Query {i}: docs {top_k_indices} " f"(scores: {[f'{s:.3f}' for s in top_k_scores]})"
)


def example_loop_prevention_demo():
Expand All @@ -243,8 +245,10 @@ def example_loop_prevention_demo():
for i in range(iterations):
self_sim = standard_cosine_similarity(state, state)
standard_accumulation += self_sim
print(f" Iteration {i+1}: self-similarity = {self_sim:.6f}, "
f"accumulated = {standard_accumulation:.6f}")
print(
f" Iteration {i+1}: self-similarity = {self_sim:.6f}, "
f"accumulated = {standard_accumulation:.6f}"
)

print(f"\n Total accumulated: {standard_accumulation:.6f}")
print(f" Average per iteration: {standard_accumulation/iterations:.6f}")
Expand All @@ -257,8 +261,10 @@ def example_loop_prevention_demo():
for i in range(iterations):
self_sim = gpu_sim.lorentz_similarity_gpu(state, state)
lorentz_accumulation += self_sim
print(f" Iteration {i+1}: self-similarity = {self_sim:.6f}, "
f"accumulated = {lorentz_accumulation:.6f}")
print(
f" Iteration {i+1}: self-similarity = {self_sim:.6f}, "
f"accumulated = {lorentz_accumulation:.6f}"
)

print(f"\n Total accumulated: {lorentz_accumulation:.6f}")
print(f" Average per iteration: {lorentz_accumulation/iterations:.6f}")
Expand Down Expand Up @@ -313,12 +319,15 @@ def performance_comparison():

print("\n" + "-" * 70)
print("Summary:")
print(f"{'Configuration':<20} {'Problem Size':<15} {'GPU (ms)':<12} "
f"{'CPU (ms)':<12} {'Speedup':<10}")
print(
f"{'Configuration':<20} {'Problem Size':<15} {'GPU (ms)':<12} "
f"{'CPU (ms)':<12} {'Speedup':<10}"
)
print("-" * 70)
for label, size, gpu_t, cpu_t, speedup in results:
print(f"{label:<20} {size:<15,} {gpu_t*1000:<12.2f} "
f"{cpu_t*1000:<12.2f} {speedup:<10.2f}x")
print(
f"{label:<20} {size:<15,} {gpu_t*1000:<12.2f} " f"{cpu_t*1000:<12.2f} {speedup:<10.2f}x"
)


def main():
Expand Down
Loading
Loading