forked from modular/modular
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreduce.mojo
More file actions
102 lines (82 loc) · 3.17 KB
/
reduce.mojo
File metadata and controls
102 lines (82 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# ===----------------------------------------------------------------------=== #
# Copyright (c) 2025, Modular Inc. All rights reserved.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions:
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #
# RUN: %mojo %s | FileCheck %s
# This sample implements a simple reduction operation on a
# large array of values to produce a single result.
# Reductions and scans are common algorithm patterns in parallel computing.
from random import rand
from algorithm import sum
from benchmark import Unit, benchmark, keep
from buffer import NDBuffer
from python import Python
# Change these numbers to reduce on different sizes
alias size_small: Int = 1 << 21
alias size_large: Int = 1 << 27
# Datatype for Tensor/Array
alias type = DType.float32
alias scalar = Scalar[type]
# Use the https://en.wikipedia.org/wiki/Kahan_summation_algorithm
# Simple summation of the array elements
fn naive_reduce_sum[
size: Int
](buffer: NDBuffer[type, 1, _, size]) raises -> scalar:
var my_sum: scalar = 0
var c: scalar = 0
for i in range(buffer.size()):
var y = buffer[i] - c
var t = my_sum + y
c = (t - my_sum) - y
my_sum = t
return my_sum
fn stdlib_reduce_sum[
size: Int
](array: NDBuffer[type, 1, _, size]) raises -> scalar:
var my_sum = sum(array)
return my_sum
def pretty_print(name: String, elements: Int, time: Float64):
py = Python.import_module("builtins")
py.print(
py.str("{:<16} {:>11,} {:>8.2f}ms").format(
name + " elements:", elements, time
)
)
fn bench[
func: fn[size: Int] (buffer: NDBuffer[type, 1, _, size]) raises -> scalar,
size: Int,
name: String,
](buffer: NDBuffer[type, 1, _, size]) raises:
@parameter
fn runner() raises:
var result = func[size](buffer)
keep(result)
var ms = benchmark.run[runner](max_runtime_secs=0.5).mean(Unit.ms)
pretty_print(name, size, ms)
fn main() raises:
print(
"Sum all values in a small array and large array\n"
"Shows algorithm.sum from stdlib with much better performance\n"
)
# Allocate and randomize data, then create two buffers
var ptr_small = UnsafePointer[Scalar[type]].alloc(size_small)
var ptr_large = UnsafePointer[Scalar[type]].alloc(size_large)
rand(ptr_small, size_small)
rand(ptr_large, size_large)
var buffer_small = NDBuffer[type, 1, _, size_small](ptr_small)
var buffer_large = NDBuffer[type, 1, _, size_large](ptr_large)
bench[naive_reduce_sum, size_small, "naive"](buffer_small)
bench[naive_reduce_sum, size_large, "naive"](buffer_large)
bench[stdlib_reduce_sum, size_small, "stdlib"](buffer_small)
# CHECK: stdlib elements
bench[stdlib_reduce_sum, size_large, "stdlib"](buffer_large)
ptr_small.free()
ptr_large.free()