diff --git a/README.md b/README.md index 5324618..eff542d 100644 --- a/README.md +++ b/README.md @@ -65,66 +65,70 @@ fpm build fpm test ``` -## Profiling - -Use `profile_benchmark.sh` to profile the `benchmark_compare` benchmark on macOS: - -```bash -./profile_benchmark.sh xctrace # Time Profiler trace -./profile_benchmark.sh sample # CLI sampling report -``` - -Environment overrides: `REPEAT_COUNT`, `DATA_DIR`, `TRACE_OUT`. - ## Benchmarks Run with `fpm test --profile release --target benchmark_compare` on Apple Silicon (M1 Max). Data files from [simple_fastfloat_benchmark](https://github.com/lemire/simple_fastfloat_benchmark). Use `./run_benchmarks.sh` to run the full suite (random + data files) with C++ comparison. -### Random uniform [0,1) -- 100k floats, 2.19 MB +### Random uniform [0,1) -- 100k floats, 2.10 MB + +``` +netlib (C) : 403.76 MB/s (+/- 1.5 %) 19.24 Mfloat/s +strtod (C) : 786.63 MB/s (+/- 1.3 %) 37.49 Mfloat/s +abseil (C++) : 917.36 MB/s (+/- 0.9 %) 43.72 Mfloat/s +fastfloat (C++) : 1586.85 MB/s (+/- 1.3 %) 75.63 Mfloat/s +ffc (C) : 1650.41 MB/s (+/- 1.3 %) 78.66 Mfloat/s +fortran (fast_float) : 876.76 MB/s (+/- 0.7 %) 41.79 Mfloat/s +fortran (stdlib to_num) : 1066.10 MB/s (+/- 1.0 %) 50.81 Mfloat/s +fortran (str2real) : 609.20 MB/s (+/- 0.7 %) 29.04 Mfloat/s +fortran (read *) : 56.90 MB/s (+/- 0.4 %) 2.71 Mfloat/s +ffc via fortran interop : 1493.30 MB/s (+/- 1.4 %) 71.17 Mfloat/s +``` + +### canada_short.txt -- 111k lines, 0.70 MB ``` -netlib (C) : 405.14 MB/s (+/- 1.2 %) 19.31 Mfloat/s -strtod (C) : 787.20 MB/s (+/- 1.0 %) 37.52 Mfloat/s -abseil (C++) : 919.15 MB/s (+/- 1.0 %) 43.81 Mfloat/s -fastfloat (C++) : 1586.45 MB/s (+/- 1.3 %) 75.61 Mfloat/s -ffc (C) : 1653.72 MB/s (+/- 1.6 %) 78.82 Mfloat/s -fortran (fast_float) : 2394.60 MB/s (+/- 8.8 %) 109.17 Mfloat/s -fortran (stdlib to_num) : 1078.39 MB/s (+/- 2.7 %) 49.16 Mfloat/s -fortran (str2real) : 666.91 MB/s (+/- 0.5 %) 30.40 Mfloat/s -fortran (read *) : 58.58 MB/s (+/- 0.5 %) 2.67 Mfloat/s -ffc via fortran interop : 4284.08 MB/s (+/- 11.0 %) 195.31 Mfloat/s +netlib (C) : 546.31 MB/s (+/- 4.0 %) 101.50 Mfloat/s +strtod (C) : 367.00 MB/s (+/- 0.9 %) 68.19 Mfloat/s +abseil (C++) : 361.36 MB/s (+/- 1.2 %) 67.14 Mfloat/s +fastfloat (C++) : 566.31 MB/s (+/- 1.0 %) 105.22 Mfloat/s +ffc (C) : 704.02 MB/s (+/- 1.1 %) 130.81 Mfloat/s +fortran (fast_float) : 638.99 MB/s (+/- 1.4 %) 118.72 Mfloat/s +fortran (stdlib to_num) : 710.33 MB/s (+/- 3.1 %) 131.98 Mfloat/s +fortran (str2real) : 243.62 MB/s (+/- 0.4 %) 45.27 Mfloat/s +fortran (read *) : 22.14 MB/s (+/- 0.6 %) 4.11 Mfloat/s +ffc via fortran interop : 598.70 MB/s (+/- 4.8 %) 111.24 Mfloat/s ``` -### canada.txt -- 111k lines, 1.93 MB +### canada.txt -- 111k lines, 2.04 MB ``` -netlib (C) : 385.05 MB/s (+/- 1.3 %) 22.13 Mfloat/s -strtod (C) : 686.85 MB/s (+/- 1.4 %) 39.47 Mfloat/s -abseil (C++) : 868.24 MB/s (+/- 1.6 %) 49.89 Mfloat/s -fastfloat (C++) : 1095.24 MB/s (+/- 1.4 %) 62.94 Mfloat/s -ffc (C) : 1169.13 MB/s (+/- 1.8 %) 67.19 Mfloat/s -fortran (fast_float) : 1011.37 MB/s (+/- 1.3 %) 58.12 Mfloat/s -fortran (stdlib to_num) : 1074.30 MB/s (+/- 2.3 %) 61.74 Mfloat/s -fortran (str2real) : 463.62 MB/s (+/- 1.5 %) 26.64 Mfloat/s -fortran (read *) : 49.77 MB/s (+/- 0.5 %) 2.86 Mfloat/s -ffc via fortran interop : 1089.43 MB/s (+/- 0.9 %) 62.61 Mfloat/s +netlib (C) : 384.11 MB/s (+/- 0.9 %) 22.07 Mfloat/s +strtod (C) : 686.30 MB/s (+/- 1.0 %) 39.44 Mfloat/s +abseil (C++) : 868.22 MB/s (+/- 1.2 %) 49.89 Mfloat/s +fastfloat (C++) : 1095.50 MB/s (+/- 1.1 %) 62.95 Mfloat/s +ffc (C) : 1169.13 MB/s (+/- 1.5 %) 67.19 Mfloat/s +fortran (fast_float) : 1036.86 MB/s (+/- 0.7 %) 59.58 Mfloat/s +fortran (stdlib to_num) : 1038.53 MB/s (+/- 1.0 %) 59.68 Mfloat/s +fortran (str2real) : 452.02 MB/s (+/- 0.4 %) 25.98 Mfloat/s +fortran (read *) : 49.86 MB/s (+/- 0.7 %) 2.87 Mfloat/s +ffc via fortran interop : 1048.10 MB/s (+/- 1.6 %) 60.23 Mfloat/s ``` -### mesh.txt -- 73k lines, 0.54 MB +### mesh.txt -- 73k lines, 0.61 MB ``` -netlib (C) : 537.08 MB/s (+/- 2.7 %) 73.17 Mfloat/s -strtod (C) : 523.55 MB/s (+/- 1.5 %) 71.32 Mfloat/s -abseil (C++) : 415.44 MB/s (+/- 1.3 %) 56.59 Mfloat/s -fastfloat (C++) : 825.16 MB/s (+/- 1.5 %) 112.41 Mfloat/s -ffc (C) : 948.34 MB/s (+/- 2.5 %) 129.19 Mfloat/s -fortran (fast_float) : 846.78 MB/s (+/- 2.4 %) 115.35 Mfloat/s -fortran (stdlib to_num) : 802.41 MB/s (+/- 1.0 %) 109.31 Mfloat/s -fortran (str2real) : 301.81 MB/s (+/- 0.4 %) 41.11 Mfloat/s -fortran (read *) : 28.43 MB/s (+/- 1.2 %) 3.87 Mfloat/s -ffc via fortran interop : 853.52 MB/s (+/- 0.9 %) 116.27 Mfloat/s +netlib (C) : 537.91 MB/s (+/- 2.1 %) 73.28 Mfloat/s +strtod (C) : 522.81 MB/s (+/- 1.2 %) 71.22 Mfloat/s +abseil (C++) : 415.05 MB/s (+/- 1.3 %) 56.54 Mfloat/s +fastfloat (C++) : 825.10 MB/s (+/- 1.1 %) 112.40 Mfloat/s +ffc (C) : 947.22 MB/s (+/- 1.3 %) 129.04 Mfloat/s +fortran (fast_float) : 885.97 MB/s (+/- 0.9 %) 120.69 Mfloat/s +fortran (stdlib to_num) : 796.45 MB/s (+/- 1.2 %) 108.50 Mfloat/s +fortran (str2real) : 298.61 MB/s (+/- 2.4 %) 40.68 Mfloat/s +fortran (read *) : 28.21 MB/s (+/- 0.6 %) 3.84 Mfloat/s +ffc via fortran interop : 838.82 MB/s (+/- 2.1 %) 114.27 Mfloat/s ``` ## Acknowledgements diff --git a/benchmark/benchmark_compare.f90 b/benchmark/benchmark_compare.f90 index a246ba9..9deeb11 100644 --- a/benchmark/benchmark_compare.f90 +++ b/benchmark/benchmark_compare.f90 @@ -135,6 +135,7 @@ subroutine generate_random_lines(howmany, lines, packed_text, packed_data, offse do i = 1, nlines call random_number(x) write(buf, '(es23.16)') x + buf = adjustl(buf) n = len_trim(buf) lines(i)%text = trim(buf) allocate(lines(i)%c_text(n)) @@ -267,7 +268,9 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths "ffc interop bits = " ] real(real64) :: min_ns(NCASES), avg_ns(NCASES), checksum(NCASES) - real(real64) :: answer, elapsed_ns, volume_mb, x_f + integer(int64) :: xor_checksum(NCASES) + real(real64), volatile :: answer + real(real64) :: elapsed_ns, volume_mb, x_f real(real64) :: x_stdlib, x_str2real, x_read real(c_double) :: x_c integer :: i, k, r, ios_read @@ -280,6 +283,33 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths avg_ns = 0.0_real64 checksum = 0.0_real64 + ! Compute XOR checksums in a single untimed pass + xor_checksum = 0_int64 + do i = 1, nlines + call parse_double_range_sub( & + packed_text, & + int(offsets(i), kind=kind(i)) + 1, & + int(offsets(i) + lengths(i), kind=kind(i)), & + x_f, f_result, DEFAULT_PARSING) + if (f_result%outcome == outcomes%OK) & + xor_checksum(B_RSUB) = ieor(xor_checksum(B_RSUB), transfer(x_f, 0_int64)) + + x_stdlib = 0.0_real64 + x_stdlib = to_num(lines(i)%text, x_stdlib) + xor_checksum(B_STDLIB) = ieor(xor_checksum(B_STDLIB), transfer(x_stdlib, 0_int64)) + + x_str2real = str2real(lines(i)%text) + xor_checksum(B_S2R) = ieor(xor_checksum(B_S2R), transfer(x_str2real, 0_int64)) + + read(lines(i)%text, *, iostat=ios_read) x_read + if (ios_read == 0) & + xor_checksum(B_READ) = ieor(xor_checksum(B_READ), transfer(x_read, 0_int64)) + + call ffc_parse_double_c(lines(i)%c_text, len(lines(i)%text), x_c, c_outcome) + if (c_outcome == outcomes%OK%state) & + xor_checksum(B_CLOOP) = ieor(xor_checksum(B_CLOOP), transfer(real(x_c, real64), 0_int64)) + end do + call system_clock(count_rate=count_rate) do r = 1, repeat_count answer = 0.0_real64 @@ -358,7 +388,7 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths call print_result(labels(k), volume_mb, nlines, min_ns(k), avg_ns(k)) end do do k = 1, NCASES - write(output_unit, "(a,z16.16)") cksum_labels(k), transfer(checksum(k), 0_int64) + write(output_unit, "(a,z16.16)") cksum_labels(k), xor_checksum(k) end do end subroutine run_benchmark diff --git a/src/fast_float_module.F90 b/src/fast_float_module.F90 index 08f1658..f1a1d5c 100644 --- a/src/fast_float_module.F90 +++ b/src/fast_float_module.F90 @@ -57,7 +57,7 @@ module fast_float_module integer(i8), parameter :: FMT_ALLOW_PLUS = int(b'010000000', i8) integer(i8), parameter :: FMT_SKIP_WS = int(b'100000000', i8) - integer(i8), parameter :: PRESET_GENERAL = ior(FMT_FIXED, FMT_SCIENTIFIC) + integer(i8), parameter :: PRESET_GENERAL = ior(ior(FMT_FIXED, FMT_SCIENTIFIC), FMT_SKIP_WS) integer(i8), parameter :: PRESET_JSON = ior(ior(FMT_JSON, PRESET_GENERAL), FMT_NO_INFNAN) integer(i8), parameter :: PRESET_FORTRAN = ior(FMT_FORTRAN, PRESET_GENERAL) @@ -868,18 +868,6 @@ end subroutine mul_u64 ! ===== Character and digit utilities ===== - !> Reinterpret a double as its 64-bit integer representation. - pure elemental integer(i8) function get_double_bits(d) - real(dp), intent(in) :: d - get_double_bits = transfer(d, 0_i8) - end function get_double_bits - - !> Reinterpret a float as its 32-bit integer representation. - pure elemental integer(i4) function get_float_bits(f) - real(sp), intent(in) :: f - get_float_bits = transfer(f, 0_i4) - end function get_float_bits - !> Count leading zero bits in a 64-bit integer. pure elemental integer function count_leading_zeros(x) integer(i8), intent(in) :: x @@ -1886,11 +1874,9 @@ end subroutine bigint_hi64 pure elemental integer function bigint_compare(a, b) type(bigint), intent(in) :: a, b integer :: j - if (a%vec%ln > b%vec%ln) then - bigint_compare = 1 - return - else if (a%vec%ln < b%vec%ln) then - bigint_compare = -1 + j = a%vec%ln - b%vec%ln + if (j/=0) then + bigint_compare = sign(1,j) return end if bigint_compare = 0 @@ -1942,7 +1928,7 @@ pure elemental subroutine to_extended(vd, f, res) integer(i8) :: b integer(i4) :: bi bi = int(f%mantissa_bits - f%min_exponent, i4) - b = get_double_bits(vd) + b = transfer(vd, b) if (iand(b, f%exponent_mask) == 0) then res = adjusted_mantissa( iand(b, f%mantissa_mask), 1 - bi) else