From 084c6db1df343bdba14bc212fcc48dddbc88e7ee Mon Sep 17 00:00:00 2001 From: Federico Perini Date: Thu, 26 Mar 2026 22:05:19 +0100 Subject: [PATCH 1/5] Fix benchmark: strip leading spaces, add meaningful XOR checksums The es23.16 format produces a leading space for positive numbers. trim() only strips trailing blanks, so fast_float and ffc (which follow C from_chars convention: no whitespace skipping) failed every parse silently. Use adjustl() to left-justify before trimming. Add XOR-based checksums computed in a separate untimed pass over all parsed values, replacing the old max-only checksum that only captured the last repeat's maximum. Mark answer as volatile to prevent the compiler from eliminating the timed loops in release builds. --- benchmark/benchmark_compare.f90 | 34 +++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_compare.f90 b/benchmark/benchmark_compare.f90 index a246ba9..9deeb11 100644 --- a/benchmark/benchmark_compare.f90 +++ b/benchmark/benchmark_compare.f90 @@ -135,6 +135,7 @@ subroutine generate_random_lines(howmany, lines, packed_text, packed_data, offse do i = 1, nlines call random_number(x) write(buf, '(es23.16)') x + buf = adjustl(buf) n = len_trim(buf) lines(i)%text = trim(buf) allocate(lines(i)%c_text(n)) @@ -267,7 +268,9 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths "ffc interop bits = " ] real(real64) :: min_ns(NCASES), avg_ns(NCASES), checksum(NCASES) - real(real64) :: answer, elapsed_ns, volume_mb, x_f + integer(int64) :: xor_checksum(NCASES) + real(real64), volatile :: answer + real(real64) :: elapsed_ns, volume_mb, x_f real(real64) :: x_stdlib, x_str2real, x_read real(c_double) :: x_c integer :: i, k, r, ios_read @@ -280,6 +283,33 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths avg_ns = 0.0_real64 checksum = 0.0_real64 + ! Compute XOR checksums in a single untimed pass + xor_checksum = 0_int64 + do i = 1, nlines + call parse_double_range_sub( & + packed_text, & + int(offsets(i), kind=kind(i)) + 1, & + int(offsets(i) + lengths(i), kind=kind(i)), & + x_f, f_result, DEFAULT_PARSING) + if (f_result%outcome == outcomes%OK) & + xor_checksum(B_RSUB) = ieor(xor_checksum(B_RSUB), transfer(x_f, 0_int64)) + + x_stdlib = 0.0_real64 + x_stdlib = to_num(lines(i)%text, x_stdlib) + xor_checksum(B_STDLIB) = ieor(xor_checksum(B_STDLIB), transfer(x_stdlib, 0_int64)) + + x_str2real = str2real(lines(i)%text) + xor_checksum(B_S2R) = ieor(xor_checksum(B_S2R), transfer(x_str2real, 0_int64)) + + read(lines(i)%text, *, iostat=ios_read) x_read + if (ios_read == 0) & + xor_checksum(B_READ) = ieor(xor_checksum(B_READ), transfer(x_read, 0_int64)) + + call ffc_parse_double_c(lines(i)%c_text, len(lines(i)%text), x_c, c_outcome) + if (c_outcome == outcomes%OK%state) & + xor_checksum(B_CLOOP) = ieor(xor_checksum(B_CLOOP), transfer(real(x_c, real64), 0_int64)) + end do + call system_clock(count_rate=count_rate) do r = 1, repeat_count answer = 0.0_real64 @@ -358,7 +388,7 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths call print_result(labels(k), volume_mb, nlines, min_ns(k), avg_ns(k)) end do do k = 1, NCASES - write(output_unit, "(a,z16.16)") cksum_labels(k), transfer(checksum(k), 0_int64) + write(output_unit, "(a,z16.16)") cksum_labels(k), xor_checksum(k) end do end subroutine run_benchmark From 4bd4124b496c129b08d24a43babc3fb2b266c1d8 Mon Sep 17 00:00:00 2001 From: Federico Perini Date: Thu, 26 Mar 2026 22:23:29 +0100 Subject: [PATCH 2/5] Update README benchmarks with corrected results Previous random-uniform results were inflated for fast_float and ffc because leading spaces caused all parses to fail silently. These are the real numbers after the adjustl fix. Also add canada_short.txt results and remove stale profiling section (profile_benchmark.sh was deleted earlier). --- README.md | 92 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 5324618..eff542d 100644 --- a/README.md +++ b/README.md @@ -65,66 +65,70 @@ fpm build fpm test ``` -## Profiling - -Use `profile_benchmark.sh` to profile the `benchmark_compare` benchmark on macOS: - -```bash -./profile_benchmark.sh xctrace # Time Profiler trace -./profile_benchmark.sh sample # CLI sampling report -``` - -Environment overrides: `REPEAT_COUNT`, `DATA_DIR`, `TRACE_OUT`. - ## Benchmarks Run with `fpm test --profile release --target benchmark_compare` on Apple Silicon (M1 Max). Data files from [simple_fastfloat_benchmark](https://github.com/lemire/simple_fastfloat_benchmark). Use `./run_benchmarks.sh` to run the full suite (random + data files) with C++ comparison. -### Random uniform [0,1) -- 100k floats, 2.19 MB +### Random uniform [0,1) -- 100k floats, 2.10 MB + +``` +netlib (C) : 403.76 MB/s (+/- 1.5 %) 19.24 Mfloat/s +strtod (C) : 786.63 MB/s (+/- 1.3 %) 37.49 Mfloat/s +abseil (C++) : 917.36 MB/s (+/- 0.9 %) 43.72 Mfloat/s +fastfloat (C++) : 1586.85 MB/s (+/- 1.3 %) 75.63 Mfloat/s +ffc (C) : 1650.41 MB/s (+/- 1.3 %) 78.66 Mfloat/s +fortran (fast_float) : 876.76 MB/s (+/- 0.7 %) 41.79 Mfloat/s +fortran (stdlib to_num) : 1066.10 MB/s (+/- 1.0 %) 50.81 Mfloat/s +fortran (str2real) : 609.20 MB/s (+/- 0.7 %) 29.04 Mfloat/s +fortran (read *) : 56.90 MB/s (+/- 0.4 %) 2.71 Mfloat/s +ffc via fortran interop : 1493.30 MB/s (+/- 1.4 %) 71.17 Mfloat/s +``` + +### canada_short.txt -- 111k lines, 0.70 MB ``` -netlib (C) : 405.14 MB/s (+/- 1.2 %) 19.31 Mfloat/s -strtod (C) : 787.20 MB/s (+/- 1.0 %) 37.52 Mfloat/s -abseil (C++) : 919.15 MB/s (+/- 1.0 %) 43.81 Mfloat/s -fastfloat (C++) : 1586.45 MB/s (+/- 1.3 %) 75.61 Mfloat/s -ffc (C) : 1653.72 MB/s (+/- 1.6 %) 78.82 Mfloat/s -fortran (fast_float) : 2394.60 MB/s (+/- 8.8 %) 109.17 Mfloat/s -fortran (stdlib to_num) : 1078.39 MB/s (+/- 2.7 %) 49.16 Mfloat/s -fortran (str2real) : 666.91 MB/s (+/- 0.5 %) 30.40 Mfloat/s -fortran (read *) : 58.58 MB/s (+/- 0.5 %) 2.67 Mfloat/s -ffc via fortran interop : 4284.08 MB/s (+/- 11.0 %) 195.31 Mfloat/s +netlib (C) : 546.31 MB/s (+/- 4.0 %) 101.50 Mfloat/s +strtod (C) : 367.00 MB/s (+/- 0.9 %) 68.19 Mfloat/s +abseil (C++) : 361.36 MB/s (+/- 1.2 %) 67.14 Mfloat/s +fastfloat (C++) : 566.31 MB/s (+/- 1.0 %) 105.22 Mfloat/s +ffc (C) : 704.02 MB/s (+/- 1.1 %) 130.81 Mfloat/s +fortran (fast_float) : 638.99 MB/s (+/- 1.4 %) 118.72 Mfloat/s +fortran (stdlib to_num) : 710.33 MB/s (+/- 3.1 %) 131.98 Mfloat/s +fortran (str2real) : 243.62 MB/s (+/- 0.4 %) 45.27 Mfloat/s +fortran (read *) : 22.14 MB/s (+/- 0.6 %) 4.11 Mfloat/s +ffc via fortran interop : 598.70 MB/s (+/- 4.8 %) 111.24 Mfloat/s ``` -### canada.txt -- 111k lines, 1.93 MB +### canada.txt -- 111k lines, 2.04 MB ``` -netlib (C) : 385.05 MB/s (+/- 1.3 %) 22.13 Mfloat/s -strtod (C) : 686.85 MB/s (+/- 1.4 %) 39.47 Mfloat/s -abseil (C++) : 868.24 MB/s (+/- 1.6 %) 49.89 Mfloat/s -fastfloat (C++) : 1095.24 MB/s (+/- 1.4 %) 62.94 Mfloat/s -ffc (C) : 1169.13 MB/s (+/- 1.8 %) 67.19 Mfloat/s -fortran (fast_float) : 1011.37 MB/s (+/- 1.3 %) 58.12 Mfloat/s -fortran (stdlib to_num) : 1074.30 MB/s (+/- 2.3 %) 61.74 Mfloat/s -fortran (str2real) : 463.62 MB/s (+/- 1.5 %) 26.64 Mfloat/s -fortran (read *) : 49.77 MB/s (+/- 0.5 %) 2.86 Mfloat/s -ffc via fortran interop : 1089.43 MB/s (+/- 0.9 %) 62.61 Mfloat/s +netlib (C) : 384.11 MB/s (+/- 0.9 %) 22.07 Mfloat/s +strtod (C) : 686.30 MB/s (+/- 1.0 %) 39.44 Mfloat/s +abseil (C++) : 868.22 MB/s (+/- 1.2 %) 49.89 Mfloat/s +fastfloat (C++) : 1095.50 MB/s (+/- 1.1 %) 62.95 Mfloat/s +ffc (C) : 1169.13 MB/s (+/- 1.5 %) 67.19 Mfloat/s +fortran (fast_float) : 1036.86 MB/s (+/- 0.7 %) 59.58 Mfloat/s +fortran (stdlib to_num) : 1038.53 MB/s (+/- 1.0 %) 59.68 Mfloat/s +fortran (str2real) : 452.02 MB/s (+/- 0.4 %) 25.98 Mfloat/s +fortran (read *) : 49.86 MB/s (+/- 0.7 %) 2.87 Mfloat/s +ffc via fortran interop : 1048.10 MB/s (+/- 1.6 %) 60.23 Mfloat/s ``` -### mesh.txt -- 73k lines, 0.54 MB +### mesh.txt -- 73k lines, 0.61 MB ``` -netlib (C) : 537.08 MB/s (+/- 2.7 %) 73.17 Mfloat/s -strtod (C) : 523.55 MB/s (+/- 1.5 %) 71.32 Mfloat/s -abseil (C++) : 415.44 MB/s (+/- 1.3 %) 56.59 Mfloat/s -fastfloat (C++) : 825.16 MB/s (+/- 1.5 %) 112.41 Mfloat/s -ffc (C) : 948.34 MB/s (+/- 2.5 %) 129.19 Mfloat/s -fortran (fast_float) : 846.78 MB/s (+/- 2.4 %) 115.35 Mfloat/s -fortran (stdlib to_num) : 802.41 MB/s (+/- 1.0 %) 109.31 Mfloat/s -fortran (str2real) : 301.81 MB/s (+/- 0.4 %) 41.11 Mfloat/s -fortran (read *) : 28.43 MB/s (+/- 1.2 %) 3.87 Mfloat/s -ffc via fortran interop : 853.52 MB/s (+/- 0.9 %) 116.27 Mfloat/s +netlib (C) : 537.91 MB/s (+/- 2.1 %) 73.28 Mfloat/s +strtod (C) : 522.81 MB/s (+/- 1.2 %) 71.22 Mfloat/s +abseil (C++) : 415.05 MB/s (+/- 1.3 %) 56.54 Mfloat/s +fastfloat (C++) : 825.10 MB/s (+/- 1.1 %) 112.40 Mfloat/s +ffc (C) : 947.22 MB/s (+/- 1.3 %) 129.04 Mfloat/s +fortran (fast_float) : 885.97 MB/s (+/- 0.9 %) 120.69 Mfloat/s +fortran (stdlib to_num) : 796.45 MB/s (+/- 1.2 %) 108.50 Mfloat/s +fortran (str2real) : 298.61 MB/s (+/- 2.4 %) 40.68 Mfloat/s +fortran (read *) : 28.21 MB/s (+/- 0.6 %) 3.84 Mfloat/s +ffc via fortran interop : 838.82 MB/s (+/- 2.1 %) 114.27 Mfloat/s ``` ## Acknowledgements From c220f02a0607aa10430c2f12bb2fd49b02faa5ff Mon Sep 17 00:00:00 2001 From: Federico Perini Date: Thu, 26 Mar 2026 22:26:41 +0100 Subject: [PATCH 3/5] Enable FMT_SKIP_WS in DEFAULT_PARSING Add FMT_SKIP_WS to PRESET_GENERAL so all parse functions skip leading whitespace by default. This matches Fortran convention (read *, stdlib, str2real all skip whitespace) and the upstream ffc.h library which has FFC_FORMAT_FLAG_SKIP_WHITE_SPACE for the same purpose. Keep adjustl in benchmark data generation so the C interop path (which uses ffc_from_chars_double without options) also gets clean input. --- src/fast_float_module.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fast_float_module.F90 b/src/fast_float_module.F90 index 08f1658..ba84c60 100644 --- a/src/fast_float_module.F90 +++ b/src/fast_float_module.F90 @@ -57,7 +57,7 @@ module fast_float_module integer(i8), parameter :: FMT_ALLOW_PLUS = int(b'010000000', i8) integer(i8), parameter :: FMT_SKIP_WS = int(b'100000000', i8) - integer(i8), parameter :: PRESET_GENERAL = ior(FMT_FIXED, FMT_SCIENTIFIC) + integer(i8), parameter :: PRESET_GENERAL = ior(ior(FMT_FIXED, FMT_SCIENTIFIC), FMT_SKIP_WS) integer(i8), parameter :: PRESET_JSON = ior(ior(FMT_JSON, PRESET_GENERAL), FMT_NO_INFNAN) integer(i8), parameter :: PRESET_FORTRAN = ior(FMT_FORTRAN, PRESET_GENERAL) From 6abe43832643cd47bddda3d29b43c87b29ba8cd6 Mon Sep 17 00:00:00 2001 From: Federico Perini Date: Thu, 26 Mar 2026 22:34:35 +0100 Subject: [PATCH 4/5] Simplify bigint_compare with sign intrinsic --- src/fast_float_module.F90 | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/fast_float_module.F90 b/src/fast_float_module.F90 index ba84c60..1f9ce28 100644 --- a/src/fast_float_module.F90 +++ b/src/fast_float_module.F90 @@ -1886,11 +1886,9 @@ end subroutine bigint_hi64 pure elemental integer function bigint_compare(a, b) type(bigint), intent(in) :: a, b integer :: j - if (a%vec%ln > b%vec%ln) then - bigint_compare = 1 - return - else if (a%vec%ln < b%vec%ln) then - bigint_compare = -1 + j = a%vec%ln - b%vec%ln + if (j/=0) then + bigint_compare = sign(1,j) return end if bigint_compare = 0 From 5c7501c46670ca79d75177b937061c545c53332a Mon Sep 17 00:00:00 2001 From: Federico Perini Date: Thu, 26 Mar 2026 22:38:33 +0100 Subject: [PATCH 5/5] Remove get_double_bits/get_float_bits wrappers, use transfer directly --- src/fast_float_module.F90 | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/fast_float_module.F90 b/src/fast_float_module.F90 index 1f9ce28..f1a1d5c 100644 --- a/src/fast_float_module.F90 +++ b/src/fast_float_module.F90 @@ -868,18 +868,6 @@ end subroutine mul_u64 ! ===== Character and digit utilities ===== - !> Reinterpret a double as its 64-bit integer representation. - pure elemental integer(i8) function get_double_bits(d) - real(dp), intent(in) :: d - get_double_bits = transfer(d, 0_i8) - end function get_double_bits - - !> Reinterpret a float as its 32-bit integer representation. - pure elemental integer(i4) function get_float_bits(f) - real(sp), intent(in) :: f - get_float_bits = transfer(f, 0_i4) - end function get_float_bits - !> Count leading zero bits in a 64-bit integer. pure elemental integer function count_leading_zeros(x) integer(i8), intent(in) :: x @@ -1940,7 +1928,7 @@ pure elemental subroutine to_extended(vd, f, res) integer(i8) :: b integer(i4) :: bi bi = int(f%mantissa_bits - f%min_exponent, i4) - b = get_double_bits(vd) + b = transfer(vd, b) if (iand(b, f%exponent_mask) == 0) then res = adjusted_mantissa( iand(b, f%mantissa_mask), 1 - bi) else