From 084c6db1df343bdba14bc212fcc48dddbc88e7ee Mon Sep 17 00:00:00 2001
From: Federico Perini <federico.perini@gmail.com>
Date: Thu, 26 Mar 2026 22:05:19 +0100
Subject: [PATCH 1/5] Fix benchmark: strip leading spaces, add meaningful XOR
 checksums

The es23.16 format produces a leading space for positive numbers.
trim() only strips trailing blanks, so fast_float and ffc (which
follow C from_chars convention: no whitespace skipping) failed every
parse silently. Use adjustl() to left-justify before trimming.

Add XOR-based checksums computed in a separate untimed pass over all
parsed values, replacing the old max-only checksum that only captured
the last repeat's maximum. Mark answer as volatile to prevent the
compiler from eliminating the timed loops in release builds.
---
 benchmark/benchmark_compare.f90 | 34 +++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/benchmark/benchmark_compare.f90 b/benchmark/benchmark_compare.f90
index a246ba9..9deeb11 100644
--- a/benchmark/benchmark_compare.f90
+++ b/benchmark/benchmark_compare.f90
@@ -135,6 +135,7 @@ subroutine generate_random_lines(howmany, lines, packed_text, packed_data, offse
         do i = 1, nlines
             call random_number(x)
             write(buf, '(es23.16)') x
+            buf = adjustl(buf)
             n = len_trim(buf)
             lines(i)%text = trim(buf)
             allocate(lines(i)%c_text(n))
@@ -267,7 +268,9 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths
             "ffc interop bits      = " ]
 
         real(real64) :: min_ns(NCASES), avg_ns(NCASES), checksum(NCASES)
-        real(real64) :: answer, elapsed_ns, volume_mb, x_f
+        integer(int64) :: xor_checksum(NCASES)
+        real(real64), volatile :: answer
+        real(real64) :: elapsed_ns, volume_mb, x_f
         real(real64) :: x_stdlib, x_str2real, x_read
         real(c_double) :: x_c
         integer :: i, k, r, ios_read
@@ -280,6 +283,33 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths
         avg_ns = 0.0_real64
         checksum = 0.0_real64
 
+        ! Compute XOR checksums in a single untimed pass
+        xor_checksum = 0_int64
+        do i = 1, nlines
+            call parse_double_range_sub( &
+                packed_text, &
+                int(offsets(i), kind=kind(i)) + 1, &
+                int(offsets(i) + lengths(i), kind=kind(i)), &
+                x_f, f_result, DEFAULT_PARSING)
+            if (f_result%outcome == outcomes%OK) &
+                xor_checksum(B_RSUB) = ieor(xor_checksum(B_RSUB), transfer(x_f, 0_int64))
+
+            x_stdlib = 0.0_real64
+            x_stdlib = to_num(lines(i)%text, x_stdlib)
+            xor_checksum(B_STDLIB) = ieor(xor_checksum(B_STDLIB), transfer(x_stdlib, 0_int64))
+
+            x_str2real = str2real(lines(i)%text)
+            xor_checksum(B_S2R) = ieor(xor_checksum(B_S2R), transfer(x_str2real, 0_int64))
+
+            read(lines(i)%text, *, iostat=ios_read) x_read
+            if (ios_read == 0) &
+                xor_checksum(B_READ) = ieor(xor_checksum(B_READ), transfer(x_read, 0_int64))
+
+            call ffc_parse_double_c(lines(i)%c_text, len(lines(i)%text), x_c, c_outcome)
+            if (c_outcome == outcomes%OK%state) &
+                xor_checksum(B_CLOOP) = ieor(xor_checksum(B_CLOOP), transfer(real(x_c, real64), 0_int64))
+        end do
+
         call system_clock(count_rate=count_rate)
         do r = 1, repeat_count
             answer = 0.0_real64
@@ -358,7 +388,7 @@ subroutine run_benchmark(name, lines, packed_text, packed_data, offsets, lengths
             call print_result(labels(k), volume_mb, nlines, min_ns(k), avg_ns(k))
         end do
         do k = 1, NCASES
-            write(output_unit, "(a,z16.16)") cksum_labels(k), transfer(checksum(k), 0_int64)
+            write(output_unit, "(a,z16.16)") cksum_labels(k), xor_checksum(k)
         end do
     end subroutine run_benchmark
 

From 4bd4124b496c129b08d24a43babc3fb2b266c1d8 Mon Sep 17 00:00:00 2001
From: Federico Perini <federico.perini@gmail.com>
Date: Thu, 26 Mar 2026 22:23:29 +0100
Subject: [PATCH 2/5] Update README benchmarks with corrected results

Previous random-uniform results were inflated for fast_float and ffc
because leading spaces caused all parses to fail silently. These are
the real numbers after the adjustl fix.

Also add canada_short.txt results and remove stale profiling section
(profile_benchmark.sh was deleted earlier).
---
 README.md | 92 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 48 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index 5324618..eff542d 100644
--- a/README.md
+++ b/README.md
@@ -65,66 +65,70 @@ fpm build
 fpm test
 ```
 
-## Profiling
-
-Use `profile_benchmark.sh` to profile the `benchmark_compare` benchmark on macOS:
-
-```bash
-./profile_benchmark.sh xctrace   # Time Profiler trace
-./profile_benchmark.sh sample    # CLI sampling report
-```
-
-Environment overrides: `REPEAT_COUNT`, `DATA_DIR`, `TRACE_OUT`.
-
 ## Benchmarks
 
 Run with `fpm test --profile release --target benchmark_compare` on Apple Silicon (M1 Max).
 Data files from [simple_fastfloat_benchmark](https://github.com/lemire/simple_fastfloat_benchmark).
 Use `./run_benchmarks.sh` to run the full suite (random + data files) with C++ comparison.
 
-### Random uniform [0,1) -- 100k floats, 2.19 MB
+### Random uniform [0,1) -- 100k floats, 2.10 MB
+
+```
+netlib              (C)                 :   403.76 MB/s (+/- 1.5 %)    19.24 Mfloat/s
+strtod              (C)                 :   786.63 MB/s (+/- 1.3 %)    37.49 Mfloat/s
+abseil              (C++)               :   917.36 MB/s (+/- 0.9 %)    43.72 Mfloat/s
+fastfloat           (C++)               :  1586.85 MB/s (+/- 1.3 %)    75.63 Mfloat/s
+ffc                 (C)                 :  1650.41 MB/s (+/- 1.3 %)    78.66 Mfloat/s
+fortran (fast_float)                    :   876.76 MB/s (+/- 0.7 %)    41.79 Mfloat/s
+fortran (stdlib to_num)                 :  1066.10 MB/s (+/- 1.0 %)    50.81 Mfloat/s
+fortran (str2real)                      :   609.20 MB/s (+/- 0.7 %)    29.04 Mfloat/s
+fortran (read *)                        :    56.90 MB/s (+/- 0.4 %)     2.71 Mfloat/s
+ffc via fortran interop                 :  1493.30 MB/s (+/- 1.4 %)    71.17 Mfloat/s
+```
+
+### canada_short.txt -- 111k lines, 0.70 MB
 
 ```
-netlib              (C)                 :   405.14 MB/s (+/-  1.2 %)    19.31 Mfloat/s
-strtod              (C)                 :   787.20 MB/s (+/-  1.0 %)    37.52 Mfloat/s
-abseil              (C++)               :   919.15 MB/s (+/-  1.0 %)    43.81 Mfloat/s
-fastfloat           (C++)               :  1586.45 MB/s (+/-  1.3 %)    75.61 Mfloat/s
-ffc                 (C)                 :  1653.72 MB/s (+/-  1.6 %)    78.82 Mfloat/s
-fortran (fast_float)                    :  2394.60 MB/s (+/-  8.8 %)   109.17 Mfloat/s
-fortran (stdlib to_num)                 :  1078.39 MB/s (+/-  2.7 %)    49.16 Mfloat/s
-fortran (str2real)                      :   666.91 MB/s (+/-  0.5 %)    30.40 Mfloat/s
-fortran (read *)                        :    58.58 MB/s (+/-  0.5 %)     2.67 Mfloat/s
-ffc via fortran interop                 :  4284.08 MB/s (+/- 11.0 %)   195.31 Mfloat/s
+netlib              (C)                 :   546.31 MB/s (+/- 4.0 %)   101.50 Mfloat/s
+strtod              (C)                 :   367.00 MB/s (+/- 0.9 %)    68.19 Mfloat/s
+abseil              (C++)               :   361.36 MB/s (+/- 1.2 %)    67.14 Mfloat/s
+fastfloat           (C++)               :   566.31 MB/s (+/- 1.0 %)   105.22 Mfloat/s
+ffc                 (C)                 :   704.02 MB/s (+/- 1.1 %)   130.81 Mfloat/s
+fortran (fast_float)                    :   638.99 MB/s (+/- 1.4 %)   118.72 Mfloat/s
+fortran (stdlib to_num)                 :   710.33 MB/s (+/- 3.1 %)   131.98 Mfloat/s
+fortran (str2real)                      :   243.62 MB/s (+/- 0.4 %)    45.27 Mfloat/s
+fortran (read *)                        :    22.14 MB/s (+/- 0.6 %)     4.11 Mfloat/s
+ffc via fortran interop                 :   598.70 MB/s (+/- 4.8 %)   111.24 Mfloat/s
 ```
 
-### canada.txt -- 111k lines, 1.93 MB
+### canada.txt -- 111k lines, 2.04 MB
 
 ```
-netlib              (C)                 :   385.05 MB/s (+/-  1.3 %)    22.13 Mfloat/s
-strtod              (C)                 :   686.85 MB/s (+/-  1.4 %)    39.47 Mfloat/s
-abseil              (C++)               :   868.24 MB/s (+/-  1.6 %)    49.89 Mfloat/s
-fastfloat           (C++)               :  1095.24 MB/s (+/-  1.4 %)    62.94 Mfloat/s
-ffc                 (C)                 :  1169.13 MB/s (+/-  1.8 %)    67.19 Mfloat/s
-fortran (fast_float)                    :  1011.37 MB/s (+/-  1.3 %)    58.12 Mfloat/s
-fortran (stdlib to_num)                 :  1074.30 MB/s (+/-  2.3 %)    61.74 Mfloat/s
-fortran (str2real)                      :   463.62 MB/s (+/-  1.5 %)    26.64 Mfloat/s
-fortran (read *)                        :    49.77 MB/s (+/-  0.5 %)     2.86 Mfloat/s
-ffc via fortran interop                 :  1089.43 MB/s (+/-  0.9 %)    62.61 Mfloat/s
+netlib              (C)                 :   384.11 MB/s (+/- 0.9 %)    22.07 Mfloat/s
+strtod              (C)                 :   686.30 MB/s (+/- 1.0 %)    39.44 Mfloat/s
+abseil              (C++)               :   868.22 MB/s (+/- 1.2 %)    49.89 Mfloat/s
+fastfloat           (C++)               :  1095.50 MB/s (+/- 1.1 %)    62.95 Mfloat/s
+ffc                 (C)                 :  1169.13 MB/s (+/- 1.5 %)    67.19 Mfloat/s
+fortran (fast_float)                    :  1036.86 MB/s (+/- 0.7 %)    59.58 Mfloat/s
+fortran (stdlib to_num)                 :  1038.53 MB/s (+/- 1.0 %)    59.68 Mfloat/s
+fortran (str2real)                      :   452.02 MB/s (+/- 0.4 %)    25.98 Mfloat/s
+fortran (read *)                        :    49.86 MB/s (+/- 0.7 %)     2.87 Mfloat/s
+ffc via fortran interop                 :  1048.10 MB/s (+/- 1.6 %)    60.23 Mfloat/s
 ```
 
-### mesh.txt -- 73k lines, 0.54 MB
+### mesh.txt -- 73k lines, 0.61 MB
 
 ```
-netlib              (C)                 :   537.08 MB/s (+/-  2.7 %)    73.17 Mfloat/s
-strtod              (C)                 :   523.55 MB/s (+/-  1.5 %)    71.32 Mfloat/s
-abseil              (C++)               :   415.44 MB/s (+/-  1.3 %)    56.59 Mfloat/s
-fastfloat           (C++)               :   825.16 MB/s (+/-  1.5 %)   112.41 Mfloat/s
-ffc                 (C)                 :   948.34 MB/s (+/-  2.5 %)   129.19 Mfloat/s
-fortran (fast_float)                    :   846.78 MB/s (+/-  2.4 %)   115.35 Mfloat/s
-fortran (stdlib to_num)                 :   802.41 MB/s (+/-  1.0 %)   109.31 Mfloat/s
-fortran (str2real)                      :   301.81 MB/s (+/-  0.4 %)    41.11 Mfloat/s
-fortran (read *)                        :    28.43 MB/s (+/-  1.2 %)     3.87 Mfloat/s
-ffc via fortran interop                 :   853.52 MB/s (+/-  0.9 %)   116.27 Mfloat/s
+netlib              (C)                 :   537.91 MB/s (+/- 2.1 %)    73.28 Mfloat/s
+strtod              (C)                 :   522.81 MB/s (+/- 1.2 %)    71.22 Mfloat/s
+abseil              (C++)               :   415.05 MB/s (+/- 1.3 %)    56.54 Mfloat/s
+fastfloat           (C++)               :   825.10 MB/s (+/- 1.1 %)   112.40 Mfloat/s
+ffc                 (C)                 :   947.22 MB/s (+/- 1.3 %)   129.04 Mfloat/s
+fortran (fast_float)                    :   885.97 MB/s (+/- 0.9 %)   120.69 Mfloat/s
+fortran (stdlib to_num)                 :   796.45 MB/s (+/- 1.2 %)   108.50 Mfloat/s
+fortran (str2real)                      :   298.61 MB/s (+/- 2.4 %)    40.68 Mfloat/s
+fortran (read *)                        :    28.21 MB/s (+/- 0.6 %)     3.84 Mfloat/s
+ffc via fortran interop                 :   838.82 MB/s (+/- 2.1 %)   114.27 Mfloat/s
 ```
 
 ## Acknowledgements

From c220f02a0607aa10430c2f12bb2fd49b02faa5ff Mon Sep 17 00:00:00 2001
From: Federico Perini <federico.perini@gmail.com>
Date: Thu, 26 Mar 2026 22:26:41 +0100
Subject: [PATCH 3/5] Enable FMT_SKIP_WS in DEFAULT_PARSING

Add FMT_SKIP_WS to PRESET_GENERAL so all parse functions skip leading
whitespace by default. This matches Fortran convention (read *, stdlib,
str2real all skip whitespace) and the upstream ffc.h library which has
FFC_FORMAT_FLAG_SKIP_WHITE_SPACE for the same purpose.

Keep adjustl in benchmark data generation so the C interop path (which
uses ffc_from_chars_double without options) also gets clean input.
---
 src/fast_float_module.F90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fast_float_module.F90 b/src/fast_float_module.F90
index 08f1658..ba84c60 100644
--- a/src/fast_float_module.F90
+++ b/src/fast_float_module.F90
@@ -57,7 +57,7 @@ module fast_float_module
     integer(i8), parameter :: FMT_ALLOW_PLUS = int(b'010000000', i8)
     integer(i8), parameter :: FMT_SKIP_WS    = int(b'100000000', i8)
 
-    integer(i8), parameter :: PRESET_GENERAL = ior(FMT_FIXED, FMT_SCIENTIFIC)
+    integer(i8), parameter :: PRESET_GENERAL = ior(ior(FMT_FIXED, FMT_SCIENTIFIC), FMT_SKIP_WS)
     integer(i8), parameter :: PRESET_JSON    = ior(ior(FMT_JSON, PRESET_GENERAL), FMT_NO_INFNAN)
     integer(i8), parameter :: PRESET_FORTRAN = ior(FMT_FORTRAN, PRESET_GENERAL)
 

From 6abe43832643cd47bddda3d29b43c87b29ba8cd6 Mon Sep 17 00:00:00 2001
From: Federico Perini <federico.perini@gmail.com>
Date: Thu, 26 Mar 2026 22:34:35 +0100
Subject: [PATCH 4/5] Simplify bigint_compare with sign intrinsic

---
 src/fast_float_module.F90 | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/fast_float_module.F90 b/src/fast_float_module.F90
index ba84c60..1f9ce28 100644
--- a/src/fast_float_module.F90
+++ b/src/fast_float_module.F90
@@ -1886,11 +1886,9 @@ end subroutine bigint_hi64
     pure elemental integer function bigint_compare(a, b)
         type(bigint), intent(in) :: a, b
         integer :: j
-        if (a%vec%ln > b%vec%ln) then
-            bigint_compare = 1
-            return
-        else if (a%vec%ln < b%vec%ln) then
-            bigint_compare = -1
+        j = a%vec%ln - b%vec%ln
+        if (j/=0) then
+            bigint_compare = sign(1,j)
             return
         end if
         bigint_compare = 0

From 5c7501c46670ca79d75177b937061c545c53332a Mon Sep 17 00:00:00 2001
From: Federico Perini <federico.perini@gmail.com>
Date: Thu, 26 Mar 2026 22:38:33 +0100
Subject: [PATCH 5/5] Remove get_double_bits/get_float_bits wrappers, use
 transfer directly

---
 src/fast_float_module.F90 | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/fast_float_module.F90 b/src/fast_float_module.F90
index 1f9ce28..f1a1d5c 100644
--- a/src/fast_float_module.F90
+++ b/src/fast_float_module.F90
@@ -868,18 +868,6 @@ end subroutine mul_u64
 
     ! ===== Character and digit utilities =====
 
-    !> Reinterpret a double as its 64-bit integer representation.
-    pure elemental integer(i8) function get_double_bits(d)
-        real(dp), intent(in) :: d
-        get_double_bits = transfer(d, 0_i8)
-    end function get_double_bits
-
-    !> Reinterpret a float as its 32-bit integer representation.
-    pure elemental integer(i4) function get_float_bits(f)
-        real(sp), intent(in) :: f
-        get_float_bits = transfer(f, 0_i4)
-    end function get_float_bits
-
     !> Count leading zero bits in a 64-bit integer.
     pure elemental integer function count_leading_zeros(x)
         integer(i8), intent(in) :: x
@@ -1940,7 +1928,7 @@ pure elemental subroutine to_extended(vd, f, res)
         integer(i8) :: b
         integer(i4) :: bi
         bi = int(f%mantissa_bits - f%min_exponent, i4)
-        b = get_double_bits(vd)
+        b = transfer(vd, b)
         if (iand(b, f%exponent_mask) == 0) then
             res = adjusted_mantissa( iand(b, f%mantissa_mask), 1 - bi)
         else