From 7097973cdc1149828b2a2cb73d983c90ebc11eea Mon Sep 17 00:00:00 2001 From: mscherer Date: Wed, 25 Mar 2026 04:46:07 +0100 Subject: [PATCH 1/4] fix: improve toml-test invalid compliance - Add UTF-8 encoding validation upfront to reject invalid byte sequences - Add control character validation in basic and literal strings - Reject bare CR (without LF) in multiline strings - Enforce lowercase-only prefixes for non-decimal integers (0x, 0o, 0b) - Reject signed non-decimal integers (+0x, -0o, etc.) - Update tests to verify new validation rules These changes address several toml-test invalid test failures by enforcing stricter TOML spec compliance: - Invalid UTF-8 now rejected early - Control characters (< 0x20 except tab, or 0x7F) rejected in strings - Bare CR rejected (TOML requires CRLF or LF only) - Integer prefixes must be lowercase per TOML spec - Only decimal integers can have sign prefix --- src/Lexer/Lexer.php | 100 +++++++++++++++++++++++--------------- tests/Lexer/LexerTest.php | 35 +++++++++++-- 2 files changed, 91 insertions(+), 44 deletions(-) diff --git a/src/Lexer/Lexer.php b/src/Lexer/Lexer.php index e049953..0c6fa58 100644 --- a/src/Lexer/Lexer.php +++ b/src/Lexer/Lexer.php @@ -28,6 +28,14 @@ public function __construct(private readonly string $input) */ public function tokenize(): Generator { + // Validate UTF-8 encoding upfront + if (!mb_check_encoding($this->input, 'UTF-8')) { + yield new Token(TokenType::Invalid, 'Invalid UTF-8 encoding', null, new Span(0, $this->length, 1, 0)); + yield new Token(TokenType::Eof, '', null, new Span($this->length, $this->length, 1, 0)); + + return; + } + while ($this->pos < $this->length) { $char = $this->input[$this->pos]; @@ -182,10 +190,15 @@ private function string(): Token continue; } $parsed .= $escaped; - } elseif ($char === "\n") { + } elseif ($char === "\n" || $char === "\r") { // Unescaped newline not allowed in basic string return new Token(TokenType::Invalid, substr($this->input, $start, $this->pos - $start), null, new Span($start, $this->pos, $this->line, $col)); } else { + // Check for control characters (except tab which is allowed) + $ord = ord($char); + if (($ord < 0x20 && $ord !== 0x09) || $ord === 0x7F) { + $valid = false; + } $parsed .= $char; $this->advance(); } @@ -340,9 +353,12 @@ private function multiLineBasicString(): Token if ($this->pos < $this->length && $this->input[$this->pos] === "\n") { $parsed .= "\n"; $this->pos++; + $this->line++; + $this->column = 0; + } else { + // Bare CR without LF is invalid + $valid = false; } - $this->line++; - $this->column = 0; } else { // Check for control characters (except tab which is allowed) $ord = ord($char); @@ -375,6 +391,7 @@ private function literalString(): Token $this->advance(); // skip opening ' $parsed = ''; + $valid = true; while ($this->pos < $this->length) { $char = $this->input[$this->pos]; @@ -382,13 +399,23 @@ private function literalString(): Token $this->advance(); $value = substr($this->input, $start, $this->pos - $start); + if (!$valid) { + return new Token(TokenType::Invalid, $value, null, new Span($start, $this->pos, $startLine, $col)); + } + return new Token(TokenType::LiteralString, $value, $parsed, new Span($start, $this->pos, $startLine, $col)); } - if ($char === "\n") { + if ($char === "\n" || $char === "\r") { return new Token(TokenType::Invalid, substr($this->input, $start, $this->pos - $start), null, new Span($start, $this->pos, $startLine, $col)); } + // Check for control characters (except tab which is allowed) + $ord = ord($char); + if (($ord < 0x20 && $ord !== 0x09) || $ord === 0x7F) { + $valid = false; + } + $parsed .= $char; $this->advance(); } @@ -466,9 +493,12 @@ private function multiLineLiteralString(): Token if ($this->pos < $this->length && $this->input[$this->pos] === "\n") { $parsed .= "\n"; $this->pos++; + $this->line++; + $this->column = 0; + } else { + // Bare CR without LF is invalid + $valid = false; } - $this->line++; - $this->column = 0; } else { // Check for control characters (except tab which is allowed) $ord = ord($char); @@ -707,7 +737,13 @@ private function classifyNumber(string $value, Span $span): Token { if (!$this->isValidNumberLiteral($value)) { // TOML 1.1: if it's not a valid number but is a valid bare key, return as BareKey - if (preg_match('/^[A-Za-z0-9_-]+$/', $value)) { + // But only if it doesn't look like an attempted number literal + // (signed values or values with 0x/0o/0b prefixes should be Invalid, not BareKey) + if ( + preg_match('/^[A-Za-z0-9_-]+$/', $value) && + !preg_match('/^[+-]/', $value) && + !preg_match('/^0[xXoObB]/', $value) + ) { return new Token(TokenType::BareKey, $value, $value, $span); } @@ -716,43 +752,27 @@ private function classifyNumber(string $value, Span $span): Token $clean = str_replace('_', '', $value); - // Hex - if ( - str_starts_with($clean, '0x') || str_starts_with($clean, '0X') || - str_starts_with($clean, '+0x') || str_starts_with($clean, '-0x') - ) { + // Hex (lowercase 0x only, no sign) + if (str_starts_with($clean, '0x')) { $parsed = intval($clean, 16); return new Token(TokenType::Integer, $value, $parsed, $span); } - // Octal - if ( - str_starts_with($clean, '0o') || str_starts_with($clean, '0O') || - str_starts_with($clean, '+0o') || str_starts_with($clean, '-0o') - ) { - $negative = str_starts_with($clean, '-'); - $oct = str_replace(['0o', '0O', '+', '-'], '', $clean); + // Octal (lowercase 0o only, no sign) + if (str_starts_with($clean, '0o')) { + $oct = substr($clean, 2); $parsed = intval($oct, 8); - if ($negative) { - $parsed = -$parsed; - } return new Token(TokenType::Integer, $value, $parsed, $span); } - // Binary - if ( - str_starts_with($clean, '0b') || str_starts_with($clean, '0B') || - str_starts_with($clean, '+0b') || str_starts_with($clean, '-0b') - ) { - $bin = str_replace(['0b', '0B', '+', '-'], '', $clean); - $parsed = bindec($bin); - if (str_starts_with($clean, '-')) { - $parsed = -$parsed; - } + // Binary (lowercase 0b only, no sign) + if (str_starts_with($clean, '0b')) { + $bin = substr($clean, 2); + $parsed = (int)bindec($bin); - return new Token(TokenType::Integer, $value, (int)$parsed, $span); + return new Token(TokenType::Integer, $value, $parsed, $span); } // Float (has . or e/E) @@ -771,20 +791,20 @@ private function isNumberChar(string $char): bool private function isValidNumberLiteral(string $value): bool { - // Integer: decimal + // Integer: decimal (only decimal can have +/- sign) if (preg_match('/^[+-]?(?:0|[1-9](?:_?\d)*)$/', $value) === 1) { return true; } - // Integer: hexadecimal - if (preg_match('/^[+-]?0[xX][0-9A-Fa-f](?:_?[0-9A-Fa-f])*$/', $value) === 1) { + // Integer: hexadecimal (lowercase 'x' only, no sign allowed) + if (preg_match('/^0x[0-9A-Fa-f](?:_?[0-9A-Fa-f])*$/', $value) === 1) { return true; } - // Integer: octal - if (preg_match('/^[+-]?0[oO][0-7](?:_?[0-7])*$/', $value) === 1) { + // Integer: octal (lowercase 'o' only, no sign allowed) + if (preg_match('/^0o[0-7](?:_?[0-7])*$/', $value) === 1) { return true; } - // Integer: binary - if (preg_match('/^[+-]?0[bB][01](?:_?[01])*$/', $value) === 1) { + // Integer: binary (lowercase 'b' only, no sign allowed) + if (preg_match('/^0b[01](?:_?[01])*$/', $value) === 1) { return true; } // Float with decimal point (requires digit after decimal) diff --git a/tests/Lexer/LexerTest.php b/tests/Lexer/LexerTest.php index 49aceea..61229e1 100644 --- a/tests/Lexer/LexerTest.php +++ b/tests/Lexer/LexerTest.php @@ -70,13 +70,40 @@ public function testHexOctalBinary(): void $this->assertSame(0b1010, $tokens[4]->parsed); } - public function testNegativeOctalAndBinary(): void + public function testSignedNonDecimalIntegersAreInvalid(): void { - $lexer = new Lexer('-0o777 -0b1010'); + // TOML spec: only decimal integers can have +/- prefix + $lexer = new Lexer('-0o777'); $tokens = iterator_to_array($lexer->tokenize()); + $this->assertSame(TokenType::Invalid, $tokens[0]->type); - $this->assertSame(-0o777, $tokens[0]->parsed); - $this->assertSame(-0b1010, $tokens[2]->parsed); + $lexer = new Lexer('+0o777'); + $tokens = iterator_to_array($lexer->tokenize()); + $this->assertSame(TokenType::Invalid, $tokens[0]->type); + + $lexer = new Lexer('-0b1010'); + $tokens = iterator_to_array($lexer->tokenize()); + $this->assertSame(TokenType::Invalid, $tokens[0]->type); + + $lexer = new Lexer('+0x1f'); + $tokens = iterator_to_array($lexer->tokenize()); + $this->assertSame(TokenType::Invalid, $tokens[0]->type); + } + + public function testCapitalPrefixesAreInvalid(): void + { + // TOML spec: only lowercase 0x, 0o, 0b allowed + $lexer = new Lexer('0X1F'); + $tokens = iterator_to_array($lexer->tokenize()); + $this->assertSame(TokenType::Invalid, $tokens[0]->type); + + $lexer = new Lexer('0O777'); + $tokens = iterator_to_array($lexer->tokenize()); + $this->assertSame(TokenType::Invalid, $tokens[0]->type); + + $lexer = new Lexer('0B1010'); + $tokens = iterator_to_array($lexer->tokenize()); + $this->assertSame(TokenType::Invalid, $tokens[0]->type); } public function testFloat(): void From 2eff6526bb4f589756eccb90bcb6502a770c4197 Mon Sep 17 00:00:00 2001 From: mscherer Date: Wed, 25 Mar 2026 05:21:34 +0100 Subject: [PATCH 2/4] docs: update toml-test compliance numbers Update compliance numbers after fixing: - UTF-8 encoding validation - Control character validation in strings - Bare CR rejection in multiline strings - Integer prefix validation (lowercase only) - Signed non-decimal integer rejection TOML 1.1: 96.8% invalid compliance (up from 90.3%) TOML 1.0: 95.3% invalid compliance (up from 89.0%) --- docs/reference/support-matrix.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/reference/support-matrix.md b/docs/reference/support-matrix.md index 2665d31..f1cda32 100644 --- a/docs/reference/support-matrix.md +++ b/docs/reference/support-matrix.md @@ -118,16 +118,18 @@ Tested against [toml-test](https://github.com/toml-lang/toml-test) v2.1.0: | Test Type | Passed | Failed | Compliance | |-----------|--------|--------|------------| | Valid | 213 | 1 | 99.5% | -| Invalid | 421 | 45 | 90.3% | +| Invalid | 451 | 15 | 96.8% | ### TOML 1.0 | Test Type | Passed | Failed | Compliance | |-----------|--------|--------|------------| | Valid | 204 | 1 | 99.5% | -| Invalid | 421 | 52 | 89.0% | +| Invalid | 451 | 22 | 95.3% | -The single valid test failure is due to a PHP limitation with null byte property names. Invalid test failures are mostly TOML 1.0 strict tests that TOML 1.1 relaxes (multiline inline tables, trailing commas, hex escapes). +The single valid test failure is due to a PHP limitation with null byte property names. + +The remaining invalid test failures are complex table semantics edge cases involving conflicts between dotted keys and explicitly defined tables. These are rare patterns in practice. ## Recommended Use From a6b11722150190cb8327294a48f74f8f44fcc4bf Mon Sep 17 00:00:00 2001 From: mscherer Date: Wed, 25 Mar 2026 05:29:29 +0100 Subject: [PATCH 3/4] fix: reject dotted key conflicts with explicit tables Add stricter validation for table semantics edge cases: 1. Cannot extend explicitly defined tables via dotted keys - [a.b.c] followed by [a] + b.c.t = ... is now rejected 2. Cannot extend array tables via dotted keys - [[a.b]] followed by [a] + b.y = ... is now rejected 3. Cannot explicitly define tables created by dotted keys - [a] + b.c = 1 followed by [a.b] is now rejected The fix introduces a new 'dotted' kind for implicit tables created via dotted key notation, distinguishing them from 'implicit' tables created by super-table headers which CAN be explicitly defined later. Add semantic test fixtures for all three patterns. --- src/Normalizer.php | 28 +++++++++++++++++-- .../array-table-dotted-extension.toml | 4 +++ .../semantic/dotted-key-explicit-table.toml | 5 ++++ .../explicit-table-dotted-extension.toml | 5 ++++ 4 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 tests/fixtures/Conformance/semantic/array-table-dotted-extension.toml create mode 100644 tests/fixtures/Conformance/semantic/dotted-key-explicit-table.toml create mode 100644 tests/fixtures/Conformance/semantic/explicit-table-dotted-extension.toml diff --git a/src/Normalizer.php b/src/Normalizer.php index 7e8808f..37ef820 100644 --- a/src/Normalizer.php +++ b/src/Normalizer.php @@ -259,6 +259,14 @@ private function &openTable(array &$array, array $path, Span $span): ?array return $null; } + if ($kind === 'dotted') { + $this->errors[] = new ParseError( + "Cannot define table '{$displayPath}' after it was implicitly created by dotted keys", + $span, + ); + + return $null; + } // kind === 'implicit' is OK - we're explicitly defining a previously implicit table } @@ -447,9 +455,24 @@ private function setNestedValue( return; } + // Check if this intermediate path was explicitly defined as a table or array table + // If so, we cannot extend it with dotted keys + if (!$isInlineTableScope && isset($this->definedTables[$internalPath])) { + $kind = $this->definedTables[$internalPath]['kind']; + if ($kind === 'explicit' || $kind === 'array') { + $this->errors[] = new ParseError( + "Cannot add keys to explicitly defined table '{$displayPath}' via dotted keys", + $span, + ); + + return; + } + } + if (!array_key_exists($key, $current)) { $current[$key] = []; - $definedTables[$internalPath] ??= ['kind' => 'implicit', 'span' => $span]; + // Mark as 'dotted' - cannot be explicitly defined later + $definedTables[$internalPath] ??= ['kind' => 'dotted', 'span' => $span]; } elseif (!is_array($current[$key])) { $this->errors[] = new ParseError("Cannot redefine key '{$displayPath}' as a table", $span); @@ -461,7 +484,8 @@ private function setNestedValue( $internalPrefix[] = '#' . (string)$lastEntry; $current = &$current[$key][$lastEntry]; } else { - $definedTables[$internalPath] ??= ['kind' => 'implicit', 'span' => $span]; + // Mark as 'dotted' - cannot be explicitly defined later + $definedTables[$internalPath] ??= ['kind' => 'dotted', 'span' => $span]; $current = &$current[$key]; } } diff --git a/tests/fixtures/Conformance/semantic/array-table-dotted-extension.toml b/tests/fixtures/Conformance/semantic/array-table-dotted-extension.toml new file mode 100644 index 0000000..1cae3a9 --- /dev/null +++ b/tests/fixtures/Conformance/semantic/array-table-dotted-extension.toml @@ -0,0 +1,4 @@ +[[a.b]] + +[a] +b.y = 2 diff --git a/tests/fixtures/Conformance/semantic/dotted-key-explicit-table.toml b/tests/fixtures/Conformance/semantic/dotted-key-explicit-table.toml new file mode 100644 index 0000000..27ae415 --- /dev/null +++ b/tests/fixtures/Conformance/semantic/dotted-key-explicit-table.toml @@ -0,0 +1,5 @@ +[a] +b.c = 1 + +[a.b] +d = 2 diff --git a/tests/fixtures/Conformance/semantic/explicit-table-dotted-extension.toml b/tests/fixtures/Conformance/semantic/explicit-table-dotted-extension.toml new file mode 100644 index 0000000..2e1187e --- /dev/null +++ b/tests/fixtures/Conformance/semantic/explicit-table-dotted-extension.toml @@ -0,0 +1,5 @@ +[a.b.c] +z = 9 + +[a] +b.c.t = 8 From 041285d42dd93eb2e87001c4766f4c96f6dbc9f1 Mon Sep 17 00:00:00 2001 From: mscherer Date: Wed, 25 Mar 2026 05:29:58 +0100 Subject: [PATCH 4/4] docs: update compliance numbers after table semantics fix With the dotted key vs explicit table conflicts now properly rejected, invalid test compliance improves significantly: TOML 1.1: 98.5% invalid (up from 96.8%) TOML 1.0: 97.0% invalid (up from 95.3%) --- docs/reference/support-matrix.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/reference/support-matrix.md b/docs/reference/support-matrix.md index f1cda32..e26d05b 100644 --- a/docs/reference/support-matrix.md +++ b/docs/reference/support-matrix.md @@ -118,18 +118,18 @@ Tested against [toml-test](https://github.com/toml-lang/toml-test) v2.1.0: | Test Type | Passed | Failed | Compliance | |-----------|--------|--------|------------| | Valid | 213 | 1 | 99.5% | -| Invalid | 451 | 15 | 96.8% | +| Invalid | 459 | 7 | 98.5% | ### TOML 1.0 | Test Type | Passed | Failed | Compliance | |-----------|--------|--------|------------| | Valid | 204 | 1 | 99.5% | -| Invalid | 451 | 22 | 95.3% | +| Invalid | 459 | 14 | 97.0% | The single valid test failure is due to a PHP limitation with null byte property names. -The remaining invalid test failures are complex table semantics edge cases involving conflicts between dotted keys and explicitly defined tables. These are rare patterns in practice. +The remaining invalid test failures are edge cases around comment control characters and specific datetime validation patterns. ## Recommended Use