Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions docs/reference/support-matrix.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,16 +118,18 @@ Tested against [toml-test](https://github.com/toml-lang/toml-test) v2.1.0:
| Test Type | Passed | Failed | Compliance |
|-----------|--------|--------|------------|
| Valid | 213 | 1 | 99.5% |
| Invalid | 421 | 45 | 90.3% |
| Invalid | 459 | 7 | 98.5% |

### TOML 1.0

| Test Type | Passed | Failed | Compliance |
|-----------|--------|--------|------------|
| Valid | 204 | 1 | 99.5% |
| Invalid | 421 | 52 | 89.0% |
| Invalid | 459 | 14 | 97.0% |

The single valid test failure is due to a PHP limitation with null byte property names. Invalid test failures are mostly TOML 1.0 strict tests that TOML 1.1 relaxes (multiline inline tables, trailing commas, hex escapes).
The single valid test failure is due to a PHP limitation with null byte property names.

The remaining invalid test failures are edge cases around comment control characters and specific datetime validation patterns.

## Recommended Use

Expand Down
100 changes: 60 additions & 40 deletions src/Lexer/Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ public function __construct(private readonly string $input)
*/
public function tokenize(): Generator
{
// Validate UTF-8 encoding upfront
if (!mb_check_encoding($this->input, 'UTF-8')) {
yield new Token(TokenType::Invalid, 'Invalid UTF-8 encoding', null, new Span(0, $this->length, 1, 0));
yield new Token(TokenType::Eof, '', null, new Span($this->length, $this->length, 1, 0));

return;
}

while ($this->pos < $this->length) {
$char = $this->input[$this->pos];

Expand Down Expand Up @@ -182,10 +190,15 @@ private function string(): Token
continue;
}
$parsed .= $escaped;
} elseif ($char === "\n") {
} elseif ($char === "\n" || $char === "\r") {
// Unescaped newline not allowed in basic string
return new Token(TokenType::Invalid, substr($this->input, $start, $this->pos - $start), null, new Span($start, $this->pos, $this->line, $col));
} else {
// Check for control characters (except tab which is allowed)
$ord = ord($char);
if (($ord < 0x20 && $ord !== 0x09) || $ord === 0x7F) {
$valid = false;
}
$parsed .= $char;
$this->advance();
}
Expand Down Expand Up @@ -340,9 +353,12 @@ private function multiLineBasicString(): Token
if ($this->pos < $this->length && $this->input[$this->pos] === "\n") {
$parsed .= "\n";
$this->pos++;
$this->line++;
$this->column = 0;
} else {
// Bare CR without LF is invalid
$valid = false;
}
$this->line++;
$this->column = 0;
} else {
// Check for control characters (except tab which is allowed)
$ord = ord($char);
Expand Down Expand Up @@ -375,20 +391,31 @@ private function literalString(): Token
$this->advance(); // skip opening '

$parsed = '';
$valid = true;
while ($this->pos < $this->length) {
$char = $this->input[$this->pos];

if ($char === "'") {
$this->advance();
$value = substr($this->input, $start, $this->pos - $start);

if (!$valid) {
return new Token(TokenType::Invalid, $value, null, new Span($start, $this->pos, $startLine, $col));
}

return new Token(TokenType::LiteralString, $value, $parsed, new Span($start, $this->pos, $startLine, $col));
}

if ($char === "\n") {
if ($char === "\n" || $char === "\r") {
return new Token(TokenType::Invalid, substr($this->input, $start, $this->pos - $start), null, new Span($start, $this->pos, $startLine, $col));
}

// Check for control characters (except tab which is allowed)
$ord = ord($char);
if (($ord < 0x20 && $ord !== 0x09) || $ord === 0x7F) {
$valid = false;
}

$parsed .= $char;
$this->advance();
}
Expand Down Expand Up @@ -466,9 +493,12 @@ private function multiLineLiteralString(): Token
if ($this->pos < $this->length && $this->input[$this->pos] === "\n") {
$parsed .= "\n";
$this->pos++;
$this->line++;
$this->column = 0;
} else {
// Bare CR without LF is invalid
$valid = false;
}
$this->line++;
$this->column = 0;
} else {
// Check for control characters (except tab which is allowed)
$ord = ord($char);
Expand Down Expand Up @@ -707,7 +737,13 @@ private function classifyNumber(string $value, Span $span): Token
{
if (!$this->isValidNumberLiteral($value)) {
// TOML 1.1: if it's not a valid number but is a valid bare key, return as BareKey
if (preg_match('/^[A-Za-z0-9_-]+$/', $value)) {
// But only if it doesn't look like an attempted number literal
// (signed values or values with 0x/0o/0b prefixes should be Invalid, not BareKey)
if (
preg_match('/^[A-Za-z0-9_-]+$/', $value) &&
!preg_match('/^[+-]/', $value) &&
!preg_match('/^0[xXoObB]/', $value)
) {
return new Token(TokenType::BareKey, $value, $value, $span);
}

Expand All @@ -716,43 +752,27 @@ private function classifyNumber(string $value, Span $span): Token

$clean = str_replace('_', '', $value);

// Hex
if (
str_starts_with($clean, '0x') || str_starts_with($clean, '0X') ||
str_starts_with($clean, '+0x') || str_starts_with($clean, '-0x')
) {
// Hex (lowercase 0x only, no sign)
if (str_starts_with($clean, '0x')) {
$parsed = intval($clean, 16);

return new Token(TokenType::Integer, $value, $parsed, $span);
}

// Octal
if (
str_starts_with($clean, '0o') || str_starts_with($clean, '0O') ||
str_starts_with($clean, '+0o') || str_starts_with($clean, '-0o')
) {
$negative = str_starts_with($clean, '-');
$oct = str_replace(['0o', '0O', '+', '-'], '', $clean);
// Octal (lowercase 0o only, no sign)
if (str_starts_with($clean, '0o')) {
$oct = substr($clean, 2);
$parsed = intval($oct, 8);
if ($negative) {
$parsed = -$parsed;
}

return new Token(TokenType::Integer, $value, $parsed, $span);
}

// Binary
if (
str_starts_with($clean, '0b') || str_starts_with($clean, '0B') ||
str_starts_with($clean, '+0b') || str_starts_with($clean, '-0b')
) {
$bin = str_replace(['0b', '0B', '+', '-'], '', $clean);
$parsed = bindec($bin);
if (str_starts_with($clean, '-')) {
$parsed = -$parsed;
}
// Binary (lowercase 0b only, no sign)
if (str_starts_with($clean, '0b')) {
$bin = substr($clean, 2);
$parsed = (int)bindec($bin);

return new Token(TokenType::Integer, $value, (int)$parsed, $span);
return new Token(TokenType::Integer, $value, $parsed, $span);
}

// Float (has . or e/E)
Expand All @@ -771,20 +791,20 @@ private function isNumberChar(string $char): bool

private function isValidNumberLiteral(string $value): bool
{
// Integer: decimal
// Integer: decimal (only decimal can have +/- sign)
if (preg_match('/^[+-]?(?:0|[1-9](?:_?\d)*)$/', $value) === 1) {
return true;
}
// Integer: hexadecimal
if (preg_match('/^[+-]?0[xX][0-9A-Fa-f](?:_?[0-9A-Fa-f])*$/', $value) === 1) {
// Integer: hexadecimal (lowercase 'x' only, no sign allowed)
if (preg_match('/^0x[0-9A-Fa-f](?:_?[0-9A-Fa-f])*$/', $value) === 1) {
return true;
}
// Integer: octal
if (preg_match('/^[+-]?0[oO][0-7](?:_?[0-7])*$/', $value) === 1) {
// Integer: octal (lowercase 'o' only, no sign allowed)
if (preg_match('/^0o[0-7](?:_?[0-7])*$/', $value) === 1) {
return true;
}
// Integer: binary
if (preg_match('/^[+-]?0[bB][01](?:_?[01])*$/', $value) === 1) {
// Integer: binary (lowercase 'b' only, no sign allowed)
if (preg_match('/^0b[01](?:_?[01])*$/', $value) === 1) {
return true;
}
// Float with decimal point (requires digit after decimal)
Expand Down
28 changes: 26 additions & 2 deletions src/Normalizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,14 @@ private function &openTable(array &$array, array $path, Span $span): ?array

return $null;
}
if ($kind === 'dotted') {
$this->errors[] = new ParseError(
"Cannot define table '{$displayPath}' after it was implicitly created by dotted keys",
$span,
);

return $null;
}
// kind === 'implicit' is OK - we're explicitly defining a previously implicit table
}

Expand Down Expand Up @@ -447,9 +455,24 @@ private function setNestedValue(
return;
}

// Check if this intermediate path was explicitly defined as a table or array table
// If so, we cannot extend it with dotted keys
if (!$isInlineTableScope && isset($this->definedTables[$internalPath])) {
$kind = $this->definedTables[$internalPath]['kind'];
if ($kind === 'explicit' || $kind === 'array') {
$this->errors[] = new ParseError(
"Cannot add keys to explicitly defined table '{$displayPath}' via dotted keys",
$span,
);

return;
}
}

if (!array_key_exists($key, $current)) {
$current[$key] = [];
$definedTables[$internalPath] ??= ['kind' => 'implicit', 'span' => $span];
// Mark as 'dotted' - cannot be explicitly defined later
$definedTables[$internalPath] ??= ['kind' => 'dotted', 'span' => $span];
} elseif (!is_array($current[$key])) {
$this->errors[] = new ParseError("Cannot redefine key '{$displayPath}' as a table", $span);

Expand All @@ -461,7 +484,8 @@ private function setNestedValue(
$internalPrefix[] = '#' . (string)$lastEntry;
$current = &$current[$key][$lastEntry];
} else {
$definedTables[$internalPath] ??= ['kind' => 'implicit', 'span' => $span];
// Mark as 'dotted' - cannot be explicitly defined later
$definedTables[$internalPath] ??= ['kind' => 'dotted', 'span' => $span];
$current = &$current[$key];
}
}
Expand Down
35 changes: 31 additions & 4 deletions tests/Lexer/LexerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,40 @@ public function testHexOctalBinary(): void
$this->assertSame(0b1010, $tokens[4]->parsed);
}

public function testNegativeOctalAndBinary(): void
public function testSignedNonDecimalIntegersAreInvalid(): void
{
$lexer = new Lexer('-0o777 -0b1010');
// TOML spec: only decimal integers can have +/- prefix
$lexer = new Lexer('-0o777');
$tokens = iterator_to_array($lexer->tokenize());
$this->assertSame(TokenType::Invalid, $tokens[0]->type);

$this->assertSame(-0o777, $tokens[0]->parsed);
$this->assertSame(-0b1010, $tokens[2]->parsed);
$lexer = new Lexer('+0o777');
$tokens = iterator_to_array($lexer->tokenize());
$this->assertSame(TokenType::Invalid, $tokens[0]->type);

$lexer = new Lexer('-0b1010');
$tokens = iterator_to_array($lexer->tokenize());
$this->assertSame(TokenType::Invalid, $tokens[0]->type);

$lexer = new Lexer('+0x1f');
$tokens = iterator_to_array($lexer->tokenize());
$this->assertSame(TokenType::Invalid, $tokens[0]->type);
}

public function testCapitalPrefixesAreInvalid(): void
{
// TOML spec: only lowercase 0x, 0o, 0b allowed
$lexer = new Lexer('0X1F');
$tokens = iterator_to_array($lexer->tokenize());
$this->assertSame(TokenType::Invalid, $tokens[0]->type);

$lexer = new Lexer('0O777');
$tokens = iterator_to_array($lexer->tokenize());
$this->assertSame(TokenType::Invalid, $tokens[0]->type);

$lexer = new Lexer('0B1010');
$tokens = iterator_to_array($lexer->tokenize());
$this->assertSame(TokenType::Invalid, $tokens[0]->type);
}

public function testFloat(): void
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[[a.b]]

[a]
b.y = 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[a]
b.c = 1

[a.b]
d = 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[a.b.c]
z = 9

[a]
b.c.t = 8
Loading