From cc47e68a18622e27effc0709dbf77c067e9a6beb Mon Sep 17 00:00:00 2001 From: Anand Krishnamoorthi Date: Wed, 1 Apr 2026 09:12:50 -0500 Subject: [PATCH] feat: add Azure Policy core JSON parser and expression parser Add the foundational parsing infrastructure for Azure Policy JSON: - ExprParser: ARM template expression parser for "[...]" strings, supporting function calls, dot access, index access, and literals - Parser (core): recursive-descent JSON tokenizer-to-AST parser that reads directly from Lexer tokens with no intermediate serde_json step - ParseError: structured error types with span context for diagnostics - Helper functions: field classification, operator kind parsing, and ARM template expression detection These components are consumed by the policy-aware parsing modules (constraint, policy_rule, policy_definition) in a subsequent PR. --- src/languages/azure_policy/expr.rs | 589 +++++++++++++++++++++ src/languages/azure_policy/mod.rs | 2 + src/languages/azure_policy/parser/core.rs | 536 +++++++++++++++++++ src/languages/azure_policy/parser/error.rs | 148 ++++++ src/languages/azure_policy/parser/mod.rs | 193 +++++++ 5 files changed, 1468 insertions(+) create mode 100644 src/languages/azure_policy/expr.rs create mode 100644 src/languages/azure_policy/parser/core.rs create mode 100644 src/languages/azure_policy/parser/error.rs create mode 100644 src/languages/azure_policy/parser/mod.rs diff --git a/src/languages/azure_policy/expr.rs b/src/languages/azure_policy/expr.rs new file mode 100644 index 00000000..f09641a4 --- /dev/null +++ b/src/languages/azure_policy/expr.rs @@ -0,0 +1,589 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! ARM template expression sub-parser. +//! +//! Parses expressions found inside `"[...]"` strings in Azure Policy JSON. +//! Uses the regorus [`Lexer`] with `set_unknown_char_is_symbol(true)` so that +//! characters like `(`, `)`, `[`, `]`, `.`, `,` are emitted as [`TokenKind::Symbol`] tokens. +//! +//! Grammar (from `azurepolicy.ebnf`): +//! ```text +//! string-expr ::= NUMBER | STRING | complex-expr +//! complex-expr ::= IDENT +//! | complex-expr '.' IDENT +//! | complex-expr '(' ( string-expr ( ',' string-expr )* )? ')' +//! | complex-expr '[' string-expr ']' +//! ``` + +use alloc::boxed::Box; +use alloc::format; +use alloc::string::{String, ToString as _}; +use alloc::vec::Vec; + +use crate::lexer::{Lexer, Source, Span, Token, TokenKind}; + +use super::parser::json_unescape; + +use super::ast::{Expr, ExprLiteral}; + +/// Errors that can occur during ARM template expression parsing. +#[derive(Debug)] +pub enum ExprParseError { + /// Error from the lexer. + Lexer(String), + /// Unexpected token encountered. + UnexpectedToken { span: Span, expected: &'static str }, + /// Invalid numeric literal. + InvalidNumber(Span, String), +} + +impl core::fmt::Display for ExprParseError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + ExprParseError::Lexer(ref msg) => write!(f, "{}", msg), + ExprParseError::UnexpectedToken { ref span, expected } => { + write!(f, "{}", span.error(&format!("expected {}", expected))) + } + ExprParseError::InvalidNumber(ref span, ref msg) => { + write!(f, "{}", span.error(&format!("invalid number: {}", msg))) + } + } + } +} + +/// Parser for ARM template expressions. +/// +/// Wraps a [`Lexer`] configured for expression tokenization (symbol chars enabled) +/// and provides recursive descent parsing methods. +#[derive(Debug)] +pub struct ExprParser<'source> { + lexer: Lexer<'source>, + tok: Token, +} + +impl<'source> ExprParser<'source> { + /// Create a new expression parser from a source. + /// + /// The source should contain just the expression text (without the surrounding + /// `[` and `]` delimiters). + fn new(source: &'source Source) -> Self { + let mut lexer = Lexer::new(source); + lexer.set_unknown_char_is_symbol(true); + let tok = Token( + TokenKind::Eof, + Span { + source: source.clone(), + line: 0, + col: 0, + start: 0, + end: 0, + }, + ); + Self { lexer, tok } + } + + /// Advance to the next token. + fn advance(&mut self) -> Result<(), ExprParseError> { + self.tok = self + .lexer + .next_token() + .map_err(|e| ExprParseError::Lexer(e.to_string()))?; + Ok(()) + } + + /// Get the text of the current token (for Symbol, Number, Ident). + fn token_text(&self) -> &str { + match self.tok.0 { + TokenKind::Symbol | TokenKind::Number | TokenKind::Ident | TokenKind::Eof => { + self.tok.1.text() + } + // String tokens return empty here; access via self.tok.1.text() directly. + TokenKind::String | TokenKind::RawString => "", + #[cfg(feature = "azure-rbac")] + _ => "", + } + } + + /// Parse a complete expression from a `"[...]"` string. + /// + /// `expr_content` is the text between `[` and `]`. + /// ARM template expressions use single-quoted strings; this function converts + /// them to double-quoted JSON strings for the lexer, handling `''` (escaped + /// apostrophe) and embedded characters that need JSON-escaping. + /// `outer_span` is the span of the original string token (for error context). + pub fn parse_from_brackets( + expr_content: &str, + outer_span: &Span, + ) -> Result { + // Convert ARM single-quoted strings to double-quoted JSON strings. + let normalized = arm_single_to_double_quotes(expr_content) + .map_err(|e| ExprParseError::Lexer(outer_span.error(e).to_string()))?; + + let source = Source::from_contents("".into(), normalized) + .map_err(|e| ExprParseError::Lexer(e.to_string()))?; + + // Create the parser locally to keep the borrow of `source` scoped + // to this function. + parse_expr_from_source(&source, outer_span) + } + + /// Parse a single expression (handles left-recursive postfix operators). + fn parse_expr(&mut self) -> Result { + let mut expr = self.parse_primary()?; + + // Handle postfix operations: `.field`, `(args)`, `[index]` + loop { + let text = self.token_text(); + match text { + "." => { + self.advance()?; + let (field_span, field_name) = self.expect_ident()?; + let span = Span { + source: expr.span().source.clone(), + line: expr.span().line, + col: expr.span().col, + start: expr.span().start, + end: field_span.end, + }; + expr = Expr::Dot { + span, + object: Box::new(expr), + field_span, + field: field_name, + }; + } + "(" => { + self.advance()?; + let mut args = Vec::new(); + if self.token_text() != ")" { + args.push(self.parse_expr()?); + while self.token_text() == "," { + self.advance()?; + args.push(self.parse_expr()?); + } + } + let close_span = self.tok.1.clone(); + if self.token_text() != ")" { + return Err(ExprParseError::UnexpectedToken { + span: self.tok.1.clone(), + expected: "')'", + }); + } + self.advance()?; + let span = Span { + source: expr.span().source.clone(), + line: expr.span().line, + col: expr.span().col, + start: expr.span().start, + end: close_span.end, + }; + expr = Expr::Call { + span, + func: Box::new(expr), + args, + }; + } + "[" => { + self.advance()?; + let index = self.parse_expr()?; + let close_span = self.tok.1.clone(); + if self.token_text() != "]" { + return Err(ExprParseError::UnexpectedToken { + span: self.tok.1.clone(), + expected: "']'", + }); + } + self.advance()?; + let span = Span { + source: expr.span().source.clone(), + line: expr.span().line, + col: expr.span().col, + start: expr.span().start, + end: close_span.end, + }; + expr = Expr::Index { + span, + object: Box::new(expr), + index: Box::new(index), + }; + } + _ => break, + } + } + + Ok(expr) + } + + /// Parse a primary expression (literal, identifier, or unary minus). + fn parse_primary(&mut self) -> Result { + match self.tok.0 { + TokenKind::Ident => { + let (span, name) = self.expect_ident()?; + Ok(Expr::Ident { span, name }) + } + TokenKind::Number => { + let span = self.tok.1.clone(); + let text = span.text().into(); + self.advance()?; + Ok(Expr::Literal { + span, + value: ExprLiteral::Number(text), + }) + } + TokenKind::String => { + let span = self.tok.1.clone(); + let text = json_unescape(span.text()); + self.advance()?; + Ok(Expr::Literal { + span, + value: ExprLiteral::String(text), + }) + } + // Unary minus: `-5`, `-3.14`, or `-expr` + TokenKind::Symbol if self.token_text() == "-" => { + let minus_span = self.tok.1.clone(); + self.advance()?; + // Fuse with a following numeric literal into a negative number. + if self.tok.0 == TokenKind::Number { + let num_span = self.tok.1.clone(); + let raw: String = num_span.text().into(); + self.advance()?; + let negated = format!("-{}", raw); + let span = Span { + source: minus_span.source.clone(), + line: minus_span.line, + col: minus_span.col, + start: minus_span.start, + end: num_span.end, + }; + Ok(Expr::Literal { + span, + value: ExprLiteral::Number(negated), + }) + } else { + // General unary minus: compile as `sub(0, expr)`. + let operand = self.parse_primary()?; + let span = Span { + source: minus_span.source.clone(), + line: minus_span.line, + col: minus_span.col, + start: minus_span.start, + end: operand.span().end, + }; + let zero = Expr::Literal { + span: minus_span, + value: ExprLiteral::Number("0".into()), + }; + Ok(Expr::Call { + span: span.clone(), + func: Box::new(Expr::Ident { + span, + name: "sub".into(), + }), + args: alloc::vec![zero, operand], + }) + } + } + _ => Err(ExprParseError::UnexpectedToken { + span: self.tok.1.clone(), + expected: "identifier or literal", + }), + } + } + + /// Expect the current token to be an identifier, consume it, return (span, name). + fn expect_ident(&mut self) -> Result<(Span, String), ExprParseError> { + if self.tok.0 != TokenKind::Ident { + return Err(ExprParseError::UnexpectedToken { + span: self.tok.1.clone(), + expected: "identifier", + }); + } + let span = self.tok.1.clone(); + let name: String = span.text().into(); + self.advance()?; + Ok((span, name)) + } +} + +/// Internal helper: parse a complete expression from an already-constructed +/// [`Source`]. Exists as a free function so that the borrow of `source` +/// can have its own (shorter) lifetime independent of any `ExprParser` +/// type-level lifetime parameter. +fn parse_expr_from_source(source: &Source, outer_span: &Span) -> Result { + let mut parser = ExprParser::new(source); + parser.advance()?; + let expr = parser.parse_expr()?; + + if parser.tok.0 != TokenKind::Eof { + return Err(ExprParseError::UnexpectedToken { + span: if parser.tok.1.text().is_empty() { + outer_span.clone() + } else { + parser.tok.1.clone() + }, + expected: "end of expression", + }); + } + + Ok(expr) +} + +/// Convert ARM template expression single-quoted strings to double-quoted JSON strings. +/// +/// ARM expressions use `'...'` for strings and `''` to escape a literal apostrophe. +/// The JSON lexer expects `"..."` strings with backslash escapes, so this function: +/// - Converts `'...'` delimiters to `"..."` +/// - Converts `''` (ARM escape) to a single `'` character inside the `"` string (no backslash escape) +/// - Escapes `"` and `\` inside the string for the JSON lexer +/// - Leaves characters outside of strings unchanged +fn arm_single_to_double_quotes(input: &str) -> Result { + let mut result = String::with_capacity(input.len()); + let mut chars = input.chars().peekable(); + + while let Some(ch) = chars.next() { + if ch == '\'' { + // Start of a single-quoted ARM string → emit as double-quoted. + result.push('"'); + loop { + match chars.next() { + Some('\'') => { + if chars.peek() == Some(&'\'') { + // '' → escaped apostrophe, emit a literal ' + chars.next(); + result.push('\''); + } else { + // End of string + result.push('"'); + break; + } + } + Some('"') => { + // Escape for JSON lexer + result.push('\\'); + result.push('"'); + } + Some('\\') => { + // Escape for JSON lexer + result.push('\\'); + result.push('\\'); + } + Some(c) => result.push(c), + None => { + return Err("unterminated string"); + } + } + } + } else { + result.push(ch); + } + } + + Ok(result) +} + +#[cfg(test)] +#[allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::as_conversions, + clippy::indexing_slicing +)] +mod tests { + use super::*; + + // ==================================================================== + // arm_single_to_double_quotes tests + // ==================================================================== + + #[test] + fn test_simple_string() { + assert_eq!(arm_single_to_double_quotes("'hello'").unwrap(), "\"hello\""); + } + + #[test] + fn test_no_quotes() { + assert_eq!(arm_single_to_double_quotes("field(x)").unwrap(), "field(x)"); + } + + #[test] + fn test_concat_two_strings() { + assert_eq!( + arm_single_to_double_quotes("concat('a', 'b')").unwrap(), + "concat(\"a\", \"b\")" + ); + } + + #[test] + fn test_escaped_apostrophe() { + // ARM: 'it''s' → string value "it's" + assert_eq!(arm_single_to_double_quotes("'it''s'").unwrap(), "\"it's\""); + } + + #[test] + fn test_doubled_apostrophe_in_concat() { + // concat('a''b', 'c') → concat("a'b", "c") + assert_eq!( + arm_single_to_double_quotes("concat('a''b', 'c')").unwrap(), + "concat(\"a'b\", \"c\")" + ); + } + + #[test] + fn test_embedded_double_quote() { + // ARM: 'say "hi"' → JSON: "say \"hi\"" + assert_eq!( + arm_single_to_double_quotes("'say \"hi\"'").unwrap(), + "\"say \\\"hi\\\"\"" + ); + } + + #[test] + fn test_embedded_backslash() { + // ARM: 'a\b' → JSON: "a\\b" + assert_eq!(arm_single_to_double_quotes("'a\\b'").unwrap(), "\"a\\\\b\""); + } + + #[test] + fn test_empty_string() { + assert_eq!(arm_single_to_double_quotes("''").unwrap(), "\"\""); + } + + #[test] + fn test_unterminated_string() { + arm_single_to_double_quotes("'hello").unwrap_err(); + } + + #[test] + fn test_unterminated_string_in_expr() { + let source = Source::from_contents("".into(), "'hello".into()).unwrap(); + let span = Span { + source, + line: 1, + col: 1, + start: 0, + end: 6, + }; + ExprParser::parse_from_brackets("'hello", &span).unwrap_err(); + } + + // ==================================================================== + // ExprParser integration tests + // ==================================================================== + + fn parse_expr_str(input: &str) -> Expr { + let source = Source::from_contents("".into(), input.into()).unwrap(); + let span = Span { + source, + line: 1, + col: 1, + start: 0, + end: input.len() as u32, + }; + ExprParser::parse_from_brackets(input, &span).expect("parse should succeed") + } + + #[test] + fn test_parse_simple_function_call() { + let expr = parse_expr_str("field('type')"); + match expr { + Expr::Call { func, args, .. } => { + assert!(matches!(*func, Expr::Ident { ref name, .. } if name == "field")); + assert_eq!(args.len(), 1); + assert!( + matches!(&args[0], Expr::Literal { value: ExprLiteral::String(s), .. } if s == "type") + ); + } + _ => panic!("expected Call, got {:?}", expr), + } + } + + #[test] + fn test_parse_string_true_not_bool() { + // 'true' in an ARM expression is a string, not a boolean. + let expr = parse_expr_str("'true'"); + match expr { + Expr::Literal { + value: ExprLiteral::String(s), + .. + } => assert_eq!(s, "true"), + _ => panic!("expected String literal, got {:?}", expr), + } + } + + #[test] + fn test_parse_string_false_not_bool() { + let expr = parse_expr_str("'false'"); + match expr { + Expr::Literal { + value: ExprLiteral::String(s), + .. + } => assert_eq!(s, "false"), + _ => panic!("expected String literal, got {:?}", expr), + } + } + + #[test] + fn test_bare_true_is_ident() { + // Bare `true` is an identifier (for true() function call), not a boolean literal. + let expr = parse_expr_str("true"); + match expr { + Expr::Ident { name, .. } => assert_eq!(name, "true"), + _ => panic!("expected Ident, got {:?}", expr), + } + } + + #[test] + fn test_parse_dot_access() { + let expr = parse_expr_str("resourceGroup().location"); + match expr { + Expr::Dot { field, .. } => assert_eq!(field, "location"), + _ => panic!("expected Dot, got {:?}", expr), + } + } + + #[test] + fn test_parse_apostrophe_in_string() { + // concat('it''s') → Call(concat, [String("it's")]) + let expr = parse_expr_str("concat('it''s')"); + match expr { + Expr::Call { args, .. } => { + assert_eq!(args.len(), 1); + assert!( + matches!(&args[0], Expr::Literal { value: ExprLiteral::String(s), .. } if s == "it's") + ); + } + _ => panic!("expected Call, got {:?}", expr), + } + } + + #[test] + fn test_string_with_embedded_double_quote_unescaped() { + // ARM: 'say "hi"' → after normalization: "say \"hi\"" + // ExprLiteral::String should contain the unescaped value: say "hi" + let expr = parse_expr_str("'say \"hi\"'"); + match expr { + Expr::Literal { + value: ExprLiteral::String(s), + .. + } => assert_eq!(s, "say \"hi\""), + _ => panic!("expected String literal, got {:?}", expr), + } + } + + #[test] + fn test_string_with_backslash_unescaped() { + // ARM: 'a\b' → after normalization: "a\\b" + // ExprLiteral::String should contain: a\b + let expr = parse_expr_str("'a\\b'"); + match expr { + Expr::Literal { + value: ExprLiteral::String(s), + .. + } => assert_eq!(s, "a\\b"), + _ => panic!("expected String literal, got {:?}", expr), + } + } +} diff --git a/src/languages/azure_policy/mod.rs b/src/languages/azure_policy/mod.rs index bc64f71c..c46508ce 100644 --- a/src/languages/azure_policy/mod.rs +++ b/src/languages/azure_policy/mod.rs @@ -6,4 +6,6 @@ #[allow(clippy::pattern_type_mismatch)] pub mod aliases; pub mod ast; +pub mod expr; +pub mod parser; pub mod strings; diff --git a/src/languages/azure_policy/parser/core.rs b/src/languages/azure_policy/parser/core.rs new file mode 100644 index 00000000..433fedbc --- /dev/null +++ b/src/languages/azure_policy/parser/core.rs @@ -0,0 +1,536 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Core parser struct and low-level JSON parsing methods. + +use alloc::boxed::Box; +use alloc::string::{String, ToString as _}; +use alloc::vec::Vec; + +use crate::lexer::{Lexer, Source, Span, Token, TokenKind}; + +use crate::languages::azure_policy::ast::{Constraint, JsonValue, ObjectEntry, ValueOrExpr}; +use crate::languages::azure_policy::expr::ExprParser; + +use super::error::ParseError; +use super::{is_template_expr, tail, unwrap}; + +/// Unescape a JSON string body (the content between the outer `"` delimiters). +/// +/// The lexer returns the raw source text for strings which preserves backslash +/// escape sequences (e.g., `\"`, `\\`, `\n`, `\uXXXX`). This function +/// converts them to the actual characters they represent so that runtime +/// `Value::String` comparisons work correctly. +pub(in crate::languages::azure_policy) fn json_unescape(raw: &str) -> String { + // Fast path: if there is no backslash, the text is already unescaped. + if !raw.contains('\\') { + return raw.into(); + } + + let mut result = String::with_capacity(raw.len()); + let mut chars = raw.chars(); + + while let Some(ch) = chars.next() { + if ch == '\\' { + match chars.next() { + Some('"') => result.push('"'), + Some('\\') => result.push('\\'), + Some('/') => result.push('/'), + Some('b') => result.push('\u{0008}'), + Some('f') => result.push('\u{000C}'), + Some('n') => result.push('\n'), + Some('r') => result.push('\r'), + Some('t') => result.push('\t'), + Some('u') => { + let hex: String = chars.by_ref().take(4).collect(); + if let Ok(code_point) = u32::from_str_radix(&hex, 16) { + // Handle UTF-16 surrogate pairs: \uD800-\uDBFF followed by \uDC00-\uDFFF + if (0xD800..=0xDBFF).contains(&code_point) { + // High surrogate — expect \uXXXX low surrogate next. + let low = decode_low_surrogate(&mut chars); + if let Some(low_point) = low { + #[allow(clippy::arithmetic_side_effects)] + // Values are range-checked above. + let combined = + ((code_point - 0xD800) << 10) + (low_point - 0xDC00) + 0x10000; + if let Some(c) = char::from_u32(combined) { + result.push(c); + } + } + // If low surrogate is missing/invalid, silently skip. + // (The lexer pre-validates strings via serde_json, so + // unpaired surrogates should not reach here in practice.) + } else if let Some(c) = char::from_u32(code_point) { + result.push(c); + } + } + } + Some(c) => { + // Unknown escape — preserve as-is. + result.push('\\'); + result.push(c); + } + None => result.push('\\'), + } + } else { + result.push(ch); + } + } + + result +} + +/// Try to consume `\uXXXX` from the iterator and return the low surrogate code point. +fn decode_low_surrogate(chars: &mut core::str::Chars<'_>) -> Option { + // We need exactly `\uXXXX` next. + let mut peekable = chars.clone(); + if peekable.next() != Some('\\') { + return None; + } + if peekable.next() != Some('u') { + return None; + } + let hex: String = peekable.by_ref().take(4).collect(); + if hex.len() != 4 { + return None; + } + let low = u32::from_str_radix(&hex, 16).ok()?; + if !(0xDC00..=0xDFFF).contains(&low) { + return None; + } + // Actually consume from the real iterator. + chars.next(); // '\' + chars.next(); // 'u' + for _ in 0..4 { + chars.next(); + } + Some(low) +} + +// ============================================================================ +// Intermediate types for constraint building +// ============================================================================ + +/// Intermediate value type used during constraint object parsing. +/// +/// Different keys produce different value types; we track them separately +/// to avoid re-parsing. +pub(super) enum EntryValue { + Json(JsonValue), + ConstraintArray(Vec), + SingleConstraint(Constraint), + CountInner(Box), +} + +/// Intermediate representation of a parsed count sub-object. +pub(super) struct CountInner { + pub span: Span, + pub field: Option<(Span, JsonValue)>, + pub value: Option<(Span, JsonValue)>, + pub name: Option<(Span, JsonValue)>, + pub where_: Option, +} + +// ============================================================================ +// Parser struct +// ============================================================================ + +/// Recursive-descent parser for Azure Policy JSON. +/// +/// Wraps a [`Lexer`] configured for JSON tokenization (`set_unknown_char_is_symbol(true)`) +/// and maintains the current look-ahead token. +pub(super) struct Parser<'source> { + lexer: Lexer<'source>, + /// Current look-ahead token. + pub tok: Token, +} + +impl<'source> Parser<'source> { + /// Create a new parser for the given source. + pub fn new(source: &'source Source) -> Result { + let mut lexer = Lexer::new(source); + lexer.set_unknown_char_is_symbol(true); + + let tok = lexer + .next_token() + .map_err(|e| ParseError::Lexer(e.to_string()))?; + + Ok(Self { lexer, tok }) + } + + /// Advance to the next token. + pub fn advance(&mut self) -> Result<(), ParseError> { + self.tok = self + .lexer + .next_token() + .map_err(|e| ParseError::Lexer(e.to_string()))?; + Ok(()) + } + + /// Get the text of the current token (for Symbol, Number, Ident). + pub fn token_text(&self) -> &str { + match self.tok.0 { + TokenKind::Symbol | TokenKind::Number | TokenKind::Ident | TokenKind::Eof => { + self.tok.1.text() + } + TokenKind::String | TokenKind::RawString => "", + #[cfg(feature = "azure-rbac")] + _ => "", + } + } + + /// Expect and consume a specific symbol character (e.g., `{`, `}`, `:`, `,`). + pub fn expect_symbol(&mut self, ch: &str) -> Result { + if self.tok.0 != TokenKind::Symbol || self.token_text() != ch { + return Err(ParseError::UnexpectedToken { + span: self.tok.1.clone(), + expected: match ch { + "{" => "'{'", + "}" => "'}'", + "[" => "'['", + "]" => "']'", + ":" => "':'", + "," => "','", + _ => "symbol", + }, + }); + } + let span = self.tok.1.clone(); + self.advance()?; + Ok(span) + } + + /// Consume a string token and return its (span, text). + /// + /// The returned text has JSON escape sequences resolved (e.g., `\"` → `"`). + pub fn expect_string(&mut self) -> Result<(Span, String), ParseError> { + if self.tok.0 != TokenKind::String { + return Err(ParseError::UnexpectedToken { + span: self.tok.1.clone(), + expected: "string", + }); + } + let span = self.tok.1.clone(); + let text = json_unescape(span.text()); + self.advance()?; + Ok((span, text)) + } + + // ======================================================================== + // Generic JSON value parsing + // ======================================================================== + + /// Parse any JSON value. + pub fn parse_json_value(&mut self) -> Result { + match self.tok.0 { + TokenKind::String => { + let span = self.tok.1.clone(); + let text = json_unescape(span.text()); + self.advance()?; + Ok(JsonValue::Str(span, text)) + } + TokenKind::Number => { + let span = self.tok.1.clone(); + let text: String = span.text().into(); + self.advance()?; + Ok(JsonValue::Number(span, text)) + } + TokenKind::Ident => { + let span = self.tok.1.clone(); + let text = span.text(); + let value = match text { + "true" => JsonValue::Bool(span.clone(), true), + "false" => JsonValue::Bool(span.clone(), false), + "null" => JsonValue::Null(span.clone()), + _ => { + return Err(ParseError::UnexpectedToken { + span, + expected: "JSON value", + }); + } + }; + self.advance()?; + Ok(value) + } + TokenKind::Symbol if self.token_text() == "[" => self.parse_json_array(), + TokenKind::Symbol if self.token_text() == "{" => self.parse_json_object(), + TokenKind::Symbol if self.token_text() == "-" => { + let start_span = self.tok.1.clone(); + self.advance()?; + if self.tok.0 != TokenKind::Number { + return Err(ParseError::UnexpectedToken { + span: self.tok.1.clone(), + expected: "number after '-'", + }); + } + let num_span = self.tok.1.clone(); + // JSON does not permit whitespace between '-' and the digit. + if start_span.end != num_span.start { + return Err(ParseError::UnexpectedToken { + span: num_span, + expected: "number immediately after '-'", + }); + } + let mut text = String::from("-"); + text.push_str(num_span.text()); + self.advance()?; + let span = Span { + source: start_span.source.clone(), + line: start_span.line, + col: start_span.col, + start: start_span.start, + end: num_span.end, + }; + Ok(JsonValue::Number(span, text)) + } + _ => Err(ParseError::UnexpectedToken { + span: self.tok.1.clone(), + expected: "JSON value", + }), + } + } + + /// Parse a JSON array `[...]`. + fn parse_json_array(&mut self) -> Result { + let open = self.expect_symbol("[")?; + let mut items = Vec::new(); + if self.token_text() != "]" { + items.push(self.parse_json_value()?); + while self.token_text() == "," { + self.advance()?; + items.push(self.parse_json_value()?); + } + } + let close = self.expect_symbol("]")?; + let span = Span { + source: open.source.clone(), + line: open.line, + col: open.col, + start: open.start, + end: close.end, + }; + Ok(JsonValue::Array(span, items)) + } + + /// Parse a JSON object `{...}` as a generic `JsonValue::Object`. + fn parse_json_object(&mut self) -> Result { + let open = self.expect_symbol("{")?; + let mut entries = Vec::new(); + if self.token_text() != "}" { + entries.push(self.parse_object_entry()?); + while self.token_text() == "," { + self.advance()?; + entries.push(self.parse_object_entry()?); + } + } + let close = self.expect_symbol("}")?; + let span = Span { + source: open.source.clone(), + line: open.line, + col: open.col, + start: open.start, + end: close.end, + }; + Ok(JsonValue::Object(span, entries)) + } + + /// Parse a single `"key": value` entry in a JSON object. + fn parse_object_entry(&mut self) -> Result { + let (key_span, key) = self.expect_string()?; + self.expect_symbol(":")?; + let value = self.parse_json_value()?; + Ok(ObjectEntry { + key_span, + key, + value, + }) + } + + // ======================================================================== + // Conversion helpers + // ======================================================================== + + /// Convert a JSON value into a [`ValueOrExpr`]. + pub fn json_to_value_or_expr(jv: JsonValue) -> Result { + match jv { + JsonValue::Str(ref span, ref s) if is_template_expr(s) => { + let inner = unwrap(s, 1, 1); + let expr = ExprParser::parse_from_brackets(inner, span).map_err(|e| { + ParseError::ExprParse { + span: span.clone(), + message: e.to_string(), + } + })?; + Ok(ValueOrExpr::Expr { + span: span.clone(), + raw: s.clone(), + expr, + }) + } + // Handle `[[...` escaped literals: strip the leading `[`. + JsonValue::Str(span, ref s) if s.starts_with("[[") => { + let unescaped = alloc::string::String::from(tail(s, 1)); + Ok(ValueOrExpr::Value(JsonValue::Str(span, unescaped))) + } + _ => Ok(ValueOrExpr::Value(jv)), + } + } +} + +#[cfg(test)] +#[allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::as_conversions +)] +mod tests { + use super::*; + + // ==================================================================== + // json_unescape tests + // ==================================================================== + + #[test] + fn test_no_escapes() { + assert_eq!(json_unescape("hello world"), "hello world"); + } + + #[test] + fn test_basic_escapes() { + assert_eq!(json_unescape(r#"a\"b"#), "a\"b"); + assert_eq!(json_unescape(r"a\\b"), "a\\b"); + assert_eq!(json_unescape(r"a\/b"), "a/b"); + assert_eq!(json_unescape(r"a\nb"), "a\nb"); + assert_eq!(json_unescape(r"a\tb"), "a\tb"); + assert_eq!(json_unescape(r"a\rb"), "a\rb"); + } + + #[test] + fn test_unicode_escape_bmp() { + // \u0041 = 'A' + assert_eq!(json_unescape(r"\u0041"), "A"); + // \u00e9 = 'é' + assert_eq!(json_unescape(r"\u00e9"), "é"); + } + + #[test] + fn test_unicode_surrogate_pair() { + // \uD83D\uDE00 = '😀' (U+1F600) + assert_eq!(json_unescape(r"\uD83D\uDE00"), "😀"); + } + + #[test] + fn test_unicode_surrogate_pair_in_context() { + assert_eq!(json_unescape(r"hi \uD83D\uDE00 there"), "hi 😀 there"); + } + + #[test] + fn test_high_surrogate_without_low_is_dropped() { + // High surrogate alone — silently dropped + assert_eq!(json_unescape(r"\uD83D abc"), " abc"); + } + + #[test] + fn test_unknown_escape_preserved() { + assert_eq!(json_unescape(r"\x"), "\\x"); + } + + // ==================================================================== + // parse_json_value tests — negative number adjacency + // ==================================================================== + + fn parse_json(input: &str) -> Result { + let source = Source::from_contents("".into(), input.into()).unwrap(); + let mut parser = Parser::new(&source)?; + parser.parse_json_value() + } + + #[test] + fn test_negative_number_adjacent() { + let val = parse_json("-42").unwrap(); + match val { + JsonValue::Number(_, text) => assert_eq!(text, "-42"), + other => panic!("expected Number, got {:?}", other), + } + } + + #[test] + fn test_negative_number_with_space_rejected() { + // JSON forbids whitespace between '-' and digits. + let result = parse_json("- 1"); + assert!(result.is_err(), "expected error for '- 1'"); + } + + // ==================================================================== + // json_to_value_or_expr tests + // ==================================================================== + + fn make_str_value(s: &str) -> JsonValue { + let source = Source::from_contents("".into(), s.into()).unwrap(); + let span = Span { + source, + line: 1, + col: 1, + start: 0, + end: s.len() as u32, + }; + JsonValue::Str(span, s.into()) + } + + #[test] + fn test_template_expr_parsed() { + let jv = make_str_value("[field('type')]"); + let result = Parser::json_to_value_or_expr(jv).unwrap(); + match result { + ValueOrExpr::Expr { raw, .. } => assert_eq!(raw, "[field('type')]"), + other => panic!("expected Expr, got {:?}", other), + } + } + + #[test] + fn test_escaped_bracket_becomes_value() { + // "[[foo]" → plain string "[foo]" + let jv = make_str_value("[[foo]"); + let result = Parser::json_to_value_or_expr(jv).unwrap(); + match result { + ValueOrExpr::Value(JsonValue::Str(_, s)) => assert_eq!(s, "[foo]"), + other => panic!("expected Value with unescaped string, got {:?}", other), + } + } + + #[test] + fn test_plain_string_unchanged() { + let jv = make_str_value("hello"); + let result = Parser::json_to_value_or_expr(jv).unwrap(); + match result { + ValueOrExpr::Value(JsonValue::Str(_, s)) => assert_eq!(s, "hello"), + other => panic!("expected Value, got {:?}", other), + } + } + + #[test] + fn test_non_string_passthrough() { + let source = Source::from_contents("".into(), "42".into()).unwrap(); + let span = Span { + source, + line: 1, + col: 1, + start: 0, + end: 2, + }; + let jv = JsonValue::Number(span, "42".into()); + let result = Parser::json_to_value_or_expr(jv).unwrap(); + match result { + ValueOrExpr::Value(JsonValue::Number(_, n)) => assert_eq!(n, "42"), + other => panic!("expected Value(Number), got {:?}", other), + } + } + + #[test] + fn test_invalid_expr_returns_error() { + // "[!!!]" is a template expression but contains invalid tokens + let jv = make_str_value("[!!!]"); + Parser::json_to_value_or_expr(jv).unwrap_err(); + } +} diff --git a/src/languages/azure_policy/parser/error.rs b/src/languages/azure_policy/parser/error.rs new file mode 100644 index 00000000..bba454cb --- /dev/null +++ b/src/languages/azure_policy/parser/error.rs @@ -0,0 +1,148 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Error types for Azure Policy JSON parsing. + +use alloc::format; +use alloc::string::String; + +use crate::lexer::Span; + +/// Errors that can occur during Azure Policy JSON parsing. +#[derive(Debug)] +pub enum ParseError { + /// Error from the lexer. + Lexer(String), + /// Unexpected token encountered. + UnexpectedToken { span: Span, expected: &'static str }, + /// A required key is missing from a JSON object. + MissingKey { span: Span, key: &'static str }, + /// An unrecognized key was found in a policy object. + UnrecognizedKey { span: Span, key: String }, + /// Logical operator (`allOf`/`anyOf`/`not`) has extra keys. + ExtraKeysInLogical { span: Span, operator: String }, + /// The `allOf`/`anyOf` value is not an array. + LogicalOperatorNotArray { span: Span, operator: String }, + /// No operator specified in a condition. + MissingOperator { span: Span }, + /// Multiple LHS operands (field + value, etc.). + MultipleLhsOperands { span: Span }, + /// Missing LHS operand in a condition. + MissingLhsOperand { span: Span }, + /// Error parsing an ARM template expression. + ExprParse { span: Span, message: String }, + /// Both `field` and `value` in a count block. + MultipleCountCollections { span: Span }, + /// Neither `field` nor `value` in a count block. + MissingCountCollection { span: Span }, + /// `name` must be a string in count-value. + InvalidCountName { span: Span }, + /// `name` used without `value` in count. + MisplacedCountName { span: Span }, + /// A custom error message (e.g., from sub-parsing). + Custom { span: Span, message: String }, +} + +impl core::fmt::Display for ParseError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + ParseError::Lexer(ref msg) => write!(f, "{}", msg), + ParseError::UnexpectedToken { ref span, expected } => { + write!(f, "{}", span.error(&format!("expected {}", expected))) + } + ParseError::MissingKey { ref span, key } => { + write!( + f, + "{}", + span.error(&format!("missing required key \"{}\"", key)) + ) + } + ParseError::UnrecognizedKey { ref span, ref key } => { + write!( + f, + "{}", + span.error(&format!("unrecognized key \"{}\"", key)) + ) + } + ParseError::ExtraKeysInLogical { + ref span, + ref operator, + } => write!( + f, + "{}", + span.error(&format!( + "\"{}\" must be the only key in its object", + operator + )) + ), + ParseError::LogicalOperatorNotArray { + ref span, + ref operator, + } => { + write!( + f, + "{}", + span.error(&format!("\"{}\" must be an array", operator)) + ) + } + ParseError::MissingOperator { ref span } => { + write!(f, "{}", span.error("no operator specified in condition")) + } + ParseError::MultipleLhsOperands { ref span } => { + write!( + f, + "{}", + span.error("multiple LHS operands (field/value/count)") + ) + } + ParseError::MissingLhsOperand { ref span } => { + write!( + f, + "{}", + span.error("missing LHS operand (field, value, or count)") + ) + } + ParseError::ExprParse { + ref span, + ref message, + } => { + write!( + f, + "{}\n expression parse error: {}", + span.error("in template expression"), + message + ) + } + ParseError::MultipleCountCollections { ref span } => { + write!( + f, + "{}", + span.error("both 'field' and 'value' specified in count") + ) + } + ParseError::MissingCountCollection { ref span } => { + write!( + f, + "{}", + span.error("neither 'field' nor 'value' specified in count") + ) + } + ParseError::InvalidCountName { ref span } => { + write!(f, "{}", span.error("'name' in count must be a string")) + } + ParseError::MisplacedCountName { ref span } => { + write!( + f, + "{}", + span.error("'name' can only be used with count-value") + ) + } + ParseError::Custom { + ref span, + ref message, + } => { + write!(f, "{}", span.error(message)) + } + } + } +} diff --git a/src/languages/azure_policy/parser/mod.rs b/src/languages/azure_policy/parser/mod.rs new file mode 100644 index 00000000..011e3216 --- /dev/null +++ b/src/languages/azure_policy/parser/mod.rs @@ -0,0 +1,193 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Core recursive-descent JSON parser for Azure Policy. +//! +//! Provides the low-level token-driven parser (`core::Parser`) that reads JSON from +//! [`Lexer`] tokens, building span-annotated AST values in a single pass. +//! No intermediate `serde_json::Value` is created. +//! +//! Higher-level policy-aware parsing (constraints, policy rules, policy +//! definitions) is layered on top by sibling modules. + +// Parser internals are consumed by constraint/policy_rule/policy_definition +// modules added in a subsequent PR. +#[allow(dead_code)] +pub(crate) mod core; +mod error; + +pub(super) use self::core::json_unescape; +pub use error::ParseError; + +use alloc::string::ToString as _; + +use super::ast::{FieldKind, OperatorKind}; +use super::expr::ExprParser; + +// ============================================================================ +// Helper functions (used by constraint/policy_rule/policy_definition modules) +// ============================================================================ + +/// Check if a string is an ARM template expression (`[...]` but not `[[...`). +#[allow(dead_code)] +pub(super) fn is_template_expr(s: &str) -> bool { + s.starts_with('[') && s.ends_with(']') && !s.starts_with("[[") +} + +/// Checked `s[prefix_len..]`. +/// +/// Callers guarantee bounds via a prior `starts_with` check on an ASCII +/// prefix whose byte-length equals `prefix_len`. +fn tail(s: &str, prefix_len: usize) -> &str { + s.get(prefix_len..).unwrap_or_default() +} + +/// Checked `s[prefix_len .. s.len() - suffix_len]`. +/// +/// Callers guarantee bounds via prior `starts_with` / `ends_with` checks. +fn unwrap(s: &str, prefix_len: usize, suffix_len: usize) -> &str { + let end = s.len().saturating_sub(suffix_len); + s.get(prefix_len..end).unwrap_or_default() +} + +/// Classify a field string into a [`FieldKind`]. +#[allow(dead_code)] +pub(super) fn classify_field( + text: &str, + span: &crate::lexer::Span, +) -> Result { + let lower = text.to_lowercase(); + match lower.as_str() { + "type" => Ok(FieldKind::Type), + "id" => Ok(FieldKind::Id), + "kind" => Ok(FieldKind::Kind), + "name" => Ok(FieldKind::Name), + "location" => Ok(FieldKind::Location), + "fullname" => Ok(FieldKind::FullName), + "tags" => Ok(FieldKind::Tags), + "identity.type" => Ok(FieldKind::IdentityType), + _ if lower.starts_with("identity.") => Ok(FieldKind::IdentityField( + tail(text, "identity.".len()).into(), + )), + "apiversion" => Ok(FieldKind::ApiVersion), + _ if lower.starts_with("tags.") => Ok(FieldKind::Tag(tail(text, "tags.".len()).into())), + _ if lower.starts_with("tags['") && text.ends_with("']") => Ok(FieldKind::Tag( + unwrap(text, "tags['".len(), "']".len()).into(), + )), + // Tags[tagName] — bracket notation without quotes. + // Tag name is everything between the first '[' and the LAST ']'. + // e.g. Tags[Dept.Name]] → tag name "Dept.Name]" + // Exclude tags[*] which is a wildcard iteration, not a tag name. + _ if lower.starts_with("tags[") && text.ends_with(']') && !text.contains("[*]") => Ok( + FieldKind::Tag(unwrap(text, "tags[".len(), "]".len()).into()), + ), + _ if is_template_expr(text) => { + let inner = unwrap(text, 1, 1); + let expr = ExprParser::parse_from_brackets(inner, span).map_err(|e| { + ParseError::ExprParse { + span: span.clone(), + message: e.to_string(), + } + })?; + Ok(FieldKind::Expr(expr)) + } + _ => Ok(FieldKind::Alias(text.into())), + } +} + +/// Try to parse a lowercase key as an operator kind. +#[allow(dead_code)] +pub(super) fn parse_operator_kind(key: &str) -> Option { + match key { + "contains" => Some(OperatorKind::Contains), + "containskey" => Some(OperatorKind::ContainsKey), + "equals" => Some(OperatorKind::Equals), + "greater" => Some(OperatorKind::Greater), + "greaterorequals" => Some(OperatorKind::GreaterOrEquals), + "exists" => Some(OperatorKind::Exists), + "in" => Some(OperatorKind::In), + "less" => Some(OperatorKind::Less), + "lessorequals" => Some(OperatorKind::LessOrEquals), + "like" => Some(OperatorKind::Like), + "match" => Some(OperatorKind::Match), + "matchinsensitively" => Some(OperatorKind::MatchInsensitively), + "notcontains" => Some(OperatorKind::NotContains), + "notcontainskey" => Some(OperatorKind::NotContainsKey), + "notequals" => Some(OperatorKind::NotEquals), + "notin" => Some(OperatorKind::NotIn), + "notlike" => Some(OperatorKind::NotLike), + "notmatch" => Some(OperatorKind::NotMatch), + "notmatchinsensitively" => Some(OperatorKind::NotMatchInsensitively), + _ => None, + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] +mod tests { + use super::*; + + fn dummy_span() -> crate::lexer::Span { + let source = + crate::lexer::Source::from_contents("".into(), alloc::string::String::new()) + .unwrap(); + crate::lexer::Span { + source, + line: 1, + col: 1, + start: 0, + end: 0, + } + } + + #[test] + fn test_is_template_expr() { + assert!(is_template_expr("[field('type')]")); + assert!(is_template_expr("[concat('a', 'b')]")); + assert!(!is_template_expr("[[escaped]")); + assert!(!is_template_expr("not-an-expr")); + assert!(!is_template_expr("[no-end-bracket")); + } + + #[test] + fn test_classify_field_builtin_names() { + let span = dummy_span(); + assert!(matches!(classify_field("type", &span), Ok(FieldKind::Type))); + assert!(matches!(classify_field("Type", &span), Ok(FieldKind::Type))); + assert!(matches!(classify_field("name", &span), Ok(FieldKind::Name))); + assert!(matches!( + classify_field("location", &span), + Ok(FieldKind::Location) + )); + } + + #[test] + fn test_classify_field_alias() { + let span = dummy_span(); + match classify_field("Microsoft.Compute/virtualMachines/storageProfile", &span) { + Ok(FieldKind::Alias(a)) => { + assert_eq!(a, "Microsoft.Compute/virtualMachines/storageProfile"); + } + other => panic!("expected Alias, got {:?}", other), + } + } + + #[test] + fn test_classify_field_expression() { + let span = dummy_span(); + match classify_field("[field('type')]", &span) { + Ok(FieldKind::Expr(_)) => {} + other => panic!("expected Expr, got {:?}", other), + } + } + + #[test] + fn test_classify_field_double_bracket_not_expr() { + // [[escaped] should NOT be parsed as an expression. + let span = dummy_span(); + match classify_field("[[escaped]", &span) { + Ok(FieldKind::Alias(a)) => assert_eq!(a, "[[escaped]"), + other => panic!("expected Alias, got {:?}", other), + } + } +}