From ca224775119e06efd10d48cdce437dfb9526a57a Mon Sep 17 00:00:00 2001 From: Kira Date: Fri, 19 Jul 2024 23:32:42 +0200 Subject: [PATCH 1/8] draft fix for datetimes in record values --- crates/nu-parser/src/lex.rs | 44 ++++++++++++++++++++++++++++++++++ crates/nu-parser/src/parser.rs | 5 ++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/crates/nu-parser/src/lex.rs b/crates/nu-parser/src/lex.rs index bad9e94a10..d7b317310b 100644 --- a/crates/nu-parser/src/lex.rs +++ b/crates/nu-parser/src/lex.rs @@ -454,6 +454,26 @@ pub fn lex_signature( special_tokens, skip_comment, true, + false, + ) +} + +// temporary name because i cant decide on a better one +pub fn lex_but_ignore_specials_after_special( + input: &[u8], + span_offset: usize, + additional_whitespace: &[u8], + special_tokens: &[u8], + skip_comment: bool, +) -> (Vec, Option) { + lex_internal( + input, + span_offset, + additional_whitespace, + special_tokens, + skip_comment, + false, + true, ) } @@ -471,6 +491,7 @@ pub fn lex( special_tokens, skip_comment, false, + false, ) } @@ -482,7 +503,12 @@ fn lex_internal( skip_comment: bool, // within signatures we want to treat `<` and `>` specially in_signature: bool, + // after lexing a special item, disable special items when lexing the next item. + // necessary because colons are special in records, but datetime literals may contain colons + ignore_specials_after_special: bool, ) -> (Vec, Option) { + let mut specials_disabled = false; + let mut error = None; let mut curr_offset = 0; @@ -612,7 +638,22 @@ fn lex_internal( } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) { // If the next character is non-newline whitespace, skip it. curr_offset += 1; + } else if ignore_specials_after_special && !specials_disabled && special_tokens.contains(&c) + { + // If disabling special items but if they're not currently disabled, handle a special item + // character right here, bypassing lex_item + output.push(Token::new( + TokenContents::Item, + Span::new(span_offset + curr_offset, span_offset + curr_offset + 1), + )); + curr_offset += 1; + specials_disabled = true; } else { + let special_tokens = if specials_disabled { + &[] + } else { + special_tokens + }; let (token, err) = lex_item( input, &mut curr_offset, @@ -625,6 +666,9 @@ fn lex_internal( error = err; } is_complete = true; + if token.contents == TokenContents::Item { + specials_disabled = false; + } output.push(token); } } diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index 8332d2f383..18c6367390 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -1,5 +1,5 @@ use crate::{ - lex::{is_assignment_operator, lex, lex_signature}, + lex::{is_assignment_operator, lex, lex_but_ignore_specials_after_special, lex_signature}, lite_parser::{lite_parse, LiteCommand, LitePipeline, LiteRedirection, LiteRedirectionTarget}, parse_keywords::*, parse_patterns::parse_pattern, @@ -5601,7 +5601,8 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression let inner_span = Span::new(start, end); let source = working_set.get_span_contents(inner_span); - let (tokens, err) = lex(source, start, &[b'\n', b'\r', b','], &[b':'], true); + let (tokens, err) = + lex_but_ignore_specials_after_special(source, start, &[b'\n', b'\r', b','], &[b':'], true); if let Some(err) = err { working_set.error(err); } From 671ddb5a2f69f856d34eeb62888d2ed70334d513 Mon Sep 17 00:00:00 2001 From: Kira Date: Sun, 21 Jul 2024 15:25:38 +0200 Subject: [PATCH 2/8] slightly better names --- crates/nu-parser/src/lex.rs | 8 +++----- crates/nu-parser/src/parser.rs | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/crates/nu-parser/src/lex.rs b/crates/nu-parser/src/lex.rs index d7b317310b..b7c8ccc118 100644 --- a/crates/nu-parser/src/lex.rs +++ b/crates/nu-parser/src/lex.rs @@ -458,8 +458,7 @@ pub fn lex_signature( ) } -// temporary name because i cant decide on a better one -pub fn lex_but_ignore_specials_after_special( +pub fn lex_alternating_special_tokens( input: &[u8], span_offset: usize, additional_whitespace: &[u8], @@ -505,7 +504,7 @@ fn lex_internal( in_signature: bool, // after lexing a special item, disable special items when lexing the next item. // necessary because colons are special in records, but datetime literals may contain colons - ignore_specials_after_special: bool, + alternate_specials: bool, ) -> (Vec, Option) { let mut specials_disabled = false; @@ -638,8 +637,7 @@ fn lex_internal( } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) { // If the next character is non-newline whitespace, skip it. curr_offset += 1; - } else if ignore_specials_after_special && !specials_disabled && special_tokens.contains(&c) - { + } else if alternate_specials && !specials_disabled && special_tokens.contains(&c) { // If disabling special items but if they're not currently disabled, handle a special item // character right here, bypassing lex_item output.push(Token::new( diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index 18c6367390..ca6dc7dffd 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -1,5 +1,5 @@ use crate::{ - lex::{is_assignment_operator, lex, lex_but_ignore_specials_after_special, lex_signature}, + lex::{is_assignment_operator, lex, lex_alternating_special_tokens, lex_signature}, lite_parser::{lite_parse, LiteCommand, LitePipeline, LiteRedirection, LiteRedirectionTarget}, parse_keywords::*, parse_patterns::parse_pattern, @@ -5602,7 +5602,7 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression let source = working_set.get_span_contents(inner_span); let (tokens, err) = - lex_but_ignore_specials_after_special(source, start, &[b'\n', b'\r', b','], &[b':'], true); + lex_alternating_special_tokens(source, start, &[b'\n', b'\r', b','], &[b':'], true); if let Some(err) = err { working_set.error(err); } From b62d9119be5be25824fe494dcf638357543c5e49 Mon Sep 17 00:00:00 2001 From: Kira Date: Sun, 21 Jul 2024 23:25:22 +0200 Subject: [PATCH 3/8] Explicitly reject record value position bare words containing colons --- crates/nu-parser/src/parser.rs | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index ca6dc7dffd..1207088ecb 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -5694,6 +5694,51 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); idx += 1; + let bareword_error = |string_value: &Expression| { + let string_span = working_set.get_span_contents(string_value.span); + let colon_position = string_span + .iter() + .find_position(|b| **b == b':') + .map(|(i, _)| string_value.span.start + i); + if let Some(colon_position) = colon_position { + Some(ParseError::InvalidLiteral( + "colon".to_string(), + "bare word specifying record value".to_string(), + Span::new(colon_position, colon_position + 1), + )) + } else { + None + } + }; + let value_span = working_set.get_span_contents(value.span); + let parse_error = match value.expr { + Expr::String(_) => { + if ![b'"', b'\'', b'`'].contains(&value_span[0]) { + bareword_error(&value) + } else { + None + } + } + Expr::StringInterpolation(ref expressions) => { + if value_span[0] != b'$' { + expressions + .iter() + .filter(|expr| matches!(expr.expr, Expr::String(_))) + .filter_map(bareword_error) + .next() + } else { + None + } + } + _ => None, + }; + let value = if let Some(parse_error) = parse_error { + working_set.error(parse_error); + garbage(working_set, value.span) + } else { + value + }; + if let Some(field) = field.as_string() { if let Some(fields) = &mut field_types { fields.push((field, value.ty.clone())); From 26c4bbfa41d9fd5f45d5f2f99bf409415ee8b15c Mon Sep 17 00:00:00 2001 From: Kira Date: Sun, 21 Jul 2024 23:48:05 +0200 Subject: [PATCH 4/8] oh thanks clippy --- crates/nu-parser/src/parser.rs | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index 1207088ecb..380d8744f8 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -5695,20 +5695,18 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression idx += 1; let bareword_error = |string_value: &Expression| { - let string_span = working_set.get_span_contents(string_value.span); - let colon_position = string_span + working_set + .get_span_contents(string_value.span) .iter() .find_position(|b| **b == b':') - .map(|(i, _)| string_value.span.start + i); - if let Some(colon_position) = colon_position { - Some(ParseError::InvalidLiteral( - "colon".to_string(), - "bare word specifying record value".to_string(), - Span::new(colon_position, colon_position + 1), - )) - } else { - None - } + .map(|(i, _)| { + let colon_position = i + string_value.span.start; + ParseError::InvalidLiteral( + "colon".to_string(), + "bare word specifying record value".to_string(), + Span::new(colon_position, colon_position + 1), + ) + }) }; let value_span = working_set.get_span_contents(value.span); let parse_error = match value.expr { From 4da348f7ca9b6eca0fca55f515985b8da09ef578 Mon Sep 17 00:00:00 2001 From: Kira Date: Tue, 23 Jul 2024 02:37:17 +0200 Subject: [PATCH 5/8] new attempt --- crates/nu-parser/src/lex.rs | 159 ++++++++++++++++++--------------- crates/nu-parser/src/parser.rs | 116 ++++++++++++++---------- 2 files changed, 155 insertions(+), 120 deletions(-) diff --git a/crates/nu-parser/src/lex.rs b/crates/nu-parser/src/lex.rs index b7c8ccc118..628b84414a 100644 --- a/crates/nu-parser/src/lex.rs +++ b/crates/nu-parser/src/lex.rs @@ -447,33 +447,58 @@ pub fn lex_signature( special_tokens: &[u8], skip_comment: bool, ) -> (Vec, Option) { - lex_internal( + let mut state = LexState { input, + output: Vec::new(), + error: None, span_offset, + }; + lex_internal( + &mut state, additional_whitespace, special_tokens, skip_comment, true, - false, - ) + None, + ); + (state.output, state.error) } -pub fn lex_alternating_special_tokens( - input: &[u8], - span_offset: usize, +pub struct LexState<'a> { + pub input: &'a [u8], + pub output: Vec, + pub error: Option, + pub span_offset: usize, +} + +// Lex until the output is max_tokens longer than before the call, or until the input is exhausted. +// The behaviour here is non-obvious (maybe non-useful) iff your additional_whitespace doesn't include newline: +// If you pass `output` in a state where the last token is an Eol, this might *remove* tokens. +pub fn lex_n_tokens( + state: &mut LexState, additional_whitespace: &[u8], special_tokens: &[u8], skip_comment: bool, -) -> (Vec, Option) { + max_tokens: usize, +) -> isize { + let n_tokens = state.output.len(); lex_internal( - input, - span_offset, + state, additional_whitespace, special_tokens, skip_comment, false, - true, - ) + Some(max_tokens), + ); + // If this lex_internal call reached the end of the input, there may now be fewer tokens + // in the output than before. + let tokens_n_diff = (state.output.len() as isize) - (n_tokens as isize); + let next_offset = state.output.last().map(|token| token.span.end); + if let Some(next_offset) = next_offset { + state.input = &state.input[next_offset - state.span_offset..]; + state.span_offset = next_offset; + } + tokens_n_diff } pub fn lex( @@ -483,39 +508,43 @@ pub fn lex( special_tokens: &[u8], skip_comment: bool, ) -> (Vec, Option) { - lex_internal( + let mut state = LexState { input, + output: Vec::new(), + error: None, span_offset, + }; + lex_internal( + &mut state, additional_whitespace, special_tokens, skip_comment, false, - false, - ) + None, + ); + (state.output, state.error) } fn lex_internal( - input: &[u8], - span_offset: usize, + state: &mut LexState, additional_whitespace: &[u8], special_tokens: &[u8], skip_comment: bool, // within signatures we want to treat `<` and `>` specially in_signature: bool, - // after lexing a special item, disable special items when lexing the next item. - // necessary because colons are special in records, but datetime literals may contain colons - alternate_specials: bool, -) -> (Vec, Option) { - let mut specials_disabled = false; - - let mut error = None; + max_tokens: Option, +) { + let initial_output_len = state.output.len(); let mut curr_offset = 0; - let mut output = vec![]; let mut is_complete = true; - - while let Some(c) = input.get(curr_offset) { + while let Some(c) = state.input.get(curr_offset) { + if max_tokens + .is_some_and(|max_tokens| state.output.len() >= initial_output_len + max_tokens) + { + break; + } let c = *c; if c == b'|' { // If the next character is `|`, it's either `|` or `||`. @@ -524,13 +553,13 @@ fn lex_internal( curr_offset += 1; // If the next character is `|`, we're looking at a `||`. - if let Some(c) = input.get(curr_offset) { + if let Some(c) = state.input.get(curr_offset) { if *c == b'|' { let idx = curr_offset; curr_offset += 1; - output.push(Token::new( + state.output.push(Token::new( TokenContents::PipePipe, - Span::new(span_offset + prev_idx, span_offset + idx + 1), + Span::new(state.span_offset + prev_idx, state.span_offset + idx + 1), )); continue; } @@ -540,12 +569,12 @@ fn lex_internal( // Before we push, check to see if the previous character was a newline. // If so, then this is a continuation of the previous line - if let Some(prev) = output.last_mut() { + if let Some(prev) = state.output.last_mut() { match prev.contents { TokenContents::Eol => { *prev = Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), ); // And this is a continuation of the previous line if previous line is a // comment line (combined with EOL + Comment) @@ -553,12 +582,12 @@ fn lex_internal( // Initially, the last one token is TokenContents::Pipe, we don't need to // check it, so the beginning offset is 2. let mut offset = 2; - while output.len() > offset { - let index = output.len() - offset; - if output[index].contents == TokenContents::Comment - && output[index - 1].contents == TokenContents::Eol + while state.output.len() > offset { + let index = state.output.len() - offset; + if state.output[index].contents == TokenContents::Comment + && state.output[index - 1].contents == TokenContents::Eol { - output.remove(index - 1); + state.output.remove(index - 1); offset += 1; } else { break; @@ -566,16 +595,16 @@ fn lex_internal( } } _ => { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } } } else { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } @@ -583,17 +612,17 @@ fn lex_internal( } else if c == b';' { // If the next character is a `;`, we're looking at a semicolon token. - if !is_complete && error.is_none() { - error = Some(ParseError::ExtraTokens(Span::new( + if !is_complete && state.error.is_none() { + state.error = Some(ParseError::ExtraTokens(Span::new( curr_offset, curr_offset + 1, ))); } let idx = curr_offset; curr_offset += 1; - output.push(Token::new( + state.output.push(Token::new( TokenContents::Semicolon, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } else if c == b'\r' { // Ignore a stand-alone carriage return @@ -603,9 +632,9 @@ fn lex_internal( let idx = curr_offset; curr_offset += 1; if !additional_whitespace.contains(&c) { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Eol, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } } else if c == b'#' { @@ -613,12 +642,12 @@ fn lex_internal( // comment. The comment continues until the next newline. let mut start = curr_offset; - while let Some(input) = input.get(curr_offset) { + while let Some(input) = state.input.get(curr_offset) { if *input == b'\n' { if !skip_comment { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Comment, - Span::new(span_offset + start, span_offset + curr_offset), + Span::new(state.span_offset + start, state.span_offset + curr_offset), )); } start = curr_offset; @@ -629,48 +658,30 @@ fn lex_internal( } } if start != curr_offset && !skip_comment { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Comment, - Span::new(span_offset + start, span_offset + curr_offset), + Span::new(state.span_offset + start, state.span_offset + curr_offset), )); } } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) { // If the next character is non-newline whitespace, skip it. curr_offset += 1; - } else if alternate_specials && !specials_disabled && special_tokens.contains(&c) { - // If disabling special items but if they're not currently disabled, handle a special item - // character right here, bypassing lex_item - output.push(Token::new( - TokenContents::Item, - Span::new(span_offset + curr_offset, span_offset + curr_offset + 1), - )); - curr_offset += 1; - specials_disabled = true; } else { - let special_tokens = if specials_disabled { - &[] - } else { - special_tokens - }; let (token, err) = lex_item( - input, + state.input, &mut curr_offset, - span_offset, + state.span_offset, additional_whitespace, special_tokens, in_signature, ); - if error.is_none() { - error = err; + if state.error.is_none() { + state.error = err; } is_complete = true; - if token.contents == TokenContents::Item { - specials_disabled = false; - } - output.push(token); + state.output.push(token); } } - (output, error) } /// True if this the start of a redirection. Does not match `>>` or `>|` forms. diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index 380d8744f8..3350364b1d 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -1,5 +1,5 @@ use crate::{ - lex::{is_assignment_operator, lex, lex_alternating_special_tokens, lex_signature}, + lex::{is_assignment_operator, lex, lex_n_tokens, lex_signature, LexState}, lite_parser::{lite_parse, LiteCommand, LitePipeline, LiteRedirection, LiteRedirectionTarget}, parse_keywords::*, parse_patterns::parse_pattern, @@ -5599,10 +5599,32 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression } let inner_span = Span::new(start, end); - let source = working_set.get_span_contents(inner_span); - let (tokens, err) = - lex_alternating_special_tokens(source, start, &[b'\n', b'\r', b','], &[b':'], true); + let mut lex_state = LexState { + input: working_set.get_span_contents(inner_span), + output: Vec::new(), + error: None, + span_offset: start, + }; + let mut lex_n = |additional_whitespace, special_tokens, max_tokens| { + lex_n_tokens( + &mut lex_state, + additional_whitespace, + special_tokens, + true, + max_tokens, + ) + }; + loop { + if lex_n(&[b'\n', b'\r', b','], &[b':'], 2) < 2 { + break; + }; + if lex_n(&[b'\n', b'\r', b','], &[], 1) < 1 { + break; + }; + } + let (tokens, err) = (lex_state.output, lex_state.error); + if let Some(err) = err { working_set.error(err); } @@ -5694,48 +5716,50 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); idx += 1; - let bareword_error = |string_value: &Expression| { - working_set - .get_span_contents(string_value.span) - .iter() - .find_position(|b| **b == b':') - .map(|(i, _)| { - let colon_position = i + string_value.span.start; - ParseError::InvalidLiteral( - "colon".to_string(), - "bare word specifying record value".to_string(), - Span::new(colon_position, colon_position + 1), - ) - }) - }; - let value_span = working_set.get_span_contents(value.span); - let parse_error = match value.expr { - Expr::String(_) => { - if ![b'"', b'\'', b'`'].contains(&value_span[0]) { - bareword_error(&value) - } else { - None - } - } - Expr::StringInterpolation(ref expressions) => { - if value_span[0] != b'$' { - expressions - .iter() - .filter(|expr| matches!(expr.expr, Expr::String(_))) - .filter_map(bareword_error) - .next() - } else { - None - } - } - _ => None, - }; - let value = if let Some(parse_error) = parse_error { - working_set.error(parse_error); - garbage(working_set, value.span) - } else { - value - }; + // Disallow colons in bare word values + + // let bareword_error = |string_value: &Expression| { + // working_set + // .get_span_contents(string_value.span) + // .iter() + // .find_position(|b| **b == b':') + // .map(|(i, _)| { + // let colon_position = i + string_value.span.start; + // ParseError::InvalidLiteral( + // "colon".to_string(), + // "bare word specifying record value".to_string(), + // Span::new(colon_position, colon_position + 1), + // ) + // }) + // }; + // let value_span = working_set.get_span_contents(value.span); + // let parse_error = match value.expr { + // Expr::String(_) => { + // if ![b'"', b'\'', b'`'].contains(&value_span[0]) { + // bareword_error(&value) + // } else { + // None + // } + // } + // Expr::StringInterpolation(ref expressions) => { + // if value_span[0] != b'$' { + // expressions + // .iter() + // .filter(|expr| matches!(expr.expr, Expr::String(_))) + // .filter_map(bareword_error) + // .next() + // } else { + // None + // } + // } + // _ => None, + // }; + // let value = if let Some(parse_error) = parse_error { + // working_set.error(parse_error); + // garbage(working_set, value.span) + // } else { + // value + // }; if let Some(field) = field.as_string() { if let Some(fields) = &mut field_types { From cc980115fca1df6f9caca008d9eb89cd4107ca9b Mon Sep 17 00:00:00 2001 From: Kira Date: Tue, 23 Jul 2024 18:21:40 +0200 Subject: [PATCH 6/8] reject non-item-tokens and bare words containing colons as record keys and values --- crates/nu-parser/src/parser.rs | 127 +++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 47 deletions(-) diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index 3350364b1d..92db4205c6 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -5579,6 +5579,49 @@ pub fn parse_builtin_commands( } } +fn check_record_key_or_value( + working_set: &StateWorkingSet, + expr: &Expression, + position: &str, +) -> Option { + let bareword_error = |string_value: &Expression| { + working_set + .get_span_contents(string_value.span) + .iter() + .find_position(|b| **b == b':') + .map(|(i, _)| { + let colon_position = i + string_value.span.start; + ParseError::InvalidLiteral( + "colon".to_string(), + format!("bare word specifying record {}", position), + Span::new(colon_position, colon_position + 1), + ) + }) + }; + let value_span = working_set.get_span_contents(expr.span); + match expr.expr { + Expr::String(_) => { + if ![b'"', b'\'', b'`'].contains(&value_span[0]) { + bareword_error(expr) + } else { + None + } + } + Expr::StringInterpolation(ref expressions) => { + if value_span[0] != b'$' { + expressions + .iter() + .filter(|expr| matches!(expr.expr, Expr::String(_))) + .filter_map(bareword_error) + .next() + } else { + None + } + } + _ => None, + } +} + pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression { let bytes = working_set.get_span_contents(span); @@ -5668,7 +5711,22 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression )); } else { // Normal key-value pair - let field = parse_value(working_set, curr_span, &SyntaxShape::Any); + let field_token = &tokens[idx]; + let field = if field_token.contents != TokenContents::Item { + working_set.error(ParseError::Expected( + "item in record key position", + Span::new(field_token.span.start, field_token.span.end), + )); + garbage(working_set, curr_span) + } else { + let field = parse_value(working_set, curr_span, &SyntaxShape::Any); + if let Some(error) = check_record_key_or_value(working_set, &field, "key") { + working_set.error(error); + garbage(working_set, field.span) + } else { + field + } + }; idx += 1; if idx == tokens.len() { @@ -5713,54 +5771,29 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression )); break; } - let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); + + let value_token = &tokens[idx]; + let value = if value_token.contents != TokenContents::Item { + working_set.error(ParseError::Expected( + "item in record value position", + Span::new(value_token.span.start, value_token.span.end), + )); + garbage( + working_set, + Span::new(value_token.span.start, value_token.span.end), + ) + } else { + let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); + if let Some(parse_error) = check_record_key_or_value(&working_set, &value, "value") + { + working_set.error(parse_error); + garbage(working_set, value.span) + } else { + value + } + }; idx += 1; - // Disallow colons in bare word values - - // let bareword_error = |string_value: &Expression| { - // working_set - // .get_span_contents(string_value.span) - // .iter() - // .find_position(|b| **b == b':') - // .map(|(i, _)| { - // let colon_position = i + string_value.span.start; - // ParseError::InvalidLiteral( - // "colon".to_string(), - // "bare word specifying record value".to_string(), - // Span::new(colon_position, colon_position + 1), - // ) - // }) - // }; - // let value_span = working_set.get_span_contents(value.span); - // let parse_error = match value.expr { - // Expr::String(_) => { - // if ![b'"', b'\'', b'`'].contains(&value_span[0]) { - // bareword_error(&value) - // } else { - // None - // } - // } - // Expr::StringInterpolation(ref expressions) => { - // if value_span[0] != b'$' { - // expressions - // .iter() - // .filter(|expr| matches!(expr.expr, Expr::String(_))) - // .filter_map(bareword_error) - // .next() - // } else { - // None - // } - // } - // _ => None, - // }; - // let value = if let Some(parse_error) = parse_error { - // working_set.error(parse_error); - // garbage(working_set, value.span) - // } else { - // value - // }; - if let Some(field) = field.as_string() { if let Some(fields) = &mut field_types { fields.push((field, value.ty.clone())); From b3bdd4ac6477959baf526c3e092fcc8a530ad315 Mon Sep 17 00:00:00 2001 From: Kira Date: Wed, 24 Jul 2024 17:14:32 +0200 Subject: [PATCH 7/8] I really dont know what im doing with these tests sorry --- crates/nu-parser/src/lex.rs | 1 + crates/nu-parser/src/lib.rs | 2 +- crates/nu-parser/src/parser.rs | 3 +- crates/nu-parser/tests/test_lex.rs | 25 ++++++++++++- crates/nu-parser/tests/test_parser.rs | 53 +++++++++++++++++++++++++++ 5 files changed, 80 insertions(+), 4 deletions(-) diff --git a/crates/nu-parser/src/lex.rs b/crates/nu-parser/src/lex.rs index 628b84414a..adcb239f4e 100644 --- a/crates/nu-parser/src/lex.rs +++ b/crates/nu-parser/src/lex.rs @@ -464,6 +464,7 @@ pub fn lex_signature( (state.output, state.error) } +#[derive(Debug)] pub struct LexState<'a> { pub input: &'a [u8], pub output: Vec, diff --git a/crates/nu-parser/src/lib.rs b/crates/nu-parser/src/lib.rs index 8f871aa815..c5d69cb270 100644 --- a/crates/nu-parser/src/lib.rs +++ b/crates/nu-parser/src/lib.rs @@ -16,7 +16,7 @@ pub use flatten::{ flatten_block, flatten_expression, flatten_pipeline, flatten_pipeline_element, FlatShape, }; pub use known_external::KnownExternal; -pub use lex::{lex, lex_signature, Token, TokenContents}; +pub use lex::{lex, lex_n_tokens, lex_signature, LexState, Token, TokenContents}; pub use lite_parser::{lite_parse, LiteBlock, LiteCommand}; pub use nu_protocol::parser_path::*; pub use parse_keywords::*; diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index 92db4205c6..22a0f6ab64 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -5784,8 +5784,7 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression ) } else { let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); - if let Some(parse_error) = check_record_key_or_value(&working_set, &value, "value") - { + if let Some(parse_error) = check_record_key_or_value(working_set, &value, "value") { working_set.error(parse_error); garbage(working_set, value.span) } else { diff --git a/crates/nu-parser/tests/test_lex.rs b/crates/nu-parser/tests/test_lex.rs index 07470a310e..22fe4c4715 100644 --- a/crates/nu-parser/tests/test_lex.rs +++ b/crates/nu-parser/tests/test_lex.rs @@ -1,4 +1,4 @@ -use nu_parser::{lex, lex_signature, Token, TokenContents}; +use nu_parser::{lex, lex_n_tokens, lex_signature, LexState, Token, TokenContents}; use nu_protocol::{ParseError, Span}; #[test] @@ -281,3 +281,26 @@ fn lex_comments() { } ); } + +#[test] +fn lex_manually() { + let file = b"'a'\n#comment\n#comment again\n| continue"; + let mut lex_state = LexState { + input: file, + output: Vec::new(), + error: None, + span_offset: 10, + }; + assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 1), 1); + assert_eq!(lex_state.output.len(), 1); + assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 5), 5); + assert_eq!(lex_state.output.len(), 6); + // Next token is the pipe. + // This shortens the output because it exhausts the input before it can + // compensate for the EOL tokens lost to the line continuation + assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 1), -1); + assert_eq!(lex_state.output.len(), 5); + assert_eq!(file.len(), lex_state.span_offset - 10); + let last_span = lex_state.output.last().unwrap().span; + assert_eq!(&file[last_span.start - 10..last_span.end - 10], b"continue"); +} diff --git a/crates/nu-parser/tests/test_parser.rs b/crates/nu-parser/tests/test_parser.rs index 44cd79a04a..591d783eef 100644 --- a/crates/nu-parser/tests/test_parser.rs +++ b/crates/nu-parser/tests/test_parser.rs @@ -2397,3 +2397,56 @@ mod operator { ); } } + +mod record { + use super::*; + + use nu_protocol::ast::RecordItem; + + #[rstest] + #[case(b"{ :: x }", "Invalid literal")] // Key is bare colon + #[case(b"{ a: x:y }", "Invalid literal")] // Value is bare word with colon + #[case(b"{ a: x('y'):z }", "Invalid literal")] // Value is bare string interpolation with colon + #[case(b"{ ;: x }", "Parse mismatch during operation.")] // Key is a non-item token + #[case(b"{ a: || }", "Parse mismatch during operation.")] // Value is a non-item token + fn refuse_confusing_record(#[case] expr: &[u8], #[case] error: &str) { + dbg!(String::from_utf8_lossy(expr)); + let engine_state = EngineState::new(); + let mut working_set = StateWorkingSet::new(&engine_state); + parse(&mut working_set, None, expr, false); + assert_eq!( + working_set.parse_errors.first().map(|e| e.to_string()), + Some(error.to_string()) + ); + } + + #[rstest] + #[case(b"{ a: 2024-07-23T22:54:54.532100627+02:00 b:xy }")] + fn parse_datetime_in_record(#[case] expr: &[u8]) { + dbg!(String::from_utf8_lossy(expr)); + let engine_state = EngineState::new(); + let mut working_set = StateWorkingSet::new(&engine_state); + let block = parse(&mut working_set, None, expr, false); + assert!(working_set.parse_errors.first().is_none()); + let pipeline_el_expr = &block + .pipelines + .first() + .unwrap() + .elements + .first() + .unwrap() + .expr + .expr; + dbg!(pipeline_el_expr); + match pipeline_el_expr { + Expr::FullCellPath(v) => match &v.head.expr { + Expr::Record(fields) => assert!(matches!( + fields[0], + RecordItem::Pair(_, Expression { ty: Type::Date, .. }) + )), + _ => panic!("Expected record head"), + }, + _ => panic!("Expected full cell path"), + } + } +} From b26dd50006571b11296b5af50769d06f474a9707 Mon Sep 17 00:00:00 2001 From: Kira Date: Fri, 2 Aug 2024 23:26:07 +0200 Subject: [PATCH 8/8] Add doc comment to `lex_n_tokens` --- crates/nu-parser/src/lex.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crates/nu-parser/src/lex.rs b/crates/nu-parser/src/lex.rs index adcb239f4e..f0802fcd7a 100644 --- a/crates/nu-parser/src/lex.rs +++ b/crates/nu-parser/src/lex.rs @@ -472,9 +472,11 @@ pub struct LexState<'a> { pub span_offset: usize, } -// Lex until the output is max_tokens longer than before the call, or until the input is exhausted. -// The behaviour here is non-obvious (maybe non-useful) iff your additional_whitespace doesn't include newline: -// If you pass `output` in a state where the last token is an Eol, this might *remove* tokens. +/// Lex until the output is `max_tokens` longer than before the call, or until the input is exhausted. +/// The return value indicates how many tokens the call added to / removed from the output. +/// +/// The behaviour here is non-obvious when `additional_whitespace` doesn't include newline: +/// If you pass a `state` where the last token in the output is an Eol, this might *remove* tokens. pub fn lex_n_tokens( state: &mut LexState, additional_whitespace: &[u8],