diff --git a/crates/nu-parser/src/lex.rs b/crates/nu-parser/src/lex.rs index b7c8ccc118..628b84414a 100644 --- a/crates/nu-parser/src/lex.rs +++ b/crates/nu-parser/src/lex.rs @@ -447,33 +447,58 @@ pub fn lex_signature( special_tokens: &[u8], skip_comment: bool, ) -> (Vec, Option) { - lex_internal( + let mut state = LexState { input, + output: Vec::new(), + error: None, span_offset, + }; + lex_internal( + &mut state, additional_whitespace, special_tokens, skip_comment, true, - false, - ) + None, + ); + (state.output, state.error) } -pub fn lex_alternating_special_tokens( - input: &[u8], - span_offset: usize, +pub struct LexState<'a> { + pub input: &'a [u8], + pub output: Vec, + pub error: Option, + pub span_offset: usize, +} + +// Lex until the output is max_tokens longer than before the call, or until the input is exhausted. +// The behaviour here is non-obvious (maybe non-useful) iff your additional_whitespace doesn't include newline: +// If you pass `output` in a state where the last token is an Eol, this might *remove* tokens. +pub fn lex_n_tokens( + state: &mut LexState, additional_whitespace: &[u8], special_tokens: &[u8], skip_comment: bool, -) -> (Vec, Option) { + max_tokens: usize, +) -> isize { + let n_tokens = state.output.len(); lex_internal( - input, - span_offset, + state, additional_whitespace, special_tokens, skip_comment, false, - true, - ) + Some(max_tokens), + ); + // If this lex_internal call reached the end of the input, there may now be fewer tokens + // in the output than before. + let tokens_n_diff = (state.output.len() as isize) - (n_tokens as isize); + let next_offset = state.output.last().map(|token| token.span.end); + if let Some(next_offset) = next_offset { + state.input = &state.input[next_offset - state.span_offset..]; + state.span_offset = next_offset; + } + tokens_n_diff } pub fn lex( @@ -483,39 +508,43 @@ pub fn lex( special_tokens: &[u8], skip_comment: bool, ) -> (Vec, Option) { - lex_internal( + let mut state = LexState { input, + output: Vec::new(), + error: None, span_offset, + }; + lex_internal( + &mut state, additional_whitespace, special_tokens, skip_comment, false, - false, - ) + None, + ); + (state.output, state.error) } fn lex_internal( - input: &[u8], - span_offset: usize, + state: &mut LexState, additional_whitespace: &[u8], special_tokens: &[u8], skip_comment: bool, // within signatures we want to treat `<` and `>` specially in_signature: bool, - // after lexing a special item, disable special items when lexing the next item. - // necessary because colons are special in records, but datetime literals may contain colons - alternate_specials: bool, -) -> (Vec, Option) { - let mut specials_disabled = false; - - let mut error = None; + max_tokens: Option, +) { + let initial_output_len = state.output.len(); let mut curr_offset = 0; - let mut output = vec![]; let mut is_complete = true; - - while let Some(c) = input.get(curr_offset) { + while let Some(c) = state.input.get(curr_offset) { + if max_tokens + .is_some_and(|max_tokens| state.output.len() >= initial_output_len + max_tokens) + { + break; + } let c = *c; if c == b'|' { // If the next character is `|`, it's either `|` or `||`. @@ -524,13 +553,13 @@ fn lex_internal( curr_offset += 1; // If the next character is `|`, we're looking at a `||`. - if let Some(c) = input.get(curr_offset) { + if let Some(c) = state.input.get(curr_offset) { if *c == b'|' { let idx = curr_offset; curr_offset += 1; - output.push(Token::new( + state.output.push(Token::new( TokenContents::PipePipe, - Span::new(span_offset + prev_idx, span_offset + idx + 1), + Span::new(state.span_offset + prev_idx, state.span_offset + idx + 1), )); continue; } @@ -540,12 +569,12 @@ fn lex_internal( // Before we push, check to see if the previous character was a newline. // If so, then this is a continuation of the previous line - if let Some(prev) = output.last_mut() { + if let Some(prev) = state.output.last_mut() { match prev.contents { TokenContents::Eol => { *prev = Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), ); // And this is a continuation of the previous line if previous line is a // comment line (combined with EOL + Comment) @@ -553,12 +582,12 @@ fn lex_internal( // Initially, the last one token is TokenContents::Pipe, we don't need to // check it, so the beginning offset is 2. let mut offset = 2; - while output.len() > offset { - let index = output.len() - offset; - if output[index].contents == TokenContents::Comment - && output[index - 1].contents == TokenContents::Eol + while state.output.len() > offset { + let index = state.output.len() - offset; + if state.output[index].contents == TokenContents::Comment + && state.output[index - 1].contents == TokenContents::Eol { - output.remove(index - 1); + state.output.remove(index - 1); offset += 1; } else { break; @@ -566,16 +595,16 @@ fn lex_internal( } } _ => { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } } } else { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } @@ -583,17 +612,17 @@ fn lex_internal( } else if c == b';' { // If the next character is a `;`, we're looking at a semicolon token. - if !is_complete && error.is_none() { - error = Some(ParseError::ExtraTokens(Span::new( + if !is_complete && state.error.is_none() { + state.error = Some(ParseError::ExtraTokens(Span::new( curr_offset, curr_offset + 1, ))); } let idx = curr_offset; curr_offset += 1; - output.push(Token::new( + state.output.push(Token::new( TokenContents::Semicolon, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } else if c == b'\r' { // Ignore a stand-alone carriage return @@ -603,9 +632,9 @@ fn lex_internal( let idx = curr_offset; curr_offset += 1; if !additional_whitespace.contains(&c) { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Eol, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } } else if c == b'#' { @@ -613,12 +642,12 @@ fn lex_internal( // comment. The comment continues until the next newline. let mut start = curr_offset; - while let Some(input) = input.get(curr_offset) { + while let Some(input) = state.input.get(curr_offset) { if *input == b'\n' { if !skip_comment { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Comment, - Span::new(span_offset + start, span_offset + curr_offset), + Span::new(state.span_offset + start, state.span_offset + curr_offset), )); } start = curr_offset; @@ -629,48 +658,30 @@ fn lex_internal( } } if start != curr_offset && !skip_comment { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Comment, - Span::new(span_offset + start, span_offset + curr_offset), + Span::new(state.span_offset + start, state.span_offset + curr_offset), )); } } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) { // If the next character is non-newline whitespace, skip it. curr_offset += 1; - } else if alternate_specials && !specials_disabled && special_tokens.contains(&c) { - // If disabling special items but if they're not currently disabled, handle a special item - // character right here, bypassing lex_item - output.push(Token::new( - TokenContents::Item, - Span::new(span_offset + curr_offset, span_offset + curr_offset + 1), - )); - curr_offset += 1; - specials_disabled = true; } else { - let special_tokens = if specials_disabled { - &[] - } else { - special_tokens - }; let (token, err) = lex_item( - input, + state.input, &mut curr_offset, - span_offset, + state.span_offset, additional_whitespace, special_tokens, in_signature, ); - if error.is_none() { - error = err; + if state.error.is_none() { + state.error = err; } is_complete = true; - if token.contents == TokenContents::Item { - specials_disabled = false; - } - output.push(token); + state.output.push(token); } } - (output, error) } /// True if this the start of a redirection. Does not match `>>` or `>|` forms. diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index 380d8744f8..3350364b1d 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -1,5 +1,5 @@ use crate::{ - lex::{is_assignment_operator, lex, lex_alternating_special_tokens, lex_signature}, + lex::{is_assignment_operator, lex, lex_n_tokens, lex_signature, LexState}, lite_parser::{lite_parse, LiteCommand, LitePipeline, LiteRedirection, LiteRedirectionTarget}, parse_keywords::*, parse_patterns::parse_pattern, @@ -5599,10 +5599,32 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression } let inner_span = Span::new(start, end); - let source = working_set.get_span_contents(inner_span); - let (tokens, err) = - lex_alternating_special_tokens(source, start, &[b'\n', b'\r', b','], &[b':'], true); + let mut lex_state = LexState { + input: working_set.get_span_contents(inner_span), + output: Vec::new(), + error: None, + span_offset: start, + }; + let mut lex_n = |additional_whitespace, special_tokens, max_tokens| { + lex_n_tokens( + &mut lex_state, + additional_whitespace, + special_tokens, + true, + max_tokens, + ) + }; + loop { + if lex_n(&[b'\n', b'\r', b','], &[b':'], 2) < 2 { + break; + }; + if lex_n(&[b'\n', b'\r', b','], &[], 1) < 1 { + break; + }; + } + let (tokens, err) = (lex_state.output, lex_state.error); + if let Some(err) = err { working_set.error(err); } @@ -5694,48 +5716,50 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); idx += 1; - let bareword_error = |string_value: &Expression| { - working_set - .get_span_contents(string_value.span) - .iter() - .find_position(|b| **b == b':') - .map(|(i, _)| { - let colon_position = i + string_value.span.start; - ParseError::InvalidLiteral( - "colon".to_string(), - "bare word specifying record value".to_string(), - Span::new(colon_position, colon_position + 1), - ) - }) - }; - let value_span = working_set.get_span_contents(value.span); - let parse_error = match value.expr { - Expr::String(_) => { - if ![b'"', b'\'', b'`'].contains(&value_span[0]) { - bareword_error(&value) - } else { - None - } - } - Expr::StringInterpolation(ref expressions) => { - if value_span[0] != b'$' { - expressions - .iter() - .filter(|expr| matches!(expr.expr, Expr::String(_))) - .filter_map(bareword_error) - .next() - } else { - None - } - } - _ => None, - }; - let value = if let Some(parse_error) = parse_error { - working_set.error(parse_error); - garbage(working_set, value.span) - } else { - value - }; + // Disallow colons in bare word values + + // let bareword_error = |string_value: &Expression| { + // working_set + // .get_span_contents(string_value.span) + // .iter() + // .find_position(|b| **b == b':') + // .map(|(i, _)| { + // let colon_position = i + string_value.span.start; + // ParseError::InvalidLiteral( + // "colon".to_string(), + // "bare word specifying record value".to_string(), + // Span::new(colon_position, colon_position + 1), + // ) + // }) + // }; + // let value_span = working_set.get_span_contents(value.span); + // let parse_error = match value.expr { + // Expr::String(_) => { + // if ![b'"', b'\'', b'`'].contains(&value_span[0]) { + // bareword_error(&value) + // } else { + // None + // } + // } + // Expr::StringInterpolation(ref expressions) => { + // if value_span[0] != b'$' { + // expressions + // .iter() + // .filter(|expr| matches!(expr.expr, Expr::String(_))) + // .filter_map(bareword_error) + // .next() + // } else { + // None + // } + // } + // _ => None, + // }; + // let value = if let Some(parse_error) = parse_error { + // working_set.error(parse_error); + // garbage(working_set, value.span) + // } else { + // value + // }; if let Some(field) = field.as_string() { if let Some(fields) = &mut field_types {