new attempt

This commit is contained in:
Kira 2024-07-23 02:37:17 +02:00
parent 26c4bbfa41
commit 4da348f7ca
2 changed files with 155 additions and 120 deletions

View File

@ -447,33 +447,58 @@ pub fn lex_signature(
special_tokens: &[u8], special_tokens: &[u8],
skip_comment: bool, skip_comment: bool,
) -> (Vec<Token>, Option<ParseError>) { ) -> (Vec<Token>, Option<ParseError>) {
lex_internal( let mut state = LexState {
input, input,
output: Vec::new(),
error: None,
span_offset, span_offset,
};
lex_internal(
&mut state,
additional_whitespace, additional_whitespace,
special_tokens, special_tokens,
skip_comment, skip_comment,
true, true,
false, None,
) );
(state.output, state.error)
} }
pub fn lex_alternating_special_tokens( pub struct LexState<'a> {
input: &[u8], pub input: &'a [u8],
span_offset: usize, pub output: Vec<Token>,
pub error: Option<ParseError>,
pub span_offset: usize,
}
// Lex until the output is max_tokens longer than before the call, or until the input is exhausted.
// The behaviour here is non-obvious (maybe non-useful) iff your additional_whitespace doesn't include newline:
// If you pass `output` in a state where the last token is an Eol, this might *remove* tokens.
pub fn lex_n_tokens(
state: &mut LexState,
additional_whitespace: &[u8], additional_whitespace: &[u8],
special_tokens: &[u8], special_tokens: &[u8],
skip_comment: bool, skip_comment: bool,
) -> (Vec<Token>, Option<ParseError>) { max_tokens: usize,
) -> isize {
let n_tokens = state.output.len();
lex_internal( lex_internal(
input, state,
span_offset,
additional_whitespace, additional_whitespace,
special_tokens, special_tokens,
skip_comment, skip_comment,
false, false,
true, Some(max_tokens),
) );
// If this lex_internal call reached the end of the input, there may now be fewer tokens
// in the output than before.
let tokens_n_diff = (state.output.len() as isize) - (n_tokens as isize);
let next_offset = state.output.last().map(|token| token.span.end);
if let Some(next_offset) = next_offset {
state.input = &state.input[next_offset - state.span_offset..];
state.span_offset = next_offset;
}
tokens_n_diff
} }
pub fn lex( pub fn lex(
@ -483,39 +508,43 @@ pub fn lex(
special_tokens: &[u8], special_tokens: &[u8],
skip_comment: bool, skip_comment: bool,
) -> (Vec<Token>, Option<ParseError>) { ) -> (Vec<Token>, Option<ParseError>) {
lex_internal( let mut state = LexState {
input, input,
output: Vec::new(),
error: None,
span_offset, span_offset,
};
lex_internal(
&mut state,
additional_whitespace, additional_whitespace,
special_tokens, special_tokens,
skip_comment, skip_comment,
false, false,
false, None,
) );
(state.output, state.error)
} }
fn lex_internal( fn lex_internal(
input: &[u8], state: &mut LexState,
span_offset: usize,
additional_whitespace: &[u8], additional_whitespace: &[u8],
special_tokens: &[u8], special_tokens: &[u8],
skip_comment: bool, skip_comment: bool,
// within signatures we want to treat `<` and `>` specially // within signatures we want to treat `<` and `>` specially
in_signature: bool, in_signature: bool,
// after lexing a special item, disable special items when lexing the next item. max_tokens: Option<usize>,
// necessary because colons are special in records, but datetime literals may contain colons ) {
alternate_specials: bool, let initial_output_len = state.output.len();
) -> (Vec<Token>, Option<ParseError>) {
let mut specials_disabled = false;
let mut error = None;
let mut curr_offset = 0; let mut curr_offset = 0;
let mut output = vec![];
let mut is_complete = true; let mut is_complete = true;
while let Some(c) = state.input.get(curr_offset) {
while let Some(c) = input.get(curr_offset) { if max_tokens
.is_some_and(|max_tokens| state.output.len() >= initial_output_len + max_tokens)
{
break;
}
let c = *c; let c = *c;
if c == b'|' { if c == b'|' {
// If the next character is `|`, it's either `|` or `||`. // If the next character is `|`, it's either `|` or `||`.
@ -524,13 +553,13 @@ fn lex_internal(
curr_offset += 1; curr_offset += 1;
// If the next character is `|`, we're looking at a `||`. // If the next character is `|`, we're looking at a `||`.
if let Some(c) = input.get(curr_offset) { if let Some(c) = state.input.get(curr_offset) {
if *c == b'|' { if *c == b'|' {
let idx = curr_offset; let idx = curr_offset;
curr_offset += 1; curr_offset += 1;
output.push(Token::new( state.output.push(Token::new(
TokenContents::PipePipe, TokenContents::PipePipe,
Span::new(span_offset + prev_idx, span_offset + idx + 1), Span::new(state.span_offset + prev_idx, state.span_offset + idx + 1),
)); ));
continue; continue;
} }
@ -540,12 +569,12 @@ fn lex_internal(
// Before we push, check to see if the previous character was a newline. // Before we push, check to see if the previous character was a newline.
// If so, then this is a continuation of the previous line // If so, then this is a continuation of the previous line
if let Some(prev) = output.last_mut() { if let Some(prev) = state.output.last_mut() {
match prev.contents { match prev.contents {
TokenContents::Eol => { TokenContents::Eol => {
*prev = Token::new( *prev = Token::new(
TokenContents::Pipe, TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1), Span::new(state.span_offset + idx, state.span_offset + idx + 1),
); );
// And this is a continuation of the previous line if previous line is a // And this is a continuation of the previous line if previous line is a
// comment line (combined with EOL + Comment) // comment line (combined with EOL + Comment)
@ -553,12 +582,12 @@ fn lex_internal(
// Initially, the last one token is TokenContents::Pipe, we don't need to // Initially, the last one token is TokenContents::Pipe, we don't need to
// check it, so the beginning offset is 2. // check it, so the beginning offset is 2.
let mut offset = 2; let mut offset = 2;
while output.len() > offset { while state.output.len() > offset {
let index = output.len() - offset; let index = state.output.len() - offset;
if output[index].contents == TokenContents::Comment if state.output[index].contents == TokenContents::Comment
&& output[index - 1].contents == TokenContents::Eol && state.output[index - 1].contents == TokenContents::Eol
{ {
output.remove(index - 1); state.output.remove(index - 1);
offset += 1; offset += 1;
} else { } else {
break; break;
@ -566,16 +595,16 @@ fn lex_internal(
} }
} }
_ => { _ => {
output.push(Token::new( state.output.push(Token::new(
TokenContents::Pipe, TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1), Span::new(state.span_offset + idx, state.span_offset + idx + 1),
)); ));
} }
} }
} else { } else {
output.push(Token::new( state.output.push(Token::new(
TokenContents::Pipe, TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1), Span::new(state.span_offset + idx, state.span_offset + idx + 1),
)); ));
} }
@ -583,17 +612,17 @@ fn lex_internal(
} else if c == b';' { } else if c == b';' {
// If the next character is a `;`, we're looking at a semicolon token. // If the next character is a `;`, we're looking at a semicolon token.
if !is_complete && error.is_none() { if !is_complete && state.error.is_none() {
error = Some(ParseError::ExtraTokens(Span::new( state.error = Some(ParseError::ExtraTokens(Span::new(
curr_offset, curr_offset,
curr_offset + 1, curr_offset + 1,
))); )));
} }
let idx = curr_offset; let idx = curr_offset;
curr_offset += 1; curr_offset += 1;
output.push(Token::new( state.output.push(Token::new(
TokenContents::Semicolon, TokenContents::Semicolon,
Span::new(span_offset + idx, span_offset + idx + 1), Span::new(state.span_offset + idx, state.span_offset + idx + 1),
)); ));
} else if c == b'\r' { } else if c == b'\r' {
// Ignore a stand-alone carriage return // Ignore a stand-alone carriage return
@ -603,9 +632,9 @@ fn lex_internal(
let idx = curr_offset; let idx = curr_offset;
curr_offset += 1; curr_offset += 1;
if !additional_whitespace.contains(&c) { if !additional_whitespace.contains(&c) {
output.push(Token::new( state.output.push(Token::new(
TokenContents::Eol, TokenContents::Eol,
Span::new(span_offset + idx, span_offset + idx + 1), Span::new(state.span_offset + idx, state.span_offset + idx + 1),
)); ));
} }
} else if c == b'#' { } else if c == b'#' {
@ -613,12 +642,12 @@ fn lex_internal(
// comment. The comment continues until the next newline. // comment. The comment continues until the next newline.
let mut start = curr_offset; let mut start = curr_offset;
while let Some(input) = input.get(curr_offset) { while let Some(input) = state.input.get(curr_offset) {
if *input == b'\n' { if *input == b'\n' {
if !skip_comment { if !skip_comment {
output.push(Token::new( state.output.push(Token::new(
TokenContents::Comment, TokenContents::Comment,
Span::new(span_offset + start, span_offset + curr_offset), Span::new(state.span_offset + start, state.span_offset + curr_offset),
)); ));
} }
start = curr_offset; start = curr_offset;
@ -629,48 +658,30 @@ fn lex_internal(
} }
} }
if start != curr_offset && !skip_comment { if start != curr_offset && !skip_comment {
output.push(Token::new( state.output.push(Token::new(
TokenContents::Comment, TokenContents::Comment,
Span::new(span_offset + start, span_offset + curr_offset), Span::new(state.span_offset + start, state.span_offset + curr_offset),
)); ));
} }
} else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) { } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) {
// If the next character is non-newline whitespace, skip it. // If the next character is non-newline whitespace, skip it.
curr_offset += 1; curr_offset += 1;
} else if alternate_specials && !specials_disabled && special_tokens.contains(&c) {
// If disabling special items but if they're not currently disabled, handle a special item
// character right here, bypassing lex_item
output.push(Token::new(
TokenContents::Item,
Span::new(span_offset + curr_offset, span_offset + curr_offset + 1),
));
curr_offset += 1;
specials_disabled = true;
} else { } else {
let special_tokens = if specials_disabled {
&[]
} else {
special_tokens
};
let (token, err) = lex_item( let (token, err) = lex_item(
input, state.input,
&mut curr_offset, &mut curr_offset,
span_offset, state.span_offset,
additional_whitespace, additional_whitespace,
special_tokens, special_tokens,
in_signature, in_signature,
); );
if error.is_none() { if state.error.is_none() {
error = err; state.error = err;
} }
is_complete = true; is_complete = true;
if token.contents == TokenContents::Item { state.output.push(token);
specials_disabled = false;
}
output.push(token);
} }
} }
(output, error)
} }
/// True if this the start of a redirection. Does not match `>>` or `>|` forms. /// True if this the start of a redirection. Does not match `>>` or `>|` forms.

View File

@ -1,5 +1,5 @@
use crate::{ use crate::{
lex::{is_assignment_operator, lex, lex_alternating_special_tokens, lex_signature}, lex::{is_assignment_operator, lex, lex_n_tokens, lex_signature, LexState},
lite_parser::{lite_parse, LiteCommand, LitePipeline, LiteRedirection, LiteRedirectionTarget}, lite_parser::{lite_parse, LiteCommand, LitePipeline, LiteRedirection, LiteRedirectionTarget},
parse_keywords::*, parse_keywords::*,
parse_patterns::parse_pattern, parse_patterns::parse_pattern,
@ -5599,10 +5599,32 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression
} }
let inner_span = Span::new(start, end); let inner_span = Span::new(start, end);
let source = working_set.get_span_contents(inner_span);
let (tokens, err) = let mut lex_state = LexState {
lex_alternating_special_tokens(source, start, &[b'\n', b'\r', b','], &[b':'], true); input: working_set.get_span_contents(inner_span),
output: Vec::new(),
error: None,
span_offset: start,
};
let mut lex_n = |additional_whitespace, special_tokens, max_tokens| {
lex_n_tokens(
&mut lex_state,
additional_whitespace,
special_tokens,
true,
max_tokens,
)
};
loop {
if lex_n(&[b'\n', b'\r', b','], &[b':'], 2) < 2 {
break;
};
if lex_n(&[b'\n', b'\r', b','], &[], 1) < 1 {
break;
};
}
let (tokens, err) = (lex_state.output, lex_state.error);
if let Some(err) = err { if let Some(err) = err {
working_set.error(err); working_set.error(err);
} }
@ -5694,48 +5716,50 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression
let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any);
idx += 1; idx += 1;
let bareword_error = |string_value: &Expression| { // Disallow colons in bare word values
working_set
.get_span_contents(string_value.span) // let bareword_error = |string_value: &Expression| {
.iter() // working_set
.find_position(|b| **b == b':') // .get_span_contents(string_value.span)
.map(|(i, _)| { // .iter()
let colon_position = i + string_value.span.start; // .find_position(|b| **b == b':')
ParseError::InvalidLiteral( // .map(|(i, _)| {
"colon".to_string(), // let colon_position = i + string_value.span.start;
"bare word specifying record value".to_string(), // ParseError::InvalidLiteral(
Span::new(colon_position, colon_position + 1), // "colon".to_string(),
) // "bare word specifying record value".to_string(),
}) // Span::new(colon_position, colon_position + 1),
}; // )
let value_span = working_set.get_span_contents(value.span); // })
let parse_error = match value.expr { // };
Expr::String(_) => { // let value_span = working_set.get_span_contents(value.span);
if ![b'"', b'\'', b'`'].contains(&value_span[0]) { // let parse_error = match value.expr {
bareword_error(&value) // Expr::String(_) => {
} else { // if ![b'"', b'\'', b'`'].contains(&value_span[0]) {
None // bareword_error(&value)
} // } else {
} // None
Expr::StringInterpolation(ref expressions) => { // }
if value_span[0] != b'$' { // }
expressions // Expr::StringInterpolation(ref expressions) => {
.iter() // if value_span[0] != b'$' {
.filter(|expr| matches!(expr.expr, Expr::String(_))) // expressions
.filter_map(bareword_error) // .iter()
.next() // .filter(|expr| matches!(expr.expr, Expr::String(_)))
} else { // .filter_map(bareword_error)
None // .next()
} // } else {
} // None
_ => None, // }
}; // }
let value = if let Some(parse_error) = parse_error { // _ => None,
working_set.error(parse_error); // };
garbage(working_set, value.span) // let value = if let Some(parse_error) = parse_error {
} else { // working_set.error(parse_error);
value // garbage(working_set, value.span)
}; // } else {
// value
// };
if let Some(field) = field.as_string() { if let Some(field) = field.as_string() {
if let Some(fields) = &mut field_types { if let Some(fields) = &mut field_types {