From ea9ca8b4ed5d0e45be533c541bd5013c69ecf8c1 Mon Sep 17 00:00:00 2001 From: Leon Date: Fri, 20 Jan 2023 17:16:18 +1000 Subject: [PATCH] `str length`, `str substring`, `str index-of`, `split words` and `split chars` now use graphemes instead of UTF-8 bytes (#7752) Closes https://github.com/nushell/nushell/issues/7742 --- crates/nu-command/src/strings/mod.rs | 25 ++++ crates/nu-command/src/strings/split/chars.rs | 73 ++++++++---- crates/nu-command/src/strings/split/words.rs | 59 +++++++++- .../nu-command/src/strings/str_/index_of.rs | 108 ++++++++++++------ crates/nu-command/src/strings/str_/length.rs | 65 ++++++++++- .../nu-command/src/strings/str_/substring.rs | 62 +++++++++- 6 files changed, 323 insertions(+), 69 deletions(-) diff --git a/crates/nu-command/src/strings/mod.rs b/crates/nu-command/src/strings/mod.rs index 317bb43cb0..86e5e61c23 100644 --- a/crates/nu-command/src/strings/mod.rs +++ b/crates/nu-command/src/strings/mod.rs @@ -15,3 +15,28 @@ pub use parse::*; pub use size::Size; pub use split::*; pub use str_::*; + +use nu_protocol::{ast::Call, ShellError}; + +// For handling the grapheme_cluster related flags on some commands. +// This ensures the error messages are consistent. +pub fn grapheme_flags(call: &Call) -> Result { + let g_flag = call.has_flag("grapheme-clusters"); + // Check for the other flags and produce errors if they exist. + // Note that Nushell already prevents nonexistant flags from being used with commands, + // so this function can be reused for both the --utf-8-bytes commands and the --code-points commands. + if g_flag && call.has_flag("utf-8-bytes") { + Err(ShellError::IncompatibleParametersSingle( + "Incompatible flags: --grapheme-clusters (-g) and --utf-8-bytes (-b)".to_string(), + call.head, + ))? + } + if g_flag && call.has_flag("code-points") { + Err(ShellError::IncompatibleParametersSingle( + "Incompatible flags: --grapheme-clusters (-g) and --utf-8-bytes (-b)".to_string(), + call.head, + ))? + } + // Grapheme cluster usage is decided by the non-default -g flag + Ok(g_flag) +} diff --git a/crates/nu-command/src/strings/split/chars.rs b/crates/nu-command/src/strings/split/chars.rs index e2f82abe8a..4c7a95c57b 100644 --- a/crates/nu-command/src/strings/split/chars.rs +++ b/crates/nu-command/src/strings/split/chars.rs @@ -1,8 +1,10 @@ +use crate::grapheme_flags; use nu_protocol::{ ast::Call, engine::{Command, EngineState, Stack}, Category, Example, PipelineData, ShellError, Signature, Span, Type, Value, }; +use unicode_segmentation::UnicodeSegmentation; #[derive(Clone)] pub struct SubCommand; @@ -15,6 +17,12 @@ impl Command for SubCommand { fn signature(&self) -> Signature { Signature::build("split chars") .input_output_types(vec![(Type::String, Type::List(Box::new(Type::String)))]) + .switch("grapheme-clusters", "split on grapheme clusters", Some('g')) + .switch( + "code-points", + "split on code points (default; splits combined characters)", + Some('c'), + ) .vectorizes_over_list(true) .category(Category::Strings) } @@ -28,20 +36,34 @@ impl Command for SubCommand { } fn examples(&self) -> Vec { - vec![Example { - description: "Split the string into a list of characters", - example: "'hello' | split chars", - result: Some(Value::List { - vals: vec![ - Value::test_string("h"), - Value::test_string("e"), - Value::test_string("l"), - Value::test_string("l"), - Value::test_string("o"), - ], - span: Span::test_data(), - }), - }] + vec![ + Example { + description: "Split the string into a list of characters", + example: "'hello' | split chars", + result: Some(Value::List { + vals: vec![ + Value::test_string("h"), + Value::test_string("e"), + Value::test_string("l"), + Value::test_string("l"), + Value::test_string("o"), + ], + span: Span::test_data(), + }), + }, + Example { + description: "Split on grapheme clusters", + example: "'๐Ÿ‡ฏ๐Ÿ‡ตใปใ’' | split chars -g", + result: Some(Value::List { + vals: vec![ + Value::test_string("๐Ÿ‡ฏ๐Ÿ‡ต"), + Value::test_string("ใป"), + Value::test_string("ใ’"), + ], + span: Span::test_data(), + }), + }, + ] } fn run( @@ -62,21 +84,30 @@ fn split_chars( ) -> Result { let span = call.head; + let graphemes = grapheme_flags(call)?; input.flat_map( - move |x| split_chars_helper(&x, span), + move |x| split_chars_helper(&x, span, graphemes), engine_state.ctrlc.clone(), ) } -fn split_chars_helper(v: &Value, name: Span) -> Vec { +fn split_chars_helper(v: &Value, name: Span, graphemes: bool) -> Vec { match v.span() { Ok(v_span) => { if let Ok(s) = v.as_string() { - s.chars() - .collect::>() - .into_iter() - .map(move |x| Value::string(x, v_span)) - .collect() + if graphemes { + s.graphemes(true) + .collect::>() + .into_iter() + .map(move |x| Value::string(x, v_span)) + .collect() + } else { + s.chars() + .collect::>() + .into_iter() + .map(move |x| Value::string(x, v_span)) + .collect() + } } else { vec![Value::Error { error: ShellError::PipelineMismatch("string".into(), name, v_span), diff --git a/crates/nu-command/src/strings/split/words.rs b/crates/nu-command/src/strings/split/words.rs index 6439ba9bb0..dfc56e68f8 100644 --- a/crates/nu-command/src/strings/split/words.rs +++ b/crates/nu-command/src/strings/split/words.rs @@ -1,3 +1,4 @@ +use crate::grapheme_flags; use fancy_regex::Regex; use nu_engine::CallExt; use nu_protocol::{ @@ -5,6 +6,7 @@ use nu_protocol::{ engine::{Command, EngineState, Stack}, Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value, }; +use unicode_segmentation::UnicodeSegmentation; #[derive(Clone)] pub struct SubCommand; @@ -40,6 +42,16 @@ impl Command for SubCommand { "The minimum word length", Some('l'), ) + .switch( + "grapheme-clusters", + "measure word length in grapheme clusters (requires -l)", + Some('g'), + ) + .switch( + "utf-8-bytes", + "measure word length in UTF-8 bytes (default; requires -l; non-ASCII chars are length 2+)", + Some('b'), + ) } fn usage(&self) -> &str { @@ -105,13 +117,34 @@ fn split_words( // let ignore_punctuation = call.has_flag("ignore-punctuation"); let word_length: Option = call.get_flag(engine_state, stack, "min-word-length")?; + if matches!(word_length, None) { + if call.has_flag("grapheme-clusters") { + return Err(ShellError::IncompatibleParametersSingle( + "--grapheme-clusters (-g) requires --min-word-length (-l)".to_string(), + span, + )); + } + if call.has_flag("utf-8-bytes") { + return Err(ShellError::IncompatibleParametersSingle( + "--utf-8-bytes (-b) requires --min-word-length (-l)".to_string(), + span, + )); + } + } + let graphemes = grapheme_flags(call)?; + input.flat_map( - move |x| split_words_helper(&x, word_length, span), + move |x| split_words_helper(&x, word_length, span, graphemes), engine_state.ctrlc.clone(), ) } -fn split_words_helper(v: &Value, word_length: Option, span: Span) -> Vec { +fn split_words_helper( + v: &Value, + word_length: Option, + span: Span, + graphemes: bool, +) -> Vec { // There are some options here with this regex. // [^A-Za-z\'] = do not match uppercase or lowercase letters or apostrophes // [^[:alpha:]\'] = do not match any uppercase or lowercase letters or apostrophes @@ -132,7 +165,12 @@ fn split_words_helper(v: &Value, word_length: Option, span: Span) -> Vec< .filter_map(|s| { if s.trim() != "" { if let Some(len) = word_length { - if s.chars().count() >= len { + if if graphemes { + s.graphemes(true).count() + } else { + s.len() + } >= len + { Some(Value::string(s, v_span)) } else { None @@ -144,7 +182,7 @@ fn split_words_helper(v: &Value, word_length: Option, span: Span) -> Vec< None } }) - .collect() + .collect::>() } else { vec![Value::Error { error: ShellError::PipelineMismatch("string".into(), span, v_span), @@ -319,6 +357,19 @@ fn split_words_helper(v: &Value, word_length: Option, span: Span) -> Vec< #[cfg(test)] mod test { use super::*; + use nu_test_support::{nu, pipeline}; + + #[test] + fn test_incompat_flags() { + let out = nu!(cwd: ".", pipeline("'a' | split words -bg -l 2")); + assert!(out.err.contains("incompatible_parameters")); + } + + #[test] + fn test_incompat_flags_2() { + let out = nu!(cwd: ".", pipeline("'a' | split words -g")); + assert!(out.err.contains("incompatible_parameters")); + } #[test] fn test_examples() { diff --git a/crates/nu-command/src/strings/str_/index_of.rs b/crates/nu-command/src/strings/str_/index_of.rs index 991b1a5d8b..c08a45c93b 100644 --- a/crates/nu-command/src/strings/str_/index_of.rs +++ b/crates/nu-command/src/strings/str_/index_of.rs @@ -1,3 +1,4 @@ +use crate::grapheme_flags; use crate::input_handler::{operate, CmdArgument}; use nu_engine::CallExt; use nu_protocol::ast::Call; @@ -6,12 +7,14 @@ use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::Category; use nu_protocol::Spanned; use nu_protocol::{Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value}; +use unicode_segmentation::UnicodeSegmentation; struct Arguments { end: bool, substring: String, range: Option, cell_paths: Option>, + graphemes: bool, } impl CmdArgument for Arguments { @@ -35,7 +38,17 @@ impl Command for SubCommand { Signature::build("str index-of") .input_output_types(vec![(Type::String, Type::Int)]) .vectorizes_over_list(true) // TODO: no test coverage - .required("string", SyntaxShape::String, "the string to find index of") + .required("string", SyntaxShape::String, "the string to find in the input") + .switch( + "grapheme-clusters", + "count indexes using grapheme clusters (all visible chars have length 1)", + Some('g'), + ) + .switch( + "utf-8-bytes", + "count indexes using UTF-8 bytes (default; non-ASCII chars have length 2+)", + Some('b'), + ) .rest( "rest", SyntaxShape::CellPath, @@ -74,6 +87,7 @@ impl Command for SubCommand { range: call.get_flag(engine_state, stack, "range")?, end: call.has_flag("end"), cell_paths, + graphemes: grapheme_flags(call)?, }; operate(action, args, input, call.head, engine_state.ctrlc.clone()) } @@ -85,6 +99,11 @@ impl Command for SubCommand { example: " 'my_library.rb' | str index-of '.rb'", result: Some(Value::test_int(10)), }, + Example { + description: "Count length using grapheme clusters", + example: "'๐Ÿ‡ฏ๐Ÿ‡ตใปใ’ ใตใŒ ใดใ‚ˆ' | str index-of -g 'ใตใŒ'", + result: Some(Value::test_int(4)), + }, Example { description: "Returns index of string in input with start index", example: " '.rb.rb' | str index-of '.rb' -r '1,'", @@ -120,6 +139,7 @@ fn action( ref substring, range, end, + graphemes, .. }: &Arguments, head: Span, @@ -138,14 +158,28 @@ fn action( Err(e) => return Value::Error { error: e }, }; - if *end { - if let Some(result) = s[start_index..end_index].rfind(&**substring) { - Value::int(result as i64 + start_index as i64, head) - } else { - Value::int(-1, head) - } - } else if let Some(result) = s[start_index..end_index].find(&**substring) { - Value::int(result as i64 + start_index as i64, head) + // When the -e flag is present, search using rfind instead of find.s + if let Some(result) = if *end { + s[start_index..end_index].rfind(&**substring) + } else { + s[start_index..end_index].find(&**substring) + } { + let result = result + start_index; + Value::int( + if *graphemes { + // Having found the substring's byte index, convert to grapheme index. + // grapheme_indices iterates graphemes alongside their UTF-8 byte indices, so .enumerate() + // is used to get the grapheme index alongside it. + s.grapheme_indices(true) + .enumerate() + .find(|e| e.1 .0 >= result) + .expect("No grapheme index for substring") + .0 + } else { + result + } as i64, + head, + ) } else { Value::int(-1, head) } @@ -244,10 +278,7 @@ mod tests { #[test] fn returns_index_of_substring() { - let word = Value::String { - val: String::from("Cargo.tomL"), - span: Span::test_data(), - }; + let word = Value::test_string("Cargo.tomL"); let options = Arguments { substring: String::from(".tomL"), @@ -258,6 +289,7 @@ mod tests { }), cell_paths: None, end: false, + graphemes: false, }; let actual = action(&word, &options, Span::test_data()); @@ -266,10 +298,7 @@ mod tests { } #[test] fn index_of_does_not_exist_in_string() { - let word = Value::String { - val: String::from("Cargo.tomL"), - span: Span::test_data(), - }; + let word = Value::test_string("Cargo.tomL"); let options = Arguments { substring: String::from("Lm"), @@ -280,6 +309,7 @@ mod tests { }), cell_paths: None, end: false, + graphemes: false, }; let actual = action(&word, &options, Span::test_data()); @@ -289,10 +319,7 @@ mod tests { #[test] fn returns_index_of_next_substring() { - let word = Value::String { - val: String::from("Cargo.Cargo"), - span: Span::test_data(), - }; + let word = Value::test_string("Cargo.Cargo"); let options = Arguments { substring: String::from("Cargo"), @@ -303,6 +330,7 @@ mod tests { }), cell_paths: None, end: false, + graphemes: false, }; let actual = action(&word, &options, Span::test_data()); @@ -311,10 +339,7 @@ mod tests { #[test] fn index_does_not_exist_due_to_end_index() { - let word = Value::String { - val: String::from("Cargo.Banana"), - span: Span::test_data(), - }; + let word = Value::test_string("Cargo.Banana"); let options = Arguments { substring: String::from("Banana"), @@ -325,6 +350,7 @@ mod tests { }), cell_paths: None, end: false, + graphemes: false, }; let actual = action(&word, &options, Span::test_data()); @@ -333,10 +359,7 @@ mod tests { #[test] fn returns_index_of_nums_in_middle_due_to_index_limit_from_both_ends() { - let word = Value::String { - val: String::from("123123123"), - span: Span::test_data(), - }; + let word = Value::test_string("123123123"); let options = Arguments { substring: String::from("123"), @@ -347,6 +370,7 @@ mod tests { }), cell_paths: None, end: false, + graphemes: false, }; let actual = action(&word, &options, Span::test_data()); @@ -355,10 +379,7 @@ mod tests { #[test] fn index_does_not_exists_due_to_strict_bounds() { - let word = Value::String { - val: String::from("123456"), - span: Span::test_data(), - }; + let word = Value::test_string("123456"); let options = Arguments { substring: String::from("1"), @@ -369,9 +390,30 @@ mod tests { }), cell_paths: None, end: false, + graphemes: false, }; let actual = action(&word, &options, Span::test_data()); assert_eq!(actual, Value::test_int(-1)); } + + #[test] + fn use_utf8_bytes() { + let word = Value::String { + val: String::from("๐Ÿ‡ฏ๐Ÿ‡ตใปใ’ ใตใŒ ใดใ‚ˆ"), + span: Span::test_data(), + }; + + let options = Arguments { + substring: String::from("ใตใŒ"), + + range: None, + cell_paths: None, + end: false, + graphemes: false, + }; + + let actual = action(&word, &options, Span::test_data()); + assert_eq!(actual, Value::test_int(15)); + } } diff --git a/crates/nu-command/src/strings/str_/length.rs b/crates/nu-command/src/strings/str_/length.rs index 4dca859509..f5518ba2d5 100644 --- a/crates/nu-command/src/strings/str_/length.rs +++ b/crates/nu-command/src/strings/str_/length.rs @@ -1,10 +1,24 @@ -use crate::input_handler::{operate, CellPathOnlyArgs}; +use crate::grapheme_flags; +use crate::input_handler::{operate, CmdArgument}; use nu_engine::CallExt; use nu_protocol::ast::Call; use nu_protocol::ast::CellPath; use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::Category; use nu_protocol::{Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value}; +use unicode_segmentation::UnicodeSegmentation; + +struct Arguments { + cell_paths: Option>, + graphemes: bool, +} + +impl CmdArgument for Arguments { + fn take_cell_paths(&mut self) -> Option> { + self.cell_paths.take() + } +} + #[derive(Clone)] pub struct SubCommand; @@ -17,6 +31,16 @@ impl Command for SubCommand { Signature::build("str length") .input_output_types(vec![(Type::String, Type::Int)]) .vectorizes_over_list(true) + .switch( + "grapheme-clusters", + "count length using grapheme clusters (all visible chars have length 1)", + Some('g'), + ) + .switch( + "utf-8-bytes", + "count length using UTF-8 bytes (default; all non-ASCII chars have length 2+)", + Some('b'), + ) .rest( "rest", SyntaxShape::CellPath, @@ -41,17 +65,25 @@ impl Command for SubCommand { input: PipelineData, ) -> Result { let cell_paths: Vec = call.rest(engine_state, stack, 0)?; - let args = CellPathOnlyArgs::from(cell_paths); + let args = Arguments { + cell_paths: (!cell_paths.is_empty()).then_some(cell_paths), + graphemes: grapheme_flags(call)?, + }; operate(action, args, input, call.head, engine_state.ctrlc.clone()) } fn examples(&self) -> Vec { vec![ Example { - description: "Return the lengths of multiple strings", + description: "Return the lengths of a string", example: "'hello' | str length", result: Some(Value::test_int(5)), }, + Example { + description: "Count length using grapheme clusters", + example: "'๐Ÿ‡ฏ๐Ÿ‡ตใปใ’ ใตใŒ ใดใ‚ˆ' | str length -g", + result: Some(Value::test_int(9)), + }, Example { description: "Return the lengths of multiple strings", example: "['hi' 'there'] | str length", @@ -64,9 +96,16 @@ impl Command for SubCommand { } } -fn action(input: &Value, _arg: &CellPathOnlyArgs, head: Span) -> Value { +fn action(input: &Value, arg: &Arguments, head: Span) -> Value { match input { - Value::String { val, .. } => Value::int(val.len() as i64, head), + Value::String { val, .. } => Value::int( + if arg.graphemes { + val.graphemes(true).count() + } else { + val.len() + } as i64, + head, + ), Value::Error { .. } => input.clone(), _ => Value::Error { error: ShellError::OnlySupportsThisInputType( @@ -83,6 +122,22 @@ fn action(input: &Value, _arg: &CellPathOnlyArgs, head: Span) -> Value { mod test { use super::*; + #[test] + fn use_utf8_bytes() { + let word = Value::String { + val: String::from("๐Ÿ‡ฏ๐Ÿ‡ตใปใ’ ใตใŒ ใดใ‚ˆ"), + span: Span::test_data(), + }; + + let options = Arguments { + cell_paths: None, + graphemes: false, + }; + + let actual = action(&word, &options, Span::test_data()); + assert_eq!(actual, Value::test_int(28)); + } + #[test] fn test_examples() { use crate::test_examples; diff --git a/crates/nu-command/src/strings/str_/substring.rs b/crates/nu-command/src/strings/str_/substring.rs index 3335a42fbc..d2f40595b3 100644 --- a/crates/nu-command/src/strings/str_/substring.rs +++ b/crates/nu-command/src/strings/str_/substring.rs @@ -1,3 +1,4 @@ +use crate::grapheme_flags; use crate::input_handler::{operate, CmdArgument}; use nu_engine::CallExt; use nu_protocol::ast::Call; @@ -5,6 +6,7 @@ use nu_protocol::ast::CellPath; use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::{Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value}; use std::cmp::Ordering; +use unicode_segmentation::UnicodeSegmentation; #[derive(Clone)] pub struct SubCommand; @@ -12,6 +14,7 @@ pub struct SubCommand; struct Arguments { indexes: Substring, cell_paths: Option>, + graphemes: bool, } impl CmdArgument for Arguments { @@ -40,6 +43,16 @@ impl Command for SubCommand { Signature::build("str substring") .input_output_types(vec![(Type::String, Type::String)]) .vectorizes_over_list(true) + .switch( + "grapheme-clusters", + "count indexes and split using grapheme clusters (all visible chars have length 1)", + Some('g'), + ) + .switch( + "utf-8-bytes", + "count indexes and split using UTF-8 bytes (default; non-ASCII chars have length 2+)", + Some('b'), + ) .required( "range", SyntaxShape::Any, @@ -74,6 +87,7 @@ impl Command for SubCommand { let args = Arguments { indexes, cell_paths, + graphemes: grapheme_flags(call)?, }; operate(action, args, input, call.head, engine_state.ctrlc.clone()) } @@ -111,6 +125,11 @@ impl Command for SubCommand { example: " 'good nushell' | str substring ',7'", result: Some(Value::test_string("good nu")), }, + Example { + description: "Count indexes and split using grapheme clusters", + example: " '๐Ÿ‡ฏ๐Ÿ‡ตใปใ’ ใตใŒ ใดใ‚ˆ' | str substring -g 4..6", + result: Some(Value::test_string("ใตใŒ")), + }, ] } } @@ -144,10 +163,23 @@ fn action(input: &Value, args: &Arguments, head: Span) -> Value { Ordering::Less => Value::String { val: { if end == isize::max_value() { - String::from_utf8_lossy( - &s.bytes().skip(start as usize).collect::>(), - ) - .to_string() + if args.graphemes { + s.graphemes(true) + .skip(start as usize) + .collect::>() + .join("") + } else { + String::from_utf8_lossy( + &s.bytes().skip(start as usize).collect::>(), + ) + .to_string() + } + } else if args.graphemes { + s.graphemes(true) + .skip(start as usize) + .take((end - start) as usize) + .collect::>() + .join("") } else { String::from_utf8_lossy( &s.bytes() @@ -266,7 +298,7 @@ fn process_arguments(range: &Value, head: Span) -> Result<(isize, isize), ShellE #[cfg(test)] mod tests { - use super::{action, Span, SubCommand, Substring, Value}; + use super::{action, Arguments, Span, SubCommand, Substring, Value}; #[test] fn test_examples() { @@ -326,9 +358,10 @@ mod tests { let expected = expectation.expected; let actual = action( &word, - &super::Arguments { + &Arguments { indexes: expectation.options(), cell_paths: None, + graphemes: false, }, Span::test_data(), ); @@ -336,4 +369,21 @@ mod tests { assert_eq!(actual, Value::test_string(expected)); } } + + #[test] + fn use_utf8_bytes() { + let word = Value::String { + val: String::from("๐Ÿ‡ฏ๐Ÿ‡ตใปใ’ ใตใŒ ใดใ‚ˆ"), + span: Span::test_data(), + }; + + let options = Arguments { + cell_paths: None, + indexes: Substring(4, 5), + graphemes: false, + }; + + let actual = action(&word, &options, Span::test_data()); + assert_eq!(actual, Value::test_string("๏ฟฝ")); + } }