From 666e6a7b57e41822694a70c17965383ad12cb07e Mon Sep 17 00:00:00 2001 From: Chris Gillespie <6572184+gillespiecd@users.noreply.github.com> Date: Wed, 2 Sep 2020 09:54:00 -0700 Subject: [PATCH] Size: count unicode graphmemes as single char (#2482) --- Cargo.lock | 1 + crates/nu-cli/Cargo.toml | 1 + crates/nu-cli/src/commands/size.rs | 44 ++++++++++++++++++++---------- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 68d377d3ef..0fc87aff64 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3093,6 +3093,7 @@ dependencies = [ "trash", "typetag", "umask", + "unicode-segmentation", "unicode-xid", "url 2.1.1", "users", diff --git a/crates/nu-cli/Cargo.toml b/crates/nu-cli/Cargo.toml index 179648168c..fe706c247a 100644 --- a/crates/nu-cli/Cargo.toml +++ b/crates/nu-cli/Cargo.toml @@ -87,6 +87,7 @@ termcolor = "1.1.0" toml = "0.5.6" typetag = "0.1.5" umask = "1.0.0" +unicode-segmentation = "1.6.0" unicode-xid = "0.2.1" uuid_crate = {package = "uuid", version = "0.8.1", features = ["v4"], optional = true} which = {version = "4.0.2", optional = true} diff --git a/crates/nu-cli/src/commands/size.rs b/crates/nu-cli/src/commands/size.rs index d1ebfc31d2..1fa033a634 100644 --- a/crates/nu-cli/src/commands/size.rs +++ b/crates/nu-cli/src/commands/size.rs @@ -1,8 +1,11 @@ +extern crate unicode_segmentation; + use crate::commands::WholeStreamCommand; use crate::prelude::*; use indexmap::indexmap; use nu_errors::ShellError; use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value}; +use unicode_segmentation::UnicodeSegmentation; pub struct Size; @@ -29,17 +32,30 @@ impl WholeStreamCommand for Size { } fn examples(&self) -> Vec { - vec![Example { - description: "Count the number of words in a string", - example: r#"echo "There are seven words in this sentence" | size"#, - result: Some(vec![UntaggedValue::row(indexmap! { - "lines".to_string() => UntaggedValue::int(0).into(), - "words".to_string() => UntaggedValue::int(7).into(), - "chars".to_string() => UntaggedValue::int(38).into(), - "bytes".to_string() => UntaggedValue::int(38).into(), - }) - .into()]), - }] + vec![ + Example { + description: "Count the number of words in a string", + example: r#"echo "There are seven words in this sentence" | size"#, + result: Some(vec![UntaggedValue::row(indexmap! { + "lines".to_string() => UntaggedValue::int(0).into(), + "words".to_string() => UntaggedValue::int(7).into(), + "chars".to_string() => UntaggedValue::int(38).into(), + "bytes".to_string() => UntaggedValue::int(38).into(), + }) + .into()]), + }, + Example { + description: "Counts unicode characters correctly in a string", + example: r#"echo "Amélie Amelie" | size"#, + result: Some(vec![UntaggedValue::row(indexmap! { + "lines".to_string() => UntaggedValue::int(0).into(), + "words".to_string() => UntaggedValue::int(2).into(), + "chars".to_string() => UntaggedValue::int(13).into(), + "bytes".to_string() => UntaggedValue::int(15).into(), + }) + .into()]), + }, + ] } } @@ -72,15 +88,15 @@ fn count(contents: &str, tag: impl Into) -> Value { let bytes = contents.len() as i64; let mut end_of_word = true; - for c in contents.chars() { + for c in UnicodeSegmentation::graphemes(contents, true) { chars += 1; match c { - '\n' => { + "\n" => { lines += 1; end_of_word = true; } - ' ' => end_of_word = true, + " " => end_of_word = true, _ => { if end_of_word { words += 1;