From 666e6a7b57e41822694a70c17965383ad12cb07e Mon Sep 17 00:00:00 2001
From: Chris Gillespie <6572184+gillespiecd@users.noreply.github.com>
Date: Wed, 2 Sep 2020 09:54:00 -0700
Subject: [PATCH] Size: count unicode graphmemes as single char (#2482)

---
 Cargo.lock                         |  1 +
 crates/nu-cli/Cargo.toml           |  1 +
 crates/nu-cli/src/commands/size.rs | 44 ++++++++++++++++++++----------
 3 files changed, 32 insertions(+), 14 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 68d377d3ef..0fc87aff64 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3093,6 +3093,7 @@ dependencies = [
  "trash",
  "typetag",
  "umask",
+ "unicode-segmentation",
  "unicode-xid",
  "url 2.1.1",
  "users",
diff --git a/crates/nu-cli/Cargo.toml b/crates/nu-cli/Cargo.toml
index 179648168c..fe706c247a 100644
--- a/crates/nu-cli/Cargo.toml
+++ b/crates/nu-cli/Cargo.toml
@@ -87,6 +87,7 @@ termcolor = "1.1.0"
 toml = "0.5.6"
 typetag = "0.1.5"
 umask = "1.0.0"
+unicode-segmentation = "1.6.0"
 unicode-xid = "0.2.1"
 uuid_crate = {package = "uuid", version = "0.8.1", features = ["v4"], optional = true}
 which = {version = "4.0.2", optional = true}
diff --git a/crates/nu-cli/src/commands/size.rs b/crates/nu-cli/src/commands/size.rs
index d1ebfc31d2..1fa033a634 100644
--- a/crates/nu-cli/src/commands/size.rs
+++ b/crates/nu-cli/src/commands/size.rs
@@ -1,8 +1,11 @@
+extern crate unicode_segmentation;
+
 use crate::commands::WholeStreamCommand;
 use crate::prelude::*;
 use indexmap::indexmap;
 use nu_errors::ShellError;
 use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value};
+use unicode_segmentation::UnicodeSegmentation;
 
 pub struct Size;
 
@@ -29,17 +32,30 @@ impl WholeStreamCommand for Size {
     }
 
     fn examples(&self) -> Vec<Example> {
-        vec![Example {
-            description: "Count the number of words in a string",
-            example: r#"echo "There are seven words in this sentence" | size"#,
-            result: Some(vec![UntaggedValue::row(indexmap! {
-                "lines".to_string() => UntaggedValue::int(0).into(),
-                "words".to_string() => UntaggedValue::int(7).into(),
-                "chars".to_string() => UntaggedValue::int(38).into(),
-                "bytes".to_string() => UntaggedValue::int(38).into(),
-            })
-            .into()]),
-        }]
+        vec![
+            Example {
+                description: "Count the number of words in a string",
+                example: r#"echo "There are seven words in this sentence" | size"#,
+                result: Some(vec![UntaggedValue::row(indexmap! {
+                        "lines".to_string() => UntaggedValue::int(0).into(),
+                        "words".to_string() => UntaggedValue::int(7).into(),
+                        "chars".to_string() => UntaggedValue::int(38).into(),
+                        "bytes".to_string() => UntaggedValue::int(38).into(),
+                })
+                .into()]),
+            },
+            Example {
+                description: "Counts unicode characters correctly in a string",
+                example: r#"echo "Amélie Amelie" | size"#,
+                result: Some(vec![UntaggedValue::row(indexmap! {
+                        "lines".to_string() => UntaggedValue::int(0).into(),
+                        "words".to_string() => UntaggedValue::int(2).into(),
+                        "chars".to_string() => UntaggedValue::int(13).into(),
+                        "bytes".to_string() => UntaggedValue::int(15).into(),
+                })
+                .into()]),
+            },
+        ]
     }
 }
 
@@ -72,15 +88,15 @@ fn count(contents: &str, tag: impl Into<Tag>) -> Value {
     let bytes = contents.len() as i64;
     let mut end_of_word = true;
 
-    for c in contents.chars() {
+    for c in UnicodeSegmentation::graphemes(contents, true) {
         chars += 1;
 
         match c {
-            '\n' => {
+            "\n" => {
                 lines += 1;
                 end_of_word = true;
             }
-            ' ' => end_of_word = true,
+            " " => end_of_word = true,
             _ => {
                 if end_of_word {
                     words += 1;