From d6669d3f33bc50cbcb3cd21e0a2d1303e118f252 Mon Sep 17 00:00:00 2001 From: Fernando Herrera Date: Sat, 19 Mar 2022 11:13:34 +0000 Subject: [PATCH] Polars update (#4875) * update to polars 0.20 * add to date parser for series --- Cargo.lock | 219 +++++++----------- crates/nu-command/Cargo.toml | 2 +- .../src/dataframe/eager/describe.rs | 4 +- .../src/dataframe/eager/drop_duplicates.rs | 14 +- .../nu-command/src/dataframe/eager/to_csv.rs | 4 +- .../src/dataframe/eager/to_parquet.rs | 4 +- .../src/dataframe/series/arg_max.rs | 4 +- .../src/dataframe/series/arg_min.rs | 4 +- .../src/dataframe/series/date/as_date.rs | 87 +++++++ .../src/dataframe/series/date/as_datetime.rs | 7 +- .../src/dataframe/series/date/mod.rs | 2 + .../src/dataframe/series/indexes/arg_sort.rs | 13 +- crates/nu-command/src/dataframe/series/mod.rs | 1 + .../values/nu_dataframe/between_values.rs | 19 ++ .../values/nu_dataframe/conversion.rs | 6 +- .../src/dataframe/values/nu_dataframe/mod.rs | 2 +- .../src/dataframe/values/nu_groupby/mod.rs | 24 +- 17 files changed, 255 insertions(+), 161 deletions(-) create mode 100644 crates/nu-command/src/dataframe/series/date/as_date.rs diff --git a/Cargo.lock b/Cargo.lock index 54df0b32be..5c36fd5125 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -102,12 +102,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" -[[package]] -name = "arrayref" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" - [[package]] name = "arrayvec" version = "0.4.12" @@ -135,15 +129,16 @@ dependencies = [ [[package]] name = "arrow2" -version = "0.9.2" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "185b18ba35287d4e6989e451ab3d184f4699befaa05cf95b0da74152b0c5c24a" +checksum = "2e387b20dd573a96f36b173d9027483898f944d696521afd74e2caa3c813d86e" dependencies = [ "arrow-format", "base64", "bytemuck", "chrono", - "csv", + "csv-core", + "either", "fallible-streaming-iterator", "futures", "hash_hasher", @@ -291,17 +286,6 @@ dependencies = [ "crunchy", ] -[[package]] -name = "blake2b_simd" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587" -dependencies = [ - "arrayref", - "arrayvec 0.5.2", - "constant_time_eq", -] - [[package]] name = "block-buffer" version = "0.9.0" @@ -527,6 +511,18 @@ dependencies = [ "encoding_rs", ] +[[package]] +name = "comfy-table" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e" +dependencies = [ + "crossterm", + "strum 0.23.0", + "strum_macros 0.23.1", + "unicode-width", +] + [[package]] name = "console" version = "0.15.0" @@ -548,12 +544,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb58b6451e8c2a812ad979ed1d83378caa5e927eef2622017a45f251457c2c9d" -[[package]] -name = "constant_time_eq" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" - [[package]] name = "convert_case" version = "0.4.0" @@ -821,17 +811,6 @@ dependencies = [ "crypto-common", ] -[[package]] -name = "dirs" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fd78930633bd1c6e35c4b42b1df7b0cbc6bc191146e512bb3bedf243fcc3901" -dependencies = [ - "libc", - "redox_users 0.3.5", - "winapi", -] - [[package]] name = "dirs" version = "4.0.0" @@ -858,7 +837,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03d86534ed367a67548dc68113a0f5db55432fdfbb6e6f9d77704397d95d5780" dependencies = [ "libc", - "redox_users 0.4.0", + "redox_users", "winapi", ] @@ -869,7 +848,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ "libc", - "redox_users 0.4.0", + "redox_users", "winapi", ] @@ -1420,6 +1399,15 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "heck" version = "0.4.0" @@ -2763,7 +2751,7 @@ dependencies = [ "cfg-if", "instant", "libc", - "redox_syscall 0.2.11", + "redox_syscall", "smallvec", "winapi", ] @@ -2776,7 +2764,7 @@ checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.2.11", + "redox_syscall", "smallvec", "windows-sys 0.32.0", ] @@ -2796,9 +2784,9 @@ dependencies = [ [[package]] name = "parquet2" -version = "0.9.2" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45476d276db539ec4076f6abe62392619460fb70a1a8edebcc06e11cd93c0ec3" +checksum = "6b085f9e78e4842865151b693f6d94bdf7b280af66daa6e3587adeb3106a07e9" dependencies = [ "async-stream", "bitpacking", @@ -2970,20 +2958,21 @@ dependencies = [ [[package]] name = "polars" -version = "0.19.1" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2620069231dd93a27f7fc982c10379394a540775057fc569a669a40c2ad7207d" +checksum = "656db3b86c338a8a717476eb29436a380ebdf74915a71cff6ecce78d52173e53" dependencies = [ "polars-core", "polars-io", "polars-lazy", + "polars-time", ] [[package]] name = "polars-arrow" -version = "0.19.1" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "863d723959237470af38fee06ca74c58e52ee3778e4796fbf66e05deb5f925f6" +checksum = "fcedf44a7b15b60c69e811c9d343ac459788e961dc4136f002ed1b68a1fada07" dependencies = [ "arrow2", "hashbrown 0.12.0", @@ -2993,20 +2982,21 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.19.1" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "006c8d5c4b057ec2287766c14293d70c76bfc65e949dd028a76a501098e3253b" +checksum = "6dfed0e21ac4d4c85df45b5864a68cfc5b2a97e9fba8a981be7b09c6f02a7eaa" dependencies = [ "ahash", "anyhow", "arrow2", + "chrono", + "comfy-table", "hashbrown 0.12.0", + "indexmap", "lazy_static", "num 0.4.0", "num_cpus", "polars-arrow", - "polars-time", - "prettytable-rs", "rand 0.8.5", "rand_distr", "rayon", @@ -3014,20 +3004,19 @@ dependencies = [ "serde", "serde_json", "thiserror", - "unsafe_unwrap", ] [[package]] name = "polars-io" -version = "0.19.1" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c99b04f1c31a6d0d121242e0b4f725c7c608665772e66621e42ed374a28ea6bc" +checksum = "d8770fb4233ab88affac80c410be090dc7a2c044a9e4e7b942132e94ceeb732b" dependencies = [ "ahash", "anyhow", "arrow2", "csv-core", - "dirs 4.0.0", + "dirs", "lazy_static", "lexical", "memchr", @@ -3039,41 +3028,46 @@ dependencies = [ "polars-utils", "rayon", "regex", + "serde_json", "simdutf8", ] [[package]] name = "polars-lazy" -version = "0.19.1" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3545b1e34365293d761e06d98d535da322ce04c02d638eefade1db61e0aa22b4" +checksum = "4eca1fed3b88ae1bb9b7f1d7b2958f1655d9c1aed33495d6ba30ff84a0c1e9e9" dependencies = [ "ahash", "glob", + "parking_lot 0.12.0", "polars-arrow", "polars-core", "polars-io", + "polars-time", "polars-utils", "rayon", ] [[package]] name = "polars-time" -version = "0.1.1" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c28b9d7a4badf14d54819ed1c49112e44348a3c7087854414a55b47bb46eea" +checksum = "0fe48c759ca778a8b6fb30f70e9a81b56f0987a82dc71e61c5b2d3c236b6b8d6" dependencies = [ "chrono", "polars-arrow", + "polars-core", ] [[package]] name = "polars-utils" -version = "0.1.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03f6f755d66fedfc506fb1df64aa8adb904cd0ef19fd4fb7f339e7ec3619aa65" +checksum = "71011e8ed52f123ce23d110b496c8704d0a59c5fd4115cd938e7ff19d4bcb7ca" dependencies = [ - "parking_lot 0.11.2", + "parking_lot 0.12.0", + "rayon", ] [[package]] @@ -3137,20 +3131,6 @@ dependencies = [ "log", ] -[[package]] -name = "prettytable-rs" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fd04b170004fa2daccf418a7f8253aaf033c27760b5f225889024cf66d7ac2e" -dependencies = [ - "atty", - "csv", - "encode_unicode", - "lazy_static", - "term", - "unicode-width", -] - [[package]] name = "proc-macro-error" version = "1.0.4" @@ -3423,12 +3403,6 @@ dependencies = [ "rand_core 0.3.1", ] -[[package]] -name = "redox_syscall" -version = "0.1.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" - [[package]] name = "redox_syscall" version = "0.2.11" @@ -3438,17 +3412,6 @@ dependencies = [ "bitflags", ] -[[package]] -name = "redox_users" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de0737333e7a9502c789a36d7c7fa6092a49895d4faa31ca5df163857ded2e9d" -dependencies = [ - "getrandom 0.1.16", - "redox_syscall 0.1.57", - "rust-argon2", -] - [[package]] name = "redox_users" version = "0.4.0" @@ -3456,7 +3419,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" dependencies = [ "getrandom 0.2.5", - "redox_syscall 0.2.11", + "redox_syscall", ] [[package]] @@ -3471,8 +3434,8 @@ dependencies = [ "nu-ansi-term", "serde", "strip-ansi-escapes", - "strum", - "strum_macros", + "strum 0.24.0", + "strum_macros 0.24.0", "unicode-segmentation", "unicode-width", ] @@ -3573,18 +3536,6 @@ dependencies = [ "syn", ] -[[package]] -name = "rust-argon2" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b18820d944b33caa75a71378964ac46f58517c92b6ae5f762636247c09e78fb" -dependencies = [ - "base64", - "blake2b_simd", - "constant_time_eq", - "crossbeam-utils", -] - [[package]] name = "rust-embed" version = "6.3.0" @@ -4086,19 +4037,38 @@ dependencies = [ "vte", ] +[[package]] +name = "strum" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" + [[package]] name = "strum" version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e96acfc1b70604b8b2f1ffa4c57e59176c7dbb05d556c71ecd2f5498a1dee7f8" +[[package]] +name = "strum_macros" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" +dependencies = [ + "heck 0.3.3", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "strum_macros" version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6878079b17446e4d3eba6192bb0a2950d5b14f0ed8424b852310e5a94345d0ef" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro2", "quote", "rustversion", @@ -4212,7 +4182,7 @@ dependencies = [ "cfg-if", "fastrand", "libc", - "redox_syscall 0.2.11", + "redox_syscall", "remove_dir_all", "winapi", ] @@ -4228,17 +4198,6 @@ dependencies = [ "utf-8", ] -[[package]] -name = "term" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edd106a334b7657c10b7c540a0106114feadeb4dc314513e97df481d5d966f42" -dependencies = [ - "byteorder", - "dirs 1.0.5", - "winapi", -] - [[package]] name = "termcolor" version = "1.1.3" @@ -4533,12 +4492,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" -[[package]] -name = "unsafe_unwrap" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1230ec65f13e0f9b28d789da20d2d419511893ea9dac2c1f4ef67b8b14e5da80" - [[package]] name = "url" version = "2.2.2" @@ -4984,18 +4937,18 @@ dependencies = [ [[package]] name = "zstd" -version = "0.9.2+zstd.1.5.1" +version = "0.10.0+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2390ea1bf6c038c39674f22d95f0564725fc06034a47129179810b2fc58caa54" +checksum = "3b1365becbe415f3f0fcd024e2f7b45bacfb5bdd055f0dc113571394114e7bdd" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "4.1.3+zstd.1.5.1" +version = "4.1.4+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e99d81b99fb3c2c2c794e3fe56c305c63d5173a16a46b5850b07c935ffc7db79" +checksum = "2f7cd17c9af1a4d6c24beb1cc54b17e2ef7b593dc92f19e9d9acad8b182bbaee" dependencies = [ "libc", "zstd-sys", @@ -5003,9 +4956,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "1.6.2+zstd.1.5.1" +version = "1.6.3+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2daf2f248d9ea44454bfcb2516534e8b8ad2fc91bf818a1885495fc42bc8ac9f" +checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" dependencies = [ "cc", "libc", diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index d8c1688268..1745c7d4c6 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -84,7 +84,7 @@ umask = "1.0.0" users = "0.11.0" [dependencies.polars] -version = "0.19.1" +version = "0.20.0" optional = true features = [ "default", "parquet", "json", "serde", "object", diff --git a/crates/nu-command/src/dataframe/eager/describe.rs b/crates/nu-command/src/dataframe/eager/describe.rs index d866b8340e..dffc01445a 100644 --- a/crates/nu-command/src/dataframe/eager/describe.rs +++ b/crates/nu-command/src/dataframe/eager/describe.rs @@ -162,7 +162,7 @@ fn command( let df = NuDataFrame::try_from_pipeline(input, call.head)?; - let names = ChunkedArray::::new_from_opt_slice("descriptor", &labels).into_series(); + let names = ChunkedArray::::from_slice_options("descriptor", &labels).into_series(); let head = std::iter::once(names); @@ -235,7 +235,7 @@ fn command( descriptors.push(max); let name = format!("{} ({})", col.name(), col.dtype()); - ChunkedArray::::new_from_opt_slice(&name, &descriptors).into_series() + ChunkedArray::::from_slice_options(&name, &descriptors).into_series() }); let res = head.chain(tail).collect::>(); diff --git a/crates/nu-command/src/dataframe/eager/drop_duplicates.rs b/crates/nu-command/src/dataframe/eager/drop_duplicates.rs index ccf51661e6..7fecaccaff 100644 --- a/crates/nu-command/src/dataframe/eager/drop_duplicates.rs +++ b/crates/nu-command/src/dataframe/eager/drop_duplicates.rs @@ -4,6 +4,7 @@ use nu_protocol::{ engine::{Command, EngineState, Stack}, Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, }; +use polars::prelude::DistinctKeepStrategy; use super::super::values::utils::convert_columns_string; use super::super::values::{Column, NuDataFrame}; @@ -28,6 +29,11 @@ impl Command for DropDuplicates { "subset of columns to drop duplicates", ) .switch("maintain", "maintain order", Some('m')) + .switch( + "last", + "keeps last duplicate value (by default keeps first)", + Some('l'), + ) .category(Category::Custom("dataframe".into())) } @@ -82,8 +88,14 @@ fn command( let subset_slice = subset.as_ref().map(|cols| &cols[..]); + let keep_strategy = if call.has_flag("last") { + DistinctKeepStrategy::Last + } else { + DistinctKeepStrategy::First + }; + df.as_ref() - .drop_duplicates(call.has_flag("maintain"), subset_slice) + .distinct(subset_slice, keep_strategy) .map_err(|e| { ShellError::SpannedLabeledError( "Error dropping duplicates".into(), diff --git a/crates/nu-command/src/dataframe/eager/to_csv.rs b/crates/nu-command/src/dataframe/eager/to_csv.rs index df87e4216d..af35fa3cf5 100644 --- a/crates/nu-command/src/dataframe/eager/to_csv.rs +++ b/crates/nu-command/src/dataframe/eager/to_csv.rs @@ -71,7 +71,7 @@ fn command( let delimiter: Option> = call.get_flag(engine_state, stack, "delimiter")?; let no_header: bool = call.has_flag("no-header"); - let df = NuDataFrame::try_from_pipeline(input, call.head)?; + let mut df = NuDataFrame::try_from_pipeline(input, call.head)?; let mut file = File::create(&file_name.item).map_err(|e| { ShellError::SpannedLabeledError( @@ -109,7 +109,7 @@ fn command( } }; - writer.finish(df.as_ref()).map_err(|e| { + writer.finish(df.as_mut()).map_err(|e| { ShellError::SpannedLabeledError( "Error writing to file".into(), e.to_string(), diff --git a/crates/nu-command/src/dataframe/eager/to_parquet.rs b/crates/nu-command/src/dataframe/eager/to_parquet.rs index 12db49b361..1acf3d5524 100644 --- a/crates/nu-command/src/dataframe/eager/to_parquet.rs +++ b/crates/nu-command/src/dataframe/eager/to_parquet.rs @@ -55,7 +55,7 @@ fn command( ) -> Result { let file_name: Spanned = call.req(engine_state, stack, 0)?; - let df = NuDataFrame::try_from_pipeline(input, call.head)?; + let mut df = NuDataFrame::try_from_pipeline(input, call.head)?; let file = File::create(&file_name.item).map_err(|e| { ShellError::SpannedLabeledError( @@ -65,7 +65,7 @@ fn command( ) })?; - ParquetWriter::new(file).finish(df.as_ref()).map_err(|e| { + ParquetWriter::new(file).finish(df.as_mut()).map_err(|e| { ShellError::SpannedLabeledError("Error saving file".into(), e.to_string(), file_name.span) })?; diff --git a/crates/nu-command/src/dataframe/series/arg_max.rs b/crates/nu-command/src/dataframe/series/arg_max.rs index 774ba401aa..b3307ca88a 100644 --- a/crates/nu-command/src/dataframe/series/arg_max.rs +++ b/crates/nu-command/src/dataframe/series/arg_max.rs @@ -60,8 +60,8 @@ fn command( let res = series.arg_max(); let chunked = match res { - Some(index) => UInt32Chunked::new_from_slice("arg_max", &[index as u32]), - None => UInt32Chunked::new_from_slice("arg_max", &[]), + Some(index) => UInt32Chunked::from_slice("arg_max", &[index as u32]), + None => UInt32Chunked::from_slice("arg_max", &[]), }; let res = chunked.into_series(); diff --git a/crates/nu-command/src/dataframe/series/arg_min.rs b/crates/nu-command/src/dataframe/series/arg_min.rs index bacdbf2768..8dccc8d750 100644 --- a/crates/nu-command/src/dataframe/series/arg_min.rs +++ b/crates/nu-command/src/dataframe/series/arg_min.rs @@ -60,8 +60,8 @@ fn command( let res = series.arg_min(); let chunked = match res { - Some(index) => UInt32Chunked::new_from_slice("arg_min", &[index as u32]), - None => UInt32Chunked::new_from_slice("arg_min", &[]), + Some(index) => UInt32Chunked::from_slice("arg_min", &[index as u32]), + None => UInt32Chunked::from_slice("arg_min", &[]), }; let res = chunked.into_series(); diff --git a/crates/nu-command/src/dataframe/series/date/as_date.rs b/crates/nu-command/src/dataframe/series/date/as_date.rs new file mode 100644 index 0000000000..aae3ee4164 --- /dev/null +++ b/crates/nu-command/src/dataframe/series/date/as_date.rs @@ -0,0 +1,87 @@ +use super::super::super::values::NuDataFrame; + +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, SyntaxShape, +}; +use polars::prelude::IntoSeries; + +#[derive(Clone)] +pub struct AsDate; + +impl Command for AsDate { + fn name(&self) -> &str { + "dfr as-date" + } + + fn usage(&self) -> &str { + r#"Converts string to date. Format example: + "%Y-%m-%d" => 2021-12-31 + "%d-%m-%Y" => 31-12-2021 + "%Y%m%d" => 2021319 (2021-03-19)"# + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("format", SyntaxShape::String, "formating date string") + .switch("not-exact", "the format string may be contained in the date (e.g. foo-2021-01-01-bar could match 2021-01-01)", Some('n')) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Converts string to date", + example: r#"["2021-12-30" "2021-12-31"] | dfr to-df | dfr as-datetime "%Y-%m-%d""#, + result: None, + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let format: String = call.req(engine_state, stack, 0)?; + let not_exact = call.has_flag("not-exact"); + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + let series = df.as_series(call.head)?; + let casted = series.utf8().map_err(|e| { + ShellError::SpannedLabeledError("Error casting to string".into(), e.to_string(), call.head) + })?; + + let res = if not_exact { + casted.as_date_not_exact(Some(format.as_str())) + } else { + casted.as_date(Some(format.as_str())) + }; + + let mut res = res + .map_err(|e| { + ShellError::SpannedLabeledError( + "Error creating datetime".into(), + e.to_string(), + call.head, + ) + })? + .into_series(); + + res.rename("date"); + + NuDataFrame::try_from_series(vec![res], call.head) + .map(|df| PipelineData::Value(NuDataFrame::into_value(df, call.head), None)) +} diff --git a/crates/nu-command/src/dataframe/series/date/as_datetime.rs b/crates/nu-command/src/dataframe/series/date/as_datetime.rs index 4861483420..be279044a2 100644 --- a/crates/nu-command/src/dataframe/series/date/as_datetime.rs +++ b/crates/nu-command/src/dataframe/series/date/as_datetime.rs @@ -34,7 +34,7 @@ impl Command for AsDateTime { fn signature(&self) -> Signature { Signature::build(self.name()) - .required("format", SyntaxShape::String, "formating date string") + .required("format", SyntaxShape::String, "formating date time string") .switch("not-exact", "the format string may be contained in the date (e.g. foo-2021-01-01-bar could match 2021-01-01)", Some('n')) .category(Category::Custom("dataframe".into())) } @@ -45,7 +45,7 @@ impl Command for AsDateTime { example: r#"["2021-12-30 00:00:00" "2021-12-31 00:00:00"] | dfr to-df | dfr as-datetime "%Y-%m-%d %H:%M:%S""#, result: Some( NuDataFrame::try_from_columns(vec![Column::new( - "0".to_string(), + "datetime".to_string(), vec![ Value::Date { val: DateTime::parse_from_str( @@ -103,7 +103,7 @@ fn command( casted.as_datetime(Some(format.as_str()), TimeUnit::Milliseconds) }; - let res = res + let mut res = res .map_err(|e| { ShellError::SpannedLabeledError( "Error creating datetime".into(), @@ -113,6 +113,7 @@ fn command( })? .into_series(); + res.rename("datetime"); NuDataFrame::try_from_series(vec![res], call.head) .map(|df| PipelineData::Value(NuDataFrame::into_value(df, call.head), None)) } diff --git a/crates/nu-command/src/dataframe/series/date/mod.rs b/crates/nu-command/src/dataframe/series/date/mod.rs index b1c8029582..ed3895a172 100644 --- a/crates/nu-command/src/dataframe/series/date/mod.rs +++ b/crates/nu-command/src/dataframe/series/date/mod.rs @@ -1,3 +1,4 @@ +mod as_date; mod as_datetime; mod get_day; mod get_hour; @@ -10,6 +11,7 @@ mod get_week; mod get_weekday; mod get_year; +pub use as_date::AsDate; pub use as_datetime::AsDateTime; pub use get_day::GetDay; pub use get_hour::GetHour; diff --git a/crates/nu-command/src/dataframe/series/indexes/arg_sort.rs b/crates/nu-command/src/dataframe/series/indexes/arg_sort.rs index d1ff8f9963..879cc8e17d 100644 --- a/crates/nu-command/src/dataframe/series/indexes/arg_sort.rs +++ b/crates/nu-command/src/dataframe/series/indexes/arg_sort.rs @@ -5,7 +5,7 @@ use nu_protocol::{ engine::{Command, EngineState, Stack}, Category, Example, PipelineData, ShellError, Signature, Span, Value, }; -use polars::prelude::IntoSeries; +use polars::prelude::{IntoSeries, SortOptions}; #[derive(Clone)] pub struct ArgSort; @@ -22,6 +22,7 @@ impl Command for ArgSort { fn signature(&self) -> Signature { Signature::build(self.name()) .switch("reverse", "reverse order", Some('r')) + .switch("nulls-last", "nulls ordered last", Some('n')) .category(Category::Custom("dataframe".into())) } @@ -85,10 +86,12 @@ fn command( ) -> Result { let df = NuDataFrame::try_from_pipeline(input, call.head)?; - let mut res = df - .as_series(call.head)? - .argsort(call.has_flag("reverse")) - .into_series(); + let sort_options = SortOptions { + descending: call.has_flag("reverse"), + nulls_last: call.has_flag("nulls-last"), + }; + + let mut res = df.as_series(call.head)?.argsort(sort_options).into_series(); res.rename("arg_sort"); NuDataFrame::try_from_series(vec![res], call.head) diff --git a/crates/nu-command/src/dataframe/series/mod.rs b/crates/nu-command/src/dataframe/series/mod.rs index 33edf7a8c1..d55aeaccd2 100644 --- a/crates/nu-command/src/dataframe/series/mod.rs +++ b/crates/nu-command/src/dataframe/series/mod.rs @@ -57,6 +57,7 @@ pub fn add_series_decls(working_set: &mut StateWorkingSet) { ArgSort, ArgTrue, ArgUnique, + AsDate, AsDateTime, Concatenate, Contains, diff --git a/crates/nu-command/src/dataframe/values/nu_dataframe/between_values.rs b/crates/nu-command/src/dataframe/values/nu_dataframe/between_values.rs index e3cc451527..70449d833d 100644 --- a/crates/nu-command/src/dataframe/values/nu_dataframe/between_values.rs +++ b/crates/nu-command/src/dataframe/values/nu_dataframe/between_values.rs @@ -530,6 +530,25 @@ where )), } } + DataType::Date => { + let to_i64 = series.cast(&DataType::Int64); + + match to_i64 { + Ok(series) => { + let nanosecs_per_day: i64 = 24 * 60 * 60 * 1_000_000_000; + let casted = series + .i64() + .map(|chunked| chunked.mul(nanosecs_per_day)) + .expect("already checked for casting"); + compare_casted_i64(Ok(&casted), val, f, span) + } + Err(e) => Err(ShellError::SpannedLabeledError( + "Unable to cast to f64".into(), + e.to_string(), + span, + )), + } + } DataType::Int64 => { let casted = series.i64(); compare_casted_i64(casted, val, f, span) diff --git a/crates/nu-command/src/dataframe/values/nu_dataframe/conversion.rs b/crates/nu-command/src/dataframe/values/nu_dataframe/conversion.rs index 5d08375578..e50301660b 100644 --- a/crates/nu-command/src/dataframe/values/nu_dataframe/conversion.rs +++ b/crates/nu-command/src/dataframe/values/nu_dataframe/conversion.rs @@ -430,7 +430,7 @@ pub fn create_column( Ok(Column::new(casted.name().into(), values)) } DataType::Time => { - let casted = series.time().map_err(|e| { + let casted = series.timestamp(TimeUnit::Nanoseconds).map_err(|e| { ShellError::LabeledError("Error casting column to time".into(), e.to_string()) })?; @@ -596,7 +596,7 @@ pub fn from_parsed_columns(column_values: ColumnMap) -> Result::new_from_opt_iter(&name, it) + ChunkedArray::::from_iter_options(&name, it) .into_datetime(TimeUnit::Milliseconds, None); df_series.push(res.into_series()) @@ -610,7 +610,7 @@ pub fn from_parsed_columns(column_values: ColumnMap) -> Result::new_from_opt_iter(&name, it); + let res = ChunkedArray::::from_iter_options(&name, it); df_series.push(res.into_series()) } diff --git a/crates/nu-command/src/dataframe/values/nu_dataframe/mod.rs b/crates/nu-command/src/dataframe/values/nu_dataframe/mod.rs index a902ae0b69..eb18660cdb 100644 --- a/crates/nu-command/src/dataframe/values/nu_dataframe/mod.rs +++ b/crates/nu-command/src/dataframe/values/nu_dataframe/mod.rs @@ -394,7 +394,7 @@ impl NuDataFrame { // Casting needed to compare other numeric types with nushell numeric type. // In nushell we only have i64 integer numeric types and any array created // with nushell untagged primitives will be of type i64 - DataType::UInt32 => match self_series.cast(&DataType::Int64) { + DataType::UInt32 | DataType::Int32 => match self_series.cast(&DataType::Int64) { Ok(series) => series, Err(_) => return None, }, diff --git a/crates/nu-command/src/dataframe/values/nu_groupby/mod.rs b/crates/nu-command/src/dataframe/values/nu_groupby/mod.rs index 53c8ff846c..ec3f2d4708 100644 --- a/crates/nu-command/src/dataframe/values/nu_groupby/mod.rs +++ b/crates/nu-command/src/dataframe/values/nu_groupby/mod.rs @@ -2,26 +2,42 @@ mod custom_value; use nu_protocol::{PipelineData, ShellError, Span, Value}; use polars::frame::groupby::{GroupBy, GroupsProxy}; -use polars::prelude::DataFrame; +use polars::prelude::{DataFrame, GroupsIdx}; use serde::{Deserialize, Serialize}; #[derive(Clone, Debug, Serialize, Deserialize)] pub enum NuGroupsProxy { - Idx(Vec<(u32, Vec)>), + Idx { + sorted: bool, + all: Vec<(u32, Vec)>, + }, Slice(Vec<[u32; 2]>), } impl NuGroupsProxy { fn from_polars(groups: &GroupsProxy) -> Self { match groups { - GroupsProxy::Idx(indexes) => NuGroupsProxy::Idx(indexes.clone()), + GroupsProxy::Idx(indexes) => NuGroupsProxy::Idx { + sorted: indexes.is_sorted(), + all: indexes + .iter() + .map(|(index, values)| (index, values.clone())) + .collect(), + }, GroupsProxy::Slice(slice) => NuGroupsProxy::Slice(slice.clone()), } } fn to_polars(&self) -> GroupsProxy { match self { - Self::Idx(indexes) => GroupsProxy::Idx(indexes.clone()), + Self::Idx { sorted, all } => { + let mut groups: GroupsIdx = all.clone().into(); + if *sorted { + groups.sort() + } + + GroupsProxy::Idx(groups) + } Self::Slice(slice) => GroupsProxy::Slice(slice.clone()), } }