From af2b2c668da4a73349b889eeeb82949d453127f4 Mon Sep 17 00:00:00 2001 From: Fernando Herrera Date: Mon, 5 Jul 2021 00:46:53 +0100 Subject: [PATCH] New take command (#3722) * Type in command description * filter name change * Clean column name * Clippy error and updated polars version * Lint correction in file * CSV Infer schema optional * Correct float operations * changes in series castings to allow other types * Clippy error correction * Removed lists from command signatures * Added not command for series * take command with args * set with idx command --- Cargo.lock | 43 +++-- crates/nu-command/Cargo.toml | 4 +- .../src/commands/dataframe/aggregate.rs | 4 +- .../nu-command/src/commands/dataframe/mod.rs | 3 + .../src/commands/dataframe/series/arg_max.rs | 22 +-- .../src/commands/dataframe/series/arg_min.rs | 22 +-- .../src/commands/dataframe/series/mod.rs | 2 + .../commands/dataframe/series/set_with_idx.rs | 153 ++++++++++++++++++ .../nu-command/src/commands/dataframe/take.rs | 107 ++++++++++++ crates/nu-command/src/commands/mod.rs | 6 +- crates/nu-command/src/default_context.rs | 2 + crates/nu-data/Cargo.toml | 4 +- crates/nu-protocol/Cargo.toml | 4 +- 13 files changed, 319 insertions(+), 57 deletions(-) create mode 100644 crates/nu-command/src/commands/dataframe/series/set_with_idx.rs create mode 100644 crates/nu-command/src/commands/dataframe/take.rs diff --git a/Cargo.lock b/Cargo.lock index dd18af2db3..125c163f9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -208,9 +208,11 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "arrow" -version = "5.0.0-SNAPSHOT" -source = "git+https://github.com/apache/arrow-rs?rev=9f56afb2d2347310184706f7d5e46af583557bea#9f56afb2d2347310184706f7d5e46af583557bea" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f3334cea4f209440350d00ae1dab237ced49d80b664cc4b0e984893d583890" dependencies = [ + "cfg_aliases", "chrono", "csv", "flatbuffers", @@ -753,6 +755,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" + [[package]] name = "chrono" version = "0.4.19" @@ -4352,8 +4360,9 @@ dependencies = [ [[package]] name = "parquet" -version = "5.0.0-SNAPSHOT" -source = "git+https://github.com/apache/arrow-rs?rev=9f56afb2d2347310184706f7d5e46af583557bea#9f56afb2d2347310184706f7d5e46af583557bea" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "265044e41d674fad4c7860a3e245e53138e926fe83cad8d45193a7a354c56a54" dependencies = [ "arrow", "base64 0.13.0", @@ -4364,7 +4373,6 @@ dependencies = [ "lz4", "num-bigint 0.4.0", "parquet-format", - "rand 0.8.4", "snap", "thrift", "zstd", @@ -4592,8 +4600,9 @@ dependencies = [ [[package]] name = "polars" -version = "0.14.2" -source = "git+https://github.com/pola-rs/polars?rev=adc358b437f93bc7f844a94d68c064616e9d2ac2#adc358b437f93bc7f844a94d68c064616e9d2ac2" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f254b79757346a86a8371ea4a087ce6a56e604c82d61093a1b85bfd0df99aeb" dependencies = [ "polars-core", "polars-io", @@ -4602,8 +4611,9 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.14.2" -source = "git+https://github.com/pola-rs/polars?rev=adc358b437f93bc7f844a94d68c064616e9d2ac2#adc358b437f93bc7f844a94d68c064616e9d2ac2" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec1ef88e60b660c51644a5b098570519948d95f389b67ef690a0f1187395d7bf" dependencies = [ "arrow", "num 0.4.0", @@ -4612,8 +4622,9 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.14.2" -source = "git+https://github.com/pola-rs/polars?rev=adc358b437f93bc7f844a94d68c064616e9d2ac2#adc358b437f93bc7f844a94d68c064616e9d2ac2" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5e6ee23eb50845501c8c31368051af75801185cf4bedf9e7b3ec945a49af9c" dependencies = [ "ahash", "anyhow", @@ -4638,8 +4649,9 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.14.2" -source = "git+https://github.com/pola-rs/polars?rev=adc358b437f93bc7f844a94d68c064616e9d2ac2#adc358b437f93bc7f844a94d68c064616e9d2ac2" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94e8719cdb70555e0492dd24e8f09f637cc112bac438be754bad8dca75f466ab" dependencies = [ "ahash", "anyhow", @@ -4661,8 +4673,9 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.14.2" -source = "git+https://github.com/pola-rs/polars?rev=adc358b437f93bc7f844a94d68c064616e9d2ac2#adc358b437f93bc7f844a94d68c064616e9d2ac2" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ca6b2fb59bbe6725a84c48df12f509b4655d173cd113e5fb51f971cff1f93bc" dependencies = [ "ahash", "itertools", diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index 383fdb057c..29f926cc2e 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -99,9 +99,7 @@ which = { version="4.1.0", optional=true } zip = { version="0.5.9", optional=true } [dependencies.polars] -git = "https://github.com/pola-rs/polars" -rev = "adc358b437f93bc7f844a94d68c064616e9d2ac2" -version = "0.14.2" +version = "0.14.5" optional = true features = ["parquet", "json", "random", "pivot", "strings", "is_in"] diff --git a/crates/nu-command/src/commands/dataframe/aggregate.rs b/crates/nu-command/src/commands/dataframe/aggregate.rs index cfb1cf6215..20790e57ea 100644 --- a/crates/nu-command/src/commands/dataframe/aggregate.rs +++ b/crates/nu-command/src/commands/dataframe/aggregate.rs @@ -165,8 +165,8 @@ fn command(mut args: CommandArgs) -> Result { Ok(OutputStream::one(value)) } _ => Err(ShellError::labeled_error( - "No groupby or dataframe", - "no groupby or found in input stream", + "No groupby, dataframe or series in stream", + "no groupby, dataframe or series found in input stream", &value.tag.span, )), } diff --git a/crates/nu-command/src/commands/dataframe/mod.rs b/crates/nu-command/src/commands/dataframe/mod.rs index bb33915548..48f33da6a7 100644 --- a/crates/nu-command/src/commands/dataframe/mod.rs +++ b/crates/nu-command/src/commands/dataframe/mod.rs @@ -21,6 +21,7 @@ pub mod select; pub mod show; pub mod slice; pub mod sort; +pub mod take; pub mod to_csv; pub mod to_df; pub mod to_parquet; @@ -52,6 +53,7 @@ pub use select::DataFrame as DataFrameSelect; pub use show::DataFrame as DataFrameShow; pub use slice::DataFrame as DataFrameSlice; pub use sort::DataFrame as DataFrameSort; +pub use take::DataFrame as DataFrameTake; pub use to_csv::DataFrame as DataFrameToCsv; pub use to_df::DataFrame as DataFrameToDF; pub use to_parquet::DataFrame as DataFrameToParquet; @@ -77,6 +79,7 @@ pub use series::DataFrameNUnique; pub use series::DataFrameNot; pub use series::DataFrameSeriesRename; pub use series::DataFrameSet; +pub use series::DataFrameSetWithIdx; pub use series::DataFrameShift; pub use series::DataFrameUnique; pub use series::DataFrameValueCounts; diff --git a/crates/nu-command/src/commands/dataframe/series/arg_max.rs b/crates/nu-command/src/commands/dataframe/series/arg_max.rs index 7fac28ca1b..626107850f 100644 --- a/crates/nu-command/src/commands/dataframe/series/arg_max.rs +++ b/crates/nu-command/src/commands/dataframe/series/arg_max.rs @@ -1,9 +1,9 @@ use crate::prelude::*; use nu_engine::WholeStreamCommand; use nu_errors::ShellError; -use nu_protocol::{ - dataframe::NuSeries, Primitive, Signature, TaggedDictBuilder, UntaggedValue, Value, -}; +use nu_protocol::{dataframe::NuSeries, Signature}; + +use polars::prelude::{IntoSeries, NewChunkedArray, UInt32Chunked}; pub struct DataFrame; @@ -40,18 +40,12 @@ fn command(mut args: CommandArgs) -> Result { let res = series.as_ref().arg_max(); - let value = match res { - Some(index) => UntaggedValue::Primitive(Primitive::Int(index as i64)), - None => UntaggedValue::Primitive(Primitive::Nothing), + let chunked = match res { + Some(index) => UInt32Chunked::new_from_slice("arg_max", &[index as u32]), + None => UInt32Chunked::new_from_slice("arg_max", &[]), }; - let value = Value { - value, - tag: tag.clone(), - }; + let res = chunked.into_series(); - let mut data = TaggedDictBuilder::new(tag); - data.insert_value("arg-max", value); - - Ok(OutputStream::one(data.into_value())) + Ok(OutputStream::one(NuSeries::series_to_value(res, tag))) } diff --git a/crates/nu-command/src/commands/dataframe/series/arg_min.rs b/crates/nu-command/src/commands/dataframe/series/arg_min.rs index c15ed1a91d..3db3d9f705 100644 --- a/crates/nu-command/src/commands/dataframe/series/arg_min.rs +++ b/crates/nu-command/src/commands/dataframe/series/arg_min.rs @@ -1,9 +1,9 @@ use crate::prelude::*; use nu_engine::WholeStreamCommand; use nu_errors::ShellError; -use nu_protocol::{ - dataframe::NuSeries, Primitive, Signature, TaggedDictBuilder, UntaggedValue, Value, -}; +use nu_protocol::{dataframe::NuSeries, Signature}; + +use polars::prelude::{IntoSeries, NewChunkedArray, UInt32Chunked}; pub struct DataFrame; @@ -40,18 +40,12 @@ fn command(mut args: CommandArgs) -> Result { let res = series.as_ref().arg_min(); - let value = match res { - Some(index) => UntaggedValue::Primitive(Primitive::Int(index as i64)), - None => UntaggedValue::Primitive(Primitive::Nothing), + let chunked = match res { + Some(index) => UInt32Chunked::new_from_slice("arg_min", &[index as u32]), + None => UInt32Chunked::new_from_slice("arg_min", &[]), }; - let value = Value { - value, - tag: tag.clone(), - }; + let res = chunked.into_series(); - let mut data = TaggedDictBuilder::new(tag); - data.insert_value("arg-min", value); - - Ok(OutputStream::one(data.into_value())) + Ok(OutputStream::one(NuSeries::series_to_value(res, tag))) } diff --git a/crates/nu-command/src/commands/dataframe/series/mod.rs b/crates/nu-command/src/commands/dataframe/series/mod.rs index d6181102ae..d7387be78b 100644 --- a/crates/nu-command/src/commands/dataframe/series/mod.rs +++ b/crates/nu-command/src/commands/dataframe/series/mod.rs @@ -15,6 +15,7 @@ pub mod n_unique; pub mod not; pub mod rename; pub mod set; +pub mod set_with_idx; pub mod shift; pub mod unique; pub mod value_counts; @@ -36,6 +37,7 @@ pub use n_unique::DataFrame as DataFrameNUnique; pub use not::DataFrame as DataFrameNot; pub use rename::DataFrame as DataFrameSeriesRename; pub use set::DataFrame as DataFrameSet; +pub use set_with_idx::DataFrame as DataFrameSetWithIdx; pub use shift::DataFrame as DataFrameShift; pub use unique::DataFrame as DataFrameUnique; pub use value_counts::DataFrame as DataFrameValueCounts; diff --git a/crates/nu-command/src/commands/dataframe/series/set_with_idx.rs b/crates/nu-command/src/commands/dataframe/series/set_with_idx.rs new file mode 100644 index 0000000000..6dea8cdf4a --- /dev/null +++ b/crates/nu-command/src/commands/dataframe/series/set_with_idx.rs @@ -0,0 +1,153 @@ +use crate::{commands::dataframe::utils::parse_polars_error, prelude::*}; +use nu_engine::WholeStreamCommand; +use nu_errors::ShellError; +use nu_protocol::{dataframe::NuSeries, Primitive, Signature, SyntaxShape, UntaggedValue, Value}; +use polars::prelude::{ChunkSet, DataType, IntoSeries}; + +pub struct DataFrame; + +impl WholeStreamCommand for DataFrame { + fn name(&self) -> &str { + "dataframe set-with-idx" + } + + fn usage(&self) -> &str { + "[Series] Sets value in the given index" + } + + fn signature(&self) -> Signature { + Signature::build("dataframe set-with-idx") + .required("value", SyntaxShape::Any, "value to be inserted in series") + .required_named( + "indices", + SyntaxShape::Any, + "list of indices indicating where to set the value", + Some('i'), + ) + } + + fn run(&self, args: CommandArgs) -> Result { + command(args) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Set value in selected rows from series", + example: r#"let series = ([4 1 5 2 4 3] | dataframe to-series); + let indices = ([0 2] | dataframe to-series); + $series | dataframe set-with-idx 6 -i $indices"#, + result: None, + }] + } +} + +fn command(mut args: CommandArgs) -> Result { + let tag = args.call_info.name_tag.clone(); + let value: Value = args.req(0)?; + let indices: Value = args.req_named("indices")?; + + let indices = match &indices.value { + UntaggedValue::DataFrame(nu_protocol::dataframe::PolarsData::Series(series)) => Ok(series), + _ => Err(ShellError::labeled_error( + "Incorrect type", + "can only use a series for set command", + value.tag.span, + )), + }?; + + let casted = match indices.as_ref().dtype() { + DataType::UInt32 | DataType::UInt64 | DataType::Int32 | DataType::Int64 => indices + .as_ref() + .cast_with_dtype(&DataType::UInt32) + .map_err(|e| parse_polars_error::<&str>(&e, &value.tag.span, None)), + _ => Err(ShellError::labeled_error_with_secondary( + "Incorrect type", + "Series with incorrect type", + &value.tag.span, + "Consider using a Series with type int type", + &value.tag.span, + )), + }?; + + let indices = casted + .u32() + .map_err(|e| parse_polars_error::<&str>(&e, &value.tag.span, None))? + .into_iter() + .filter_map(|val| val.map(|v| v as usize)); + + let series = NuSeries::try_from_stream(&mut args.input, &tag.span)?; + + match &value.value { + UntaggedValue::Primitive(Primitive::Int(val)) => { + let chunked = series.as_ref().i64().map_err(|e| { + parse_polars_error::<&str>( + &e, + &value.tag.span, + Some("The value has to match the set value type"), + ) + })?; + + let res = chunked + .set_at_idx(indices, Some(*val)) + .map_err(|e| parse_polars_error::<&str>(&e, &value.tag.span, None))?; + + Ok(OutputStream::one(NuSeries::series_to_value( + res.into_series(), + tag, + ))) + } + UntaggedValue::Primitive(Primitive::Decimal(val)) => { + let chunked = series.as_ref().f64().map_err(|e| { + parse_polars_error::<&str>( + &e, + &value.tag.span, + Some("The value has to match the series type"), + ) + })?; + + let res = chunked + .set_at_idx( + indices, + Some( + val.to_f64() + .expect("internal error: expected f64-compatible decimal"), + ), + ) + .map_err(|e| parse_polars_error::<&str>(&e, &value.tag.span, None))?; + + Ok(OutputStream::one(NuSeries::series_to_value( + res.into_series(), + tag, + ))) + } + UntaggedValue::Primitive(Primitive::String(val)) => { + let chunked = series.as_ref().utf8().map_err(|e| { + parse_polars_error::<&str>( + &e, + &value.tag.span, + Some("The value has to match the series type"), + ) + })?; + + let res = chunked + .set_at_idx(indices, Some(val.as_ref())) + .map_err(|e| parse_polars_error::<&str>(&e, &value.tag.span, None))?; + + let mut res = res.into_series(); + res.rename("string"); + + Ok(OutputStream::one(NuSeries::series_to_value( + res.into_series(), + tag, + ))) + } + _ => Err(ShellError::labeled_error( + "Incorrect type", + format!( + "this value cannot be set in a series of type '{}'", + series.as_ref().dtype() + ), + value.tag.span, + )), + } +} diff --git a/crates/nu-command/src/commands/dataframe/take.rs b/crates/nu-command/src/commands/dataframe/take.rs new file mode 100644 index 0000000000..e90f6b948d --- /dev/null +++ b/crates/nu-command/src/commands/dataframe/take.rs @@ -0,0 +1,107 @@ +use crate::prelude::*; +use nu_engine::WholeStreamCommand; +use nu_errors::ShellError; +use nu_protocol::{ + dataframe::{NuDataFrame, NuSeries, PolarsData}, + Signature, SyntaxShape, UntaggedValue, Value, +}; +use polars::prelude::DataType; + +use super::utils::parse_polars_error; + +pub struct DataFrame; + +impl WholeStreamCommand for DataFrame { + fn name(&self) -> &str { + "dataframe take" + } + + fn usage(&self) -> &str { + "[DataFrame, Series] Creates new dataframe using the given indices" + } + + fn signature(&self) -> Signature { + Signature::build("dataframe take").required( + "indices", + SyntaxShape::Any, + "list of indices used to take data", + ) + } + + fn run(&self, args: CommandArgs) -> Result { + command(args) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Takes selected rows from dataframe", + example: r#"let df = ([[a b]; [4 1] [5 2] [4 3]] | dataframe to-df); + let indices = ([0 2] | dataframe to-series); + $df | dataframe take $indices"#, + result: None, + }, + Example { + description: "Takes selected rows from series", + example: r#"let series = ([4 1 5 2 4 3] | dataframe to-series); + let indices = ([0 2] | dataframe to-series); + $series | dataframe take $indices"#, + result: None, + }, + ] + } +} + +fn command(mut args: CommandArgs) -> Result { + let tag = args.call_info.name_tag.clone(); + let value: Value = args.req(0)?; + + let series = match &value.value { + UntaggedValue::DataFrame(PolarsData::Series(series)) => Ok(series), + _ => Err(ShellError::labeled_error( + "Incorrect type", + "can only use a series for take command", + value.tag.span, + )), + }?; + + let casted = match series.as_ref().dtype() { + DataType::UInt32 | DataType::UInt64 | DataType::Int32 | DataType::Int64 => series + .as_ref() + .cast_with_dtype(&DataType::UInt32) + .map_err(|e| parse_polars_error::<&str>(&e, &value.tag.span, None)), + _ => Err(ShellError::labeled_error_with_secondary( + "Incorrect type", + "Series with incorrect type", + &value.tag.span, + "Consider using a Series with type int type", + &value.tag.span, + )), + }?; + + let indices = casted + .u32() + .map_err(|e| parse_polars_error::<&str>(&e, &value.tag.span, None))?; + + let value = args.input.next().ok_or_else(|| { + ShellError::labeled_error("Empty stream", "No value found in the stream", &tag) + })?; + + match value.value { + UntaggedValue::DataFrame(PolarsData::EagerDataFrame(df)) => { + let res = df.as_ref().take(indices); + + Ok(OutputStream::one(NuDataFrame::dataframe_to_value(res, tag))) + } + UntaggedValue::DataFrame(PolarsData::Series(series)) => { + let res = series.as_ref().take(indices); + + Ok(OutputStream::one(NuSeries::series_to_value(res, tag))) + } + _ => Err(ShellError::labeled_error( + "No dataframe or series in stream", + "no dataframe or series found in input stream", + &value.tag.span, + )), + } +} diff --git a/crates/nu-command/src/commands/mod.rs b/crates/nu-command/src/commands/mod.rs index 9325ae6377..46acd65d38 100644 --- a/crates/nu-command/src/commands/mod.rs +++ b/crates/nu-command/src/commands/mod.rs @@ -32,9 +32,9 @@ pub use dataframe::{ DataFrameIsIn, DataFrameIsNotNull, DataFrameIsNull, DataFrameIsUnique, DataFrameJoin, DataFrameLast, DataFrameList, DataFrameMelt, DataFrameNNull, DataFrameNUnique, DataFrameNot, DataFrameOpen, DataFramePivot, DataFrameSample, DataFrameSelect, DataFrameSeriesRename, - DataFrameSet, DataFrameShift, DataFrameShow, DataFrameSlice, DataFrameSort, DataFrameToCsv, - DataFrameToDF, DataFrameToParquet, DataFrameToSeries, DataFrameUnique, DataFrameValueCounts, - DataFrameWhere, DataFrameWithColumn, + DataFrameSet, DataFrameSetWithIdx, DataFrameShift, DataFrameShow, DataFrameSlice, + DataFrameSort, DataFrameTake, DataFrameToCsv, DataFrameToDF, DataFrameToParquet, + DataFrameToSeries, DataFrameUnique, DataFrameValueCounts, DataFrameWhere, DataFrameWithColumn, }; pub use env::*; pub use filesystem::*; diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs index 8533ad2004..8a591d449d 100644 --- a/crates/nu-command/src/default_context.rs +++ b/crates/nu-command/src/default_context.rs @@ -315,6 +315,8 @@ pub fn create_default_context(interactive: bool) -> Result