From 6a35e6b7b61223a0e24598e3e38e6a0b68ed3bc6 Mon Sep 17 00:00:00 2001 From: Fernando Herrera Date: Tue, 21 Dec 2021 18:32:09 +0000 Subject: [PATCH] Dataframe commands (#542) * groupby object * aggregate command * eager commands * rest of dataframe commands --- Cargo.lock | 19 + crates/nu-command/Cargo.toml | 2 +- .../src/dataframe/eager/aggregate.rs | 375 ++++++++++++++++++ .../src/dataframe/{ => eager}/append.rs | 4 +- .../src/dataframe/{ => eager}/column.rs | 4 +- .../src/dataframe/{ => eager}/command.rs | 0 .../src/dataframe/{ => eager}/describe.rs | 4 +- .../src/dataframe/{ => eager}/drop.rs | 6 +- .../src/dataframe/{ => eager}/drop_nulls.rs | 12 +- .../src/dataframe/{ => eager}/dtypes.rs | 4 +- .../nu-command/src/dataframe/eager/dummies.rs | 137 +++++++ .../src/dataframe/eager/filter_with.rs | 98 +++++ .../nu-command/src/dataframe/eager/first.rs | 80 ++++ crates/nu-command/src/dataframe/eager/get.rs | 88 ++++ .../nu-command/src/dataframe/eager/groupby.rs | 71 ++++ crates/nu-command/src/dataframe/eager/join.rs | 226 +++++++++++ crates/nu-command/src/dataframe/eager/last.rs | 80 ++++ crates/nu-command/src/dataframe/eager/melt.rs | 243 ++++++++++++ crates/nu-command/src/dataframe/eager/mod.rs | 105 +++++ .../src/dataframe/{ => eager}/open.rs | 2 +- .../nu-command/src/dataframe/eager/pivot.rs | 175 ++++++++ .../nu-command/src/dataframe/eager/rename.rs | 94 +++++ .../nu-command/src/dataframe/eager/sample.rs | 106 +++++ .../nu-command/src/dataframe/eager/shape.rs | 87 ++++ .../nu-command/src/dataframe/eager/slice.rs | 85 ++++ crates/nu-command/src/dataframe/eager/sort.rs | 142 +++++++ crates/nu-command/src/dataframe/eager/take.rs | 144 +++++++ .../nu-command/src/dataframe/eager/to_csv.rs | 132 ++++++ .../src/dataframe/{ => eager}/to_df.rs | 4 +- .../nu-command/src/dataframe/eager/to_nu.rs | 83 ++++ .../src/dataframe/eager/to_parquet.rs | 84 ++++ .../src/dataframe/{ => eager}/with_column.rs | 4 +- crates/nu-command/src/dataframe/mod.rs | 96 +---- crates/nu-command/src/dataframe/series/mod.rs | 59 +++ .../src/dataframe/series/rolling.rs | 2 +- .../nu-command/src/dataframe/series/shift.rs | 2 +- .../src/dataframe/test_dataframe.rs | 2 +- crates/nu-command/src/dataframe/values/mod.rs | 2 + .../values/nu_dataframe/custom_value.rs | 6 +- .../src/dataframe/values/nu_dataframe/mod.rs | 4 +- .../values/nu_groupby/custom_value.rs | 44 ++ .../src/dataframe/values/nu_groupby/mod.rs | 89 +++++ .../nu-command/src/dataframe/values/utils.rs | 41 +- crates/nu-protocol/src/value/custom_value.rs | 19 +- 44 files changed, 2936 insertions(+), 130 deletions(-) create mode 100644 crates/nu-command/src/dataframe/eager/aggregate.rs rename crates/nu-command/src/dataframe/{ => eager}/append.rs (97%) rename crates/nu-command/src/dataframe/{ => eager}/column.rs (95%) rename crates/nu-command/src/dataframe/{ => eager}/command.rs (100%) rename crates/nu-command/src/dataframe/{ => eager}/describe.rs (98%) rename crates/nu-command/src/dataframe/{ => eager}/drop.rs (94%) rename crates/nu-command/src/dataframe/{ => eager}/drop_nulls.rs (91%) rename crates/nu-command/src/dataframe/{ => eager}/dtypes.rs (96%) create mode 100644 crates/nu-command/src/dataframe/eager/dummies.rs create mode 100644 crates/nu-command/src/dataframe/eager/filter_with.rs create mode 100644 crates/nu-command/src/dataframe/eager/first.rs create mode 100644 crates/nu-command/src/dataframe/eager/get.rs create mode 100644 crates/nu-command/src/dataframe/eager/groupby.rs create mode 100644 crates/nu-command/src/dataframe/eager/join.rs create mode 100644 crates/nu-command/src/dataframe/eager/last.rs create mode 100644 crates/nu-command/src/dataframe/eager/melt.rs create mode 100644 crates/nu-command/src/dataframe/eager/mod.rs rename crates/nu-command/src/dataframe/{ => eager}/open.rs (99%) create mode 100644 crates/nu-command/src/dataframe/eager/pivot.rs create mode 100644 crates/nu-command/src/dataframe/eager/rename.rs create mode 100644 crates/nu-command/src/dataframe/eager/sample.rs create mode 100644 crates/nu-command/src/dataframe/eager/shape.rs create mode 100644 crates/nu-command/src/dataframe/eager/slice.rs create mode 100644 crates/nu-command/src/dataframe/eager/sort.rs create mode 100644 crates/nu-command/src/dataframe/eager/take.rs create mode 100644 crates/nu-command/src/dataframe/eager/to_csv.rs rename crates/nu-command/src/dataframe/{ => eager}/to_df.rs (97%) create mode 100644 crates/nu-command/src/dataframe/eager/to_nu.rs create mode 100644 crates/nu-command/src/dataframe/eager/to_parquet.rs rename crates/nu-command/src/dataframe/{ => eager}/with_column.rs (96%) create mode 100644 crates/nu-command/src/dataframe/values/nu_groupby/custom_value.rs create mode 100644 crates/nu-command/src/dataframe/values/nu_groupby/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 12baabdddb..1432dc1a52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1363,6 +1363,12 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "libm" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7d73b3f436185384286bd8098d17ec07c9a7d2388a6599f824d8502b529702a" + [[package]] name = "libssh2-sys" version = "0.2.23" @@ -1982,6 +1988,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -2249,6 +2256,8 @@ dependencies = [ "num_cpus", "polars-arrow", "prettytable-rs", + "rand", + "rand_distr", "rayon", "regex", "serde", @@ -2434,6 +2443,16 @@ dependencies = [ "getrandom 0.2.3", ] +[[package]] +name = "rand_distr" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "964d548f8e7d12e102ef183a0de7e98180c9f8729f555897a857b96e48122d2f" +dependencies = [ + "num-traits", + "rand", +] + [[package]] name = "rand_hc" version = "0.3.1" diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index 853b426a34..a003a55962 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -74,7 +74,7 @@ optional = true features = [ "default", "parquet", "json", "serde", "object", "checked_arithmetic", "strings", "cum_agg", "is_in", - "rolling_window", "strings" + "rolling_window", "strings", "pivot", "random" ] [features] diff --git a/crates/nu-command/src/dataframe/eager/aggregate.rs b/crates/nu-command/src/dataframe/eager/aggregate.rs new file mode 100644 index 0000000000..66017c41ac --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/aggregate.rs @@ -0,0 +1,375 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + did_you_mean, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value, +}; +use polars::{frame::groupby::GroupBy, prelude::PolarsError}; + +use crate::dataframe::values::NuGroupBy; + +use super::super::values::{Column, NuDataFrame}; + +enum Operation { + Mean, + Sum, + Min, + Max, + First, + Last, + Nunique, + Quantile(f64), + Median, + Var, + Std, + Count, +} + +impl Operation { + fn from_tagged( + name: &Spanned, + quantile: Option>, + ) -> Result { + match name.item.as_ref() { + "mean" => Ok(Operation::Mean), + "sum" => Ok(Operation::Sum), + "min" => Ok(Operation::Min), + "max" => Ok(Operation::Max), + "first" => Ok(Operation::First), + "last" => Ok(Operation::Last), + "nunique" => Ok(Operation::Nunique), + "quantile" => match quantile { + None => Err(ShellError::SpannedLabeledError( + "Quantile value not fount".into(), + "Quantile operation requires quantile value".into(), + name.span, + )), + Some(value) => { + if (value.item < 0.0) | (value.item > 1.0) { + Err(ShellError::SpannedLabeledError( + "Inappropriate quantile".into(), + "Quantile value should be between 0.0 and 1.0".into(), + value.span, + )) + } else { + Ok(Operation::Quantile(value.item)) + } + } + }, + "median" => Ok(Operation::Median), + "var" => Ok(Operation::Var), + "std" => Ok(Operation::Std), + "count" => Ok(Operation::Count), + selection => { + let possibilities = [ + "mean".to_string(), + "sum".to_string(), + "min".to_string(), + "max".to_string(), + "first".to_string(), + "last".to_string(), + "nunique".to_string(), + "quantile".to_string(), + "median".to_string(), + "var".to_string(), + "std".to_string(), + "count".to_string(), + ]; + + match did_you_mean(&possibilities, selection) { + Some(suggestion) => Err(ShellError::DidYouMean(suggestion, name.span)), + None => Err(ShellError::SpannedLabeledErrorHelp( + "Operation not fount".into(), + "Operation does not exist".into(), + name.span, + "Perhaps you want: mean, sum, min, max, first, last, nunique, quantile, median, var, std, or count".into(), + )) + } + } + } + } + + fn to_str(&self) -> &'static str { + match self { + Self::Mean => "mean", + Self::Sum => "sum", + Self::Min => "min", + Self::Max => "max", + Self::First => "first", + Self::Last => "last", + Self::Nunique => "nunique", + Self::Quantile(_) => "quantile", + Self::Median => "median", + Self::Var => "var", + Self::Std => "std", + Self::Count => "count", + } + } +} + +#[derive(Clone)] +pub struct Aggregate; + +impl Command for Aggregate { + fn name(&self) -> &str { + "dfr aggregate" + } + + fn usage(&self) -> &str { + "Performs an aggregation operation on a dataframe and groupby object" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required( + "operation-name", + SyntaxShape::String, + "\n\tDataframes: mean, sum, min, max, quantile, median, var, std +\tGroupBy: mean, sum, min, max, first, last, nunique, quantile, median, var, std, count", + ) + .named( + "quantile", + SyntaxShape::Number, + "quantile value for quantile operation", + Some('q'), + ) + .switch( + "explicit", + "returns explicit names for groupby aggregations", + Some('e'), + ) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Aggregate sum by grouping by column a and summing on col b", + example: + "[[a b]; [one 1] [one 2]] | dfr to-df | dfr group-by a | dfr aggregate sum", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("a".to_string(), vec![Value::test_string("one")]), + Column::new("b".to_string(), vec![Value::test_int(3)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Aggregate sum in dataframe columns", + example: "[[a b]; [4 1] [5 2]] | dfr to-df | dfr aggregate sum", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("a".to_string(), vec![Value::test_int(9)]), + Column::new("b".to_string(), vec![Value::test_int(3)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Aggregate sum in series", + example: "[4 1 5 6] | dfr to-df | dfr aggregate sum", + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "0".to_string(), + vec![Value::test_int(16)], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let operation: Spanned = call.req(engine_state, stack, 0)?; + let quantile: Option> = call.get_flag(engine_state, stack, "quantile")?; + let op = Operation::from_tagged(&operation, quantile)?; + + match input { + PipelineData::Value(Value::CustomValue { val, span }, _) => { + let df = val.as_any().downcast_ref::(); + let groupby = val.as_any().downcast_ref::(); + + match (df, groupby) { + (Some(df), None) => { + let df = df.as_ref(); + let res = perform_dataframe_aggregation(df, op, operation.span)?; + + Ok(PipelineData::Value( + NuDataFrame::dataframe_into_value(res, span), + None, + )) + } + (None, Some(nu_groupby)) => { + let groupby = nu_groupby.to_groupby()?; + + let res = perform_groupby_aggregation( + groupby, + op, + operation.span, + call.head, + call.has_flag("explicit"), + )?; + + Ok(PipelineData::Value( + NuDataFrame::dataframe_into_value(res, span), + None, + )) + } + _ => Err(ShellError::SpannedLabeledError( + "Incorrect datatype".into(), + "no groupby or dataframe found in input stream".into(), + call.head, + )), + } + } + _ => Err(ShellError::SpannedLabeledError( + "Incorrect datatype".into(), + "no groupby or dataframe found in input stream".into(), + call.head, + )), + } +} + +fn perform_groupby_aggregation( + groupby: GroupBy, + operation: Operation, + operation_span: Span, + agg_span: Span, + explicit: bool, +) -> Result { + let mut res = match operation { + Operation::Mean => groupby.mean(), + Operation::Sum => groupby.sum(), + Operation::Min => groupby.min(), + Operation::Max => groupby.max(), + Operation::First => groupby.first(), + Operation::Last => groupby.last(), + Operation::Nunique => groupby.n_unique(), + Operation::Quantile(quantile) => groupby.quantile(quantile), + Operation::Median => groupby.median(), + Operation::Var => groupby.var(), + Operation::Std => groupby.std(), + Operation::Count => groupby.count(), + } + .map_err(|e| { + let span = match &e { + PolarsError::NotFound(_) => agg_span, + _ => operation_span, + }; + + ShellError::SpannedLabeledError("Error calculating aggregation".into(), e.to_string(), span) + })?; + + if !explicit { + let col_names = res + .get_column_names() + .iter() + .map(|name| name.to_string()) + .collect::>(); + + for col in col_names { + let from = match operation { + Operation::Mean => "_mean", + Operation::Sum => "_sum", + Operation::Min => "_min", + Operation::Max => "_max", + Operation::First => "_first", + Operation::Last => "_last", + Operation::Nunique => "_n_unique", + Operation::Quantile(_) => "_quantile", + Operation::Median => "_median", + Operation::Var => "_agg_var", + Operation::Std => "_agg_std", + Operation::Count => "_count", + }; + + let new_col = match col.find(from) { + Some(index) => &col[..index], + None => &col[..], + }; + + res.rename(&col, new_col) + .expect("Column is always there. Looping with known names"); + } + } + + Ok(res) +} + +fn perform_dataframe_aggregation( + dataframe: &polars::prelude::DataFrame, + operation: Operation, + operation_span: Span, +) -> Result { + match operation { + Operation::Mean => Ok(dataframe.mean()), + Operation::Sum => Ok(dataframe.sum()), + Operation::Min => Ok(dataframe.min()), + Operation::Max => Ok(dataframe.max()), + Operation::Quantile(quantile) => dataframe.quantile(quantile).map_err(|e| { + ShellError::SpannedLabeledError( + "Error calculating quantile".into(), + e.to_string(), + operation_span, + ) + }), + Operation::Median => Ok(dataframe.median()), + Operation::Var => Ok(dataframe.var()), + Operation::Std => Ok(dataframe.std()), + operation => { + let possibilities = [ + "mean".to_string(), + "sum".to_string(), + "min".to_string(), + "max".to_string(), + "quantile".to_string(), + "median".to_string(), + "var".to_string(), + "std".to_string(), + ]; + + match did_you_mean(&possibilities, operation.to_str()) { + Some(suggestion) => Err(ShellError::DidYouMean(suggestion, operation_span)), + None => Err(ShellError::SpannedLabeledErrorHelp( + "Operation not fount".into(), + "Operation does not exist".into(), + operation_span, + "Perhaps you want: mean, sum, min, max, quantile, median, var, or std".into(), + )), + } + } + } +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::super::CreateGroupBy; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(Aggregate {}), Box::new(CreateGroupBy {})]) + } +} diff --git a/crates/nu-command/src/dataframe/append.rs b/crates/nu-command/src/dataframe/eager/append.rs similarity index 97% rename from crates/nu-command/src/dataframe/append.rs rename to crates/nu-command/src/dataframe/eager/append.rs index fe1f40a3a5..a57ed9e990 100644 --- a/crates/nu-command/src/dataframe/append.rs +++ b/crates/nu-command/src/dataframe/eager/append.rs @@ -5,7 +5,7 @@ use nu_protocol::{ Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, }; -use super::values::{Axis, Column, NuDataFrame}; +use super::super::values::{Axis, Column, NuDataFrame}; #[derive(Clone)] pub struct AppendDF; @@ -120,7 +120,7 @@ fn command( #[cfg(test)] mod test { - use super::super::test_dataframe::test_dataframe; + use super::super::super::test_dataframe::test_dataframe; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/column.rs b/crates/nu-command/src/dataframe/eager/column.rs similarity index 95% rename from crates/nu-command/src/dataframe/column.rs rename to crates/nu-command/src/dataframe/eager/column.rs index 76d6bff024..9d3df4344d 100644 --- a/crates/nu-command/src/dataframe/column.rs +++ b/crates/nu-command/src/dataframe/eager/column.rs @@ -5,7 +5,7 @@ use nu_protocol::{ Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value, }; -use super::values::{Column, NuDataFrame}; +use super::super::values::{Column, NuDataFrame}; #[derive(Clone)] pub struct ColumnDF; @@ -71,7 +71,7 @@ fn command( #[cfg(test)] mod test { - use super::super::test_dataframe::test_dataframe; + use super::super::super::test_dataframe::test_dataframe; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/command.rs b/crates/nu-command/src/dataframe/eager/command.rs similarity index 100% rename from crates/nu-command/src/dataframe/command.rs rename to crates/nu-command/src/dataframe/eager/command.rs diff --git a/crates/nu-command/src/dataframe/describe.rs b/crates/nu-command/src/dataframe/eager/describe.rs similarity index 98% rename from crates/nu-command/src/dataframe/describe.rs rename to crates/nu-command/src/dataframe/eager/describe.rs index 50d1e4b317..2d9791b034 100644 --- a/crates/nu-command/src/dataframe/describe.rs +++ b/crates/nu-command/src/dataframe/eager/describe.rs @@ -1,4 +1,4 @@ -use super::values::{Column, NuDataFrame}; +use super::super::values::{Column, NuDataFrame}; use nu_protocol::{ ast::Call, @@ -231,7 +231,7 @@ fn command( #[cfg(test)] mod test { - use super::super::test_dataframe::test_dataframe; + use super::super::super::test_dataframe::test_dataframe; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/drop.rs b/crates/nu-command/src/dataframe/eager/drop.rs similarity index 94% rename from crates/nu-command/src/dataframe/drop.rs rename to crates/nu-command/src/dataframe/eager/drop.rs index 1338ad1522..df6946d86b 100644 --- a/crates/nu-command/src/dataframe/drop.rs +++ b/crates/nu-command/src/dataframe/eager/drop.rs @@ -5,8 +5,8 @@ use nu_protocol::{ Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, }; -use super::values::utils::convert_columns; -use super::values::{Column, NuDataFrame}; +use super::super::values::utils::convert_columns; +use super::super::values::{Column, NuDataFrame}; #[derive(Clone)] pub struct DropDF; @@ -101,7 +101,7 @@ fn command( #[cfg(test)] mod test { - use super::super::test_dataframe::test_dataframe; + use super::super::super::test_dataframe::test_dataframe; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/drop_nulls.rs b/crates/nu-command/src/dataframe/eager/drop_nulls.rs similarity index 91% rename from crates/nu-command/src/dataframe/drop_nulls.rs rename to crates/nu-command/src/dataframe/eager/drop_nulls.rs index c85873d8e9..e91b57edfa 100644 --- a/crates/nu-command/src/dataframe/drop_nulls.rs +++ b/crates/nu-command/src/dataframe/eager/drop_nulls.rs @@ -5,8 +5,8 @@ use nu_protocol::{ Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, }; -use super::values::utils::convert_columns; -use super::values::{Column, NuDataFrame}; +use super::super::values::utils::convert_columns_string; +use super::super::values::{Column, NuDataFrame}; #[derive(Clone)] pub struct DropNulls; @@ -101,11 +101,7 @@ fn command( let (subset, col_span) = match columns { Some(cols) => { - let (agg_string, col_span) = convert_columns(cols, call.head)?; - let agg_string = agg_string - .into_iter() - .map(|col| col.item) - .collect::>(); + let (agg_string, col_span) = convert_columns_string(cols, call.head)?; (Some(agg_string), col_span) } None => (None, call.head), @@ -123,7 +119,7 @@ fn command( #[cfg(test)] mod test { - use super::super::test_dataframe::test_dataframe; + use super::super::super::test_dataframe::test_dataframe; use super::super::WithColumn; use super::*; diff --git a/crates/nu-command/src/dataframe/dtypes.rs b/crates/nu-command/src/dataframe/eager/dtypes.rs similarity index 96% rename from crates/nu-command/src/dataframe/dtypes.rs rename to crates/nu-command/src/dataframe/eager/dtypes.rs index 8138dd58b2..923a032bc4 100644 --- a/crates/nu-command/src/dataframe/dtypes.rs +++ b/crates/nu-command/src/dataframe/eager/dtypes.rs @@ -1,4 +1,4 @@ -use super::values::{Column, NuDataFrame}; +use super::super::values::{Column, NuDataFrame}; use nu_protocol::{ ast::Call, engine::{Command, EngineState, Stack}, @@ -96,7 +96,7 @@ fn command( #[cfg(test)] mod test { - use super::super::test_dataframe::test_dataframe; + use super::super::super::test_dataframe::test_dataframe; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/eager/dummies.rs b/crates/nu-command/src/dataframe/eager/dummies.rs new file mode 100644 index 0000000000..ee1744e6be --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/dummies.rs @@ -0,0 +1,137 @@ +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, Value, +}; + +use super::super::values::{Column, NuDataFrame}; + +#[derive(Clone)] +pub struct Dummies; + +impl Command for Dummies { + fn name(&self) -> &str { + "dfr to-dummies" + } + + fn usage(&self) -> &str { + "Creates a new dataframe with dummy variables" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()).category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Create new dataframe with dummy variables from a dataframe", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-dummies", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "a_1".to_string(), + vec![Value::test_int(1), Value::test_int(0)], + ), + Column::new( + "a_3".to_string(), + vec![Value::test_int(0), Value::test_int(1)], + ), + Column::new( + "b_2".to_string(), + vec![Value::test_int(1), Value::test_int(0)], + ), + Column::new( + "b_4".to_string(), + vec![Value::test_int(0), Value::test_int(1)], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Create new dataframe with dummy variables from a series", + example: "[1 2 2 3 3] | dfr to-df | dfr to-dummies", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "0_1".to_string(), + vec![ + Value::test_int(1), + Value::test_int(0), + Value::test_int(0), + Value::test_int(0), + Value::test_int(0), + ], + ), + Column::new( + "0_2".to_string(), + vec![ + Value::test_int(0), + Value::test_int(1), + Value::test_int(1), + Value::test_int(0), + Value::test_int(0), + ], + ), + Column::new( + "0_3".to_string(), + vec![ + Value::test_int(0), + Value::test_int(0), + Value::test_int(0), + Value::test_int(1), + Value::test_int(1), + ], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + _engine_state: &EngineState, + _stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + df.as_ref() + .to_dummies() + .map_err(|e| { + ShellError::SpannedLabeledErrorHelp( + "Error calculating dummies".into(), + e.to_string(), + call.head, + "The only allowed column types for dummies are String or Int".into(), + ) + }) + .map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(Dummies {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/filter_with.rs b/crates/nu-command/src/dataframe/eager/filter_with.rs new file mode 100644 index 0000000000..cdaf1cb456 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/filter_with.rs @@ -0,0 +1,98 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, +}; + +use super::super::values::{Column, NuDataFrame}; + +#[derive(Clone)] +pub struct FilterWith; + +impl Command for FilterWith { + fn name(&self) -> &str { + "dfr filter-with" + } + + fn usage(&self) -> &str { + "Filters dataframe using a mask as reference" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("mask", SyntaxShape::Any, "boolean mask used to filter data") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Filter dataframe using a bool mask", + example: r#"let mask = ([$true $false] | dfr to-df); + [[a b]; [1 2] [3 4]] | dfr to-df | dfr filter-with $mask"#, + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("a".to_string(), vec![Value::test_int(1)]), + Column::new("b".to_string(), vec![Value::test_int(2)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let mask_value: Value = call.req(engine_state, stack, 0)?; + + let mask_span = mask_value.span()?; + let mask = NuDataFrame::try_from_value(mask_value)?.as_series(mask_span)?; + let mask = mask.bool().map_err(|e| { + ShellError::SpannedLabeledErrorHelp( + "Error casting to bool".into(), + e.to_string(), + mask_span, + "Perhaps you want to use a series with booleans as mask".into(), + ) + })?; + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + df.as_ref() + .filter(mask) + .map_err(|e| { + ShellError::SpannedLabeledErrorHelp( + "Error calculating dummies".into(), + e.to_string(), + call.head, + "The only allowed column types for dummies are String or Int".into(), + ) + }) + .map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(FilterWith {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/first.rs b/crates/nu-command/src/dataframe/eager/first.rs new file mode 100644 index 0000000000..1371f869cc --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/first.rs @@ -0,0 +1,80 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, +}; + +use super::super::values::{utils::DEFAULT_ROWS, Column, NuDataFrame}; + +#[derive(Clone)] +pub struct FirstDF; + +impl Command for FirstDF { + fn name(&self) -> &str { + "dfr first" + } + + fn usage(&self) -> &str { + "Creates new dataframe with first rows" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .optional("rows", SyntaxShape::Int, "Number of rows for head") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Create new dataframe with head rows", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr first 1", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("a".to_string(), vec![Value::test_int(1)]), + Column::new("b".to_string(), vec![Value::test_int(2)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let rows: Option = call.opt(engine_state, stack, 0)?; + let rows = rows.unwrap_or(DEFAULT_ROWS); + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + let res = df.as_ref().head(Some(rows)); + Ok(PipelineData::Value( + NuDataFrame::dataframe_into_value(res, call.head), + None, + )) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(FirstDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/get.rs b/crates/nu-command/src/dataframe/eager/get.rs new file mode 100644 index 0000000000..8c1eab5897 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/get.rs @@ -0,0 +1,88 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, +}; + +use crate::dataframe::values::utils::convert_columns_string; + +use super::super::values::{Column, NuDataFrame}; + +#[derive(Clone)] +pub struct GetDF; + +impl Command for GetDF { + fn name(&self) -> &str { + "dfr get" + } + + fn usage(&self) -> &str { + "Creates dataframe with the selected columns" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .rest("rest", SyntaxShape::Any, "column names to sort dataframe") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Creates dataframe with selected columns", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr get a", + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "a".to_string(), + vec![Value::test_int(1), Value::test_int(3)], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let columns: Vec = call.rest(engine_state, stack, 0)?; + let (col_string, col_span) = convert_columns_string(columns, call.head)?; + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + df.as_ref() + .select(&col_string) + .map_err(|e| { + ShellError::SpannedLabeledError( + "Error selecting columns".into(), + e.to_string(), + col_span, + ) + }) + .map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(GetDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/groupby.rs b/crates/nu-command/src/dataframe/eager/groupby.rs new file mode 100644 index 0000000000..e471b20f6b --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/groupby.rs @@ -0,0 +1,71 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, SyntaxShape, Value, +}; + +use super::super::values::{utils::convert_columns_string, NuDataFrame, NuGroupBy}; + +#[derive(Clone)] +pub struct CreateGroupBy; + +impl Command for CreateGroupBy { + fn name(&self) -> &str { + "dfr group-by" + } + + fn usage(&self) -> &str { + "Creates a groupby object that can be used for other aggregations" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .rest("rest", SyntaxShape::Any, "groupby columns") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Grouping by column a", + example: "[[a b]; [one 1] [one 2]] | dfr to-df | dfr group-by a", + result: None, + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + // Extracting the names of the columns to perform the groupby + let columns: Vec = call.rest(engine_state, stack, 0)?; + let (col_string, col_span) = convert_columns_string(columns, call.head)?; + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + // This is the expensive part of the groupby; to create the + // groups that will be used for grouping the data in the + // dataframe. Once it has been done these values can be stored + // in a NuGroupBy + let groupby = df.as_ref().groupby(&col_string).map_err(|e| { + ShellError::SpannedLabeledError("Error creating groupby".into(), e.to_string(), col_span) + })?; + + let groups = groupby.get_groups().to_vec(); + let groupby = NuGroupBy::new(df.as_ref().clone(), col_string, groups); + + Ok(PipelineData::Value(groupby.into_value(call.head), None)) +} diff --git a/crates/nu-command/src/dataframe/eager/join.rs b/crates/nu-command/src/dataframe/eager/join.rs new file mode 100644 index 0000000000..732b9c93ca --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/join.rs @@ -0,0 +1,226 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value, +}; +use polars::prelude::JoinType; + +use crate::dataframe::values::utils::convert_columns_string; + +use super::super::values::{Column, NuDataFrame}; + +#[derive(Clone)] +pub struct JoinDF; + +impl Command for JoinDF { + fn name(&self) -> &str { + "dfr join" + } + + fn usage(&self) -> &str { + "Joins a dataframe using columns as reference" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("dataframe", SyntaxShape::Any, "right dataframe to join") + .required_named( + "left", + SyntaxShape::Table, + "left column names to perform join", + Some('l'), + ) + .required_named( + "right", + SyntaxShape::Table, + "right column names to perform join", + Some('r'), + ) + .named( + "type", + SyntaxShape::String, + "type of join. Inner by default", + Some('t'), + ) + .named( + "suffix", + SyntaxShape::String, + "suffix for the columns of the right dataframe", + Some('s'), + ) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "inner join dataframe", + example: r#"let right = ([[a b c]; [1 2 5] [3 4 5] [5 6 6]] | dfr to-df); + $right | dfr join $right -l [a b] -r [a b]"#, + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "a".to_string(), + vec![Value::test_int(1), Value::test_int(3), Value::test_int(5)], + ), + Column::new( + "b".to_string(), + vec![Value::test_int(2), Value::test_int(4), Value::test_int(6)], + ), + Column::new( + "c".to_string(), + vec![Value::test_int(5), Value::test_int(5), Value::test_int(6)], + ), + Column::new( + "c_right".to_string(), + vec![Value::test_int(5), Value::test_int(5), Value::test_int(6)], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let r_df: Value = call.req(engine_state, stack, 0)?; + let l_col: Vec = call + .get_flag(engine_state, stack, "left")? + .expect("required value in syntax"); + let r_col: Vec = call + .get_flag(engine_state, stack, "right")? + .expect("required value in syntax"); + let suffix: Option = call.get_flag(engine_state, stack, "suffix")?; + let join_type_op: Option> = call.get_flag(engine_state, stack, "type")?; + + let join_type = match join_type_op { + None => JoinType::Inner, + Some(val) => match val.item.as_ref() { + "inner" => JoinType::Inner, + "outer" => JoinType::Outer, + "left" => JoinType::Left, + _ => { + return Err(ShellError::SpannedLabeledErrorHelp( + "Incorrect join type".into(), + "Invalid join type".into(), + val.span, + "Options: inner, outer or left".into(), + )) + } + }, + }; + + let (l_col_string, l_col_span) = convert_columns_string(l_col, call.head)?; + let (r_col_string, r_col_span) = convert_columns_string(r_col, call.head)?; + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + let r_df = NuDataFrame::try_from_value(r_df)?; + + check_column_datatypes( + df.as_ref(), + r_df.as_ref(), + &l_col_string, + l_col_span, + &r_col_string, + r_col_span, + )?; + + df.as_ref() + .join( + r_df.as_ref(), + &l_col_string, + &r_col_string, + join_type, + suffix, + ) + .map_err(|e| { + ShellError::SpannedLabeledError( + "Error joining dataframes".into(), + e.to_string(), + l_col_span, + ) + }) + .map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)) +} + +fn check_column_datatypes>( + df_l: &polars::prelude::DataFrame, + df_r: &polars::prelude::DataFrame, + l_cols: &[T], + l_col_span: Span, + r_cols: &[T], + r_col_span: Span, +) -> Result<(), ShellError> { + if l_cols.len() != r_cols.len() { + return Err(ShellError::SpannedLabeledErrorHelp( + "Mismatched number of column names".into(), + format!( + "found {} left names vs {} right names", + l_cols.len(), + r_cols.len() + ), + l_col_span, + "perhaps you need to change the number of columns to join".into(), + )); + } + + for (l, r) in l_cols.iter().zip(r_cols) { + let l_series = df_l.column(l.as_ref()).map_err(|e| { + ShellError::SpannedLabeledError( + "Error selecting the columns".into(), + e.to_string(), + l_col_span, + ) + })?; + + let r_series = df_r.column(r.as_ref()).map_err(|e| { + ShellError::SpannedLabeledError( + "Error selecting the columns".into(), + e.to_string(), + r_col_span, + ) + })?; + + if l_series.dtype() != r_series.dtype() { + return Err(ShellError::SpannedLabeledErrorHelp( + "Mismatched datatypes".into(), + format!( + "left column type '{}' doesn't match '{}' right column match", + l_series.dtype(), + r_series.dtype() + ), + l_col_span, + "perhaps you need to select other column to match".into(), + )); + } + } + + Ok(()) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(JoinDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/last.rs b/crates/nu-command/src/dataframe/eager/last.rs new file mode 100644 index 0000000000..39294fae3c --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/last.rs @@ -0,0 +1,80 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, +}; + +use super::super::values::{utils::DEFAULT_ROWS, Column, NuDataFrame}; + +#[derive(Clone)] +pub struct LastDF; + +impl Command for LastDF { + fn name(&self) -> &str { + "dfr last" + } + + fn usage(&self) -> &str { + "Creates new dataframe with tail rows" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .optional("rows", SyntaxShape::Int, "Number of rows for tail") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Create new dataframe with last rows", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr last 1", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("a".to_string(), vec![Value::test_int(3)]), + Column::new("b".to_string(), vec![Value::test_int(4)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let rows: Option = call.opt(engine_state, stack, 0)?; + let rows = rows.unwrap_or(DEFAULT_ROWS); + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + let res = df.as_ref().tail(Some(rows)); + Ok(PipelineData::Value( + NuDataFrame::dataframe_into_value(res, call.head), + None, + )) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(LastDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/melt.rs b/crates/nu-command/src/dataframe/eager/melt.rs new file mode 100644 index 0000000000..5956fbb416 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/melt.rs @@ -0,0 +1,243 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value, +}; + +use crate::dataframe::values::utils::convert_columns_string; + +use super::super::values::{Column, NuDataFrame}; + +#[derive(Clone)] +pub struct MeltDF; + +impl Command for MeltDF { + fn name(&self) -> &str { + "dfr melt" + } + + fn usage(&self) -> &str { + "Unpivot a DataFrame from wide to long format" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required_named( + "columns", + SyntaxShape::Table, + "column names for melting", + Some('c'), + ) + .required_named( + "values", + SyntaxShape::Table, + "column names used as value columns", + Some('v'), + ) + .named( + "variable-name", + SyntaxShape::String, + "optional name for variable column", + Some('r'), + ) + .named( + "value-name", + SyntaxShape::String, + "optional name for value column", + Some('l'), + ) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "melt dataframe", + example: + "[[a b c d]; [x 1 4 a] [y 2 5 b] [z 3 6 c]] | dfr to-df | dfr melt -c [b c] -v [a d]", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "b".to_string(), + vec![ + Value::test_int(1), + Value::test_int(2), + Value::test_int(3), + Value::test_int(1), + Value::test_int(2), + Value::test_int(3), + ], + ), + Column::new( + "c".to_string(), + vec![ + Value::test_int(4), + Value::test_int(5), + Value::test_int(6), + Value::test_int(4), + Value::test_int(5), + Value::test_int(6), + ], + ), + Column::new( + "variable".to_string(), + vec![ + Value::test_string("a"), + Value::test_string("a"), + Value::test_string("a"), + Value::test_string("d"), + Value::test_string("d"), + Value::test_string("d"), + ], + ), + Column::new( + "value".to_string(), + vec![ + Value::test_string("x"), + Value::test_string("y"), + Value::test_string("z"), + Value::test_string("a"), + Value::test_string("b"), + Value::test_string("c"), + ], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let id_col: Vec = call + .get_flag(engine_state, stack, "columns")? + .expect("required value"); + let val_col: Vec = call + .get_flag(engine_state, stack, "values")? + .expect("required value"); + + let value_name: Option> = call.get_flag(engine_state, stack, "value-name")?; + let variable_name: Option> = + call.get_flag(engine_state, stack, "variable-name")?; + + let (id_col_string, id_col_span) = convert_columns_string(id_col, call.head)?; + let (val_col_string, val_col_span) = convert_columns_string(val_col, call.head)?; + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + check_column_datatypes(df.as_ref(), &id_col_string, id_col_span)?; + check_column_datatypes(df.as_ref(), &val_col_string, val_col_span)?; + + let mut res = df + .as_ref() + .melt(&id_col_string, &val_col_string) + .map_err(|e| { + ShellError::SpannedLabeledError( + "Error calculating melt".into(), + e.to_string(), + call.head, + ) + })?; + + if let Some(name) = &variable_name { + res.rename("variable", &name.item).map_err(|e| { + ShellError::SpannedLabeledError( + "Error renaming column".into(), + e.to_string(), + name.span, + ) + })?; + } + + if let Some(name) = &value_name { + res.rename("value", &name.item).map_err(|e| { + ShellError::SpannedLabeledError( + "Error renaming column".into(), + e.to_string(), + name.span, + ) + })?; + } + + Ok(PipelineData::Value( + NuDataFrame::dataframe_into_value(res, call.head), + None, + )) +} + +fn check_column_datatypes>( + df: &polars::prelude::DataFrame, + cols: &[T], + col_span: Span, +) -> Result<(), ShellError> { + if cols.is_empty() { + return Err(ShellError::SpannedLabeledError( + "Merge error".into(), + "empty column list".into(), + col_span, + )); + } + + // Checking if they are same type + if cols.len() > 1 { + for w in cols.windows(2) { + let l_series = df.column(w[0].as_ref()).map_err(|e| { + ShellError::SpannedLabeledError( + "Error selecting columns".into(), + e.to_string(), + col_span, + ) + })?; + + let r_series = df.column(w[1].as_ref()).map_err(|e| { + ShellError::SpannedLabeledError( + "Error selecting columns".into(), + e.to_string(), + col_span, + ) + })?; + + if l_series.dtype() != r_series.dtype() { + return Err(ShellError::SpannedLabeledErrorHelp( + "Merge error".into(), + "found different column types in list".into(), + col_span, + format!( + "datatypes {} and {} are incompatible", + l_series.dtype(), + r_series.dtype() + ), + )); + } + } + } + + Ok(()) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(MeltDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/mod.rs b/crates/nu-command/src/dataframe/eager/mod.rs new file mode 100644 index 0000000000..493779916b --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/mod.rs @@ -0,0 +1,105 @@ +mod aggregate; +mod append; +mod column; +mod command; +mod describe; +mod drop; +mod drop_nulls; +mod dtypes; +mod dummies; +mod filter_with; +mod first; +mod get; +mod groupby; +mod join; +mod last; +mod melt; +mod open; +mod pivot; +mod rename; +mod sample; +mod shape; +mod slice; +mod sort; +mod take; +mod to_csv; +mod to_df; +mod to_nu; +mod to_parquet; +mod with_column; + +use nu_protocol::engine::StateWorkingSet; + +pub use aggregate::Aggregate; +pub use append::AppendDF; +pub use column::ColumnDF; +pub use command::Dataframe; +pub use describe::DescribeDF; +pub use drop::DropDF; +pub use drop_nulls::DropNulls; +pub use dtypes::DataTypes; +pub use dummies::Dummies; +pub use filter_with::FilterWith; +pub use first::FirstDF; +pub use get::GetDF; +pub use groupby::CreateGroupBy; +pub use join::JoinDF; +pub use last::LastDF; +pub use melt::MeltDF; +pub use open::OpenDataFrame; +pub use pivot::PivotDF; +pub use rename::RenameDF; +pub use sample::SampleDF; +pub use shape::ShapeDF; +pub use slice::SliceDF; +pub use sort::SortDF; +pub use take::TakeDF; +pub use to_csv::ToCSV; +pub use to_df::ToDataFrame; +pub use to_nu::ToNu; +pub use to_parquet::ToParquet; +pub use with_column::WithColumn; + +pub fn add_eager_decls(working_set: &mut StateWorkingSet) { + macro_rules! bind_command { + ( $command:expr ) => { + working_set.add_decl(Box::new($command)); + }; + ( $( $command:expr ),* ) => { + $( working_set.add_decl(Box::new($command)); )* + }; + } + + // Dataframe commands + bind_command!( + Aggregate, + AppendDF, + ColumnDF, + CreateGroupBy, + Dataframe, + DataTypes, + DescribeDF, + DropDF, + DropNulls, + Dummies, + FilterWith, + FirstDF, + GetDF, + JoinDF, + LastDF, + MeltDF, + OpenDataFrame, + PivotDF, + RenameDF, + SampleDF, + ShapeDF, + SliceDF, + SortDF, + TakeDF, + ToCSV, + ToDataFrame, + ToNu, + ToParquet, + WithColumn + ); +} diff --git a/crates/nu-command/src/dataframe/open.rs b/crates/nu-command/src/dataframe/eager/open.rs similarity index 99% rename from crates/nu-command/src/dataframe/open.rs rename to crates/nu-command/src/dataframe/eager/open.rs index 9f8ff47b59..0bc069a7fd 100644 --- a/crates/nu-command/src/dataframe/open.rs +++ b/crates/nu-command/src/dataframe/eager/open.rs @@ -1,4 +1,4 @@ -use super::values::NuDataFrame; +use super::super::values::NuDataFrame; use nu_engine::CallExt; use nu_protocol::{ ast::Call, diff --git a/crates/nu-command/src/dataframe/eager/pivot.rs b/crates/nu-command/src/dataframe/eager/pivot.rs new file mode 100644 index 0000000000..d2feb84618 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/pivot.rs @@ -0,0 +1,175 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape, +}; +use polars::prelude::DataType; + +use crate::dataframe::values::NuGroupBy; + +use super::super::values::NuDataFrame; + +enum Operation { + First, + Sum, + Min, + Max, + Mean, + Median, +} + +impl Operation { + fn from_tagged(name: Spanned) -> Result { + match name.item.as_ref() { + "first" => Ok(Operation::First), + "sum" => Ok(Operation::Sum), + "min" => Ok(Operation::Min), + "max" => Ok(Operation::Max), + "mean" => Ok(Operation::Mean), + "median" => Ok(Operation::Median), + _ => Err(ShellError::SpannedLabeledErrorHelp( + "Operation not fount".into(), + "Operation does not exist for pivot".into(), + name.span, + "Options: first, sum, min, max, mean, median".into(), + )), + } + } +} + +#[derive(Clone)] +pub struct PivotDF; + +impl Command for PivotDF { + fn name(&self) -> &str { + "dfr pivot" + } + + fn usage(&self) -> &str { + "Performs a pivot operation on a groupby object" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required( + "pivot-column", + SyntaxShape::String, + "pivot column to perform pivot", + ) + .required( + "value-column", + SyntaxShape::String, + "value column to perform pivot", + ) + .required("operation", SyntaxShape::String, "aggregate operation") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Pivot a dataframe on b and aggregation on col c", + example: + "[[a b c]; [one x 1] [two y 2]] | dfr to-df | dfr group-by a | dfr pivot b c sum", + result: None, + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let pivot_col: Spanned = call.req(engine_state, stack, 0)?; + let value_col: Spanned = call.req(engine_state, stack, 1)?; + let operation: Spanned = call.req(engine_state, stack, 2)?; + let op = Operation::from_tagged(operation)?; + + let nu_groupby = NuGroupBy::try_from_pipeline(input, call.head)?; + let df_ref = nu_groupby.as_ref(); + + check_pivot_column(df_ref, &pivot_col)?; + check_value_column(df_ref, &value_col)?; + + let mut groupby = nu_groupby.to_groupby()?; + + let pivot = groupby.pivot(&pivot_col.item, &value_col.item); + + match op { + Operation::Mean => pivot.mean(), + Operation::Sum => pivot.sum(), + Operation::Min => pivot.min(), + Operation::Max => pivot.max(), + Operation::First => pivot.first(), + Operation::Median => pivot.median(), + } + .map_err(|e| { + ShellError::SpannedLabeledError("Error creating pivot".into(), e.to_string(), call.head) + }) + .map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)) +} + +fn check_pivot_column( + df: &polars::prelude::DataFrame, + col: &Spanned, +) -> Result<(), ShellError> { + let series = df.column(&col.item).map_err(|e| { + ShellError::SpannedLabeledError("Column not found".into(), e.to_string(), col.span) + })?; + + match series.dtype() { + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Utf8 => Ok(()), + _ => Err(ShellError::SpannedLabeledError( + "Pivot error".into(), + format!("Unsupported datatype {}", series.dtype()), + col.span, + )), + } +} + +fn check_value_column( + df: &polars::prelude::DataFrame, + col: &Spanned, +) -> Result<(), ShellError> { + let series = df.column(&col.item).map_err(|e| { + ShellError::SpannedLabeledError("Column not found".into(), e.to_string(), col.span) + })?; + + match series.dtype() { + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 => Ok(()), + _ => Err(ShellError::SpannedLabeledError( + "Pivot error".into(), + format!("Unsupported datatype {}", series.dtype()), + col.span, + )), + } +} diff --git a/crates/nu-command/src/dataframe/eager/rename.rs b/crates/nu-command/src/dataframe/eager/rename.rs new file mode 100644 index 0000000000..91815d2998 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/rename.rs @@ -0,0 +1,94 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, +}; + +use super::super::values::{Column, NuDataFrame}; + +#[derive(Clone)] +pub struct RenameDF; + +impl Command for RenameDF { + fn name(&self) -> &str { + "dfr rename-col" + } + + fn usage(&self) -> &str { + "rename a dataframe column" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("from", SyntaxShape::String, "column name to be renamed") + .required("to", SyntaxShape::String, "new column name") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Renames a dataframe column", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr rename-col a a_new", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "a_new".to_string(), + vec![Value::test_int(1), Value::test_int(3)], + ), + Column::new( + "b".to_string(), + vec![Value::test_int(2), Value::test_int(4)], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let from: String = call.req(engine_state, stack, 0)?; + let to: String = call.req(engine_state, stack, 1)?; + + let mut df = NuDataFrame::try_from_pipeline(input, call.head)?; + + df.as_mut() + .rename(&from, &to) + .map_err(|e| { + ShellError::SpannedLabeledError("Error renaming".into(), e.to_string(), call.head) + }) + .map(|df| { + PipelineData::Value( + NuDataFrame::dataframe_into_value(df.clone(), call.head), + None, + ) + }) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(RenameDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/sample.rs b/crates/nu-command/src/dataframe/eager/sample.rs new file mode 100644 index 0000000000..b0fcc54192 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/sample.rs @@ -0,0 +1,106 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape, +}; + +use super::super::values::NuDataFrame; + +#[derive(Clone)] +pub struct SampleDF; + +impl Command for SampleDF { + fn name(&self) -> &str { + "dfr sample" + } + + fn usage(&self) -> &str { + "Create sample dataframe" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .named( + "n-rows", + SyntaxShape::Int, + "number of rows to be taken from dataframe", + Some('n'), + ) + .named( + "fraction", + SyntaxShape::Number, + "fraction of dataframe to be taken", + Some('f'), + ) + .switch("replace", "sample with replace", Some('e')) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Sample rows from dataframe", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr sample -n 1", + result: None, // No expected value because sampling is random + }, + Example { + description: "Shows sample row using fraction and replace", + example: "[[a b]; [1 2] [3 4] [5 6]] | dfr to-df | dfr sample -f 0.5 -e", + result: None, // No expected value because sampling is random + }, + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let rows: Option> = call.get_flag(engine_state, stack, "n-rows")?; + let fraction: Option> = call.get_flag(engine_state, stack, "fraction")?; + let replace: bool = call.has_flag("replace"); + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + match (rows, fraction) { + (Some(rows), None) => df.as_ref().sample_n(rows.item, replace).map_err(|e| { + ShellError::SpannedLabeledError( + "Error creating sample".into(), + e.to_string(), + rows.span, + ) + }), + (None, Some(frac)) => df.as_ref().sample_frac(frac.item, replace).map_err(|e| { + ShellError::SpannedLabeledError( + "Error creating sample".into(), + e.to_string(), + frac.span, + ) + }), + (Some(_), Some(_)) => Err(ShellError::SpannedLabeledError( + "Incompatible flags".into(), + "Only one selection criterion allowed".into(), + call.head, + )), + (None, None) => Err(ShellError::SpannedLabeledErrorHelp( + "No selection".into(), + "No selection criterion was found".into(), + call.head, + "Perhaps you want to use the flag -n or -f".into(), + )), + } + .map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)) +} diff --git a/crates/nu-command/src/dataframe/eager/shape.rs b/crates/nu-command/src/dataframe/eager/shape.rs new file mode 100644 index 0000000000..32cda93a38 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/shape.rs @@ -0,0 +1,87 @@ +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, Value, +}; + +use crate::dataframe::values::Column; + +use super::super::values::NuDataFrame; + +#[derive(Clone)] +pub struct ShapeDF; + +impl Command for ShapeDF { + fn name(&self) -> &str { + "dfr shape" + } + + fn usage(&self) -> &str { + "Shows column and row size for a dataframe" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()).category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Shows row and column shape", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr shape", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("rows".to_string(), vec![Value::test_int(2)]), + Column::new("columns".to_string(), vec![Value::test_int(2)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + _engine_state: &EngineState, + _stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + let rows = Value::Int { + val: df.as_ref().height() as i64, + span: call.head, + }; + + let cols = Value::Int { + val: df.as_ref().width() as i64, + span: call.head, + }; + + let rows_col = Column::new("rows".to_string(), vec![rows]); + let cols_col = Column::new("columns".to_string(), vec![cols]); + + NuDataFrame::try_from_columns(vec![rows_col, cols_col]) + .map(|df| PipelineData::Value(df.into_value(call.head), None)) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(ShapeDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/slice.rs b/crates/nu-command/src/dataframe/eager/slice.rs new file mode 100644 index 0000000000..087fe23aac --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/slice.rs @@ -0,0 +1,85 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, +}; + +use crate::dataframe::values::Column; + +use super::super::values::NuDataFrame; + +#[derive(Clone)] +pub struct SliceDF; + +impl Command for SliceDF { + fn name(&self) -> &str { + "dfr slice" + } + + fn usage(&self) -> &str { + "Creates new dataframe from a slice of rows" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("offset", SyntaxShape::Int, "start of slice") + .required("size", SyntaxShape::Int, "size of slice") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Create new dataframe from a slice of the rows", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr slice 0 1", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("a".to_string(), vec![Value::test_int(1)]), + Column::new("b".to_string(), vec![Value::test_int(2)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let offset: i64 = call.req(engine_state, stack, 0)?; + let size: usize = call.req(engine_state, stack, 1)?; + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + let res = df.as_ref().slice(offset, size); + + Ok(PipelineData::Value( + NuDataFrame::dataframe_into_value(res, call.head), + None, + )) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(SliceDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/sort.rs b/crates/nu-command/src/dataframe/eager/sort.rs new file mode 100644 index 0000000000..d5bded8904 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/sort.rs @@ -0,0 +1,142 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, +}; + +use crate::dataframe::values::{utils::convert_columns_string, Column}; + +use super::super::values::NuDataFrame; + +#[derive(Clone)] +pub struct SortDF; + +impl Command for SortDF { + fn name(&self) -> &str { + "dfr sort" + } + + fn usage(&self) -> &str { + "Creates new sorted dataframe or series" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .switch("reverse", "invert sort", Some('r')) + .rest("rest", SyntaxShape::Any, "column names to sort dataframe") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Create new sorted dataframe", + example: "[[a b]; [3 4] [1 2]] | dfr to-df | dfr sort a", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "a".to_string(), + vec![Value::test_int(1), Value::test_int(3)], + ), + Column::new( + "b".to_string(), + vec![Value::test_int(2), Value::test_int(4)], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Create new sorted series", + example: "[3 4 1 2] | dfr to-df | dfr sort", + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "0".to_string(), + vec![ + Value::test_int(1), + Value::test_int(2), + Value::test_int(3), + Value::test_int(4), + ], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let reverse = call.has_flag("reverse"); + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + if df.is_series() { + let columns = df.as_ref().get_column_names(); + + df.as_ref() + .sort(columns, reverse) + .map_err(|e| { + ShellError::SpannedLabeledError( + "Error sorting dataframe".into(), + e.to_string(), + call.head, + ) + }) + .map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)) + } else { + let columns: Vec = call.rest(engine_state, stack, 0)?; + + if !columns.is_empty() { + let (col_string, col_span) = convert_columns_string(columns, call.head)?; + + df.as_ref() + .sort(&col_string, reverse) + .map_err(|e| { + ShellError::SpannedLabeledError( + "Error sorting dataframe".into(), + e.to_string(), + col_span, + ) + }) + .map(|df| { + PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None) + }) + } else { + Err(ShellError::SpannedLabeledError( + "Missing columns".into(), + "missing column name to perform sort".into(), + call.head, + )) + } + } +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(SortDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/take.rs b/crates/nu-command/src/dataframe/eager/take.rs new file mode 100644 index 0000000000..0b3a68cce2 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/take.rs @@ -0,0 +1,144 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value, +}; +use polars::prelude::DataType; + +use crate::dataframe::values::Column; + +use super::super::values::NuDataFrame; + +#[derive(Clone)] +pub struct TakeDF; + +impl Command for TakeDF { + fn name(&self) -> &str { + "dfr take" + } + + fn usage(&self) -> &str { + "Creates new dataframe using the given indices" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required( + "indices", + SyntaxShape::Any, + "list of indices used to take data", + ) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Takes selected rows from dataframe", + example: r#"let df = ([[a b]; [4 1] [5 2] [4 3]] | dfr to-df); + let indices = ([0 2] | dfr to-df); + $df | dfr take $indices"#, + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "a".to_string(), + vec![Value::test_int(4), Value::test_int(4)], + ), + Column::new( + "b".to_string(), + vec![Value::test_int(1), Value::test_int(3)], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Takes selected rows from series", + example: r#"let series = ([4 1 5 2 4 3] | dfr to-df); + let indices = ([0 2] | dfr to-df); + $series | dfr take $indices"#, + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "0".to_string(), + vec![Value::test_int(4), Value::test_int(5)], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let index_value: Value = call.req(engine_state, stack, 0)?; + let index_span = index_value.span()?; + let index = NuDataFrame::try_from_value(index_value)?.as_series(index_span)?; + + let casted = match index.dtype() { + DataType::UInt32 | DataType::UInt64 | DataType::Int32 | DataType::Int64 => { + index.cast(&DataType::UInt32).map_err(|e| { + ShellError::SpannedLabeledError( + "Error casting index list".into(), + e.to_string(), + index_span, + ) + }) + } + _ => Err(ShellError::SpannedLabeledErrorHelp( + "Incorrect type".into(), + "Series with incorrect type".into(), + call.head, + "Consider using a Series with type int type".into(), + )), + }?; + + let indices = casted.u32().map_err(|e| { + ShellError::SpannedLabeledError( + "Error casting index list".into(), + e.to_string(), + index_span, + ) + })?; + + NuDataFrame::try_from_pipeline(input, call.head).and_then(|df| { + df.as_ref() + .take(indices) + .map_err(|e| { + ShellError::SpannedLabeledError( + "Error taking values".into(), + e.to_string(), + call.head, + ) + }) + .map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)) + }) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(TakeDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/to_csv.rs b/crates/nu-command/src/dataframe/eager/to_csv.rs new file mode 100644 index 0000000000..5db04a11f9 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/to_csv.rs @@ -0,0 +1,132 @@ +use std::{fs::File, path::PathBuf}; + +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape, Value, +}; +use polars::prelude::{CsvWriter, SerWriter}; + +use super::super::values::NuDataFrame; + +#[derive(Clone)] +pub struct ToCSV; + +impl Command for ToCSV { + fn name(&self) -> &str { + "dfr to-csv" + } + + fn usage(&self) -> &str { + "Saves dataframe to csv file" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("file", SyntaxShape::Filepath, "file path to save dataframe") + .named( + "delimiter", + SyntaxShape::String, + "file delimiter character", + Some('d'), + ) + .switch("no-header", "Indicates if file doesn't have header", None) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Saves dataframe to csv file", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-csv test.csv", + result: None, + }, + Example { + description: "Saves dataframe to csv file using other delimiter", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-csv test.csv -d '|'", + result: None, + }, + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let file_name: Spanned = call.req(engine_state, stack, 0)?; + let delimiter: Option> = call.get_flag(engine_state, stack, "delimiter")?; + let no_header: bool = call.has_flag("no_header"); + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + let mut file = File::create(&file_name.item).map_err(|e| { + ShellError::SpannedLabeledError( + "Error with file name".into(), + e.to_string(), + file_name.span, + ) + })?; + + let writer = CsvWriter::new(&mut file); + + let writer = if no_header { + writer.has_header(false) + } else { + writer.has_header(true) + }; + + let writer = match delimiter { + None => writer, + Some(d) => { + if d.item.len() != 1 { + return Err(ShellError::SpannedLabeledError( + "Incorrect delimiter".into(), + "Delimiter has to be one char".into(), + d.span, + )); + } else { + let delimiter = match d.item.chars().next() { + Some(d) => d as u8, + None => unreachable!(), + }; + + writer.with_delimiter(delimiter) + } + } + }; + + writer.finish(df.as_ref()).map_err(|e| { + ShellError::SpannedLabeledError( + "Error writing to file".into(), + e.to_string(), + file_name.span, + ) + })?; + + let file_value = Value::String { + val: format!("saved {:?}", &file_name.item), + span: file_name.span, + }; + + Ok(PipelineData::Value( + Value::List { + vals: vec![file_value], + span: call.head, + }, + None, + )) +} diff --git a/crates/nu-command/src/dataframe/to_df.rs b/crates/nu-command/src/dataframe/eager/to_df.rs similarity index 97% rename from crates/nu-command/src/dataframe/to_df.rs rename to crates/nu-command/src/dataframe/eager/to_df.rs index 8d4a8cecb8..4feee1856d 100644 --- a/crates/nu-command/src/dataframe/to_df.rs +++ b/crates/nu-command/src/dataframe/eager/to_df.rs @@ -1,4 +1,4 @@ -use super::values::{Column, NuDataFrame}; +use super::super::values::{Column, NuDataFrame}; use nu_protocol::{ ast::Call, @@ -117,7 +117,7 @@ impl Command for ToDataFrame { #[cfg(test)] mod test { - use super::super::test_dataframe::test_dataframe; + use super::super::super::test_dataframe::test_dataframe; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/eager/to_nu.rs b/crates/nu-command/src/dataframe/eager/to_nu.rs new file mode 100644 index 0000000000..2f8026999d --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/to_nu.rs @@ -0,0 +1,83 @@ +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, SyntaxShape, Value, +}; + +use super::super::values::NuDataFrame; + +#[derive(Clone)] +pub struct ToNu; + +impl Command for ToNu { + fn name(&self) -> &str { + "dfr to-nu" + } + + fn usage(&self) -> &str { + "Converts a section of the dataframe to Nushell Table" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .named( + "n-rows", + SyntaxShape::Number, + "number of rows to be shown", + Some('n'), + ) + .switch("tail", "shows tail rows", Some('t')) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Shows head rows from dataframe", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-nu", + result: None, + }, + Example { + description: "Shows tail rows from dataframe", + example: "[[a b]; [1 2] [3 4] [5 6]] | dfr to-df | dfr to-nu -t -n 1", + result: None, + }, + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let rows: Option = call.get_flag(engine_state, stack, "n-rows")?; + let tail: bool = call.has_flag("tail"); + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + let values = if tail { + df.tail(rows, call.head)? + } else { + df.head(rows, call.head)? + }; + + let value = Value::List { + vals: values, + span: call.head, + }; + + Ok(PipelineData::Value(value, None)) +} diff --git a/crates/nu-command/src/dataframe/eager/to_parquet.rs b/crates/nu-command/src/dataframe/eager/to_parquet.rs new file mode 100644 index 0000000000..12db49b361 --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/to_parquet.rs @@ -0,0 +1,84 @@ +use std::{fs::File, path::PathBuf}; + +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape, Value, +}; +use polars::prelude::ParquetWriter; + +use super::super::values::NuDataFrame; + +#[derive(Clone)] +pub struct ToParquet; + +impl Command for ToParquet { + fn name(&self) -> &str { + "dfr to-parquet" + } + + fn usage(&self) -> &str { + "Saves dataframe to parquet file" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("file", SyntaxShape::Filepath, "file path to save dataframe") + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Saves dataframe to csv file", + example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-parquet test.parquet", + result: None, + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +fn command( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let file_name: Spanned = call.req(engine_state, stack, 0)?; + + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + let file = File::create(&file_name.item).map_err(|e| { + ShellError::SpannedLabeledError( + "Error with file name".into(), + e.to_string(), + file_name.span, + ) + })?; + + ParquetWriter::new(file).finish(df.as_ref()).map_err(|e| { + ShellError::SpannedLabeledError("Error saving file".into(), e.to_string(), file_name.span) + })?; + + let file_value = Value::String { + val: format!("saved {:?}", &file_name.item), + span: file_name.span, + }; + + Ok(PipelineData::Value( + Value::List { + vals: vec![file_value], + span: call.head, + }, + None, + )) +} diff --git a/crates/nu-command/src/dataframe/with_column.rs b/crates/nu-command/src/dataframe/eager/with_column.rs similarity index 96% rename from crates/nu-command/src/dataframe/with_column.rs rename to crates/nu-command/src/dataframe/eager/with_column.rs index f6890d1958..8be389e1bc 100644 --- a/crates/nu-command/src/dataframe/with_column.rs +++ b/crates/nu-command/src/dataframe/eager/with_column.rs @@ -5,7 +5,7 @@ use nu_protocol::{ Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value, }; -use super::values::{Column, NuDataFrame}; +use super::super::values::{Column, NuDataFrame}; #[derive(Clone)] pub struct WithColumn; @@ -99,7 +99,7 @@ fn command( #[cfg(test)] mod test { - use super::super::test_dataframe::test_dataframe; + use super::super::super::test_dataframe::test_dataframe; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/mod.rs b/crates/nu-command/src/dataframe/mod.rs index 8830987b1b..61abdea795 100644 --- a/crates/nu-command/src/dataframe/mod.rs +++ b/crates/nu-command/src/dataframe/mod.rs @@ -1,101 +1,15 @@ +mod eager; mod series; mod values; -mod append; -mod column; -mod command; -mod describe; -mod drop; -mod drop_nulls; -mod dtypes; -mod open; -mod to_df; -mod with_column; - -pub use series::*; - -pub use append::AppendDF; -pub use column::ColumnDF; -pub use command::Dataframe; -pub use describe::DescribeDF; -pub use drop::DropDF; -pub use drop_nulls::DropNulls; -pub use dtypes::DataTypes; -pub use open::OpenDataFrame; -pub use to_df::ToDataFrame; -pub use with_column::WithColumn; +pub use eager::add_eager_decls; +pub use series::add_series_decls; use nu_protocol::engine::StateWorkingSet; pub fn add_dataframe_decls(working_set: &mut StateWorkingSet) { - macro_rules! bind_command { - ( $command:expr ) => { - working_set.add_decl(Box::new($command)); - }; - ( $( $command:expr ),* ) => { - $( working_set.add_decl(Box::new($command)); )* - }; - } - - // Series commands - bind_command!( - AllFalse, - AllTrue, - ArgMax, - ArgMin, - ArgSort, - ArgTrue, - ArgUnique, - Concatenate, - Contains, - Cumulative, - GetDay, - GetHour, - GetMinute, - GetMonth, - GetNanosecond, - GetOrdinal, - GetSecond, - GetWeek, - GetWeekDay, - GetYear, - IsDuplicated, - IsIn, - IsNotNull, - IsNull, - IsUnique, - NNull, - NUnique, - NotSeries, - Rename, - Replace, - ReplaceAll, - Rolling, - SetSeries, - SetWithIndex, - Shift, - StrLengths, - StrSlice, - StrFTime, - ToLowerCase, - ToUpperCase, - Unique, - ValueCount - ); - - // Dataframe commands - bind_command!( - AppendDF, - ColumnDF, - Dataframe, - DataTypes, - DescribeDF, - DropDF, - DropNulls, - OpenDataFrame, - ToDataFrame, - WithColumn - ); + add_series_decls(working_set); + add_eager_decls(working_set); } #[cfg(test)] diff --git a/crates/nu-command/src/dataframe/series/mod.rs b/crates/nu-command/src/dataframe/series/mod.rs index 9fa3b2221c..fbe81ebcfa 100644 --- a/crates/nu-command/src/dataframe/series/mod.rs +++ b/crates/nu-command/src/dataframe/series/mod.rs @@ -23,6 +23,8 @@ mod shift; mod unique; mod value_counts; +use nu_protocol::engine::StateWorkingSet; + pub use all_false::AllFalse; pub use all_true::AllTrue; pub use arg_max::ArgMax; @@ -35,3 +37,60 @@ pub use rolling::Rolling; pub use shift::Shift; pub use unique::Unique; pub use value_counts::ValueCount; + +pub fn add_series_decls(working_set: &mut StateWorkingSet) { + macro_rules! bind_command { + ( $command:expr ) => { + working_set.add_decl(Box::new($command)); + }; + ( $( $command:expr ),* ) => { + $( working_set.add_decl(Box::new($command)); )* + }; + } + + // Series commands + bind_command!( + AllFalse, + AllTrue, + ArgMax, + ArgMin, + ArgSort, + ArgTrue, + ArgUnique, + Concatenate, + Contains, + Cumulative, + GetDay, + GetHour, + GetMinute, + GetMonth, + GetNanosecond, + GetOrdinal, + GetSecond, + GetWeek, + GetWeekDay, + GetYear, + IsDuplicated, + IsIn, + IsNotNull, + IsNull, + IsUnique, + NNull, + NUnique, + NotSeries, + Rename, + Replace, + ReplaceAll, + Rolling, + SetSeries, + SetWithIndex, + Shift, + StrLengths, + StrSlice, + StrFTime, + ToLowerCase, + ToUpperCase, + Unique, + ValueCount + ); +} diff --git a/crates/nu-command/src/dataframe/series/rolling.rs b/crates/nu-command/src/dataframe/series/rolling.rs index 438b8ef063..d2ee02abc2 100644 --- a/crates/nu-command/src/dataframe/series/rolling.rs +++ b/crates/nu-command/src/dataframe/series/rolling.rs @@ -162,8 +162,8 @@ fn command( #[cfg(test)] mod test { + use super::super::super::eager::DropNulls; use super::super::super::test_dataframe::test_dataframe; - use super::super::super::DropNulls; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/series/shift.rs b/crates/nu-command/src/dataframe/series/shift.rs index 6deca4f10b..cc35a2ca3f 100644 --- a/crates/nu-command/src/dataframe/series/shift.rs +++ b/crates/nu-command/src/dataframe/series/shift.rs @@ -68,8 +68,8 @@ fn command( #[cfg(test)] mod test { + use super::super::super::eager::DropNulls; use super::super::super::test_dataframe::test_dataframe; - use super::super::super::DropNulls; use super::*; #[test] diff --git a/crates/nu-command/src/dataframe/test_dataframe.rs b/crates/nu-command/src/dataframe/test_dataframe.rs index fa0fcdda34..5fc4cf79ad 100644 --- a/crates/nu-command/src/dataframe/test_dataframe.rs +++ b/crates/nu-command/src/dataframe/test_dataframe.rs @@ -5,7 +5,7 @@ use nu_protocol::{ PipelineData, Span, Value, CONFIG_VARIABLE_ID, }; -use super::ToDataFrame; +use super::eager::ToDataFrame; use crate::Let; pub fn test_dataframe(cmds: Vec>) { diff --git a/crates/nu-command/src/dataframe/values/mod.rs b/crates/nu-command/src/dataframe/values/mod.rs index f0aecd416e..b952137a0c 100644 --- a/crates/nu-command/src/dataframe/values/mod.rs +++ b/crates/nu-command/src/dataframe/values/mod.rs @@ -1,4 +1,6 @@ mod nu_dataframe; +mod nu_groupby; pub mod utils; pub use nu_dataframe::{Axis, Column, NuDataFrame}; +pub use nu_groupby::NuGroupBy; diff --git a/crates/nu-command/src/dataframe/values/nu_dataframe/custom_value.rs b/crates/nu-command/src/dataframe/values/nu_dataframe/custom_value.rs index a1c2329e87..67fd59fd85 100644 --- a/crates/nu-command/src/dataframe/values/nu_dataframe/custom_value.rs +++ b/crates/nu-command/src/dataframe/values/nu_dataframe/custom_value.rs @@ -1,5 +1,5 @@ use super::NuDataFrame; -use nu_protocol::{ast::Operator, Category, CustomValue, ShellError, Span, Value}; +use nu_protocol::{ast::Operator, CustomValue, ShellError, Span, Value}; // CustomValue implementation for NuDataFrame impl CustomValue for NuDataFrame { @@ -20,10 +20,6 @@ impl CustomValue for NuDataFrame { } } - fn category(&self) -> Category { - Category::Custom(self.typetag_name().into()) - } - fn value_string(&self) -> String { self.typetag_name().to_string() } diff --git a/crates/nu-command/src/dataframe/values/nu_dataframe/mod.rs b/crates/nu-command/src/dataframe/values/nu_dataframe/mod.rs index 7fc92bba41..ba7e246e0a 100644 --- a/crates/nu-command/src/dataframe/values/nu_dataframe/mod.rs +++ b/crates/nu-command/src/dataframe/values/nu_dataframe/mod.rs @@ -12,6 +12,8 @@ use polars::prelude::{DataFrame, DataType, PolarsObject, Series}; use serde::{Deserialize, Serialize}; use std::{cmp::Ordering, fmt::Display, hash::Hasher}; +use super::utils::DEFAULT_ROWS; + // DataFrameValue is an encapsulation of Nushell Value that can be used // to define the PolarsObject Trait. The polars object trait allows to // create dataframes with mixed datatypes @@ -283,7 +285,7 @@ impl NuDataFrame { pub fn tail(&self, rows: Option, span: Span) -> Result, ShellError> { let df = &self.0; let to_row = df.height(); - let size = rows.unwrap_or(5); + let size = rows.unwrap_or(DEFAULT_ROWS); let from_row = to_row.saturating_sub(size); let values = self.to_rows(from_row, to_row, span)?; diff --git a/crates/nu-command/src/dataframe/values/nu_groupby/custom_value.rs b/crates/nu-command/src/dataframe/values/nu_groupby/custom_value.rs new file mode 100644 index 0000000000..f60a6bff7a --- /dev/null +++ b/crates/nu-command/src/dataframe/values/nu_groupby/custom_value.rs @@ -0,0 +1,44 @@ +use super::NuGroupBy; +use nu_protocol::{CustomValue, ShellError, Span, Value}; + +// CustomValue implementation for NuDataFrame +impl CustomValue for NuGroupBy { + fn typetag_name(&self) -> &'static str { + "groupby" + } + + fn typetag_deserialize(&self) { + unimplemented!("typetag_deserialize") + } + + fn clone_value(&self, span: nu_protocol::Span) -> Value { + let cloned = NuGroupBy { + dataframe: self.dataframe.clone(), + by: self.by.clone(), + groups: self.groups.clone(), + }; + + Value::CustomValue { + val: Box::new(cloned), + span, + } + } + + fn value_string(&self) -> String { + self.typetag_name().to_string() + } + + fn to_base_value(&self, span: Span) -> Result { + let vals = self.print(span)?; + + Ok(Value::List { vals, span }) + } + + fn to_json(&self) -> nu_json::Value { + nu_json::Value::Null + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } +} diff --git a/crates/nu-command/src/dataframe/values/nu_groupby/mod.rs b/crates/nu-command/src/dataframe/values/nu_groupby/mod.rs new file mode 100644 index 0000000000..5d3dbb5e41 --- /dev/null +++ b/crates/nu-command/src/dataframe/values/nu_groupby/mod.rs @@ -0,0 +1,89 @@ +mod custom_value; + +use nu_protocol::{PipelineData, ShellError, Span, Value}; +use polars::frame::groupby::{GroupBy, GroupTuples}; +use polars::prelude::DataFrame; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct NuGroupBy { + dataframe: DataFrame, + by: Vec, + groups: GroupTuples, +} + +impl NuGroupBy { + pub fn new(dataframe: DataFrame, by: Vec, groups: GroupTuples) -> Self { + NuGroupBy { + dataframe, + by, + groups, + } + } + + pub fn into_value(self, span: Span) -> Value { + Value::CustomValue { + val: Box::new(self), + span, + } + } + + pub fn try_from_value(value: Value) -> Result { + match value { + Value::CustomValue { val, span } => match val.as_any().downcast_ref::() { + Some(groupby) => Ok(NuGroupBy { + dataframe: groupby.dataframe.clone(), + by: groupby.by.clone(), + groups: groupby.groups.clone(), + }), + None => Err(ShellError::CantConvert( + "groupby".into(), + "non-dataframe".into(), + span, + )), + }, + x => Err(ShellError::CantConvert( + "groupby".into(), + x.get_type().to_string(), + x.span()?, + )), + } + } + + pub fn try_from_pipeline(input: PipelineData, span: Span) -> Result { + let value = input.into_value(span); + NuGroupBy::try_from_value(value) + } + + pub fn to_groupby(&self) -> Result { + let by = self.dataframe.select_series(&self.by).map_err(|e| { + ShellError::LabeledError("Error creating groupby".into(), e.to_string()) + })?; + + Ok(GroupBy::new(&self.dataframe, by, self.groups.clone(), None)) + } + + pub fn print(&self, span: Span) -> Result, ShellError> { + let values = self + .by + .iter() + .map(|col| { + let cols = vec!["group by".to_string()]; + let vals = vec![Value::String { + val: col.into(), + span, + }]; + + Value::Record { cols, vals, span } + }) + .collect::>(); + + Ok(values) + } +} + +impl AsRef for NuGroupBy { + fn as_ref(&self) -> &polars::prelude::DataFrame { + &self.dataframe + } +} diff --git a/crates/nu-command/src/dataframe/values/utils.rs b/crates/nu-command/src/dataframe/values/utils.rs index ec2c0a345d..947c68a3dd 100644 --- a/crates/nu-command/src/dataframe/values/utils.rs +++ b/crates/nu-command/src/dataframe/values/utils.rs @@ -1,6 +1,9 @@ use nu_protocol::{span as span_join, ShellError, Span, Spanned, Value}; -// Converts a Vec to a Vec with a Span marking the whole +// Default value used when selecting rows from dataframe +pub const DEFAULT_ROWS: usize = 5; + +// Converts a Vec to a Vec> with a Span marking the whole // location of the columns for error referencing pub(crate) fn convert_columns( columns: Vec, @@ -35,3 +38,39 @@ pub(crate) fn convert_columns( Ok((res, col_span)) } + +// Converts a Vec to a Vec with a Span marking the whole +// location of the columns for error referencing +pub(crate) fn convert_columns_string( + columns: Vec, + span: Span, +) -> Result<(Vec, Span), ShellError> { + // First column span + let mut col_span = columns + .get(0) + .ok_or_else(|| { + ShellError::SpannedLabeledError( + "Empty column list".into(), + "Empty list found for command".into(), + span, + ) + }) + .and_then(|v| v.span())?; + + let res = columns + .into_iter() + .map(|value| match value { + Value::String { val, span } => { + col_span = span_join(&[col_span, span]); + Ok(val) + } + _ => Err(ShellError::SpannedLabeledError( + "Incorrect column format".into(), + "Only string as column name".into(), + span, + )), + }) + .collect::, _>>()?; + + Ok((res, col_span)) +} diff --git a/crates/nu-protocol/src/value/custom_value.rs b/crates/nu-protocol/src/value/custom_value.rs index 1859ee4dc6..35b4d34f84 100644 --- a/crates/nu-protocol/src/value/custom_value.rs +++ b/crates/nu-protocol/src/value/custom_value.rs @@ -1,13 +1,13 @@ use std::{cmp::Ordering, fmt}; -use crate::{ast::Operator, Category, ShellError, Span, Value}; +use crate::{ast::Operator, ShellError, Span, Value}; // Trait definition for a custom value #[typetag::serde(tag = "type")] pub trait CustomValue: fmt::Debug + Send + Sync { fn clone_value(&self, span: Span) -> Value; - fn category(&self) -> Category; + //fn category(&self) -> Category; // Define string representation of the custom value fn value_string(&self) -> String; @@ -26,8 +26,19 @@ pub trait CustomValue: fmt::Debug + Send + Sync { fn as_any(&self) -> &dyn std::any::Any; // Follow cell path functions - fn follow_path_int(&self, count: usize, span: Span) -> Result; - fn follow_path_string(&self, column_name: String, span: Span) -> Result; + fn follow_path_int(&self, _count: usize, span: Span) -> Result { + Err(ShellError::IncompatiblePathAccess( + format!("{} does't support path access", self.value_string()), + span, + )) + } + + fn follow_path_string(&self, _column_name: String, span: Span) -> Result { + Err(ShellError::IncompatiblePathAccess( + format!("{} does't support path access", self.value_string()), + span, + )) + } // ordering with other value fn partial_cmp(&self, _other: &Value) -> Option {