From 1de7c3d0332a6b98b424017ef3411438a819efc4 Mon Sep 17 00:00:00 2001 From: Luccas Mateus Date: Fri, 24 Sep 2021 10:08:13 -0300 Subject: [PATCH] Scraping multiple tables (#4036) * Output error when ls into a file without permission * math sqrt * added test to check fails when ls into prohibited dir * fix lint * math sqrt with tests and doc * trigger wasm build * Update filesystem_shell.rs * Fix Running echo .. starts printing integers forever * Allow for multiple table scraping * linting * Fix clippy * linting Co-authored-by: Jonathan Turner --- crates/nu_plugin_selector/src/selector.rs | 30 ++- crates/nu_plugin_selector/src/tables.rs | 267 +++++++++++++++++++++- 2 files changed, 286 insertions(+), 11 deletions(-) diff --git a/crates/nu_plugin_selector/src/selector.rs b/crates/nu_plugin_selector/src/selector.rs index 5503c7a000..918b1941f3 100644 --- a/crates/nu_plugin_selector/src/selector.rs +++ b/crates/nu_plugin_selector/src/selector.rs @@ -66,18 +66,39 @@ pub fn retrieve_tables(input_string: &str, columns: &Value, inspect_mode: bool) eprintln!("Passed in Column Headers = {:#?}", &cols,); } - let mut table = match Table::find_by_headers(html, &cols) { + let tables = match Table::find_by_headers(html, &cols) { Some(t) => { if inspect_mode { eprintln!("Table Found = {:#?}", &t); } t } - None => Table::empty(), + None => vec![Table::empty()], }; + if tables.len() == 1 { + return retrieve_table( + tables + .into_iter() + .next() + .expect("This should never trigger"), + columns, + ); + } + tables + .into_iter() + .map(move |table| { + UntaggedValue::Table(retrieve_table(table, columns)).into_value(Tag::unknown()) + }) + .collect() +} - let mut table_out = Vec::new(); - +fn retrieve_table(mut table: Table, columns: &Value) -> Vec { + let mut cols = Vec::new(); + if let UntaggedValue::Table(t) = &columns.value { + for x in t { + cols.push(x.convert_to_string()); + } + } // since cols was empty and headers is not, it means that headers were manually populated // so let's fake the data in order to build a proper table. this situation happens when // there are tables where the first column is actually the headers. kind of like a table @@ -95,6 +116,7 @@ pub fn retrieve_tables(input_string: &str, columns: &Value, inspect_mode: bool) table.data = vec![data2]; } + let mut table_out = Vec::new(); // if columns are still empty, let's just make a single column table with the data if cols.is_empty() { let table_with_no_empties: Vec<_> = table.iter().filter(|item| !item.is_empty()).collect(); diff --git a/crates/nu_plugin_selector/src/tables.rs b/crates/nu_plugin_selector/src/tables.rs index 3efd59cb0f..06c03c180c 100644 --- a/crates/nu_plugin_selector/src/tables.rs +++ b/crates/nu_plugin_selector/src/tables.rs @@ -18,9 +18,13 @@ impl Table { html.select(&css("table")).next().map(Table::new) } - pub fn find_all_tables(html: &str) -> Vec { + pub fn find_all_tables(html: &str) -> Option> { let html = Html::parse_fragment(html); - html.select(&css("table")).map(Table::new).collect() + let iter: Vec
= html.select(&css("table")).map(Table::new).collect(); + if iter.is_empty() { + return None; + } + Some(iter) } /// Finds the table in `html` with an id of `id`. @@ -40,12 +44,12 @@ impl Table { /// /// If `headers` is empty, this is the same as /// [`find_first`](#method.find_first). - pub fn find_by_headers(html: &str, headers: &[T]) -> Option
+ pub fn find_by_headers(html: &str, headers: &[T]) -> Option> where T: AsRef, { if headers.is_empty() { - return Table::find_first(html); + return Table::find_all_tables(html); } let sel_table = css("table"); @@ -53,14 +57,17 @@ impl Table { let sel_th = css("th"); let html = Html::parse_fragment(html); - html.select(&sel_table) - .find(|table| { + let mut tables = html + .select(&sel_table) + .filter(|table| { table.select(&sel_tr).next().map_or(false, |tr| { let cells = select_cells(tr, &sel_th, true); headers.iter().all(|h| contains_str(&cells, h.as_ref())) }) }) - .map(Table::new) + .peekable(); + tables.peek()?; + Some(tables.map(Table::new).collect()) } /// Returns the headers of the table. @@ -350,6 +357,15 @@ mod tests {
NameAge
+"#; + + const TWO_TABLES_TD: &'static str = r#" + + +
NameAge
+ + +
ProfessionCivil State
"#; const TABLE_TH_TD: &'static str = r#" @@ -357,6 +373,17 @@ mod tests { NameAge John20 +"#; + + const TWO_TABLES_TH_TD: &'static str = r#" + + + +
NameAge
John20
+ + + +
ProfessionCivil State
MechanicSingle
"#; const TABLE_TD_TD: &'static str = r#" @@ -381,6 +408,29 @@ mod tests { abcd +"#; + + const TWO_TABLES_COMPLEX: &'static str = r#" + + + foo + + + + + + + +
NameAgeExtra
John20
May30foo
abcd
+ + + + + + +
ProfessionCivil StateExtra
CarpenterSingle
MechanicMarriedbar
efgh
+ + "#; const HTML_NO_TABLE: &'static str = r#" @@ -775,6 +825,29 @@ mod tests { ); } + #[test] + fn test_row_len_two_tables() { + let tables = Table::find_all_tables(HTML_TWO_TABLES).unwrap(); + let mut tables_iter = tables.iter(); + let table_1 = tables_iter.next().unwrap(); + let table_2 = tables_iter.next().unwrap(); + assert_eq!(vec![2], table_1.iter().map(|r| r.len()).collect::>()); + assert_eq!(vec![2], table_2.iter().map(|r| r.len()).collect::>()); + + let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap(); + let mut tables_iter = tables.iter(); + let table_1 = tables_iter.next().unwrap(); + let table_2 = tables_iter.next().unwrap(); + assert_eq!( + vec![2, 3, 0, 4], + table_1.iter().map(|r| r.len()).collect::>() + ); + assert_eq!( + vec![2, 3, 0, 4], + table_2.iter().map(|r| r.len()).collect::>() + ); + } + #[test] fn test_row_get_without_headers() { let table = Table::find_first(TABLE_TD).unwrap(); @@ -831,6 +904,55 @@ mod tests { assert_eq!(None, iter.next()); } + #[test] + fn test_two_tables_row_get_complex() { + let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap(); + let mut tables_iter = tables.iter(); + let table_1 = tables_iter.next().unwrap(); + let table_2 = tables_iter.next().unwrap(); + let mut iter_1 = table_1.iter(); + let mut iter_2 = table_2.iter(); + + let row_table_1 = iter_1.next().unwrap(); + let row_table_2 = iter_2.next().unwrap(); + assert_eq!(Some("John"), row_table_1.get("Name")); + assert_eq!(Some("20"), row_table_1.get("Age")); + assert_eq!(None, row_table_1.get("Extra")); + assert_eq!(Some("Carpenter"), row_table_2.get("Profession")); + assert_eq!(Some("Single"), row_table_2.get("Civil State")); + assert_eq!(None, row_table_2.get("Extra")); + + let row_table_1 = iter_1.next().unwrap(); + let row_table_2 = iter_2.next().unwrap(); + assert_eq!(Some("May"), row_table_1.get("Name")); + assert_eq!(Some("30"), row_table_1.get("Age")); + assert_eq!(Some("foo"), row_table_1.get("Extra")); + assert_eq!(Some("Mechanic"), row_table_2.get("Profession")); + assert_eq!(Some("Married"), row_table_2.get("Civil State")); + assert_eq!(Some("bar"), row_table_2.get("Extra")); + + let row_table_1 = iter_1.next().unwrap(); + let row_table_2 = iter_2.next().unwrap(); + assert_eq!(None, row_table_1.get("Name")); + assert_eq!(None, row_table_1.get("Age")); + assert_eq!(None, row_table_1.get("Extra")); + assert_eq!(None, row_table_2.get("Name")); + assert_eq!(None, row_table_2.get("Age")); + assert_eq!(None, row_table_2.get("Extra")); + + let row_table_1 = iter_1.next().unwrap(); + let row_table_2 = iter_2.next().unwrap(); + assert_eq!(Some("a"), row_table_1.get("Name")); + assert_eq!(Some("b"), row_table_1.get("Age")); + assert_eq!(Some("c"), row_table_1.get("Extra")); + assert_eq!(Some("e"), row_table_2.get("Profession")); + assert_eq!(Some("f"), row_table_2.get("Civil State")); + assert_eq!(Some("g"), row_table_2.get("Extra")); + + assert_eq!(None, iter_1.next()); + assert_eq!(None, iter_2.next()); + } + #[test] fn test_row_as_slice_without_headers() { let table = Table::find_first(TABLE_TD).unwrap(); @@ -840,6 +962,24 @@ mod tests { assert_eq!(None, iter.next()); } + #[test] + fn test_row_as_slice_without_headers_two_tables() { + let tables = Table::find_all_tables(TWO_TABLES_TD).unwrap(); + let mut tables_iter = tables.iter(); + let table_1 = tables_iter.next().unwrap(); + let table_2 = tables_iter.next().unwrap(); + let mut iter_1 = table_1.iter(); + let mut iter_2 = table_2.iter(); + + assert_eq!(&["Name", "Age"], iter_1.next().unwrap().as_slice()); + assert_eq!( + &["Profession", "Civil State"], + iter_2.next().unwrap().as_slice() + ); + assert_eq!(None, iter_1.next()); + assert_eq!(None, iter_2.next()); + } + #[test] fn test_row_as_slice_with_headers() { let table = Table::find_first(TABLE_TH_TD).unwrap(); @@ -849,6 +989,21 @@ mod tests { assert_eq!(None, iter.next()); } + #[test] + fn test_row_as_slice_with_headers_two_tables() { + let tables = Table::find_all_tables(TWO_TABLES_TH_TD).unwrap(); + let mut tables_iter = tables.iter(); + let table_1 = tables_iter.next().unwrap(); + let table_2 = tables_iter.next().unwrap(); + let mut iter_1 = table_1.iter(); + let mut iter_2 = table_2.iter(); + + assert_eq!(&["John", "20"], iter_1.next().unwrap().as_slice()); + assert_eq!(&["Mechanic", "Single"], iter_2.next().unwrap().as_slice()); + assert_eq!(None, iter_1.next()); + assert_eq!(None, iter_2.next()); + } + #[test] fn test_row_as_slice_complex() { let table = Table::find_first(TABLE_COMPLEX).unwrap(); @@ -862,6 +1017,31 @@ mod tests { assert_eq!(None, iter.next()); } + #[test] + fn test_row_as_slice_complex_two_tables() { + let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap(); + let mut tables_iter = tables.iter(); + let table_1 = tables_iter.next().unwrap(); + let table_2 = tables_iter.next().unwrap(); + let mut iter_1 = table_1.iter(); + let mut iter_2 = table_2.iter(); + let empty: [&str; 0] = []; + + assert_eq!(&["John", "20"], iter_1.next().unwrap().as_slice()); + assert_eq!(&["May", "30", "foo"], iter_1.next().unwrap().as_slice()); + assert_eq!(&empty, iter_1.next().unwrap().as_slice()); + assert_eq!(&["a", "b", "c", "d"], iter_1.next().unwrap().as_slice()); + assert_eq!(None, iter_1.next()); + assert_eq!(&["Carpenter", "Single"], iter_2.next().unwrap().as_slice()); + assert_eq!( + &["Mechanic", "Married", "bar"], + iter_2.next().unwrap().as_slice() + ); + assert_eq!(&empty, iter_2.next().unwrap().as_slice()); + assert_eq!(&["e", "f", "g", "h"], iter_2.next().unwrap().as_slice()); + assert_eq!(None, iter_2.next()); + } + #[test] fn test_row_iter_simple() { let table = Table::find_first(TABLE_TD).unwrap(); @@ -873,6 +1053,25 @@ mod tests { assert_eq!(None, iter.next()); } + #[test] + fn test_row_iter_simple_two_tables() { + let tables = Table::find_all_tables(TWO_TABLES_TD).unwrap(); + let mut tables_iter = tables.iter(); + let table_1 = tables_iter.next().unwrap(); + let table_2 = tables_iter.next().unwrap(); + let row_1 = table_1.iter().next().unwrap(); + let row_2 = table_2.iter().next().unwrap(); + let mut iter_1 = row_1.iter(); + let mut iter_2 = row_2.iter(); + + assert_eq!(Some("Name"), iter_1.next().map(String::as_str)); + assert_eq!(Some("Age"), iter_1.next().map(String::as_str)); + assert_eq!(None, iter_1.next()); + assert_eq!(Some("Profession"), iter_2.next().map(String::as_str)); + assert_eq!(Some("Civil State"), iter_2.next().map(String::as_str)); + assert_eq!(None, iter_2.next()); + } + #[test] fn test_row_iter_complex() { let table = Table::find_first(TABLE_COMPLEX).unwrap(); @@ -904,6 +1103,60 @@ mod tests { assert_eq!(None, iter.next()); } + #[test] + fn test_row_iter_complex_two_tables() { + let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap(); + let mut tables_iter = tables.iter(); + let mut table_1 = tables_iter.next().unwrap().iter(); + let mut table_2 = tables_iter.next().unwrap().iter(); + + let row_1 = table_1.next().unwrap(); + let row_2 = table_2.next().unwrap(); + let mut iter_1 = row_1.iter(); + let mut iter_2 = row_2.iter(); + assert_eq!(Some("John"), iter_1.next().map(String::as_str)); + assert_eq!(Some("20"), iter_1.next().map(String::as_str)); + assert_eq!(None, iter_1.next()); + assert_eq!(Some("Carpenter"), iter_2.next().map(String::as_str)); + assert_eq!(Some("Single"), iter_2.next().map(String::as_str)); + assert_eq!(None, iter_2.next()); + + let row_1 = table_1.next().unwrap(); + let row_2 = table_2.next().unwrap(); + let mut iter_1 = row_1.iter(); + let mut iter_2 = row_2.iter(); + assert_eq!(Some("May"), iter_1.next().map(String::as_str)); + assert_eq!(Some("30"), iter_1.next().map(String::as_str)); + assert_eq!(Some("foo"), iter_1.next().map(String::as_str)); + assert_eq!(None, iter_1.next()); + assert_eq!(Some("Mechanic"), iter_2.next().map(String::as_str)); + assert_eq!(Some("Married"), iter_2.next().map(String::as_str)); + assert_eq!(Some("bar"), iter_2.next().map(String::as_str)); + assert_eq!(None, iter_2.next()); + + let row_1 = table_1.next().unwrap(); + let row_2 = table_2.next().unwrap(); + let mut iter_1 = row_1.iter(); + let mut iter_2 = row_2.iter(); + assert_eq!(None, iter_1.next()); + assert_eq!(None, iter_2.next()); + + let row_1 = table_1.next().unwrap(); + let row_2 = table_2.next().unwrap(); + let mut iter_1 = row_1.iter(); + let mut iter_2 = row_2.iter(); + assert_eq!(Some("a"), iter_1.next().map(String::as_str)); + assert_eq!(Some("b"), iter_1.next().map(String::as_str)); + assert_eq!(Some("c"), iter_1.next().map(String::as_str)); + assert_eq!(Some("d"), iter_1.next().map(String::as_str)); + assert_eq!(None, iter_1.next()); + assert_eq!(Some("e"), iter_2.next().map(String::as_str)); + assert_eq!(Some("f"), iter_2.next().map(String::as_str)); + assert_eq!(Some("g"), iter_2.next().map(String::as_str)); + assert_eq!(Some("h"), iter_2.next().map(String::as_str)); + assert_eq!(None, iter_2.next()); + } + #[test] fn test_wikipedia_swapped_rows_columns() { // empty columns