,
{
if headers.is_empty() {
return Table::find_all_tables(html);
}
let sel_table = css("table");
let sel_tr = css("tr");
let sel_th = css("th");
let html = Html::parse_fragment(html);
let mut tables = html
.select(&sel_table)
.filter(|table| {
table.select(&sel_tr).next().map_or(false, |tr| {
let cells = select_cells(tr, &sel_th, true);
headers.iter().all(|h| contains_str(&cells, h.as_ref()))
})
})
.peekable();
tables.peek()?;
Some(tables.map(Table::new).collect())
}
/// Returns the headers of the table.
///
/// This will be empty if the table had no `` tags in its first row. See
/// [`Headers`](type.Headers.html) for more.
pub fn headers(&self) -> &Headers {
&self.headers
}
/// Returns an iterator over the [`Row`](struct.Row.html)s of the table.
///
/// Only ` | ` cells are considered when generating rows. If the first row
/// of the table is a header row, meaning it contains at least one ` | `
/// cell, the iterator will start on the second row. Use
/// [`headers`](#method.headers) to access the header row in that case.
pub fn iter(&self) -> Iter {
Iter {
headers: &self.headers,
iter: self.data.iter(),
}
}
pub fn empty() -> Table {
Table {
headers: HashMap::new(),
data: vec![vec!["".to_string()]],
}
}
// fn new(element: ElementRef) -> Table {
// let sel_tr = css("tr");
// let sel_th = css("th");
// let sel_td = css("td");
// let mut headers = HashMap::new();
// let mut rows = element.select(&sel_tr).peekable();
// if let Some(tr) = rows.peek() {
// for (i, th) in tr.select(&sel_th).enumerate() {
// headers.insert(cell_content(th), i);
// }
// }
// if !headers.is_empty() {
// rows.next();
// }
// let data = rows.map(|tr| select_cells(tr, &sel_td, true)).collect();
// Table { headers, data }
// }
fn new(element: ElementRef) -> Table {
let sel_tr = css("tr");
let sel_th = css("th");
let sel_td = css("td");
let mut headers = HashMap::new();
let mut rows = element.select(&sel_tr).peekable();
if let Some(tr) = rows.clone().peek() {
for (i, th) in tr.select(&sel_th).enumerate() {
headers.insert(cell_content(th), i);
}
}
if !headers.is_empty() {
rows.next();
}
if headers.is_empty() {
// try looking for data as headers i.e. they're row headers not column headers
for (i, d) in rows
.clone()
.map(|tr| select_cells(tr, &sel_th, true))
.enumerate()
{
headers.insert(d.join(", "), i);
}
// check if headers are there but empty
let mut empty_headers = true;
for (h, _i) in headers.clone() {
if !h.is_empty() {
empty_headers = false;
break;
}
}
if empty_headers {
headers = HashMap::new();
}
let data = rows.map(|tr| select_cells(tr, &sel_td, true)).collect();
Table { headers, data }
} else {
let data = rows.map(|tr| select_cells(tr, &sel_td, true)).collect();
Table { headers, data }
}
}
}
impl<'a> IntoIterator for &'a Table {
type Item = Row<'a>;
type IntoIter = Iter<'a>;
fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}
/// An iterator over the rows in a [`Table`](struct.Table.html).
pub struct Iter<'a> {
headers: &'a Headers,
iter: std::slice::Iter<'a, Vec>,
}
impl<'a> Iterator for Iter<'a> {
type Item = Row<'a>;
fn next(&mut self) -> Option {
let headers = self.headers;
self.iter.next().map(|cells| Row { headers, cells })
}
}
/// A row in a [`Table`](struct.Table.html).
///
/// A row consists of a number of data cells stored as strings. If the row
/// contains the same number of cells as the table's header row, its cells can
/// be safely accessed by header names using [`get`](#method.get). Otherwise,
/// the data should be accessed via [`as_slice`](#method.as_slice) or by
/// iterating over the row.
///
/// This struct can be thought of as a lightweight reference into a table. As
/// such, it implements the `Copy` trait.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct Row<'a> {
headers: &'a Headers,
cells: &'a [String],
}
impl<'a> Row<'a> {
/// Returns the number of cells in the row.
pub fn len(&self) -> usize {
self.cells.len()
}
/// Returns `true` if the row contains no cells.
pub fn is_empty(&self) -> bool {
self.cells.is_empty()
}
/// Returns the cell underneath `header`.
///
/// Returns `None` if there is no such header, or if there is no cell at
/// that position in the row.
pub fn get(&self, header: &str) -> Option<&'a str> {
// eprintln!(
// "header={}, headers={:?}, cells={:?}",
// &header, &self.headers, &self.cells
// );
self.headers.get(header).and_then(|&i| {
// eprintln!("i={}", i);
self.cells.get(i).map(String::as_str)
})
}
pub fn get_header_at(&self, index: usize) -> Option<&'a str> {
let mut a_match = "";
for (key, val) in self.headers {
if *val == index {
a_match = key;
break;
}
}
if a_match.is_empty() {
None
} else {
Some(a_match)
}
}
/// Returns a slice containing all the cells.
pub fn as_slice(&self) -> &'a [String] {
self.cells
}
/// Returns an iterator over the cells of the row.
pub fn iter(&self) -> std::slice::Iter {
self.cells.iter()
}
}
impl<'a> IntoIterator for Row<'a> {
type Item = &'a String;
type IntoIter = std::slice::Iter<'a, String>;
fn into_iter(self) -> Self::IntoIter {
self.cells.iter()
}
}
fn css(selector: &'static str) -> ScraperSelector {
ScraperSelector::parse(selector).expect("Unable to parse selector with scraper")
}
fn select_cells(
element: ElementRef,
selector: &ScraperSelector,
remove_html_tags: bool,
) -> Vec {
if remove_html_tags {
let scraped = element.select(selector).map(cell_content);
let mut dehtmlized: Vec = Vec::new();
for item in scraped {
let frag = Html::parse_fragment(&item);
for node in frag.tree {
if let scraper::node::Node::Text(text) = node {
dehtmlized.push(text.text.to_string());
}
}
}
dehtmlized
} else {
element.select(selector).map(cell_content).collect()
}
}
fn cell_content(element: ElementRef) -> String {
// element.inner_html().trim().to_string()
let mut dehtmlize = String::new();
let element = element.inner_html().trim().to_string();
let frag = Html::parse_fragment(&element);
for node in frag.tree {
if let scraper::node::Node::Text(text) = node {
dehtmlize.push_str(&text.text.to_string())
}
}
// eprintln!("element={} dehtmlize={}", &element, &dehtmlize);
if dehtmlize.is_empty() {
dehtmlize = element;
}
dehtmlize
}
fn contains_str(slice: &[String], item: &str) -> bool {
// slice.iter().any(|s| s == item)
let mut dehtmlized = String::new();
let frag = Html::parse_fragment(item);
for node in frag.tree {
if let scraper::node::Node::Text(text) = node {
dehtmlized.push_str(&text.text.to_string());
}
}
if dehtmlized.is_empty() {
dehtmlized = item.to_string();
}
slice.iter().any(|s| {
// eprintln!(
// "\ns={} item={} contains={}\n",
// &s,
// &dehtmlized,
// &dehtmlized.contains(s)
// );
// s.starts_with(item)
dehtmlized.contains(s)
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::selector::retrieve_tables;
use indexmap::indexmap;
use nu_protocol::UntaggedValue;
const TABLE_EMPTY: &'static str = r#"
"#;
const TABLE_TH: &'static str = r#"
"#;
const TABLE_TD: &'static str = r#"
"#;
const TWO_TABLES_TD: &'static str = r#"
"#;
const TABLE_TH_TD: &'static str = r#"
"#;
const TWO_TABLES_TH_TD: &'static str = r#"
Profession | Civil State |
Mechanic | Single |
"#;
const TABLE_TD_TD: &'static str = r#"
"#;
const TABLE_TH_TH: &'static str = r#"
"#;
const TABLE_COMPLEX: &'static str = r#"
Name | Age | Extra |
John | 20 |
May | 30 | foo |
a | b | c | d |
"#;
const TWO_TABLES_COMPLEX: &'static str = r#"
foo
Name | Age | Extra |
John | 20 |
May | 30 | foo |
a | b | c | d |
Profession | Civil State | Extra |
Carpenter | Single |
Mechanic | Married | bar |
e | f | g | h |
"#;
const HTML_NO_TABLE: &'static str = r#"
foo
Hi.
"#;
const HTML_TWO_TABLES: &'static str = r#"
foo
"#;
const HTML_TABLE_FRAGMENT: &'static str = r#"
|