This commit is contained in:
Andy Gayton 2024-06-27 15:16:42 -04:00
parent 5e29658266
commit b6b9e19612
4 changed files with 521 additions and 44 deletions

104
Cargo.lock generated
View File

@ -377,7 +377,7 @@ dependencies = [
"bitflags 2.5.0",
"cexpr",
"clang-sys",
"itertools 0.11.0",
"itertools 0.12.1",
"lazy_static",
"lazycell",
"proc-macro2",
@ -1117,6 +1117,36 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "curl"
version = "0.4.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e2161dd6eba090ff1594084e95fd67aeccf04382ffea77999ea94ed42ec67b6"
dependencies = [
"curl-sys",
"libc",
"openssl-probe",
"openssl-sys",
"schannel",
"socket2",
"windows-sys 0.52.0",
]
[[package]]
name = "curl-sys"
version = "0.4.73+curl-8.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "450ab250ecf17227c39afb9a2dd9261dc0035cb80f2612472fc0c4aac2dcb84d"
dependencies = [
"cc",
"libc",
"libz-sys",
"openssl-sys",
"pkg-config",
"vcpkg",
"windows-sys 0.52.0",
]
[[package]]
name = "deranged"
version = "0.3.11"
@ -1854,12 +1884,26 @@ checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"markup5ever 0.11.0",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "html5ever"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
dependencies = [
"log",
"mac",
"markup5ever 0.12.1",
"proc-macro2",
"quote",
"syn 2.0.60",
]
[[package]]
name = "http"
version = "0.2.12"
@ -2517,6 +2561,32 @@ dependencies = [
"tendril",
]
[[package]]
name = "markup5ever"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45"
dependencies = [
"log",
"phf 0.11.2",
"phf_codegen 0.11.2",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18"
dependencies = [
"html5ever 0.27.0",
"markup5ever 0.12.1",
"tendril",
"xml5ever",
]
[[package]]
name = "md-5"
version = "0.10.6"
@ -3424,8 +3494,11 @@ dependencies = [
"nu-plugin",
"nu-protocol",
"scraper",
"serde",
"serde_json",
"sxd-document",
"sxd-xpath",
"webpage",
]
[[package]]
@ -5219,7 +5292,7 @@ dependencies = [
"ahash 0.8.11",
"cssparser",
"ego-tree",
"html5ever",
"html5ever 0.26.0",
"once_cell",
"selectors",
"tendril",
@ -6742,6 +6815,20 @@ dependencies = [
"pkg-config",
]
[[package]]
name = "webpage"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70862efc041d46e6bbaa82bb9c34ae0596d090e86cbd14bd9e93b36ee6802eac"
dependencies = [
"curl",
"html5ever 0.27.0",
"markup5ever_rcdom",
"serde",
"serde_json",
"url",
]
[[package]]
name = "which"
version = "6.0.1"
@ -7154,6 +7241,17 @@ dependencies = [
"rustix",
]
[[package]]
name = "xml5ever"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bbb26405d8e919bc1547a5aa9abc95cbfa438f04844f5fdd9dc7596b748bf69"
dependencies = [
"log",
"mac",
"markup5ever 0.12.1",
]
[[package]]
name = "xxhash-rust"
version = "0.8.10"

View File

@ -90,7 +90,6 @@ filesize = "0.2"
filetime = "0.2"
fs_extra = "1.3"
fuzzy-matcher = "0.3"
hamcrest2 = "0.3"
heck = "0.5.0"
human-date-parser = "0.1.1"
indexmap = "2.2"
@ -310,4 +309,4 @@ bench = false
# Run individual benchmarks like `cargo bench -- <regex>` e.g. `cargo bench -- parse`
[[bench]]
name = "benchmarks"
harness = false
harness = false

View File

@ -22,4 +22,7 @@ nu-protocol = { path = "../nu-protocol", version = "0.95.1" }
gjson = "0.8"
scraper = { default-features = false, version = "0.19" }
sxd-document = "0.3"
sxd-xpath = "0.4"
sxd-xpath = "0.4"
webpage = { version = "2.0.1", features = ["serde"] }
serde_json.workspace = true
serde.workspace = true

View File

@ -1,6 +1,7 @@
use crate::Query;
use nu_plugin::{EngineInterface, EvaluatedCall, SimplePluginCommand};
use nu_protocol::{Category, Example, LabeledError, Signature, Value};
use nu_protocol::{Category, Example, LabeledError, Record, Signature, Span, Value};
use crate::Query;
pub struct QueryWebpageInfo;
@ -32,7 +33,7 @@ impl SimplePluginCommand for QueryWebpageInfo {
) -> Result<Value, LabeledError> {
let span = input.span();
match input {
Value::String { val, .. } => Ok(input.clone()),
Value::String { val, .. } => execute_webpage(val, span),
_ => Err(LabeledError::new("Requires text input")
.with_label("expected text from pipeline", span)),
}
@ -47,35 +48,16 @@ pub fn web_examples() -> Vec<Example<'static>> {
}]
}
/*
fn execute_webpage(
input_string: &str,
span: Span,
) -> Value {
let doc = Html::parse_fragment(input_string);
fn execute_webpage(html: &str, span: Span) -> Result<Value, LabeledError> {
let info = webpage::HTML::from_string(html.to_string(), None)
.map_err(|e| LabeledError::new(e.to_string()).with_label("error parsing html", span))?;
let vals: Vec<Value> = match as_html {
true => doc
.select(&css(query_string, inspect))
.map(|selection| Value::string(selection.html(), span))
.collect(),
false => doc
.select(&css(query_string, inspect))
.map(|selection| {
Value::list(
selection
.text()
.map(|text| Value::string(text, span))
.collect(),
span,
)
})
.collect(),
};
let value = to_value(info, span).map_err(|e| {
LabeledError::new(e.to_string()).with_label("error convert Value::Record", span)
})?;
Value::list(vals, span)
Ok(value)
}
*/
#[cfg(test)]
mod tests {
@ -86,14 +68,409 @@ mod tests {
"#;
#[test]
fn test_first_child_is_not_empty() {
assert!(!execute_selector_query(
SIMPLE_LIST,
"li:first-child",
false,
false,
Span::test_data()
)
.is_empty())
fn test_basics() {
// assert!(!execute_webpage(HTML, Span::test_data()).is_empty())
eprintln!("{:?}", execute_webpage(HTML, Span::test_data()));
}
}
// revive nu-serde sketch
use serde::Serialize;
/// Convert any serde:Serialize into a `nu_protocol::Value`
pub fn to_value<T>(value: T, span: Span) -> Result<Value, Error>
where
T: Serialize,
{
value.serialize(&ValueSerializer { span })
}
struct ValueSerializer {
span: Span,
}
struct MapSerializer<'a> {
record: Record,
serializer: &'a ValueSerializer,
current_key: Option<String>,
}
impl<'a> serde::Serializer for &'a ValueSerializer {
type Ok = Value;
type Error = Error;
type SerializeSeq = SeqSerializer<'a>;
type SerializeTuple = SeqSerializer<'a>;
type SerializeTupleStruct = SeqSerializer<'a>;
type SerializeTupleVariant = SeqSerializer<'a>;
type SerializeMap = MapSerializer<'a>;
type SerializeStruct = MapSerializer<'a>;
type SerializeStructVariant = MapSerializer<'a>;
fn serialize_bool(self, v: bool) -> Result<Self::Ok, Self::Error> {
Ok(Value::bool(v, self.span))
}
fn serialize_i8(self, v: i8) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_i16(self, v: i16) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_i32(self, v: i32) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_i64(self, v: i64) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v, self.span))
}
fn serialize_u8(self, v: u8) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_u16(self, v: u16) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_u32(self, v: u32) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_u64(self, v: u64) -> Result<Self::Ok, Self::Error> {
Err(Error::new("the numbers are too big"))
// Ok(Value::int(v.into(), self.span))
}
fn serialize_f32(self, v: f32) -> Result<Self::Ok, Self::Error> {
Ok(Value::float(v.into(), self.span))
}
fn serialize_f64(self, v: f64) -> Result<Self::Ok, Self::Error> {
Ok(Value::float(v, self.span))
}
fn serialize_char(self, v: char) -> Result<Self::Ok, Self::Error> {
Ok(Value::string(String::from(v), self.span))
}
/*
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
Ok(self.value(UntaggedValue::Primitive(Primitive::String(v.into()))))
}
fn serialize_bytes(self, v: &[u8]) -> Result<Self::Ok, Self::Error> {
Ok(self.value(UntaggedValue::Primitive(Primitive::Binary(v.into()))))
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Ok(self.value(UntaggedValue::Primitive(Primitive::Nothing)))
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
value.serialize(self)
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
// TODO: is this OK?
Ok(self.value(UntaggedValue::Primitive(Primitive::Nothing)))
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
// TODO: is this OK?
Ok(self.value(UntaggedValue::Primitive(Primitive::Nothing)))
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
// TODO: is this OK?
Ok(self.value(UntaggedValue::Primitive(Primitive::Nothing)))
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
value.serialize(self)
}
*/
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Ok(SeqSerializer::new(self))
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Ok(SeqSerializer::new(self))
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Ok(SeqSerializer::new(self))
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Ok(SeqSerializer::new(self))
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer::new(self))
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Ok(MapSerializer::new(self))
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Ok(MapSerializer::new(self))
}
}
pub struct Error {
message: String,
}
impl Error {
pub fn new<T: std::fmt::Display>(msg: T) -> Self {
Error {
message: msg.to_string(),
}
}
}
impl serde::ser::Error for Error {
fn custom<T: std::fmt::Display>(msg: T) -> Self {
Error::new(msg.to_string())
}
}
impl std::fmt::Debug for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.message)
}
}
impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.message)
}
}
impl std::error::Error for Error {}
//
// maps
impl<'a> MapSerializer<'a> {
fn new(serializer: &'a ValueSerializer) -> Self {
Self {
record: Record::new(),
current_key: None,
serializer,
}
}
}
impl<'a> serde::ser::SerializeStruct for MapSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: Serialize,
{
self.record
.insert(key.to_owned(), value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::record(self.record, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeMap for MapSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
let value = serde_json::to_value(&key).map_err(|e| Error::new(e))?;
let key = value
.as_str()
.ok_or(Error::new("key must be a string"))?
.to_string();
self.current_key = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
let key = self.current_key.take().ok_or(Error::new("key expected"))?;
self.record.insert(key, value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::record(self.record, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeStructVariant for MapSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: Serialize,
{
self.record
.insert(key.to_owned(), value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::record(self.record, self.serializer.span))
}
}
//
// sequences
struct SeqSerializer<'a> {
seq: Vec<Value>,
serializer: &'a ValueSerializer,
}
impl<'a> SeqSerializer<'a> {
fn new(serializer: &'a ValueSerializer) -> Self {
Self {
seq: Vec::new(),
serializer,
}
}
}
impl<'a> serde::ser::SerializeSeq for SeqSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
self.seq.push(value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::list(self.seq, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeTuple for SeqSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
self.seq.push(value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::list(self.seq, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeTupleStruct for SeqSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_field<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
self.seq.push(value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::list(self.seq, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeTupleVariant for SeqSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_field<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
self.seq.push(value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::list(self.seq, self.serializer.span))
}
}