Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors = [
"Andriy Rakhnin <a@rakhnin.com>",
]
edition = "2021"
description = "CLI tool for saving web pages as a single HTML file"
description = "CLI tool and library for saving web pages as a single HTML file"
homepage = "https://github.com/Y2Z/monolith"
repository = "https://github.com/Y2Z/monolith"
readme = "README.md"
Expand Down
12 changes: 10 additions & 2 deletions src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ use url::Url;
use crate::cache::Cache;
use crate::cookies::Cookie;
use crate::html::{
add_favicon, create_metadata_tag, get_base_url, get_charset, get_title, has_favicon,
html_to_dom, serialize_document, set_base_url, set_charset, walk_and_embed_assets,
add_favicon, create_metadata_tag, get_base_url, get_charset, get_robots, get_title,
has_favicon, html_to_dom, serialize_document, set_base_url, set_charset, set_robots,
walk_and_embed_assets,
};
use crate::url::{clean_url, create_data_url, get_referer_url, parse_data_url, resolve_url};

Expand Down Expand Up @@ -263,6 +264,13 @@ pub fn create_monolithic_document_from_data(
}
}

// Append noindex META-tag
if let meta_robots_content_value = get_robots(&dom.document).unwrap_or_default() {
if meta_robots_content_value.trim().is_empty() || meta_robots_content_value != "none" {
dom = set_robots(dom, "none");
}
}

// Save using specified charset, if given
if let Some(custom_encoding) = options.encoding.clone() {
document_encoding = custom_encoding;
Expand Down
70 changes: 63 additions & 7 deletions src/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,20 @@ pub fn get_parent_node(child: &Handle) -> Handle {
parent.and_then(|node| node.upgrade()).unwrap()
}

pub fn get_robots(handle: &Handle) -> Option<String> {
for meta_node in find_nodes(handle, vec!["html", "head", "meta"]).iter() {
// Only the first base tag matters (we ignore the rest, if there's any)
if get_node_attr(meta_node, "name")
.unwrap_or_default()
.eq_ignore_ascii_case("robots")
{
return get_node_attr(meta_node, "content");
}
}

None
}

pub fn get_title(node: &Handle) -> Option<String> {
for title_node in find_nodes(node, vec!["html", "head", "title"]).iter() {
for child_node in title_node.children.borrow().iter() {
Expand Down Expand Up @@ -436,7 +450,7 @@ pub fn parse_srcset(srcset: &str) -> Vec<SrcSetItem> {
srcset_items
}

pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
pub fn set_base_url(document: &Handle, base_href_value: String) -> RcDom {
let mut buf: Vec<u8> = Vec::new();
serialize(
&mut buf,
Expand All @@ -450,14 +464,14 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
if let Some(head_node) = get_child_node_by_name(&html_node, "head") {
// Check if BASE node already exists in the DOM tree
if let Some(base_node) = get_child_node_by_name(&head_node, "base") {
set_node_attr(&base_node, "href", Some(desired_base_href));
set_node_attr(&base_node, "href", Some(base_href_value));
} else {
let base_node = create_element(
&dom,
QualName::new(None, ns!(), LocalName::from("base")),
vec![Attribute {
name: QualName::new(None, ns!(), LocalName::from("href")),
value: format_tendril!("{}", desired_base_href),
value: format_tendril!("{}", base_href_value),
}],
);

Expand All @@ -470,10 +484,10 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
dom
}

pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
pub fn set_charset(dom: RcDom, charset: String) -> RcDom {
for meta_node in find_nodes(&dom.document, vec!["html", "head", "meta"]).iter() {
if get_node_attr(meta_node, "charset").is_some() {
set_node_attr(meta_node, "charset", Some(desired_charset));
set_node_attr(meta_node, "charset", Some(charset));
return dom;
}

Expand All @@ -485,7 +499,7 @@ pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
set_node_attr(
meta_node,
"content",
Some(format!("text/html;charset={}", desired_charset)),
Some(format!("text/html;charset={}", charset)),
);
return dom;
}
Expand All @@ -498,7 +512,7 @@ pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
QualName::new(None, ns!(), LocalName::from("meta")),
vec![Attribute {
name: QualName::new(None, ns!(), LocalName::from("charset")),
value: format_tendril!("{}", desired_charset),
value: format_tendril!("{}", charset),
}],
);

Expand All @@ -508,6 +522,7 @@ pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
.children
.borrow_mut()
.push(meta_charset_node.clone());
break;
}
}

Expand Down Expand Up @@ -551,6 +566,47 @@ pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option<String>)
};
}

pub fn set_robots(dom: RcDom, content_value: &str) -> RcDom {
for meta_node in find_nodes(&dom.document, vec!["html", "head", "meta"]).iter() {
if get_node_attr(meta_node, "name")
.unwrap_or_default()
.eq_ignore_ascii_case("robots")
{
set_node_attr(meta_node, "content", Some(content_value.to_string()));
return dom;
}
}

// Manually append robots META node to HEAD
{
let meta_charset_node: Handle = create_element(
&dom,
QualName::new(None, ns!(), LocalName::from("meta")),
vec![
Attribute {
name: QualName::new(None, ns!(), LocalName::from("name")),
value: format_tendril!("robots"),
},
Attribute {
name: QualName::new(None, ns!(), LocalName::from("content")),
value: format_tendril!("{}", content_value),
},
],
);

// Insert newly created META charset node into HEAD
for head_node in find_nodes(&dom.document, vec!["html", "head"]).iter() {
head_node
.children
.borrow_mut()
.push(meta_charset_node.clone());
break;
}
}

dom
}

pub fn serialize_document(dom: RcDom, document_encoding: String, options: &Options) -> Vec<u8> {
let mut buf: Vec<u8> = Vec::new();

Expand Down
20 changes: 8 additions & 12 deletions tests/cli/base_url.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@ mod passing {
// STDOUT should contain newly added base URL
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<base href=\"http://localhost:30701/\"></base>\
</head><body>Hello, World!</body></html>\n"
r#"<html><head><base href="http://localhost:30701/"></base><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
"#
);

// Exit code should be 0
Expand All @@ -52,9 +51,8 @@ mod passing {
// STDOUT should contain newly added base URL
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<base href=\"http://localhost:30701/\">\
</head><body>Hello, World!</body></html>\n"
r#"<html><head><base href="http://localhost:30701/"><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
"#
);

// Exit code should be 0
Expand All @@ -78,9 +76,8 @@ mod passing {
// STDOUT should contain newly added base URL
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<base href=\"http://localhost/\">\
</head><body>Hello, World!</body></html>\n"
r#"<html><head><base href="http://localhost/"><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
"#
);

// Exit code should be 0
Expand All @@ -104,9 +101,8 @@ mod passing {
// STDOUT should contain newly added base URL
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<base href=\"\">\
</head><body>Hello, World!</body></html>\n"
r#"<html><head><base href=""><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
"#
);

// Exit code should be 0
Expand Down
4 changes: 2 additions & 2 deletions tests/cli/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ mod passing {
// STDOUT should contain HTML created out of STDIN
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r#"<html><head></head><body>Hello from STDIN
r#"<html><head><meta name="robots" content="none"></meta></head><body>Hello from STDIN
</body></html>
"#
);
Expand Down Expand Up @@ -116,7 +116,7 @@ mod passing {
@import url("data:text/css;base64,Ym9keXtiYWNrZ3JvdW5kLWNvbG9yOiMwMDA7Y29sb3I6I2ZmZn0K");

</style>
</head><body></body></html>
<meta name="robots" content="none"></meta></head><body></body></html>
"##
);

Expand Down
15 changes: 8 additions & 7 deletions tests/cli/data_url.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ mod passing {
// STDOUT should contain isolated HTML
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r#"<html><head><meta http-equiv="Content-Security-Policy" content="default-src 'unsafe-eval' 'unsafe-inline' data:;"></meta></head><body>Hello, World!</body></html>
r#"<html><head><meta http-equiv="Content-Security-Policy" content="default-src 'unsafe-eval' 'unsafe-inline' data:;"></meta><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
"#
);

Expand All @@ -53,7 +53,7 @@ mod passing {
// STDOUT should contain HTML with no CSS
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r#"<html><head><meta http-equiv="Content-Security-Policy" content="style-src 'none';"></meta><style></style></head><body>Hello</body></html>
r#"<html><head><meta http-equiv="Content-Security-Policy" content="style-src 'none';"></meta><style></style><meta name="robots" content="none"></meta></head><body>Hello</body></html>
"#
);

Expand All @@ -77,7 +77,7 @@ mod passing {
// STDOUT should contain HTML with no web fonts
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r#"<html><head><meta http-equiv="Content-Security-Policy" content="font-src 'none';"></meta><style></style></head><body>Hi</body></html>
r#"<html><head><meta http-equiv="Content-Security-Policy" content="font-src 'none';"></meta><style></style><meta name="robots" content="none"></meta></head><body>Hi</body></html>
"#
);

Expand All @@ -101,7 +101,7 @@ mod passing {
// STDOUT should contain HTML with no iframes
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r#"<html><head><meta http-equiv="Content-Security-Policy" content="frame-src 'none'; child-src 'none';"></meta></head><body><iframe src=""></iframe>Hi</body></html>
r#"<html><head><meta http-equiv="Content-Security-Policy" content="frame-src 'none'; child-src 'none';"></meta><meta name="robots" content="none"></meta></head><body><iframe src=""></iframe>Hi</body></html>
"#
);

Expand All @@ -126,7 +126,7 @@ mod passing {
assert_eq!(
String::from_utf8_lossy(&out.stdout),
format!(
r#"<html><head><meta http-equiv="Content-Security-Policy" content="img-src data:;"></meta></head><body><img src="{empty_image}">Hi</body></html>
r#"<html><head><meta http-equiv="Content-Security-Policy" content="img-src data:;"></meta><meta name="robots" content="none"></meta></head><body><img src="{empty_image}">Hi</body></html>
"#,
empty_image = EMPTY_IMAGE_DATA_URL,
)
Expand All @@ -152,7 +152,7 @@ mod passing {
// STDOUT should contain HTML with no JS
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r#"<html><head><meta http-equiv="Content-Security-Policy" content="script-src 'none';"></meta><script></script></head><body>Hi</body></html>
r#"<html><head><meta http-equiv="Content-Security-Policy" content="script-src 'none';"></meta><script></script><meta name="robots" content="none"></meta></head><body>Hi</body></html>
"#
);

Expand Down Expand Up @@ -204,7 +204,8 @@ mod failing {
// STDOUT should contain HTML without contents of local JS file
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head><script></script></head><body></body></html>\n"
r#"<html><head><script></script><meta name="robots" content="none"></meta></head><body></body></html>
"#
);

// Exit code should be 0
Expand Down
14 changes: 7 additions & 7 deletions tests/cli/local_files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ mod passing {
<title>Local HTML file</title>
<link href="data:text/css;base64,Ym9keSB7CiAgICBiYWNrZ3JvdW5kLWNvbG9yOiAjMDAwOwogICAgY29sb3I6ICNmZmY7Cn0K" rel="stylesheet" type="text/css">
<link rel="stylesheet" type="text/css">
</head>
<meta name="robots" content="none"></meta></head>

<body>
<img alt="">
Expand Down Expand Up @@ -107,7 +107,7 @@ document.body.style.color = "red";
<title>Local HTML file</title>
<link rel="stylesheet" type="text/css">
<link rel="stylesheet" type="text/css">
</head>
<meta name="robots" content="none"></meta></head>

<body>
<img src="{empty_image}" alt="">
Expand Down Expand Up @@ -166,7 +166,7 @@ document.body.style.color = "red";
<title>Local HTML file</title>
<link rel="stylesheet" type="text/css">
<link rel="stylesheet" type="text/css">
</head>
<meta name="robots" content="none"></meta></head>

<body>
<img src="{empty_image}" alt="">
Expand Down Expand Up @@ -209,7 +209,7 @@ document.body.style.color = "red";
// STDOUT should contain HTML with date URL for background-image in it
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r##"<html><head></head><body><div style="background-image: url(&quot;data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=&quot;)"></div>
r##"<html><head><meta name="robots" content="none"></meta></head><body><div style="background-image: url(&quot;data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=&quot;)"></div>
</body></html>
"##
);
Expand Down Expand Up @@ -241,7 +241,7 @@ document.body.style.color = "red";
// STDOUT should contain HTML with one symbol extracted from SVG file
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r##"<html><head></head><body>
r##"<html><head><meta name="robots" content="none"></meta></head><body>
<button class="tm-votes-lever__button" data-test-id="votes-lever-upvote-button" title="Like" type="button">
<svg class="tm-svg-img tm-votes-lever__icon" height="24" width="24">
<title>Like</title>
Expand Down Expand Up @@ -283,7 +283,7 @@ document.body.style.color = "red";
// STDOUT should contain HTML with data URL of SVG file
assert_eq!(
String::from_utf8_lossy(&out.stdout),
r##"<html><head></head><body>
r##"<html><head><meta name="robots" content="none"></meta></head><body>
<svg height="24" width="24">
<image href="data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=" width="24" height="24">
</image></svg>
Expand Down Expand Up @@ -348,7 +348,7 @@ document.body.style.color = "red";
<title>Local HTML file</title>
<link href="data:text/css;base64,Ym9keSB7CiAgICBiYWNrZ3JvdW5kLWNvbG9yOiAjMDAwOwogICAgY29sb3I6ICNGRkY7Cn0K" rel="stylesheet" type="text/css" crossorigin="anonymous">
<link href="style.css" rel="stylesheet" type="text/css" crossorigin="anonymous">
</head>
<meta name="robots" content="none"></meta></head>

<body>
<p>This page should have black background and white foreground, but only when served via http: (not via file:)</p>
Expand Down
Loading