anki/rslib/src/text.rs
2023-05-29 14:46:02 +10:00

648 lines
19 KiB
Rust

// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use std::borrow::Cow;
use lazy_static::lazy_static;
use percent_encoding_iri::percent_decode_str;
use percent_encoding_iri::utf8_percent_encode;
use percent_encoding_iri::AsciiSet;
use percent_encoding_iri::CONTROLS;
use regex::Captures;
use regex::Regex;
use unicase::eq as uni_eq;
use unicode_normalization::char::is_combining_mark;
use unicode_normalization::is_nfc;
use unicode_normalization::is_nfkd_quick;
use unicode_normalization::IsNormalized;
use unicode_normalization::UnicodeNormalization;
pub trait Trimming {
fn trim(self) -> Self;
}
impl Trimming for Cow<'_, str> {
fn trim(self) -> Self {
match self {
Cow::Borrowed(text) => text.trim().into(),
Cow::Owned(text) => {
let trimmed = text.as_str().trim();
if trimmed.len() == text.len() {
text.into()
} else {
trimmed.to_string().into()
}
}
}
}
}
pub(crate) trait CowMapping<'a, B: ?Sized + 'a + ToOwned> {
/// Returns [self]
/// - unchanged, if the given function returns [Cow::Borrowed]
/// - with the new value, if the given function returns [Cow::Owned]
fn map_cow(self, f: impl FnOnce(&B) -> Cow<B>) -> Self;
fn get_owned(self) -> Option<B::Owned>;
}
impl<'a, B: ?Sized + 'a + ToOwned> CowMapping<'a, B> for Cow<'a, B> {
fn map_cow(self, f: impl FnOnce(&B) -> Cow<B>) -> Self {
if let Cow::Owned(o) = f(&self) {
Cow::Owned(o)
} else {
self
}
}
fn get_owned(self) -> Option<B::Owned> {
match self {
Cow::Borrowed(_) => None,
Cow::Owned(s) => Some(s),
}
}
}
pub(crate) fn strip_utf8_bom(s: &str) -> &str {
s.strip_prefix('\u{feff}').unwrap_or(s)
}
#[derive(Debug, PartialEq)]
pub enum AvTag {
SoundOrVideo(String),
TextToSpeech {
field_text: String,
lang: String,
voices: Vec<String>,
speed: f32,
other_args: Vec<String>,
},
}
lazy_static! {
static ref HTML: Regex = Regex::new(concat!(
"(?si)",
// wrapped text
r"(<!--.*?-->)|(<style.*?>.*?</style>)|(<script.*?>.*?</script>)",
// html tags
r"|(<.*?>)",
))
.unwrap();
static ref HTML_LINEBREAK_TAGS: Regex = Regex::new(
r#"(?xsi)
</?
(?:
br|address|article|aside|blockquote|canvas|dd|div
|dl|dt|fieldset|figcaption|figure|footer|form
|h[1-6]|header|hr|li|main|nav|noscript|ol
|output|p|pre|section|table|tfoot|ul|video
)
>
"#
).unwrap();
pub static ref HTML_MEDIA_TAGS: Regex = Regex::new(
r#"(?xsi)
# the start of the image, audio, or object tag
<\b(?:img|audio|object)\b[^>]+\b(?:src|data)\b=
(?:
# 1: double-quoted filename
"
([^"]+?)
"
[^>]*>
|
# 2: single-quoted filename
'
([^']+?)
'
[^>]*>
|
# 3: unquoted filename
([^ >]+?)
(?:
# then either a space and the rest
\x20[^>]*>
|
# or the tag immediately ends
>
)
)
"#
).unwrap();
// videos are also in sound tags
static ref AV_TAGS: Regex = Regex::new(
r#"(?xs)
\[sound:(.+?)\] # 1 - the filename in a sound tag
|
\[anki:tts\]
\[(.*?)\] # 2 - arguments to tts call
(.*?) # 3 - field text
\[/anki:tts\]
"#).unwrap();
static ref PERSISTENT_HTML_SPACERS: Regex = Regex::new(r#"(?i)<br\s*/?>|<div>|\n"#).unwrap();
static ref TYPE_TAG: Regex = Regex::new(r"\[\[type:[^]]+\]\]").unwrap();
pub(crate) static ref SOUND_TAG: Regex = Regex::new(r"\[sound:([^]]+)\]").unwrap();
/// Files included in CSS with a leading underscore.
static ref UNDERSCORED_CSS_IMPORTS: Regex = Regex::new(
r#"(?xi)
(?:@import\s+ # import statement with a bare
"(_[^"]*.css)" # double quoted
| # or
'(_[^']*.css)' # single quoted css filename
)
| # or
(?:url\(\s* # a url function with a
"(_[^"]+)" # double quoted
| # or
'(_[^']+)' # single quoted
| # or
(_.+) # unquoted filename
\s*\))
"#).unwrap();
/// Strings, src and data attributes with a leading underscore.
static ref UNDERSCORED_REFERENCES: Regex = Regex::new(
r#"(?x)
\[sound:(_[^]]+)\] # a filename in an Anki sound tag
| # or
"(_[^"]+)" # a double quoted
| # or
'(_[^']+)' # single quoted string
| # or
\b(?:src|data) # a 'src' or 'data' attribute
= # followed by
(_[^ >]+) # an unquoted value
"#).unwrap();
}
pub fn is_html(text: impl AsRef<str>) -> bool {
HTML.is_match(text.as_ref())
}
pub fn html_to_text_line(html: &str, preserve_media_filenames: bool) -> Cow<str> {
let (html_stripper, sound_rep): (fn(&str) -> Cow<str>, _) = if preserve_media_filenames {
(strip_html_preserving_media_filenames, "$1")
} else {
(strip_html, "")
};
PERSISTENT_HTML_SPACERS
.replace_all(html, " ")
.map_cow(|s| TYPE_TAG.replace_all(s, ""))
.map_cow(|s| SOUND_TAG.replace_all(s, sound_rep))
.map_cow(html_stripper)
.trim()
}
pub fn strip_html(html: &str) -> Cow<str> {
strip_html_preserving_entities(html).map_cow(decode_entities)
}
pub fn strip_html_preserving_entities(html: &str) -> Cow<str> {
HTML.replace_all(html, "")
}
pub fn decode_entities(html: &str) -> Cow<str> {
if html.contains('&') {
match htmlescape::decode_html(html) {
Ok(text) => text.replace('\u{a0}', " ").into(),
Err(_) => html.into(),
}
} else {
// nothing to do
html.into()
}
}
pub(crate) fn newlines_to_spaces(text: &str) -> Cow<str> {
if text.contains('\n') {
text.replace('\n', " ").into()
} else {
text.into()
}
}
pub fn strip_html_for_tts(html: &str) -> Cow<str> {
HTML_LINEBREAK_TAGS
.replace_all(html, " ")
.map_cow(strip_html)
}
/// Truncate a String on a valid UTF8 boundary.
pub(crate) fn truncate_to_char_boundary(s: &mut String, mut max: usize) {
if max >= s.len() {
return;
}
while !s.is_char_boundary(max) {
max -= 1;
}
s.truncate(max);
}
#[derive(Debug)]
pub(crate) struct MediaRef<'a> {
pub full_ref: &'a str,
pub fname: &'a str,
/// audio files may have things like &amp; that need decoding
pub fname_decoded: Cow<'a, str>,
}
pub(crate) fn extract_media_refs(text: &str) -> Vec<MediaRef> {
let mut out = vec![];
for caps in HTML_MEDIA_TAGS.captures_iter(text) {
let fname = caps
.get(1)
.or_else(|| caps.get(2))
.or_else(|| caps.get(3))
.unwrap()
.as_str();
let fname_decoded = decode_entities(fname);
out.push(MediaRef {
full_ref: caps.get(0).unwrap().as_str(),
fname,
fname_decoded,
});
}
for caps in AV_TAGS.captures_iter(text) {
if let Some(m) = caps.get(1) {
let fname = m.as_str();
let fname_decoded = decode_entities(fname);
out.push(MediaRef {
full_ref: caps.get(0).unwrap().as_str(),
fname,
fname_decoded,
});
}
}
out
}
/// Calls `replacer` for every media reference in `text`, and optionally
/// replaces it with something else. [None] if no reference was found.
pub fn replace_media_refs(
text: &str,
mut replacer: impl FnMut(&str) -> Option<String>,
) -> Option<String> {
let mut rep = |caps: &Captures| {
let whole_match = caps.get(0).unwrap().as_str();
let old_name = caps.iter().skip(1).find_map(|g| g).unwrap().as_str();
let old_name_decoded = decode_entities(old_name);
if let Some(mut new_name) = replacer(&old_name_decoded) {
if matches!(old_name_decoded, Cow::Owned(_)) {
new_name = htmlescape::encode_minimal(&new_name);
}
whole_match.replace(old_name, &new_name)
} else {
whole_match.to_owned()
}
};
HTML_MEDIA_TAGS
.replace_all(text, &mut rep)
.map_cow(|s| AV_TAGS.replace_all(s, &mut rep))
.get_owned()
}
pub(crate) fn extract_underscored_css_imports(text: &str) -> Vec<&str> {
UNDERSCORED_CSS_IMPORTS
.captures_iter(text)
.map(extract_match)
.collect()
}
pub(crate) fn extract_underscored_references(text: &str) -> Vec<&str> {
UNDERSCORED_REFERENCES
.captures_iter(text)
.map(extract_match)
.collect()
}
/// Returns the first matching group as a str. This is intended for regexes
/// where exactly one group matches, and will panic for matches without matching
/// groups.
fn extract_match(caps: Captures) -> &str {
caps.iter().skip(1).find_map(|g| g).unwrap().as_str()
}
pub fn strip_html_preserving_media_filenames(html: &str) -> Cow<str> {
HTML_MEDIA_TAGS
.replace_all(html, r" ${1}${2}${3} ")
.map_cow(strip_html)
}
pub fn contains_media_tag(html: &str) -> bool {
HTML_MEDIA_TAGS.is_match(html)
}
#[allow(dead_code)]
pub(crate) fn sanitize_html(html: &str) -> String {
ammonia::clean(html)
}
pub(crate) fn sanitize_html_no_images(html: &str) -> String {
ammonia::Builder::default()
.rm_tags(&["img"])
.clean(html)
.to_string()
}
pub(crate) fn normalize_to_nfc(s: &str) -> Cow<str> {
if !is_nfc(s) {
s.chars().nfc().collect::<String>().into()
} else {
s.into()
}
}
pub(crate) fn ensure_string_in_nfc(s: &mut String) {
if !is_nfc(s) {
*s = s.chars().nfc().collect()
}
}
/// Convert provided string to NFKD form and strip combining characters.
pub(crate) fn without_combining(s: &str) -> Cow<str> {
// if the string is already normalized
if matches!(is_nfkd_quick(s.chars()), IsNormalized::Yes) {
// and no combining characters found, return unchanged
if !s.chars().any(is_combining_mark) {
return s.into();
}
}
// we need to create a new string without the combining marks
s.chars()
.nfkd()
.filter(|c| !is_combining_mark(*c))
.collect::<String>()
.into()
}
/// Check if string contains an unescaped wildcard.
pub(crate) fn is_glob(txt: &str) -> bool {
// even number of \s followed by a wildcard
lazy_static! {
static ref RE: Regex = Regex::new(
r#"(?x)
(?:^|[^\\]) # not a backslash
(?:\\\\)* # even number of backslashes
[*_] # wildcard
"#
)
.unwrap();
}
RE.is_match(txt)
}
/// Convert to a RegEx respecting Anki wildcards.
pub(crate) fn to_re(txt: &str) -> Cow<str> {
to_custom_re(txt, ".")
}
/// Convert Anki style to RegEx using the provided wildcard.
pub(crate) fn to_custom_re<'a>(txt: &'a str, wildcard: &str) -> Cow<'a, str> {
lazy_static! {
static ref RE: Regex = Regex::new(r"\\?.").unwrap();
}
RE.replace_all(txt, |caps: &Captures| {
let s = &caps[0];
match s {
r"\\" | r"\*" => s.to_string(),
r"\_" => "_".to_string(),
"*" => format!("{}*", wildcard),
"_" => wildcard.to_string(),
s => regex::escape(s),
}
})
}
/// Convert to SQL respecting Anki wildcards.
pub(crate) fn to_sql(txt: &str) -> Cow<str> {
// escape sequences and unescaped special characters which need conversion
lazy_static! {
static ref RE: Regex = Regex::new(r"\\[\\*]|[*%]").unwrap();
}
RE.replace_all(txt, |caps: &Captures| {
let s = &caps[0];
match s {
r"\\" => r"\\",
r"\*" => "*",
"*" => "%",
"%" => r"\%",
_ => unreachable!(),
}
})
}
/// Unescape everything.
pub(crate) fn to_text(txt: &str) -> Cow<str> {
lazy_static! {
static ref RE: Regex = Regex::new(r"\\(.)").unwrap();
}
RE.replace_all(txt, "$1")
}
/// Escape Anki wildcards and the backslash for escaping them: \*_
pub(crate) fn escape_anki_wildcards(txt: &str) -> String {
lazy_static! {
static ref RE: Regex = Regex::new(r"[\\*_]").unwrap();
}
RE.replace_all(txt, r"\$0").into()
}
/// Escape Anki wildcards unless it's _*
pub(crate) fn escape_anki_wildcards_for_search_node(txt: &str) -> String {
if txt == "_*" {
txt.to_string()
} else {
escape_anki_wildcards(txt)
}
}
/// Return a function to match input against `search`,
/// which may contain wildcards.
pub(crate) fn glob_matcher(search: &str) -> impl Fn(&str) -> bool + '_ {
let mut regex = None;
let mut cow = None;
if is_glob(search) {
regex = Some(Regex::new(&format!("^(?i){}$", to_re(search))).unwrap());
} else {
cow = Some(to_text(search));
}
move |text| {
if let Some(r) = &regex {
r.is_match(text)
} else {
uni_eq(text, cow.as_ref().unwrap())
}
}
}
lazy_static! {
pub(crate) static ref REMOTE_FILENAME: Regex = Regex::new("(?i)^https?://").unwrap();
}
/// https://url.spec.whatwg.org/#fragment-percent-encode-set
const FRAGMENT_QUERY_UNION: &AsciiSet = &CONTROLS
.add(b' ')
.add(b'"')
.add(b'<')
.add(b'>')
.add(b'`')
.add(b'#');
/// IRI-encode unescaped local paths in HTML fragment.
pub(crate) fn encode_iri_paths(unescaped_html: &str) -> Cow<str> {
transform_html_paths(unescaped_html, |fname| {
utf8_percent_encode(fname, FRAGMENT_QUERY_UNION).into()
})
}
/// URI-decode escaped local paths in HTML fragment.
pub(crate) fn decode_iri_paths(escaped_html: &str) -> Cow<str> {
transform_html_paths(escaped_html, |fname| {
percent_decode_str(fname).decode_utf8_lossy()
})
}
/// Apply a transform to local filename references in tags like IMG.
/// Required at display time, as Anki unfortunately stores the references
/// in unencoded form in the database.
fn transform_html_paths<F>(html: &str, transform: F) -> Cow<str>
where
F: Fn(&str) -> Cow<str>,
{
HTML_MEDIA_TAGS.replace_all(html, |caps: &Captures| {
let fname = caps
.get(1)
.or_else(|| caps.get(2))
.or_else(|| caps.get(3))
.unwrap()
.as_str();
let full = caps.get(0).unwrap().as_str();
if REMOTE_FILENAME.is_match(fname) {
full.into()
} else {
full.replace(fname, &transform(fname))
}
})
}
#[cfg(test)]
mod test {
use std::borrow::Cow;
use super::*;
#[test]
fn stripping() {
assert_eq!(strip_html("test"), "test");
assert_eq!(strip_html("t<b>e</b>st"), "test");
assert_eq!(strip_html("so<SCRIPT>t<b>e</b>st</script>me"), "some");
assert_eq!(
strip_html_preserving_media_filenames("<img src=foo.jpg>"),
" foo.jpg "
);
assert_eq!(
strip_html_preserving_media_filenames("<img src='foo.jpg'><html>"),
" foo.jpg "
);
assert_eq!(strip_html_preserving_media_filenames("<html>"), "");
}
#[test]
fn combining() {
assert!(matches!(without_combining("test"), Cow::Borrowed(_)));
assert!(matches!(without_combining("Über"), Cow::Owned(_)));
}
#[test]
fn conversion() {
assert_eq!(&to_re(r"[te\*st]"), r"\[te\*st\]");
assert_eq!(&to_custom_re("f_o*", r"\d"), r"f\do\d*");
assert_eq!(&to_sql("%f_o*"), r"\%f_o%");
assert_eq!(&to_text(r"\*\_*_"), "*_*_");
assert!(is_glob(r"\\\\_"));
assert!(!is_glob(r"\\\_"));
assert!(glob_matcher(r"foo\*bar*")("foo*bar123"));
}
#[test]
fn extracting() {
assert_eq!(
extract_underscored_css_imports(concat!(
"@IMPORT '_foo.css'\n",
"@import \"_bar.css\"\n",
"@import '_baz.css'\n",
"@import 'nope.css'\n",
"url(_foo.css)\n",
"URL(\"_bar.css\")\n",
"@import url('_baz.css')\n",
"url('nope.css')\n",
)),
vec!["_foo.css", "_bar.css", "_baz.css", "_foo.css", "_bar.css", "_baz.css",]
);
assert_eq!(
extract_underscored_references(concat!(
"<img src=\"_foo.jpg\">",
"<object data=\"_bar\">",
"\"_baz.js\"",
"\"nope.js\"",
"<img src=_foo.jpg>",
"<object data=_bar>",
"'_baz.js'",
)),
vec!["_foo.jpg", "_bar", "_baz.js", "_foo.jpg", "_bar", "_baz.js",]
);
}
#[test]
fn replacing() {
assert_eq!(
&replace_media_refs("<img src=foo.jpg>[sound:bar.mp3]<img src=baz.jpg>", |s| {
(s != "baz.jpg").then(|| "spam".to_string())
})
.unwrap(),
"<img src=spam>[sound:spam]<img src=baz.jpg>",
);
}
#[test]
fn truncate() {
let mut s = "日本語".to_string();
truncate_to_char_boundary(&mut s, 6);
assert_eq!(&s, "日本");
let mut s = "日本語".to_string();
truncate_to_char_boundary(&mut s, 1);
assert_eq!(&s, "");
}
#[test]
fn iri_encoding() {
for (input, output) in [
("foo.jpg", "foo.jpg"),
("bar baz", "bar%20baz"),
("sub/path.jpg", "sub/path.jpg"),
("日本語", "日本語"),
("a=b", "a=b"),
("a&b", "a&b"),
] {
assert_eq!(
&encode_iri_paths(&format!("<img src=\"{input}\">")),
&format!("<img src=\"{output}\">")
);
}
}
}