// Copyright: Ankitects Pty Ltd and contributors // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html use std::borrow::Cow; use lazy_static::lazy_static; use percent_encoding_iri::percent_decode_str; use percent_encoding_iri::utf8_percent_encode; use percent_encoding_iri::AsciiSet; use percent_encoding_iri::CONTROLS; use regex::Captures; use regex::Regex; use unicase::eq as uni_eq; use unicode_normalization::char::is_combining_mark; use unicode_normalization::is_nfc; use unicode_normalization::is_nfkd_quick; use unicode_normalization::IsNormalized; use unicode_normalization::UnicodeNormalization; pub trait Trimming { fn trim(self) -> Self; } impl Trimming for Cow<'_, str> { fn trim(self) -> Self { match self { Cow::Borrowed(text) => text.trim().into(), Cow::Owned(text) => { let trimmed = text.as_str().trim(); if trimmed.len() == text.len() { text.into() } else { trimmed.to_string().into() } } } } } pub(crate) trait CowMapping<'a, B: ?Sized + 'a + ToOwned> { /// Returns [self] /// - unchanged, if the given function returns [Cow::Borrowed] /// - with the new value, if the given function returns [Cow::Owned] fn map_cow(self, f: impl FnOnce(&B) -> Cow) -> Self; fn get_owned(self) -> Option; } impl<'a, B: ?Sized + 'a + ToOwned> CowMapping<'a, B> for Cow<'a, B> { fn map_cow(self, f: impl FnOnce(&B) -> Cow) -> Self { if let Cow::Owned(o) = f(&self) { Cow::Owned(o) } else { self } } fn get_owned(self) -> Option { match self { Cow::Borrowed(_) => None, Cow::Owned(s) => Some(s), } } } pub(crate) fn strip_utf8_bom(s: &str) -> &str { s.strip_prefix('\u{feff}').unwrap_or(s) } #[derive(Debug, PartialEq)] pub enum AvTag { SoundOrVideo(String), TextToSpeech { field_text: String, lang: String, voices: Vec, speed: f32, other_args: Vec, }, } lazy_static! { static ref HTML: Regex = Regex::new(concat!( "(?si)", // wrapped text r"()|(.*?)|(.*?)", // html tags r"|(<.*?>)", )) .unwrap(); static ref HTML_LINEBREAK_TAGS: Regex = Regex::new( r#"(?xsi) "# ).unwrap(); pub static ref HTML_MEDIA_TAGS: Regex = Regex::new( r#"(?xsi) # the start of the image, audio, or object tag <\b(?:img|audio|video|object)\b[^>]+\b(?:src|data)\b= (?: # 1: double-quoted filename " ([^"]+?) " [^>]*> | # 2: single-quoted filename ' ([^']+?) ' [^>]*> | # 3: unquoted filename ([^ >]+?) (?: # then either a space and the rest \x20[^>]*> | # or the tag immediately ends > ) ) "# ).unwrap(); // videos are also in sound tags static ref AV_TAGS: Regex = Regex::new( r"(?xs) \[sound:(.+?)\] # 1 - the filename in a sound tag | \[anki:tts\] \[(.*?)\] # 2 - arguments to tts call (.*?) # 3 - field text \[/anki:tts\] ").unwrap(); static ref PERSISTENT_HTML_SPACERS: Regex = Regex::new(r"(?i)|
|\n").unwrap(); static ref TYPE_TAG: Regex = Regex::new(r"\[\[type:[^]]+\]\]").unwrap(); pub(crate) static ref SOUND_TAG: Regex = Regex::new(r"\[sound:([^]]+)\]").unwrap(); /// Files included in CSS with a leading underscore. static ref UNDERSCORED_CSS_IMPORTS: Regex = Regex::new( r#"(?xi) (?:@import\s+ # import statement with a bare "(_[^"]*.css)" # double quoted | # or '(_[^']*.css)' # single quoted css filename ) | # or (?:url\(\s* # a url function with a "(_[^"]+)" # double quoted | # or '(_[^']+)' # single quoted | # or (_.+) # unquoted filename \s*\)) "#).unwrap(); /// Strings, src and data attributes with a leading underscore. static ref UNDERSCORED_REFERENCES: Regex = Regex::new( r#"(?x) \[sound:(_[^]]+)\] # a filename in an Anki sound tag | # or "(_[^"]+)" # a double quoted | # or '(_[^']+)' # single quoted string | # or \b(?:src|data) # a 'src' or 'data' attribute = # followed by (_[^ >]+) # an unquoted value "#).unwrap(); } pub fn is_html(text: impl AsRef) -> bool { HTML.is_match(text.as_ref()) } pub fn html_to_text_line(html: &str, preserve_media_filenames: bool) -> Cow { let (html_stripper, sound_rep): (fn(&str) -> Cow, _) = if preserve_media_filenames { (strip_html_preserving_media_filenames, "$1") } else { (strip_html, "") }; PERSISTENT_HTML_SPACERS .replace_all(html, " ") .map_cow(|s| TYPE_TAG.replace_all(s, "")) .map_cow(|s| SOUND_TAG.replace_all(s, sound_rep)) .map_cow(html_stripper) .trim() } pub fn strip_html(html: &str) -> Cow { strip_html_preserving_entities(html).map_cow(decode_entities) } pub fn strip_html_preserving_entities(html: &str) -> Cow { HTML.replace_all(html, "") } pub fn decode_entities(html: &str) -> Cow { if html.contains('&') { match htmlescape::decode_html(html) { Ok(text) => text.replace('\u{a0}', " ").into(), Err(_) => html.into(), } } else { // nothing to do html.into() } } pub(crate) fn newlines_to_spaces(text: &str) -> Cow { if text.contains('\n') { text.replace('\n', " ").into() } else { text.into() } } pub fn strip_html_for_tts(html: &str) -> Cow { HTML_LINEBREAK_TAGS .replace_all(html, " ") .map_cow(strip_html) } /// Truncate a String on a valid UTF8 boundary. pub(crate) fn truncate_to_char_boundary(s: &mut String, mut max: usize) { if max >= s.len() { return; } while !s.is_char_boundary(max) { max -= 1; } s.truncate(max); } #[derive(Debug)] pub(crate) struct MediaRef<'a> { pub full_ref: &'a str, pub fname: &'a str, /// audio files may have things like & that need decoding pub fname_decoded: Cow<'a, str>, } pub(crate) fn extract_media_refs(text: &str) -> Vec { let mut out = vec![]; for caps in HTML_MEDIA_TAGS.captures_iter(text) { let fname = caps .get(1) .or_else(|| caps.get(2)) .or_else(|| caps.get(3)) .unwrap() .as_str(); let fname_decoded = decode_entities(fname); out.push(MediaRef { full_ref: caps.get(0).unwrap().as_str(), fname, fname_decoded, }); } for caps in AV_TAGS.captures_iter(text) { if let Some(m) = caps.get(1) { let fname = m.as_str(); let fname_decoded = decode_entities(fname); out.push(MediaRef { full_ref: caps.get(0).unwrap().as_str(), fname, fname_decoded, }); } } out } /// Calls `replacer` for every media reference in `text`, and optionally /// replaces it with something else. [None] if no reference was found. pub fn replace_media_refs( text: &str, mut replacer: impl FnMut(&str) -> Option, ) -> Option { let mut rep = |caps: &Captures| { let whole_match = caps.get(0).unwrap().as_str(); let old_name = caps.iter().skip(1).find_map(|g| g).unwrap().as_str(); let old_name_decoded = decode_entities(old_name); if let Some(mut new_name) = replacer(&old_name_decoded) { if matches!(old_name_decoded, Cow::Owned(_)) { new_name = htmlescape::encode_minimal(&new_name); } whole_match.replace(old_name, &new_name) } else { whole_match.to_owned() } }; HTML_MEDIA_TAGS .replace_all(text, &mut rep) .map_cow(|s| AV_TAGS.replace_all(s, &mut rep)) .get_owned() } pub(crate) fn extract_underscored_css_imports(text: &str) -> Vec<&str> { UNDERSCORED_CSS_IMPORTS .captures_iter(text) .map(extract_match) .collect() } pub(crate) fn extract_underscored_references(text: &str) -> Vec<&str> { UNDERSCORED_REFERENCES .captures_iter(text) .map(extract_match) .collect() } /// Returns the first matching group as a str. This is intended for regexes /// where exactly one group matches, and will panic for matches without matching /// groups. fn extract_match(caps: Captures) -> &str { caps.iter().skip(1).find_map(|g| g).unwrap().as_str() } pub fn strip_html_preserving_media_filenames(html: &str) -> Cow { HTML_MEDIA_TAGS .replace_all(html, r" ${1}${2}${3} ") .map_cow(strip_html) } pub fn contains_media_tag(html: &str) -> bool { HTML_MEDIA_TAGS.is_match(html) } #[allow(dead_code)] pub(crate) fn sanitize_html(html: &str) -> String { ammonia::clean(html) } pub(crate) fn sanitize_html_no_images(html: &str) -> String { ammonia::Builder::default() .rm_tags(&["img"]) .clean(html) .to_string() } pub(crate) fn normalize_to_nfc(s: &str) -> Cow { if !is_nfc(s) { s.chars().nfc().collect::().into() } else { s.into() } } pub(crate) fn ensure_string_in_nfc(s: &mut String) { if !is_nfc(s) { *s = s.chars().nfc().collect() } } /// Convert provided string to NFKD form and strip combining characters. pub(crate) fn without_combining(s: &str) -> Cow { // if the string is already normalized if matches!(is_nfkd_quick(s.chars()), IsNormalized::Yes) { // and no combining characters found, return unchanged if !s.chars().any(is_combining_mark) { return s.into(); } } // we need to create a new string without the combining marks s.chars() .nfkd() .filter(|c| !is_combining_mark(*c)) .collect::() .into() } /// Check if string contains an unescaped wildcard. pub(crate) fn is_glob(txt: &str) -> bool { // even number of \s followed by a wildcard lazy_static! { static ref RE: Regex = Regex::new( r"(?x) (?:^|[^\\]) # not a backslash (?:\\\\)* # even number of backslashes [*_] # wildcard " ) .unwrap(); } RE.is_match(txt) } /// Convert to a RegEx respecting Anki wildcards. pub(crate) fn to_re(txt: &str) -> Cow { to_custom_re(txt, ".") } /// Convert Anki style to RegEx using the provided wildcard. pub(crate) fn to_custom_re<'a>(txt: &'a str, wildcard: &str) -> Cow<'a, str> { lazy_static! { static ref RE: Regex = Regex::new(r"\\?.").unwrap(); } RE.replace_all(txt, |caps: &Captures| { let s = &caps[0]; match s { r"\\" | r"\*" => s.to_string(), r"\_" => "_".to_string(), "*" => format!("{}*", wildcard), "_" => wildcard.to_string(), s => regex::escape(s), } }) } /// Convert to SQL respecting Anki wildcards. pub(crate) fn to_sql(txt: &str) -> Cow { // escape sequences and unescaped special characters which need conversion lazy_static! { static ref RE: Regex = Regex::new(r"\\[\\*]|[*%]").unwrap(); } RE.replace_all(txt, |caps: &Captures| { let s = &caps[0]; match s { r"\\" => r"\\", r"\*" => "*", "*" => "%", "%" => r"\%", _ => unreachable!(), } }) } /// Unescape everything. pub(crate) fn to_text(txt: &str) -> Cow { lazy_static! { static ref RE: Regex = Regex::new(r"\\(.)").unwrap(); } RE.replace_all(txt, "$1") } /// Escape Anki wildcards and the backslash for escaping them: \*_ pub(crate) fn escape_anki_wildcards(txt: &str) -> String { lazy_static! { static ref RE: Regex = Regex::new(r"[\\*_]").unwrap(); } RE.replace_all(txt, r"\$0").into() } /// Escape Anki wildcards unless it's _* pub(crate) fn escape_anki_wildcards_for_search_node(txt: &str) -> String { if txt == "_*" { txt.to_string() } else { escape_anki_wildcards(txt) } } /// Return a function to match input against `search`, /// which may contain wildcards. pub(crate) fn glob_matcher(search: &str) -> impl Fn(&str) -> bool + '_ { let mut regex = None; let mut cow = None; if is_glob(search) { regex = Some(Regex::new(&format!("^(?i){}$", to_re(search))).unwrap()); } else { cow = Some(to_text(search)); } move |text| { if let Some(r) = ®ex { r.is_match(text) } else { uni_eq(text, cow.as_ref().unwrap()) } } } lazy_static! { pub(crate) static ref REMOTE_FILENAME: Regex = Regex::new("(?i)^https?://").unwrap(); } /// https://url.spec.whatwg.org/#fragment-percent-encode-set const FRAGMENT_QUERY_UNION: &AsciiSet = &CONTROLS .add(b' ') .add(b'"') .add(b'<') .add(b'>') .add(b'`') .add(b'#'); /// IRI-encode unescaped local paths in HTML fragment. pub(crate) fn encode_iri_paths(unescaped_html: &str) -> Cow { transform_html_paths(unescaped_html, |fname| { utf8_percent_encode(fname, FRAGMENT_QUERY_UNION).into() }) } /// URI-decode escaped local paths in HTML fragment. pub(crate) fn decode_iri_paths(escaped_html: &str) -> Cow { transform_html_paths(escaped_html, |fname| { percent_decode_str(fname).decode_utf8_lossy() }) } /// Apply a transform to local filename references in tags like IMG. /// Required at display time, as Anki unfortunately stores the references /// in unencoded form in the database. fn transform_html_paths(html: &str, transform: F) -> Cow where F: Fn(&str) -> Cow, { HTML_MEDIA_TAGS.replace_all(html, |caps: &Captures| { let fname = caps .get(1) .or_else(|| caps.get(2)) .or_else(|| caps.get(3)) .unwrap() .as_str(); let full = caps.get(0).unwrap().as_str(); if REMOTE_FILENAME.is_match(fname) { full.into() } else { full.replace(fname, &transform(fname)) } }) } #[cfg(test)] mod test { use std::borrow::Cow; use super::*; #[test] fn stripping() { assert_eq!(strip_html("test"), "test"); assert_eq!(strip_html("test"), "test"); assert_eq!(strip_html("some"), "some"); assert_eq!( strip_html_preserving_media_filenames(""), " foo.jpg " ); assert_eq!( strip_html_preserving_media_filenames(""), " foo.jpg " ); assert_eq!(strip_html_preserving_media_filenames(""), ""); } #[test] fn combining() { assert!(matches!(without_combining("test"), Cow::Borrowed(_))); assert!(matches!(without_combining("Über"), Cow::Owned(_))); } #[test] fn conversion() { assert_eq!(&to_re(r"[te\*st]"), r"\[te\*st\]"); assert_eq!(&to_custom_re("f_o*", r"\d"), r"f\do\d*"); assert_eq!(&to_sql("%f_o*"), r"\%f_o%"); assert_eq!(&to_text(r"\*\_*_"), "*_*_"); assert!(is_glob(r"\\\\_")); assert!(!is_glob(r"\\\_")); assert!(glob_matcher(r"foo\*bar*")("foo*bar123")); } #[test] fn extracting() { assert_eq!( extract_underscored_css_imports(concat!( "@IMPORT '_foo.css'\n", "@import \"_bar.css\"\n", "@import '_baz.css'\n", "@import 'nope.css'\n", "url(_foo.css)\n", "URL(\"_bar.css\")\n", "@import url('_baz.css')\n", "url('nope.css')\n", )), vec!["_foo.css", "_bar.css", "_baz.css", "_foo.css", "_bar.css", "_baz.css",] ); assert_eq!( extract_underscored_references(concat!( "", "", "\"_baz.js\"", "\"nope.js\"", "", "", "'_baz.js'", )), vec!["_foo.jpg", "_bar", "_baz.js", "_foo.jpg", "_bar", "_baz.js",] ); } #[test] fn replacing() { assert_eq!( &replace_media_refs("[sound:bar.mp3]", |s| { (s != "baz.jpg").then(|| "spam".to_string()) }) .unwrap(), "[sound:spam]", ); } #[test] fn truncate() { let mut s = "日本語".to_string(); truncate_to_char_boundary(&mut s, 6); assert_eq!(&s, "日本"); let mut s = "日本語".to_string(); truncate_to_char_boundary(&mut s, 1); assert_eq!(&s, ""); } #[test] fn iri_encoding() { for (input, output) in [ ("foo.jpg", "foo.jpg"), ("bar baz", "bar%20baz"), ("sub/path.jpg", "sub/path.jpg"), ("日本語", "日本語"), ("a=b", "a=b"), ("a&b", "a&b"), ] { assert_eq!( &encode_iri_paths(&format!("")), &format!("") ); } } }