diff --git a/rslib/src/lib.rs b/rslib/src/lib.rs index 285852037..5f63b11b6 100644 --- a/rslib/src/lib.rs +++ b/rslib/src/lib.rs @@ -7,3 +7,4 @@ pub mod backend; pub mod err; pub mod sched; pub mod template; +pub mod text; diff --git a/rslib/src/text.rs b/rslib/src/text.rs new file mode 100644 index 000000000..ef81be90c --- /dev/null +++ b/rslib/src/text.rs @@ -0,0 +1,98 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +use lazy_static::lazy_static; +use regex::Regex; +use std::borrow::Cow; +use std::collections::HashSet; +use std::ptr; + +lazy_static! { + static ref HTML: Regex = Regex::new(concat!( + "(?si)", + // wrapped text + r"()|(.*?)|(.*?)", + // html tags + r"|(<.*?>)", + )) + .unwrap(); + + static ref IMG_TAG: Regex = Regex::new( + // group 1 is filename + r#"(?i)]+src=["']?([^"'>]+)["']?[^>]*>"# + ).unwrap(); + + static ref SOUND_TAG: Regex = Regex::new( + r"\[sound:(.*?)\]" + ).unwrap(); + + static ref CLOZED_TEXT: Regex = Regex::new( + r"(?s)\{\{c(\d+)::.+?\}\}" + ).unwrap(); +} + +pub fn strip_html(html: &str) -> Cow { + HTML.replace_all(html, "") +} + +pub fn strip_sounds(html: &str) -> Cow { + SOUND_TAG.replace_all(html, "") +} + +pub fn strip_html_preserving_image_filenames(html: &str) -> Cow { + let without_fnames = IMG_TAG.replace_all(html, r" $1 "); + let without_html = HTML.replace_all(&without_fnames, ""); + // no changes? + if let Cow::Borrowed(b) = without_html { + if ptr::eq(b, html) { + return Cow::Borrowed(html); + } + } + // make borrow checker happy + without_html.into_owned().into() +} + +pub fn cloze_numbers_in_string(html: &str) -> HashSet { + let mut hash = HashSet::with_capacity(4); + for cap in CLOZED_TEXT.captures_iter(html) { + if let Ok(n) = cap[1].parse() { + hash.insert(n); + } + } + hash +} + +#[cfg(test)] +mod test { + use crate::text::{cloze_numbers_in_string, strip_html, strip_html_preserving_image_filenames}; + use std::collections::HashSet; + + #[test] + fn test_stripping() { + assert_eq!(strip_html("test"), "test"); + assert_eq!(strip_html("test"), "test"); + assert_eq!(strip_html("some"), "some"); + + assert_eq!( + strip_html_preserving_image_filenames(""), + " foo.jpg " + ); + assert_eq!( + strip_html_preserving_image_filenames(""), + " foo.jpg " + ); + assert_eq!(strip_html_preserving_image_filenames(""), ""); + } + + #[test] + fn test_cloze() { + assert_eq!( + cloze_numbers_in_string("test"), + vec![].into_iter().collect::>() + ); + assert_eq!( + cloze_numbers_in_string("{{c2::te}}{{c1::s}}t{{"), + vec![1, 2].into_iter().collect::>() + ); + } +}