add some text helpers
This commit is contained in:
parent
96c8784024
commit
d4553e9488
@ -7,3 +7,4 @@ pub mod backend;
|
|||||||
pub mod err;
|
pub mod err;
|
||||||
pub mod sched;
|
pub mod sched;
|
||||||
pub mod template;
|
pub mod template;
|
||||||
|
pub mod text;
|
||||||
|
98
rslib/src/text.rs
Normal file
98
rslib/src/text.rs
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
// Copyright: Ankitects Pty Ltd and contributors
|
||||||
|
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
||||||
|
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use regex::Regex;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::ptr;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref HTML: Regex = Regex::new(concat!(
|
||||||
|
"(?si)",
|
||||||
|
// wrapped text
|
||||||
|
r"(<!--.*?-->)|(<style.*?>.*?</style>)|(<script.*?>.*?</script>)",
|
||||||
|
// html tags
|
||||||
|
r"|(<.*?>)",
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
static ref IMG_TAG: Regex = Regex::new(
|
||||||
|
// group 1 is filename
|
||||||
|
r#"(?i)<img[^>]+src=["']?([^"'>]+)["']?[^>]*>"#
|
||||||
|
).unwrap();
|
||||||
|
|
||||||
|
static ref SOUND_TAG: Regex = Regex::new(
|
||||||
|
r"\[sound:(.*?)\]"
|
||||||
|
).unwrap();
|
||||||
|
|
||||||
|
static ref CLOZED_TEXT: Regex = Regex::new(
|
||||||
|
r"(?s)\{\{c(\d+)::.+?\}\}"
|
||||||
|
).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn strip_html(html: &str) -> Cow<str> {
|
||||||
|
HTML.replace_all(html, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn strip_sounds(html: &str) -> Cow<str> {
|
||||||
|
SOUND_TAG.replace_all(html, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
|
||||||
|
let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
|
||||||
|
let without_html = HTML.replace_all(&without_fnames, "");
|
||||||
|
// no changes?
|
||||||
|
if let Cow::Borrowed(b) = without_html {
|
||||||
|
if ptr::eq(b, html) {
|
||||||
|
return Cow::Borrowed(html);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// make borrow checker happy
|
||||||
|
without_html.into_owned().into()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn cloze_numbers_in_string(html: &str) -> HashSet<u16> {
|
||||||
|
let mut hash = HashSet::with_capacity(4);
|
||||||
|
for cap in CLOZED_TEXT.captures_iter(html) {
|
||||||
|
if let Ok(n) = cap[1].parse() {
|
||||||
|
hash.insert(n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
hash
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use crate::text::{cloze_numbers_in_string, strip_html, strip_html_preserving_image_filenames};
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stripping() {
|
||||||
|
assert_eq!(strip_html("test"), "test");
|
||||||
|
assert_eq!(strip_html("t<b>e</b>st"), "test");
|
||||||
|
assert_eq!(strip_html("so<SCRIPT>t<b>e</b>st</script>me"), "some");
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
strip_html_preserving_image_filenames("<img src=foo.jpg>"),
|
||||||
|
" foo.jpg "
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
strip_html_preserving_image_filenames("<img src='foo.jpg'><html>"),
|
||||||
|
" foo.jpg "
|
||||||
|
);
|
||||||
|
assert_eq!(strip_html_preserving_image_filenames("<html>"), "");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cloze() {
|
||||||
|
assert_eq!(
|
||||||
|
cloze_numbers_in_string("test"),
|
||||||
|
vec![].into_iter().collect::<HashSet<u16>>()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
cloze_numbers_in_string("{{c2::te}}{{c1::s}}t{{"),
|
||||||
|
vec![1, 2].into_iter().collect::<HashSet<u16>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user