Add html_to_text_line() on backend

This commit is contained in:
RumovZ 2021-03-20 12:00:45 +01:00
parent 1823c0dda4
commit e931a429b3

View File

@ -10,6 +10,26 @@ use unicode_normalization::{
char::is_combining_mark, is_nfc, is_nfkd_quick, IsNormalized, UnicodeNormalization,
};
pub trait Trimming {
fn trim(self) -> Self;
}
impl Trimming for Cow<'_, str> {
fn trim(self) -> Self {
match self {
Cow::Borrowed(text) => text.trim().into(),
Cow::Owned(text) => {
let trimmed = text.as_str().trim();
if trimmed.len() == text.len() {
text.into()
} else {
trimmed.to_string().into()
}
}
}
}
}
#[derive(Debug, PartialEq)]
pub enum AVTag {
SoundOrVideo(String),
@ -72,6 +92,29 @@ lazy_static! {
(.*?) # 3 - field text
\[/anki:tts\]
"#).unwrap();
static ref PERSISTENT_HTML_SPACERS: Regex = Regex::new("<br>|<br />|<div>|\n").unwrap();
static ref UNPRINTABLE_TAGS: Regex = Regex::new(
r"(?xs)
\[sound:[^]]+\]
|
\[\[type:[^]]+\]\]
").unwrap();
}
pub fn html_to_text_line(html: &str) -> Cow<str> {
let mut out: Cow<str> = html.into();
if let Cow::Owned(o) = PERSISTENT_HTML_SPACERS.replace_all(&out, " ") {
out = o.into();
}
if let Cow::Owned(o) = UNPRINTABLE_TAGS.replace_all(&out, "") {
out = o.into();
}
if let Cow::Owned(o) = strip_html_preserving_media_filenames(&out) {
out = o.into();
}
out.trim()
}
pub fn strip_html(html: &str) -> Cow<str> {