2020-01-10 12:01:23 +01:00
|
|
|
// Copyright: Ankitects Pty Ltd and contributors
|
|
|
|
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
|
|
|
|
|
|
|
use lazy_static::lazy_static;
|
2020-01-21 05:44:27 +01:00
|
|
|
use regex::{Captures, Regex};
|
2020-01-10 12:01:23 +01:00
|
|
|
use std::borrow::Cow;
|
|
|
|
use std::ptr;
|
2020-03-21 06:15:59 +01:00
|
|
|
use unicode_normalization::{
|
|
|
|
char::is_combining_mark, is_nfc, is_nfkd_quick, IsNormalized, UnicodeNormalization,
|
|
|
|
};
|
2020-01-10 12:01:23 +01:00
|
|
|
|
2020-01-20 10:12:34 +01:00
|
|
|
#[derive(Debug, PartialEq)]
|
2020-01-24 02:06:11 +01:00
|
|
|
pub enum AVTag {
|
|
|
|
SoundOrVideo(String),
|
2020-01-20 10:12:34 +01:00
|
|
|
TextToSpeech {
|
2020-01-24 02:06:11 +01:00
|
|
|
field_text: String,
|
|
|
|
lang: String,
|
|
|
|
voices: Vec<String>,
|
2020-01-26 05:28:17 +01:00
|
|
|
speed: f32,
|
2020-01-24 02:06:11 +01:00
|
|
|
other_args: Vec<String>,
|
2020-01-20 10:12:34 +01:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2020-01-10 12:01:23 +01:00
|
|
|
lazy_static! {
|
|
|
|
static ref HTML: Regex = Regex::new(concat!(
|
|
|
|
"(?si)",
|
|
|
|
// wrapped text
|
|
|
|
r"(<!--.*?-->)|(<style.*?>.*?</style>)|(<script.*?>.*?</script>)",
|
|
|
|
// html tags
|
|
|
|
r"|(<.*?>)",
|
|
|
|
))
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
static ref IMG_TAG: Regex = Regex::new(
|
2020-02-10 05:19:39 +01:00
|
|
|
r#"(?xsi)
|
|
|
|
# the start of the image tag
|
|
|
|
<img[^>]+src=
|
|
|
|
(?:
|
|
|
|
# 1: double-quoted filename
|
|
|
|
"
|
|
|
|
([^"]+?)
|
|
|
|
"
|
|
|
|
[^>]*>
|
|
|
|
|
|
|
|
|
# 2: single-quoted filename
|
|
|
|
'
|
|
|
|
([^']+?)
|
|
|
|
'
|
|
|
|
[^>]*>
|
|
|
|
|
|
|
|
|
# 3: unquoted filename
|
|
|
|
([^ >]+?)
|
|
|
|
(?:
|
|
|
|
# then either a space and the rest
|
|
|
|
\x20[^>]*>
|
|
|
|
|
|
|
|
|
# or the tag immediately ends
|
|
|
|
>
|
|
|
|
)
|
|
|
|
)
|
|
|
|
"#
|
2020-01-10 12:01:23 +01:00
|
|
|
).unwrap();
|
|
|
|
|
2020-01-20 10:12:34 +01:00
|
|
|
// videos are also in sound tags
|
|
|
|
static ref AV_TAGS: Regex = Regex::new(
|
|
|
|
r#"(?xs)
|
2020-05-12 12:53:50 +02:00
|
|
|
\[sound:(.+?)\] # 1 - the filename in a sound tag
|
2020-01-20 10:12:34 +01:00
|
|
|
|
|
|
|
|
\[anki:tts\]
|
|
|
|
\[(.*?)\] # 2 - arguments to tts call
|
|
|
|
(.*?) # 3 - field text
|
|
|
|
\[/anki:tts\]
|
|
|
|
"#).unwrap();
|
2020-01-10 12:01:23 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn strip_html(html: &str) -> Cow<str> {
|
2020-02-11 04:11:20 +01:00
|
|
|
let mut out: Cow<str> = html.into();
|
|
|
|
|
2020-04-30 03:17:38 +02:00
|
|
|
if let Cow::Owned(o) = strip_html_preserving_entities(html) {
|
2020-02-11 04:11:20 +01:00
|
|
|
out = o.into();
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Cow::Owned(o) = decode_entities(out.as_ref()) {
|
|
|
|
out = o.into();
|
|
|
|
}
|
|
|
|
|
|
|
|
out
|
2020-01-10 12:01:23 +01:00
|
|
|
}
|
|
|
|
|
2020-04-30 03:17:38 +02:00
|
|
|
pub fn strip_html_preserving_entities(html: &str) -> Cow<str> {
|
|
|
|
HTML.replace_all(html, "")
|
|
|
|
}
|
|
|
|
|
2020-01-20 10:12:34 +01:00
|
|
|
pub fn decode_entities(html: &str) -> Cow<str> {
|
|
|
|
if html.contains('&') {
|
|
|
|
match htmlescape::decode_html(html) {
|
2020-08-30 03:23:12 +02:00
|
|
|
Ok(text) => text.replace('\u{a0}', " "),
|
2020-01-20 10:12:34 +01:00
|
|
|
Err(e) => format!("{:?}", e),
|
|
|
|
}
|
|
|
|
.into()
|
|
|
|
} else {
|
|
|
|
// nothing to do
|
|
|
|
html.into()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn strip_html_for_tts(html: &str) -> Cow<str> {
|
|
|
|
match HTML.replace_all(html, " ") {
|
|
|
|
Cow::Borrowed(_) => decode_entities(html),
|
|
|
|
Cow::Owned(s) => decode_entities(&s).to_string().into(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn strip_av_tags(text: &str) -> Cow<str> {
|
|
|
|
AV_TAGS.replace_all(text, "")
|
|
|
|
}
|
|
|
|
|
2020-01-24 02:06:11 +01:00
|
|
|
/// Extract audio tags from string, replacing them with [anki:play] refs
|
|
|
|
pub fn extract_av_tags<'a>(text: &'a str, question_side: bool) -> (Cow<'a, str>, Vec<AVTag>) {
|
|
|
|
let mut tags = vec![];
|
|
|
|
let context = if question_side { 'q' } else { 'a' };
|
|
|
|
let replaced_text = AV_TAGS.replace_all(text, |caps: &Captures| {
|
|
|
|
// extract
|
|
|
|
let tag = if let Some(av_file) = caps.get(1) {
|
|
|
|
AVTag::SoundOrVideo(decode_entities(av_file.as_str()).into())
|
2020-01-20 10:12:34 +01:00
|
|
|
} else {
|
|
|
|
let args = caps.get(2).unwrap();
|
|
|
|
let field_text = caps.get(3).unwrap();
|
2020-01-21 03:41:37 +01:00
|
|
|
tts_tag_from_string(field_text.as_str(), args.as_str())
|
2020-01-24 02:06:11 +01:00
|
|
|
};
|
|
|
|
tags.push(tag);
|
|
|
|
|
|
|
|
// and replace with reference
|
|
|
|
format!("[anki:play:{}:{}]", context, tags.len() - 1)
|
|
|
|
});
|
|
|
|
|
|
|
|
(replaced_text, tags)
|
2020-01-10 12:01:23 +01:00
|
|
|
}
|
|
|
|
|
2020-02-10 05:19:39 +01:00
|
|
|
#[derive(Debug)]
|
|
|
|
pub(crate) struct MediaRef<'a> {
|
|
|
|
pub full_ref: &'a str,
|
|
|
|
pub fname: &'a str,
|
2020-09-04 01:26:21 +02:00
|
|
|
/// audio files may have things like & that need decoding
|
|
|
|
pub fname_decoded: Cow<'a, str>,
|
2020-02-10 05:19:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
pub(crate) fn extract_media_refs(text: &str) -> Vec<MediaRef> {
|
|
|
|
let mut out = vec![];
|
|
|
|
|
|
|
|
for caps in IMG_TAG.captures_iter(text) {
|
2020-09-04 01:26:21 +02:00
|
|
|
let fname = caps
|
|
|
|
.get(1)
|
|
|
|
.or_else(|| caps.get(2))
|
|
|
|
.or_else(|| caps.get(3))
|
|
|
|
.unwrap()
|
|
|
|
.as_str();
|
|
|
|
let fname_decoded = fname.into();
|
2020-02-10 05:19:39 +01:00
|
|
|
out.push(MediaRef {
|
|
|
|
full_ref: caps.get(0).unwrap().as_str(),
|
2020-09-04 01:26:21 +02:00
|
|
|
fname,
|
|
|
|
fname_decoded,
|
2020-02-10 05:19:39 +01:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
for caps in AV_TAGS.captures_iter(text) {
|
|
|
|
if let Some(m) = caps.get(1) {
|
2020-09-04 01:26:21 +02:00
|
|
|
let fname = m.as_str();
|
|
|
|
let fname_decoded = decode_entities(fname);
|
2020-02-10 05:19:39 +01:00
|
|
|
out.push(MediaRef {
|
|
|
|
full_ref: caps.get(0).unwrap().as_str(),
|
2020-09-04 01:26:21 +02:00
|
|
|
fname,
|
|
|
|
fname_decoded,
|
2020-02-10 05:19:39 +01:00
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out
|
|
|
|
}
|
|
|
|
|
2020-01-24 02:06:11 +01:00
|
|
|
fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag {
|
2020-01-21 03:41:37 +01:00
|
|
|
let mut other_args = vec![];
|
2020-01-26 05:28:17 +01:00
|
|
|
let mut split_args = args.split_ascii_whitespace();
|
2020-01-21 03:41:37 +01:00
|
|
|
let lang = split_args.next().unwrap_or("");
|
|
|
|
let mut voices = None;
|
2020-01-26 05:28:17 +01:00
|
|
|
let mut speed = 1.0;
|
2020-01-21 03:41:37 +01:00
|
|
|
|
|
|
|
for remaining_arg in split_args {
|
|
|
|
if remaining_arg.starts_with("voices=") {
|
|
|
|
voices = remaining_arg
|
|
|
|
.split('=')
|
|
|
|
.nth(1)
|
2020-01-24 02:06:11 +01:00
|
|
|
.map(|voices| voices.split(',').map(ToOwned::to_owned).collect());
|
2020-01-26 05:28:17 +01:00
|
|
|
} else if remaining_arg.starts_with("speed=") {
|
|
|
|
speed = remaining_arg
|
|
|
|
.split('=')
|
|
|
|
.nth(1)
|
|
|
|
.unwrap()
|
|
|
|
.parse()
|
|
|
|
.unwrap_or(1.0);
|
2020-01-21 03:41:37 +01:00
|
|
|
} else {
|
2020-01-24 02:06:11 +01:00
|
|
|
other_args.push(remaining_arg.to_owned());
|
2020-01-21 03:41:37 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
AVTag::TextToSpeech {
|
2020-01-24 02:06:11 +01:00
|
|
|
field_text: strip_html_for_tts(field_text).into(),
|
|
|
|
lang: lang.into(),
|
2020-01-21 03:41:37 +01:00
|
|
|
voices: voices.unwrap_or_else(Vec::new),
|
2020-01-26 05:28:17 +01:00
|
|
|
speed,
|
2020-01-21 03:41:37 +01:00
|
|
|
other_args,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-10 12:01:23 +01:00
|
|
|
pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
|
2020-02-10 05:19:39 +01:00
|
|
|
let without_fnames = IMG_TAG.replace_all(html, r" ${1}${2}${3} ");
|
2020-01-10 12:01:23 +01:00
|
|
|
let without_html = HTML.replace_all(&without_fnames, "");
|
|
|
|
// no changes?
|
|
|
|
if let Cow::Borrowed(b) = without_html {
|
|
|
|
if ptr::eq(b, html) {
|
|
|
|
return Cow::Borrowed(html);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// make borrow checker happy
|
|
|
|
without_html.into_owned().into()
|
|
|
|
}
|
|
|
|
|
2020-02-08 11:56:30 +01:00
|
|
|
pub(crate) fn normalize_to_nfc(s: &str) -> Cow<str> {
|
|
|
|
if !is_nfc(s) {
|
|
|
|
s.chars().nfc().collect::<String>().into()
|
|
|
|
} else {
|
|
|
|
s.into()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-12 09:45:21 +02:00
|
|
|
pub(crate) fn ensure_string_in_nfc(s: &mut String) {
|
|
|
|
if !is_nfc(s) {
|
|
|
|
*s = s.chars().nfc().collect()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-21 06:15:59 +01:00
|
|
|
/// Convert provided string to NFKD form and strip combining characters.
|
|
|
|
pub(crate) fn without_combining(s: &str) -> Cow<str> {
|
|
|
|
// if the string is already normalized
|
|
|
|
if matches!(is_nfkd_quick(s.chars()), IsNormalized::Yes) {
|
|
|
|
// and no combining characters found, return unchanged
|
|
|
|
if !s.chars().any(is_combining_mark) {
|
|
|
|
return s.into();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// we need to create a new string without the combining marks
|
|
|
|
s.chars()
|
|
|
|
.nfkd()
|
|
|
|
.filter(|c| !is_combining_mark(*c))
|
|
|
|
.collect::<String>()
|
|
|
|
.into()
|
|
|
|
}
|
|
|
|
|
2020-08-17 10:14:00 +02:00
|
|
|
/// Escape text, converting glob characters to regex syntax, then return.
|
|
|
|
pub(crate) fn text_to_re(glob: &str) -> String {
|
|
|
|
lazy_static! {
|
|
|
|
static ref ESCAPED: Regex = Regex::new(r"(\\\\)?\\\*").unwrap();
|
|
|
|
static ref GLOB: Regex = Regex::new(r"(\\\\)?[_%]").unwrap();
|
|
|
|
}
|
|
|
|
|
|
|
|
let escaped = regex::escape(glob);
|
|
|
|
|
|
|
|
let text = ESCAPED.replace_all(&escaped, |caps: &Captures| {
|
|
|
|
if caps.get(0).unwrap().as_str().len() == 2 {
|
|
|
|
".*"
|
|
|
|
} else {
|
|
|
|
r"\*"
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
let text2 = GLOB.replace_all(&text, |caps: &Captures| {
|
|
|
|
match caps.get(0).unwrap().as_str() {
|
|
|
|
"_" => ".",
|
|
|
|
"%" => ".*",
|
|
|
|
other => {
|
|
|
|
// strip off the escaping char
|
|
|
|
&other[2..]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
.to_string()
|
|
|
|
});
|
|
|
|
|
|
|
|
text2.into()
|
|
|
|
}
|
|
|
|
|
2020-01-10 12:01:23 +01:00
|
|
|
#[cfg(test)]
|
|
|
|
mod test {
|
2020-03-21 06:15:59 +01:00
|
|
|
use crate::text::without_combining;
|
2020-01-20 10:12:34 +01:00
|
|
|
use crate::text::{
|
2020-01-27 11:41:23 +01:00
|
|
|
extract_av_tags, strip_av_tags, strip_html, strip_html_preserving_image_filenames, AVTag,
|
2020-01-20 10:12:34 +01:00
|
|
|
};
|
2020-03-21 06:15:59 +01:00
|
|
|
use std::borrow::Cow;
|
2020-01-10 12:01:23 +01:00
|
|
|
|
|
|
|
#[test]
|
2020-02-13 00:10:52 +01:00
|
|
|
fn stripping() {
|
2020-01-10 12:01:23 +01:00
|
|
|
assert_eq!(strip_html("test"), "test");
|
|
|
|
assert_eq!(strip_html("t<b>e</b>st"), "test");
|
|
|
|
assert_eq!(strip_html("so<SCRIPT>t<b>e</b>st</script>me"), "some");
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
strip_html_preserving_image_filenames("<img src=foo.jpg>"),
|
|
|
|
" foo.jpg "
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
strip_html_preserving_image_filenames("<img src='foo.jpg'><html>"),
|
|
|
|
" foo.jpg "
|
|
|
|
);
|
|
|
|
assert_eq!(strip_html_preserving_image_filenames("<html>"), "");
|
|
|
|
}
|
|
|
|
|
2020-01-20 10:12:34 +01:00
|
|
|
#[test]
|
2020-02-13 00:10:52 +01:00
|
|
|
fn audio() {
|
2020-01-21 03:41:37 +01:00
|
|
|
let s =
|
2020-01-26 05:28:17 +01:00
|
|
|
"abc[sound:fo&o.mp3]def[anki:tts][en_US voices=Bob,Jane speed=1.2]foo<br>1>2[/anki:tts]gh";
|
2020-01-20 10:12:34 +01:00
|
|
|
assert_eq!(strip_av_tags(s), "abcdefgh");
|
2020-01-24 02:06:11 +01:00
|
|
|
|
|
|
|
let (text, tags) = extract_av_tags(s, true);
|
|
|
|
assert_eq!(text, "abc[anki:play:q:0]def[anki:play:q:1]gh");
|
|
|
|
|
2020-01-20 10:12:34 +01:00
|
|
|
assert_eq!(
|
2020-01-24 02:06:11 +01:00
|
|
|
tags,
|
2020-01-20 10:12:34 +01:00
|
|
|
vec![
|
|
|
|
AVTag::SoundOrVideo("fo&o.mp3".into()),
|
|
|
|
AVTag::TextToSpeech {
|
2020-01-21 03:41:37 +01:00
|
|
|
field_text: "foo 1>2".into(),
|
2020-01-24 02:06:11 +01:00
|
|
|
lang: "en_US".into(),
|
|
|
|
voices: vec!["Bob".into(), "Jane".into()],
|
2020-01-26 05:28:17 +01:00
|
|
|
other_args: vec![],
|
|
|
|
speed: 1.2
|
2020-01-20 10:12:34 +01:00
|
|
|
},
|
|
|
|
]
|
|
|
|
);
|
|
|
|
}
|
2020-03-17 08:02:58 +01:00
|
|
|
|
2020-03-21 06:15:59 +01:00
|
|
|
#[test]
|
|
|
|
fn combining() {
|
|
|
|
assert!(matches!(without_combining("test"), Cow::Borrowed(_)));
|
|
|
|
assert!(matches!(without_combining("Über"), Cow::Owned(_)));
|
|
|
|
}
|
2020-01-10 12:01:23 +01:00
|
|
|
}
|