diff --git a/rslib/src/cloze.rs b/rslib/src/cloze.rs index d5c7f2f01..820a2077c 100644 --- a/rslib/src/cloze.rs +++ b/rslib/src/cloze.rs @@ -1,8 +1,9 @@ // Copyright: Ankitects Pty Ltd and contributors // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html +use crate::latex::contains_latex; use crate::template::RenderContext; -use crate::text::{contains_latex, strip_html}; +use crate::text::strip_html; use lazy_static::lazy_static; use regex::Captures; use regex::Regex; diff --git a/rslib/src/latex.rs b/rslib/src/latex.rs new file mode 100644 index 000000000..f2e5345ff --- /dev/null +++ b/rslib/src/latex.rs @@ -0,0 +1,122 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +use crate::media::files::sha1_of_data; +use crate::text::strip_html; +use lazy_static::lazy_static; +use regex::{Captures, Regex}; +use std::borrow::Cow; + +lazy_static! { + static ref LATEX: Regex = Regex::new( + r#"(?xsi) + \[latex\](.+?)\[/latex\] # 1 - standard latex + | + \[\$\](.+?)\[/\$\] # 2 - inline math + | + \[\$\$\](.+?)\[/\$\$\] # 3 - math environment + "# + ) + .unwrap(); + static ref LATEX_NEWLINES: Regex = Regex::new( + r#"(?xi) + + | +
+ "# + ) + .unwrap(); +} + +pub(crate) fn contains_latex(text: &str) -> bool { + LATEX.is_match(text) +} + +#[derive(Debug, PartialEq)] +pub struct ExtractedLatex { + pub fname: String, + pub latex: String, +} + +pub(crate) fn extract_latex(text: &str, svg: bool) -> (String, Vec) { + let mut extracted = vec![]; + + let new_text = LATEX.replace_all(text, |caps: &Captures| { + let latex = match (caps.get(1), caps.get(2), caps.get(3)) { + (Some(m), _, _) => m.as_str().into(), + (_, Some(m), _) => format!("${}$", m.as_str()), + (_, _, Some(m)) => format!(r"\begin{{displaymath}}{}\end{{displaymath}}", m.as_str()), + _ => unreachable!(), + }; + let latex_text = strip_html_for_latex(&latex); + let fname = fname_for_latex(&latex_text, svg); + let img_link = image_link_for_fname(&fname); + extracted.push(ExtractedLatex { + fname, + latex: latex_text.into(), + }); + + img_link + }); + + (new_text.into(), extracted) +} + +fn strip_html_for_latex(html: &str) -> Cow { + let mut out: Cow = html.into(); + if let Cow::Owned(o) = LATEX_NEWLINES.replace_all(html, "\n") { + out = o.into(); + } + if let Cow::Owned(o) = strip_html(out.as_ref()) { + out = o.into(); + } + + out +} + +fn fname_for_latex(latex: &str, svg: bool) -> String { + let ext = if svg { "svg" } else { "png" }; + let csum = hex::encode(sha1_of_data(latex.as_bytes())); + + format!("latex-{}.{}", csum, ext) +} + +fn image_link_for_fname(fname: &str) -> String { + format!("", fname) +} + +#[cfg(test)] +mod test { + use crate::latex::{extract_latex, ExtractedLatex}; + + #[test] + fn latex() { + let fname = "latex-ef30b3f4141c33a5bf7044b0d1961d3399c05d50.png"; + assert_eq!( + extract_latex("a[latex]one
and
two[/latex]b", false), + ( + format!("ab", fname), + vec![ExtractedLatex { + fname: fname.into(), + latex: "one\nand\ntwo".into() + }] + ) + ); + + assert_eq!( + extract_latex("[$]hello  world[/$]", true).1, + vec![ExtractedLatex { + fname: "latex-060219fbf3ddb74306abddaf4504276ad793b029.svg".to_string(), + latex: "$hello world$".to_string() + }] + ); + + assert_eq!( + extract_latex("[$$]math & stuff[/$$]", false).1, + vec![ExtractedLatex { + fname: "latex-8899f3f849ffdef6e4e9f2f34a923a1f608ebc07.png".to_string(), + latex: r"\begin{displaymath}math & stuff\end{displaymath}".to_string() + }] + ); + } +} diff --git a/rslib/src/lib.rs b/rslib/src/lib.rs index a888f812b..62acc5dc2 100644 --- a/rslib/src/lib.rs +++ b/rslib/src/lib.rs @@ -12,6 +12,7 @@ pub fn version() -> &'static str { pub mod backend; pub mod cloze; pub mod err; +pub mod latex; pub mod media; pub mod sched; pub mod template; diff --git a/rslib/src/media/check.rs b/rslib/src/media/check.rs index a27c0e30e..6e7aea4a7 100644 --- a/rslib/src/media/check.rs +++ b/rslib/src/media/check.rs @@ -1,7 +1,9 @@ // Copyright: Ankitects Pty Ltd and contributors // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html +use crate::cloze::expand_clozes_to_reveal_latex; use crate::err::{AnkiError, Result}; +use crate::latex::extract_latex; use crate::media::col::{ for_every_note, get_note_types, mark_collection_modified, open_or_create_collection_db, set_note, Note, @@ -223,20 +225,19 @@ where if self.checked % 10 == 0 { self.maybe_fire_progress_cb()?; } + let nt = note_types + .get(¬e.mid) + .ok_or_else(|| AnkiError::DBError { + info: "missing note type".to_string(), + })?; if fix_and_extract_media_refs(note, &mut referenced_files, renamed)? { // note was modified, needs saving - set_note( - &trx, - note, - note_types - .get(¬e.mid) - .ok_or_else(|| AnkiError::DBError { - info: "missing note type".to_string(), - })?, - )?; + set_note(&trx, note, nt)?; collection_modified = true; } + // extract latex + extract_latex_refs(note, &mut referenced_files, nt.latex_uses_svg()); Ok(()) })?; @@ -320,6 +321,20 @@ fn find_unused_and_missing( (unused, references.into_iter().collect()) } +fn extract_latex_refs(note: &Note, seen_files: &mut HashSet, svg: bool) { + for field in note.fields() { + let field_text: Cow = if field.contains("{{c") { + expand_clozes_to_reveal_latex(field).into() + } else { + field.into() + }; + let (_, extracted) = extract_latex(field_text.as_ref(), svg); + for e in extracted { + seen_files.insert(e.fname); + } + } +} + #[cfg(test)] mod test { use crate::err::Result; diff --git a/rslib/src/media/col.rs b/rslib/src/media/col.rs index 51bbb8796..20b0e7a88 100644 --- a/rslib/src/media/col.rs +++ b/rslib/src/media/col.rs @@ -64,6 +64,15 @@ pub(super) struct NoteType { id: ObjID, #[serde(rename = "sortf")] sort_field_idx: u16, + + #[serde(rename = "latexsvg", default)] + latex_svg: bool, +} + +impl NoteType { + pub fn latex_uses_svg(&self) -> bool { + self.latex_svg + } } pub(super) fn get_note_types(db: &Connection) -> Result> { diff --git a/rslib/src/text.rs b/rslib/src/text.rs index ae910577d..300ab6f90 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -70,25 +70,26 @@ lazy_static! { (.*?) # 3 - field text \[/anki:tts\] "#).unwrap(); - - static ref LATEX: Regex = Regex::new( - r#"(?xsi) - \[latex\](.+?)\[/latex\] # 1 - standard latex - | - \[\$\](.+?)\[/\$\] # 2 - inline math - | - \[\$\$\](.+?)\[/\$\$\] # 3 - math environment - "#).unwrap(); } pub fn strip_html(html: &str) -> Cow { - HTML.replace_all(html, "") + let mut out: Cow = html.into(); + + if let Cow::Owned(o) = HTML.replace_all(html, "") { + out = o.into(); + } + + if let Cow::Owned(o) = decode_entities(out.as_ref()) { + out = o.into(); + } + + out } pub fn decode_entities(html: &str) -> Cow { if html.contains('&') { match htmlescape::decode_html(html) { - Ok(text) => text, + Ok(text) => text.replace("\u{a0}", " "), Err(e) => format!("{:?}", e), } .into() @@ -211,10 +212,6 @@ pub fn strip_html_preserving_image_filenames(html: &str) -> Cow { without_html.into_owned().into() } -pub(crate) fn contains_latex(text: &str) -> bool { - LATEX.is_match(text) -} - pub(crate) fn normalize_to_nfc(s: &str) -> Cow { if !is_nfc(s) { s.chars().nfc().collect::().into()