include LaTeX png/svg files when checking for unused media

This commit is contained in:
Damien Elmes 2020-02-11 13:11:20 +10:00
parent 4cca3ecef5
commit c890ef871e
6 changed files with 170 additions and 25 deletions

View File

@ -1,8 +1,9 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use crate::latex::contains_latex;
use crate::template::RenderContext;
use crate::text::{contains_latex, strip_html};
use crate::text::strip_html;
use lazy_static::lazy_static;
use regex::Captures;
use regex::Regex;

122
rslib/src/latex.rs Normal file
View File

@ -0,0 +1,122 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use crate::media::files::sha1_of_data;
use crate::text::strip_html;
use lazy_static::lazy_static;
use regex::{Captures, Regex};
use std::borrow::Cow;
lazy_static! {
static ref LATEX: Regex = Regex::new(
r#"(?xsi)
\[latex\](.+?)\[/latex\] # 1 - standard latex
|
\[\$\](.+?)\[/\$\] # 2 - inline math
|
\[\$\$\](.+?)\[/\$\$\] # 3 - math environment
"#
)
.unwrap();
static ref LATEX_NEWLINES: Regex = Regex::new(
r#"(?xi)
<br( /)?>
|
<div>
"#
)
.unwrap();
}
pub(crate) fn contains_latex(text: &str) -> bool {
LATEX.is_match(text)
}
#[derive(Debug, PartialEq)]
pub struct ExtractedLatex {
pub fname: String,
pub latex: String,
}
pub(crate) fn extract_latex(text: &str, svg: bool) -> (String, Vec<ExtractedLatex>) {
let mut extracted = vec![];
let new_text = LATEX.replace_all(text, |caps: &Captures| {
let latex = match (caps.get(1), caps.get(2), caps.get(3)) {
(Some(m), _, _) => m.as_str().into(),
(_, Some(m), _) => format!("${}$", m.as_str()),
(_, _, Some(m)) => format!(r"\begin{{displaymath}}{}\end{{displaymath}}", m.as_str()),
_ => unreachable!(),
};
let latex_text = strip_html_for_latex(&latex);
let fname = fname_for_latex(&latex_text, svg);
let img_link = image_link_for_fname(&fname);
extracted.push(ExtractedLatex {
fname,
latex: latex_text.into(),
});
img_link
});
(new_text.into(), extracted)
}
fn strip_html_for_latex(html: &str) -> Cow<str> {
let mut out: Cow<str> = html.into();
if let Cow::Owned(o) = LATEX_NEWLINES.replace_all(html, "\n") {
out = o.into();
}
if let Cow::Owned(o) = strip_html(out.as_ref()) {
out = o.into();
}
out
}
fn fname_for_latex(latex: &str, svg: bool) -> String {
let ext = if svg { "svg" } else { "png" };
let csum = hex::encode(sha1_of_data(latex.as_bytes()));
format!("latex-{}.{}", csum, ext)
}
fn image_link_for_fname(fname: &str) -> String {
format!("<img class=latex src=\"{}\">", fname)
}
#[cfg(test)]
mod test {
use crate::latex::{extract_latex, ExtractedLatex};
#[test]
fn latex() {
let fname = "latex-ef30b3f4141c33a5bf7044b0d1961d3399c05d50.png";
assert_eq!(
extract_latex("a[latex]one<br>and<div>two[/latex]b", false),
(
format!("a<img class=latex src=\"{}\">b", fname),
vec![ExtractedLatex {
fname: fname.into(),
latex: "one\nand\ntwo".into()
}]
)
);
assert_eq!(
extract_latex("[$]<b>hello</b>&nbsp; world[/$]", true).1,
vec![ExtractedLatex {
fname: "latex-060219fbf3ddb74306abddaf4504276ad793b029.svg".to_string(),
latex: "$hello world$".to_string()
}]
);
assert_eq!(
extract_latex("[$$]math &amp; stuff[/$$]", false).1,
vec![ExtractedLatex {
fname: "latex-8899f3f849ffdef6e4e9f2f34a923a1f608ebc07.png".to_string(),
latex: r"\begin{displaymath}math & stuff\end{displaymath}".to_string()
}]
);
}
}

View File

@ -12,6 +12,7 @@ pub fn version() -> &'static str {
pub mod backend;
pub mod cloze;
pub mod err;
pub mod latex;
pub mod media;
pub mod sched;
pub mod template;

View File

@ -1,7 +1,9 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use crate::cloze::expand_clozes_to_reveal_latex;
use crate::err::{AnkiError, Result};
use crate::latex::extract_latex;
use crate::media::col::{
for_every_note, get_note_types, mark_collection_modified, open_or_create_collection_db,
set_note, Note,
@ -223,20 +225,19 @@ where
if self.checked % 10 == 0 {
self.maybe_fire_progress_cb()?;
}
let nt = note_types
.get(&note.mid)
.ok_or_else(|| AnkiError::DBError {
info: "missing note type".to_string(),
})?;
if fix_and_extract_media_refs(note, &mut referenced_files, renamed)? {
// note was modified, needs saving
set_note(
&trx,
note,
note_types
.get(&note.mid)
.ok_or_else(|| AnkiError::DBError {
info: "missing note type".to_string(),
})?,
)?;
set_note(&trx, note, nt)?;
collection_modified = true;
}
// extract latex
extract_latex_refs(note, &mut referenced_files, nt.latex_uses_svg());
Ok(())
})?;
@ -320,6 +321,20 @@ fn find_unused_and_missing(
(unused, references.into_iter().collect())
}
fn extract_latex_refs(note: &Note, seen_files: &mut HashSet<String>, svg: bool) {
for field in note.fields() {
let field_text: Cow<str> = if field.contains("{{c") {
expand_clozes_to_reveal_latex(field).into()
} else {
field.into()
};
let (_, extracted) = extract_latex(field_text.as_ref(), svg);
for e in extracted {
seen_files.insert(e.fname);
}
}
}
#[cfg(test)]
mod test {
use crate::err::Result;

View File

@ -64,6 +64,15 @@ pub(super) struct NoteType {
id: ObjID,
#[serde(rename = "sortf")]
sort_field_idx: u16,
#[serde(rename = "latexsvg", default)]
latex_svg: bool,
}
impl NoteType {
pub fn latex_uses_svg(&self) -> bool {
self.latex_svg
}
}
pub(super) fn get_note_types(db: &Connection) -> Result<HashMap<ObjID, NoteType>> {

View File

@ -70,25 +70,26 @@ lazy_static! {
(.*?) # 3 - field text
\[/anki:tts\]
"#).unwrap();
static ref LATEX: Regex = Regex::new(
r#"(?xsi)
\[latex\](.+?)\[/latex\] # 1 - standard latex
|
\[\$\](.+?)\[/\$\] # 2 - inline math
|
\[\$\$\](.+?)\[/\$\$\] # 3 - math environment
"#).unwrap();
}
pub fn strip_html(html: &str) -> Cow<str> {
HTML.replace_all(html, "")
let mut out: Cow<str> = html.into();
if let Cow::Owned(o) = HTML.replace_all(html, "") {
out = o.into();
}
if let Cow::Owned(o) = decode_entities(out.as_ref()) {
out = o.into();
}
out
}
pub fn decode_entities(html: &str) -> Cow<str> {
if html.contains('&') {
match htmlescape::decode_html(html) {
Ok(text) => text,
Ok(text) => text.replace("\u{a0}", " "),
Err(e) => format!("{:?}", e),
}
.into()
@ -211,10 +212,6 @@ pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
without_html.into_owned().into()
}
pub(crate) fn contains_latex(text: &str) -> bool {
LATEX.is_match(text)
}
pub(crate) fn normalize_to_nfc(s: &str) -> Cow<str> {
if !is_nfc(s) {
s.chars().nfc().collect::<String>().into()