From c157ccb3f5fc2a1f40efbb4e6fb98c2ed52701a8 Mon Sep 17 00:00:00 2001 From: Damien Elmes Date: Fri, 6 Nov 2020 10:21:51 +1000 Subject: [PATCH] handle notes with invalid utf8 --- rslib/ftl/database-check.ftl | 5 +++++ rslib/src/dbcheck.rs | 33 ++++++++++++++++++++++++++++++++- rslib/src/err.rs | 11 ++++++++++- rslib/src/prelude.rs | 2 +- rslib/src/storage/note/mod.rs | 18 ++++++++++++++++++ 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/rslib/ftl/database-check.ftl b/rslib/ftl/database-check.ftl index 4973d1feb..2334f0642 100644 --- a/rslib/ftl/database-check.ftl +++ b/rslib/ftl/database-check.ftl @@ -40,6 +40,11 @@ database-check-revlog-properties = [one] Fixed { $count } review entry with invalid properties. *[other] Fixed { $count } review entries with invalid properties. } +database-check-notes-with-invalid-utf8 = + { $count -> + [one] Fixed { $count } note with invalid utf8 characters. + *[other] Fixed { $count } notes with invalid utf8 characters. + } # "db-check" is always in English database-check-notetypes-recovered = One or more notetypes were missing. The notes that used them have been given new notetypes starting with "db-check", but field names and card design have been lost, so you may be better off restoring from an automatic backup. diff --git a/rslib/src/dbcheck.rs b/rslib/src/dbcheck.rs index f7c781841..a07fd1bcc 100644 --- a/rslib/src/dbcheck.rs +++ b/rslib/src/dbcheck.rs @@ -9,6 +9,7 @@ use crate::{ all_stock_notetypes, AlreadyGeneratedCardInfo, CardGenContext, NoteType, NoteTypeID, NoteTypeKind, }, + prelude::*, timestamp::{TimestampMillis, TimestampSecs}, }; use itertools::Itertools; @@ -29,6 +30,7 @@ pub struct CheckDatabaseOutput { card_ords_duplicated: usize, field_count_mismatch: usize, notetypes_recovered: usize, + invalid_utf8: usize, } #[derive(Debug, Clone, Copy)] @@ -99,6 +101,12 @@ impl CheckDatabaseOutput { tr_args!["count"=>self.revlog_properties_invalid], )); } + if self.invalid_utf8 > 0 { + probs.push(i18n.trn( + TR::DatabaseCheckNotesWithInvalidUtf8, + tr_args!["count"=>self.invalid_utf8], + )); + } probs } @@ -263,7 +271,7 @@ impl Collection { ); checked_notes += 1; - let mut note = self.storage.get_note(nid)?.unwrap(); + let mut note = self.get_note_fixing_invalid_utf8(nid, out)?; let cards = self.storage.existing_cards_for_note(nid)?; @@ -304,6 +312,29 @@ impl Collection { Ok(()) } + fn get_note_fixing_invalid_utf8( + &self, + nid: NoteID, + out: &mut CheckDatabaseOutput, + ) -> Result { + match self.storage.get_note(nid) { + Ok(note) => Ok(note.unwrap()), + Err(err) => match err { + AnkiError::DBError { + kind: DBErrorKind::Utf8, + .. + } => { + // fix note then fetch again + self.storage.fix_invalid_utf8_in_note(nid)?; + out.invalid_utf8 += 1; + Ok(self.storage.get_note(nid)?.unwrap()) + } + // other errors are unhandled + _ => return Err(err), + }, + } + } + fn remove_duplicate_card_ordinals( &mut self, cards: &[AlreadyGeneratedCardInfo], diff --git a/rslib/src/err.rs b/rslib/src/err.rs index c07f3923d..87a1f7838 100644 --- a/rslib/src/err.rs +++ b/rslib/src/err.rs @@ -4,7 +4,7 @@ use crate::i18n::{tr_args, tr_strs, I18n, TR}; pub use failure::{Error, Fail}; use reqwest::StatusCode; -use std::io; +use std::{io, str::Utf8Error}; pub type Result = std::result::Result; @@ -175,6 +175,14 @@ impl From for AnkiError { impl From for AnkiError { fn from(err: rusqlite::types::FromSqlError) -> Self { + if let rusqlite::types::FromSqlError::Other(ref err) = err { + if let Some(_err) = err.downcast_ref::() { + return AnkiError::DBError { + info: "".to_string(), + kind: DBErrorKind::Utf8, + }; + } + } AnkiError::DBError { info: format!("{:?}", err), kind: DBErrorKind::Other, @@ -316,5 +324,6 @@ pub enum DBErrorKind { MissingEntity, Corrupt, Locked, + Utf8, Other, } diff --git a/rslib/src/prelude.rs b/rslib/src/prelude.rs index e9ce12ea8..e21b6a4d1 100644 --- a/rslib/src/prelude.rs +++ b/rslib/src/prelude.rs @@ -8,7 +8,7 @@ pub use crate::{ decks::DeckID, err::{AnkiError, Result}, i18n::{tr_args, tr_strs, TR}, - notes::NoteID, + notes::{Note, NoteID}, notetype::NoteTypeID, revlog::RevlogID, timestamp::{TimestampMillis, TimestampSecs}, diff --git a/rslib/src/storage/note/mod.rs b/rslib/src/storage/note/mod.rs index c2df8e5b9..46b875156 100644 --- a/rslib/src/storage/note/mod.rs +++ b/rslib/src/storage/note/mod.rs @@ -117,6 +117,24 @@ impl super::SqliteStorage { Ok(()) } + pub(crate) fn fix_invalid_utf8_in_note(&self, nid: NoteID) -> Result<()> { + self.db + .query_row( + "select cast(flds as blob) from notes where id=?", + &[nid], + |row| { + let fixed_flds: Vec = row.get(0)?; + let fixed_str = String::from_utf8_lossy(&fixed_flds); + self.db.execute( + "update notes set flds = ? where id = ?", + params![fixed_str, nid], + ) + }, + ) + .map_err(Into::into) + .map(|_| ()) + } + /// Returns the first field of other notes with the same checksum. /// The field of the provided note ID is not returned. pub(crate) fn note_fields_by_checksum(