Refactor MediaIter

Removes the dependency on a local path, and allows the unicode checks
to be skipped if appropriate.
This commit is contained in:
Damien Elmes 2023-02-01 21:09:41 +10:00
parent ce35ba123b
commit d5772ac43a
5 changed files with 183 additions and 110 deletions

View File

@ -8,7 +8,7 @@ use std::path::PathBuf;
use crate::collection::CollectionBuilder;
use crate::import_export::gather::ExchangeData;
use crate::import_export::package::colpkg::export::export_collection;
use crate::import_export::package::colpkg::export::MediaIter;
use crate::import_export::package::media::MediaIter;
use crate::import_export::package::Meta;
use crate::import_export::ExportProgress;
use crate::import_export::IncrementableProgress;

View File

@ -10,8 +10,8 @@ use zip::ZipArchive;
use super::Context;
use crate::error::FileIoSnafu;
use crate::error::FileOp;
use crate::import_export::package::colpkg::export::MediaCopier;
use crate::import_export::package::media::extract_media_entries;
use crate::import_export::package::media::MediaCopier;
use crate::import_export::package::media::SafeMediaEntry;
use crate::import_export::ImportProgress;
use crate::import_export::IncrementableProgress;

View File

@ -1,9 +1,7 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use std::borrow::Cow;
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fs::File;
use std::io;
use std::io::Read;
@ -12,8 +10,6 @@ use std::path::Path;
use std::path::PathBuf;
use prost::Message;
use sha1::Digest;
use sha1::Sha1;
use tempfile::NamedTempFile;
use zip::write::FileOptions;
use zip::CompressionMethod;
@ -27,14 +23,14 @@ use super::super::MediaEntry;
use super::super::Meta;
use super::super::Version;
use crate::collection::CollectionBuilder;
use crate::import_export::package::media::MediaCopier;
use crate::import_export::package::media::MediaIter;
use crate::import_export::ExportProgress;
use crate::import_export::IncrementableProgress;
use crate::io::atomic_rename;
use crate::io::new_tempfile;
use crate::io::new_tempfile_in_parent_of;
use crate::io::open_file;
use crate::io::read_dir_files;
use crate::media::files::filename_if_normalized;
use crate::prelude::*;
use crate::storage::SchemaVersion;
@ -82,36 +78,6 @@ impl Collection {
}
}
pub struct MediaIter(Box<dyn Iterator<Item = io::Result<PathBuf>>>);
impl MediaIter {
/// Iterator over all files in the given path, without traversing
/// subfolders.
pub fn from_folder(path: &Path) -> Result<Self> {
Ok(Self(Box::new(
read_dir_files(path)?.map(|res| res.map(|entry| entry.path())),
)))
}
/// Iterator over all given files in the given folder.
/// Missing files are silently ignored.
pub fn from_file_list(
list: impl IntoIterator<Item = String> + 'static,
folder: PathBuf,
) -> Self {
Self(Box::new(
list.into_iter()
.map(move |file| folder.join(file))
.filter(|path| path.exists())
.map(Ok),
))
}
pub fn empty() -> Self {
Self(Box::new(std::iter::empty()))
}
}
fn export_collection_file(
out_path: impl AsRef<Path>,
col_path: impl AsRef<Path>,
@ -298,88 +264,24 @@ fn write_media_files(
let mut incrementor = progress.incrementor(ExportProgress::Media);
for (index, res) in media.0.enumerate() {
incrementor.increment()?;
let path = res?;
let mut entry = res?;
zip.start_file(index.to_string(), file_options_stored())?;
let mut file = open_file(&path)?;
let file_name = path.file_name().or_invalid("not a file path")?;
let name = normalized_unicode_file_name(file_name)?;
let (size, sha1) = copier.copy(&mut file, zip)?;
media_entries.push(MediaEntry::new(name, size, sha1));
let (size, sha1) = copier.copy(&mut entry.data, zip)?;
media_entries.push(MediaEntry::new(entry.nfc_filename, size, sha1));
}
Ok(())
}
fn normalized_unicode_file_name(filename: &OsStr) -> Result<String> {
let filename = filename.to_str().or_invalid("non-unicode filename")?;
filename_if_normalized(filename)
.map(Cow::into_owned)
.ok_or(AnkiError::MediaCheckRequired)
}
/// Copies and hashes while optionally encoding.
/// If compressing, the encoder is reused to optimize for repeated calls.
pub(crate) struct MediaCopier {
encoding: bool,
encoder: Option<RawEncoder<'static>>,
buf: [u8; 64 * 1024],
}
impl MediaCopier {
pub(crate) fn new(encoding: bool) -> Self {
Self {
encoding,
encoder: None,
buf: [0; 64 * 1024],
}
}
fn encoder(&mut self) -> Option<RawEncoder<'static>> {
self.encoding.then(|| {
self.encoder
.take()
.unwrap_or_else(|| RawEncoder::with_dictionary(0, &[]).unwrap())
})
}
/// Returns size and sha1 hash of the copied data.
pub(crate) fn copy(
&mut self,
reader: &mut impl Read,
writer: &mut impl Write,
) -> Result<(usize, Sha1Hash)> {
let mut size = 0;
let mut hasher = Sha1::new();
self.buf = [0; 64 * 1024];
let mut wrapped_writer = MaybeEncodedWriter::new(writer, self.encoder());
loop {
let count = match reader.read(&mut self.buf) {
Ok(0) => break,
Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
result => result?,
};
size += count;
hasher.update(&self.buf[..count]);
wrapped_writer.write(&self.buf[..count])?;
}
self.encoder = wrapped_writer.finish()?;
Ok((size, hasher.finalize().into()))
}
}
enum MaybeEncodedWriter<'a, W: Write> {
pub(crate) enum MaybeEncodedWriter<'a, W: Write> {
Stored(&'a mut W),
Encoded(zio::Writer<&'a mut W, RawEncoder<'static>>),
}
impl<'a, W: Write> MaybeEncodedWriter<'a, W> {
fn new(writer: &'a mut W, encoder: Option<RawEncoder<'static>>) -> Self {
pub fn new(writer: &'a mut W, encoder: Option<RawEncoder<'static>>) -> Self {
if let Some(encoder) = encoder {
Self::Encoded(zio::Writer::new(writer, encoder))
} else {
@ -387,7 +289,7 @@ impl<'a, W: Write> MaybeEncodedWriter<'a, W> {
}
}
fn write(&mut self, buf: &[u8]) -> Result<()> {
pub fn write(&mut self, buf: &[u8]) -> Result<()> {
match self {
Self::Stored(writer) => writer.write_all(buf)?,
Self::Encoded(writer) => writer.write_all(buf)?,
@ -395,7 +297,7 @@ impl<'a, W: Write> MaybeEncodedWriter<'a, W> {
Ok(())
}
fn finish(self) -> Result<Option<RawEncoder<'static>>> {
pub fn finish(self) -> Result<Option<RawEncoder<'static>>> {
Ok(match self {
Self::Stored(_) => None,
Self::Encoded(mut writer) => {

View File

@ -3,25 +3,36 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::ffi::OsString;
use std::fs;
use std::fs::File;
use std::io;
use std::io::Read;
use std::io::Write;
use std::path::Path;
use std::path::PathBuf;
use prost::Message;
use sha1::Digest;
use sha1::Sha1;
use zip::read::ZipFile;
use zip::ZipArchive;
use zstd::stream::copy_decode;
use zstd::stream::raw::Encoder as RawEncoder;
use super::colpkg::export::MediaCopier;
use super::MediaEntries;
use super::MediaEntry;
use super::Meta;
use crate::error::FileIoError;
use crate::error::FileOp;
use crate::error::ImportError;
use crate::error::InvalidInputError;
use crate::import_export::package::colpkg::export::MaybeEncodedWriter;
use crate::io::atomic_rename;
use crate::io::filename_is_safe;
use crate::io::new_tempfile_in;
use crate::io::read_dir_files;
use crate::media::files::filename_if_normalized;
use crate::media::files::normalize_filename;
use crate::prelude::*;
@ -171,6 +182,163 @@ impl MediaEntries {
}
}
pub struct MediaIterEntry {
pub nfc_filename: String,
pub data: Box<dyn Read>,
}
#[derive(Debug)]
pub enum MediaIterError {
InvalidFilename {
filename: OsString,
},
IoError {
filename: String,
source: io::Error,
},
Other {
source: Box<dyn std::error::Error + Send + Sync>,
},
}
impl TryFrom<&Path> for MediaIterEntry {
type Error = MediaIterError;
fn try_from(value: &Path) -> std::result::Result<Self, Self::Error> {
let nfc_filename: String = value
.file_name()
.and_then(|s| s.to_str())
.and_then(filename_if_normalized)
.ok_or_else(|| MediaIterError::InvalidFilename {
filename: value.as_os_str().to_owned(),
})?
.into();
let file = File::open(value).map_err(|err| MediaIterError::IoError {
filename: nfc_filename.clone(),
source: err,
})?;
Ok(MediaIterEntry {
nfc_filename,
data: Box::new(file) as _,
})
}
}
impl From<MediaIterError> for AnkiError {
fn from(err: MediaIterError) -> Self {
match err {
MediaIterError::InvalidFilename { .. } => AnkiError::MediaCheckRequired,
MediaIterError::IoError { filename, source } => FileIoError {
path: filename.into(),
op: FileOp::Read,
source,
}
.into(),
MediaIterError::Other { source } => InvalidInputError {
message: "".to_string(),
source: Some(source),
backtrace: None,
}
.into(),
}
}
}
pub struct MediaIter(pub Box<dyn Iterator<Item = Result<MediaIterEntry, MediaIterError>>>);
impl MediaIter {
pub fn new<I>(iter: I) -> Self
where
I: Iterator<Item = Result<MediaIterEntry, MediaIterError>> + 'static,
{
Self(Box::new(iter))
}
/// Iterator over all files in the given path, without traversing
/// subfolders.
pub fn from_folder(path: &Path) -> Result<Self> {
let path2 = path.to_owned();
Ok(Self::new(read_dir_files(path)?.map(move |res| match res {
Ok(entry) => MediaIterEntry::try_from(entry.path().as_path()),
Err(err) => Err(MediaIterError::IoError {
filename: path2.to_string_lossy().into(),
source: err,
}),
})))
}
/// Iterator over all given files in the given folder.
/// Missing files are silently ignored.
pub fn from_file_list(
list: impl IntoIterator<Item = String> + 'static,
folder: PathBuf,
) -> Self {
Self::new(
list.into_iter()
.map(move |file| folder.join(file))
.filter(|path| path.exists())
.map(|path| MediaIterEntry::try_from(path.as_path())),
)
}
pub fn empty() -> Self {
Self::new([].into_iter())
}
}
/// Copies and hashes while optionally encoding.
/// If compressing, the encoder is reused to optimize for repeated calls.
pub(crate) struct MediaCopier {
encoding: bool,
encoder: Option<RawEncoder<'static>>,
buf: [u8; 64 * 1024],
}
impl MediaCopier {
pub(crate) fn new(encoding: bool) -> Self {
Self {
encoding,
encoder: None,
buf: [0; 64 * 1024],
}
}
fn encoder(&mut self) -> Option<RawEncoder<'static>> {
self.encoding.then(|| {
self.encoder
.take()
.unwrap_or_else(|| RawEncoder::with_dictionary(0, &[]).unwrap())
})
}
/// Returns size and sha1 hash of the copied data.
pub(crate) fn copy(
&mut self,
reader: &mut impl Read,
writer: &mut impl Write,
) -> Result<(usize, Sha1Hash)> {
let mut size = 0;
let mut hasher = Sha1::new();
self.buf = [0; 64 * 1024];
let mut wrapped_writer = MaybeEncodedWriter::new(writer, self.encoder());
loop {
let count = match reader.read(&mut self.buf) {
Ok(0) => break,
Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
result => result?,
};
size += count;
hasher.update(&self.buf[..count]);
wrapped_writer.write(&self.buf[..count])?;
}
self.encoder = wrapped_writer.finish()?;
Ok((size, hasher.finalize().into()))
}
}
#[cfg(test)]
mod test {
use super::*;

View File

@ -9,6 +9,9 @@ mod meta;
pub(crate) use apkg::NoteMeta;
pub(crate) use colpkg::export::export_colpkg_from_data;
pub use colpkg::import::import_colpkg;
pub use media::MediaIter;
pub use media::MediaIterEntry;
pub use media::MediaIterError;
pub(self) use meta::Meta;
pub(self) use meta::Version;