diff --git a/Cargo.toml b/Cargo.toml index 1f1deae87..0a2de1c80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -133,3 +133,7 @@ harness = false [[bench]] name = "merge_archive" harness = false + +[[bench]] +name = "zip_file_list" +harness = false diff --git a/benches/read_entry.rs b/benches/read_entry.rs index 88be9c8ab..d2ef9f9d5 100644 --- a/benches/read_entry.rs +++ b/benches/read_entry.rs @@ -25,7 +25,7 @@ fn read_entry(bench: &mut Bencher) { let size = 1024 * 1024; let bytes = generate_random_archive(size) .expect("Failed to create a random archive for the bench read_entry()"); - let mut archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap(); + let mut archive = ZipArchive::new(Cursor::new(&bytes)).unwrap(); bench.iter(|| { let mut file = archive.by_name("random.dat").unwrap(); @@ -41,5 +41,39 @@ fn read_entry(bench: &mut Bencher) { bench.bytes = size as u64; } -benchmark_group!(benches, read_entry); +fn read_entry_iterable(bench: &mut Bencher) { + use zip::read::Config; + use zip::unstable::read::IterableZip; + let size = 1024 * 1024; + let bytes = generate_random_archive(size) + .expect("Failed to create a random archive for the bench read_entry()"); + let mut reader = Cursor::new(&bytes); + let mut archive = IterableZip::try_new(reader.clone(), Config::default()).unwrap(); + + bench.iter(|| { + let file = archive + .files() + .unwrap() + .find(|f| { + let file = f.as_ref().unwrap(); + let filename = file.file_name().unwrap(); + filename == "random.dat" + }) + .unwrap() + .unwrap(); + let mut buf = [0u8; 1024]; + let zip_data = &file.into_zip_file_data(&mut reader).unwrap(); + let mut file_reader = archive.by_file_data(&zip_data, Default::default()).unwrap(); + loop { + let n = file_reader.read(&mut buf).unwrap(); + if n == 0 { + break; + } + } + }); + + bench.bytes = size as u64; +} + +benchmark_group!(benches, read_entry, read_entry_iterable); benchmark_main!(benches); diff --git a/benches/zip_file_list.rs b/benches/zip_file_list.rs new file mode 100644 index 000000000..3c2ab971e --- /dev/null +++ b/benches/zip_file_list.rs @@ -0,0 +1,116 @@ +use bencher::{benchmark_group, benchmark_main}; + +use std::io::{Cursor, Write}; + +use bencher::Bencher; +use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions}; + +const NB_FILES: usize = 100; +const FILENAME: &str = "bench_file_listing.zip"; + +fn generate_random_archive(size: usize) -> Result, std::io::Error> { + let data = Vec::new(); + let mut writer = ZipWriter::new(Cursor::new(data)); + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored); + for count in 0..NB_FILES { + writer.start_file(format!("random_{}.dat", count), options)?; + let mut bytes = vec![0u8; size]; + getrandom::fill(&mut bytes) + .map_err(|e| std::io::Error::other(format!("getrandom error: {}", e)))?; + writer.write_all(&bytes)?; + } + let w = writer.finish()?; + + Ok(w.into_inner()) +} + +fn generate_random_archive_to_file(size: usize) -> Result<(), std::io::Error> { + use std::fs::File; + + let bytes = generate_random_archive(size)?; + let mut file = File::create(FILENAME)?; + file.write_all(&bytes)?; + Ok(()) +} + +fn file_listing_memory(bench: &mut Bencher) { + let size = 1024 * 1024; + let bytes = generate_random_archive(size) + .expect("Failed to create a random archive for the bench read_entry()"); + + bench.iter(|| { + let mut archive = ZipArchive::new(Cursor::new(&bytes)).unwrap(); + let mut names = vec![]; + for idx in 0..archive.len() { + let file = archive.by_index(idx).unwrap(); + names.push(file.name().to_string()); + } + }); +} + +fn file_listing_file(bench: &mut Bencher) { + use std::fs::File; + + let size = 1024 * 1024; + generate_random_archive_to_file(size) + .expect("Failed to create a random archive for the bench read_entry()"); + + bench.iter(|| { + let file = File::open(FILENAME).unwrap(); + let mut archive = ZipArchive::new(file).unwrap(); + let mut names = vec![]; + for idx in 0..archive.len() { + let file = archive.by_index(idx).unwrap(); + names.push(file.name().to_string()); + } + }); + + std::fs::remove_file(FILENAME).unwrap(); +} + +fn file_listing_iterable_memory(bench: &mut Bencher) { + use zip::read::Config; + use zip::unstable::read::IterableZip; + let size = 1024 * 1024; + let bytes = generate_random_archive(size) + .expect("Failed to create a random archive for the bench read_entry()"); + + bench.iter(|| { + let mut archive = IterableZip::try_new(Cursor::new(&bytes), Config::default()).unwrap(); + let mut names = vec![]; + for file in archive.files().unwrap() { + let file = file.unwrap(); + names.push(file.file_name().unwrap().to_string()); + } + }); +} + +fn file_listing_iterable_file(bench: &mut Bencher) { + use std::fs::File; + use zip::read::Config; + use zip::unstable::read::IterableZip; + + let size = 1024 * 1024; + generate_random_archive_to_file(size) + .expect("Failed to create a random archive for the bench read_entry()"); + + bench.iter(|| { + let file = File::open(FILENAME).unwrap(); + let mut archive = IterableZip::try_new(file, Config::default()).unwrap(); + let mut names = vec![]; + for file in archive.files().unwrap() { + let file = file.unwrap(); + names.push(file.file_name().unwrap().to_string()); + } + }); + std::fs::remove_file(FILENAME).unwrap(); +} + +benchmark_group!( + benches, + file_listing_memory, + file_listing_iterable_memory, + file_listing_file, + file_listing_iterable_file +); +benchmark_main!(benches); diff --git a/src/read.rs b/src/read.rs index 306c07fdf..a5ee204d0 100644 --- a/src/read.rs +++ b/src/read.rs @@ -33,6 +33,8 @@ pub(crate) mod stream; pub use stream::read_zipfile_from_stream; pub use stream::read_zipfile_from_stream_with_compressed_size; +pub(crate) mod iterable_zip; + pub(crate) mod magic_finder; pub use zip_archive::ZipArchive; diff --git a/src/read/iterable_zip.rs b/src/read/iterable_zip.rs new file mode 100644 index 000000000..e691dfafa --- /dev/null +++ b/src/read/iterable_zip.rs @@ -0,0 +1,280 @@ +//! Iterable zip reader + +use crate::cp437::FromCp437; +use crate::read::central_header_to_zip_file_inner; +use crate::spec::{FixedSizeBlock, ZipCentralEntryBlock, ZipFlags}; +use crate::{ + ZipReadOptions, + read::{ + CentralDirectoryInfo, Config, ZipFile, find_content, make_crypto_reader, make_reader, + read_variable_length_byte_field, + }, + result::{ZipError, ZipResult}, + spec, + types::ZipFileData, +}; +use std::{ + borrow::Cow, + io::{Read, Seek, SeekFrom}, +}; + +/// Iterable version of ZipArchive +pub struct IterableZip { + #[allow(unused)] + pub(crate) config: Config, + pub(crate) iterable_files: IterableZipFiles, +} +impl IterableZip { + /// Try to create a new zip archive + pub fn try_new(reader: R, config: Config) -> ZipResult> { + Self::with_config(config, reader) + } + + fn with_config(config: Config, mut reader: R) -> ZipResult> { + let file_len = reader.seek(SeekFrom::End(0))?; + let mut end_exclusive = file_len; + let mut last_err = None; + + let central_directory = loop { + let cde = match spec::find_central_directory( + &mut reader, + config.archive_offset, + end_exclusive, + file_len, + ) { + Ok(cde) => cde, + Err(e) => return Err(last_err.unwrap_or(e)), + }; + + match CentralDirectoryInfo::try_from(&cde) { + Ok(info) => break info, + Err(e) => { + last_err = Some(e); + end_exclusive = cde.eocd.position; + } + } + }; + + // If the parsed number of files is greater than the offset then + // something fishy is going on and we shouldn't trust number_of_files. + if central_directory.number_of_files > central_directory.directory_start as usize { + return Err(ZipError::UnsupportedArchive("Fishy error :)")); + } + + if central_directory.disk_number != central_directory.disk_with_central_directory { + return Err(ZipError::UnsupportedArchive( + "Support for multi-disk files is not implemented", + )); + } + + let iterable_shared = IterableZipFiles::try_new(reader, central_directory)?; + + Ok(IterableZip { + config, + iterable_files: iterable_shared, + }) + } + + /// Get the file as an iterator + pub fn files(&mut self) -> ZipResult<&mut IterableZipFiles> { + self.iterable_files.reset()?; + Ok(&mut self.iterable_files) + } + + /// Get a contained file by index with options. + pub fn by_file_data<'data>( + &'data mut self, + data: &'data ZipFileData, + mut options: ZipReadOptions<'_>, + ) -> ZipResult> { + if options.ignore_encryption_flag { + // Always use no password when we're ignoring the encryption flag. + options.password = None; + } else { + // Require and use the password only if the file is encrypted. + match (options.password, data.encrypted) { + (None, true) => { + return Err(ZipError::UnsupportedArchive(ZipError::PASSWORD_REQUIRED)); + } + // Password supplied, but none needed! Discard. + (Some(_), false) => options.password = None, + _ => {} + } + } + let limit_reader = find_content(data, &mut self.iterable_files.reader)?; + + let crypto_reader = + make_crypto_reader(data, limit_reader, options.password, data.aes_mode)?; + + Ok(ZipFile { + data: Cow::Borrowed(data), + reader: make_reader( + data.compression_method, + data.uncompressed_size, + Some(data.crc32), + crypto_reader, + #[cfg(feature = "legacy-zip")] + data.flags, + )?, + }) + } +} + +/// Iterable Files +#[derive(Debug)] +pub struct IterableZipFiles { + reader: R, + central_directory: CentralDirectoryInfo, + current_file: usize, +} + +impl IterableZipFiles { + /// Try to create an iterable of files + pub(crate) fn try_new( + mut reader: R, + central_directory: CentralDirectoryInfo, + ) -> ZipResult { + reader.seek(SeekFrom::Start(central_directory.directory_start))?; + Ok(Self { + reader, + central_directory, + current_file: 0, + }) + } + + pub(crate) fn reset(&mut self) -> ZipResult<()> { + self.current_file = 0; + self.reader + .seek(SeekFrom::Start(self.central_directory.directory_start))?; + Ok(()) + } + + pub(crate) fn parse_entry(&mut self) -> ZipResult { + let central_header_start = self.reader.stream_position()?; + + // Parse central header + let block = ZipCentralEntryBlock::parse(&mut self.reader)?; + let variable_data = + ZipCentralEntryVariableDataRaw::try_from_reader(&mut self.reader, &block)?; + let file = ZipEntry::new( + self.central_directory.archive_offset, + block, + variable_data, + central_header_start, + ); + let central_header_end = self.reader.stream_position()?; + + self.reader.seek(SeekFrom::Start(central_header_end))?; + Ok(file) + } +} + +impl Iterator for IterableZipFiles { + type Item = ZipResult; + + fn next(&mut self) -> Option { + if self.current_file >= self.central_directory.number_of_files { + return None; + } + self.current_file += 1; + let file = self.parse_entry(); + Some(file) + } +} + +#[derive(Debug, Clone)] +pub(crate) struct ZipCentralEntryVariableDataRaw { + file_name: Box<[u8]>, + extra_fields: Box<[u8]>, + file_comment: Box<[u8]>, +} + +impl ZipCentralEntryVariableDataRaw { + fn try_from_reader(reader: &mut R, block: &ZipCentralEntryBlock) -> ZipResult { + let file_name_raw = + read_variable_length_byte_field(reader, block.file_name_length as usize)?; + let extra_field = + read_variable_length_byte_field(reader, block.extra_field_length as usize)?; + let file_comment_raw = + read_variable_length_byte_field(reader, block.file_comment_length as usize)?; + Ok(Self { + file_name: file_name_raw, + extra_fields: extra_field, + file_comment: file_comment_raw, + }) + } +} + +/// A Zip entry +#[non_exhaustive] +#[derive(Debug, Clone)] +pub struct ZipEntry { + archive_offset: u64, + central_block: ZipCentralEntryBlock, + variable_data: ZipCentralEntryVariableDataRaw, + central_block_start: u64, +} + +impl ZipEntry { + pub(crate) fn new( + archive_offset: u64, + central_block: ZipCentralEntryBlock, + variable_data: ZipCentralEntryVariableDataRaw, + start_offset: u64, + ) -> Self { + Self { + archive_offset, + central_block, + variable_data, + central_block_start: start_offset, + } + } + /// Check if the entry have the UTF-8 encoding flag + #[must_use] + pub fn is_utf8(&self) -> bool { + ZipFlags::matching(self.central_block.flags, ZipFlags::LanguageEncoding) + } + + /// Get file name + pub fn file_name(&self) -> ZipResult> { + let file_name_raw = self.file_name_raw(); + // TODO + let file_name = if self.is_utf8() { + String::from_utf8_lossy(file_name_raw) + } else { + file_name_raw.from_cp437()? + }; + Ok(file_name) + } + + /// Get raw file name + #[must_use] + pub fn file_name_raw(&self) -> &[u8] { + &self.variable_data.file_name + } + + /// Get raw extra fields + #[must_use] + pub fn extra_fields(&self) -> &[u8] { + &self.variable_data.extra_fields + } + + /// Get raw comment + #[must_use] + pub fn comment(&self) -> &[u8] { + &self.variable_data.file_comment + } + + /// TODO convert into zip_file + pub fn into_zip_file_data( + self, + reader: &mut R, + ) -> ZipResult { + central_header_to_zip_file_inner( + reader, + self.archive_offset, + self.central_block_start, + self.central_block, + ) + } +} diff --git a/src/unstable.rs b/src/unstable.rs index 33cc4235f..97cc05a92 100644 --- a/src/unstable.rs +++ b/src/unstable.rs @@ -9,6 +9,12 @@ use std::path::{Component, MAIN_SEPARATOR, Path}; pub mod stream { pub use crate::read::stream::{ZipStreamFileMetadata, ZipStreamReader, ZipStreamVisitor}; } + +/// Iterable zip +pub mod read { + pub use crate::read::iterable_zip::IterableZip; +} + /// Types for creating ZIP archives. pub mod write { use crate::result::{ZipError, ZipResult};