diff --git a/Cargo.lock b/Cargo.lock index 8fecbf7..886efae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -380,7 +380,7 @@ dependencies = [ [[package]] name = "ms-pdb" -version = "0.1.18" +version = "0.1.19" dependencies = [ "anyhow", "bitfield", @@ -424,7 +424,7 @@ dependencies = [ [[package]] name = "ms-pdb-msfz" -version = "0.1.9" +version = "0.1.10" dependencies = [ "anyhow", "bstr", @@ -492,7 +492,7 @@ dependencies = [ [[package]] name = "pdbtool" -version = "0.1.18" +version = "0.1.19" dependencies = [ "anyhow", "bitvec", diff --git a/codeview/src/types/iter.rs b/codeview/src/types/iter.rs index fb7109b..cd15c8f 100644 --- a/codeview/src/types/iter.rs +++ b/codeview/src/types/iter.rs @@ -16,6 +16,11 @@ impl<'a> TypesIter<'a> { pub fn new(buffer: &'a [u8]) -> Self { Self { buffer } } + + /// Returns the "rest" of the data that has not been parsed. + pub fn rest(&self) -> &'a [u8] { + self.buffer + } } impl<'a> HasRestLen for TypesIter<'a> { @@ -27,6 +32,11 @@ impl<'a> HasRestLen for TypesIter<'a> { impl<'a> Iterator for TypesIter<'a> { type Item = TypeRecord<'a>; + /// Finds the next type record + /// + /// This implementation makes an important guarantee: If it cannot decode the next record, + /// it _will not_ change `self.buffer`. This is important because it allows an application + /// to detect the exact length and contents of an unparseable record. fn next(&mut self) -> Option> { if self.buffer.is_empty() { return None; diff --git a/msf/src/lib.rs b/msf/src/lib.rs index b8f73df..3b9d2ef 100644 --- a/msf/src/lib.rs +++ b/msf/src/lib.rs @@ -353,6 +353,15 @@ impl Msf { page_to_offset(self.pages.num_pages, self.pages.page_size) } + /// Return the total number of pages allocated to the file, including all pages (allocated, + /// unallocated, etc.). + /// + /// This count includes pages allocated to streams, Page 0, FPM pages, pages that are free + /// (not allocated), and pages allocated to the Stream Directory. + pub fn num_total_pages(&self) -> u32 { + self.pages.num_pages + } + /// Returns the number of free pages. /// /// This number counts the pages that are _less than_ `num_pages`. There may be pages assigned diff --git a/msfz/Cargo.toml b/msfz/Cargo.toml index e584b73..041fa3a 100644 --- a/msfz/Cargo.toml +++ b/msfz/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ms-pdb-msfz" -version = "0.1.9" +version = "0.1.10" edition = "2024" description = "Reads Compressed Multi-Stream Files, which is part of the Microsoft PDB file format" authors = ["Arlie Davis "] diff --git a/msfz/src/lib.rs b/msfz/src/lib.rs index 0923b42..5141380 100644 --- a/msfz/src/lib.rs +++ b/msfz/src/lib.rs @@ -23,7 +23,7 @@ mod stream_data; mod tests; mod writer; -pub use reader::*; +pub use reader::{Fragment, FragmentLocation, Msfz, StreamReader}; pub use stream_data::StreamData; pub use writer::*; @@ -100,6 +100,11 @@ pub const COMPRESSION_ZSTD: u32 = 1; /// This uses the "raw" Deflate stream. It _does not_ use the GZIP encapsulation header. pub const COMPRESSION_DEFLATE: u32 = 2; +/// This is the maximum file offset where an uncompressed fragment be be stored. +/// +/// The MSFZ specification provides 48 bits for storing the file offset of an uncompressed fragment. +pub const MAX_UNCOMPRESSED_FILE_OFFSET: u64 = (1u64 << 48) - 1; + /// Specifies the compression algorithms that are supported by this library. #[derive(Copy, Clone, Eq, PartialEq, Debug)] #[non_exhaustive] diff --git a/msfz/src/reader.rs b/msfz/src/reader.rs index 9a5c17d..c808eb1 100644 --- a/msfz/src/reader.rs +++ b/msfz/src/reader.rs @@ -32,11 +32,13 @@ pub struct Msfz { chunk_cache: Vec>>, } -// Describes a region within a stream. +/// Describes a region within a stream. #[derive(Clone)] -struct Fragment { - size: u32, - location: FragmentLocation, +pub struct Fragment { + /// The size in bytes of the fragment + pub size: u32, + /// The location of the fragment + pub location: FragmentLocation, } impl std::fmt::Debug for Fragment { @@ -52,15 +54,15 @@ impl std::fmt::Debug for FragmentLocation { } else if self.is_compressed() { write!( f, - "uncompressed at 0x{:06x}", - self.uncompressed_file_offset() + "chunk {} : 0x{:04x}", + self.compressed_first_chunk(), + self.compressed_offset_within_chunk() ) } else { write!( f, - "chunk {} : 0x{:04x}", - self.compressed_first_chunk(), - self.compressed_offset_within_chunk() + "uncompressed at 0x{:06x}", + self.uncompressed_file_offset() ) } } @@ -68,9 +70,13 @@ impl std::fmt::Debug for FragmentLocation { const FRAGMENT_LOCATION_32BIT_IS_COMPRESSED_MASK: u32 = 1u32 << 31; +/// A bit mask for the bits within a packed `FragmentLocation` which encode +/// the file offset of an uncompressed fragment. +const UNCOMPRESSED_FRAGMENT_FILE_OFFSET_MASK: u64 = (1u64 << 48) - 1; + /// Represents the location of a fragment, either compressed or uncompressed. #[derive(Copy, Clone)] -struct FragmentLocation { +pub struct FragmentLocation { /// bits 0-31 lo: u32, /// bits 32-63 @@ -89,11 +95,15 @@ impl FragmentLocation { self.lo == u32::MAX && self.hi == u32::MAX } - fn is_compressed(&self) -> bool { + /// Returns `true` if this is a compressed fragment + pub fn is_compressed(&self) -> bool { (self.hi & FRAGMENT_LOCATION_32BIT_IS_COMPRESSED_MASK) != 0 } - fn compressed_first_chunk(&self) -> u32 { + /// Returns the chunk index for this compressed fragment. + /// + /// You must check `is_compressed()` before calling this function. + pub fn compressed_first_chunk(&self) -> u32 { debug_assert!(!self.is_nil()); debug_assert!(self.is_compressed()); self.hi & !FRAGMENT_LOCATION_32BIT_IS_COMPRESSED_MASK @@ -108,7 +118,7 @@ impl FragmentLocation { fn uncompressed_file_offset(&self) -> u64 { debug_assert!(!self.is_nil()); debug_assert!(!self.is_compressed()); - ((self.hi as u64) << 32) | (self.lo as u64) + (((self.hi as u64) << 32) | (self.lo as u64)) & UNCOMPRESSED_FRAGMENT_FILE_OFFSET_MASK } } @@ -231,7 +241,7 @@ impl Msfz { /// Gets the fragments for a given stream. /// /// If `stream` is out of range, returns `None`. - fn stream_fragments(&self, stream: u32) -> Option<&[Fragment]> { + pub fn stream_fragments(&self, stream: u32) -> Option<&[Fragment]> { let i = stream as usize; if i < self.stream_fragments.len() - 1 { let start = self.stream_fragments[i] as usize; @@ -481,10 +491,20 @@ impl Msfz { self.fragments.len() } + /// Raw access to the Fragments table + pub fn fragments(&self) -> &[Fragment] { + &self.fragments + } + /// The total number of compressed chunks. pub fn num_chunks(&self) -> usize { self.chunk_table.len() } + + /// Raw access to the Chunks table + pub fn chunks(&self) -> &[ChunkEntry] { + &self.chunk_table + } } /// Allows reading a stream using the [`Read`], [`Seek`], and [`ReadAt`] traits. @@ -544,7 +564,9 @@ impl<'a, F: ReadAt> ReadAt for StreamReader<'a, F> { )?; buf_xfer.copy_from_slice(chunk_slice); } else { - // Read the stream data directly from disk. + // Read the stream data directly from disk. We don't validate the file offset or + // the length of the transfer. Instead, we just allow the underlying file system + // to report errors. let file_offset = fragment.location.uncompressed_file_offset(); self.msfz .file @@ -555,11 +577,9 @@ impl<'a, F: ReadAt> ReadAt for StreamReader<'a, F> { break; } - if current_offset >= num_bytes_xfer as u64 { - current_offset -= num_bytes_xfer as u64; - } else { - current_offset = 0; - } + // Since we have finished reading from one chunk and there is more data to read in the + // next chunk, the "offset within chunk" becomes zero. + current_offset = 0; } Ok(original_buf_len - buf.len()) diff --git a/msfz/src/writer.rs b/msfz/src/writer.rs index 49432b5..6552ba9 100644 --- a/msfz/src/writer.rs +++ b/msfz/src/writer.rs @@ -4,11 +4,11 @@ use pow2::Pow2; use std::fs::File; use std::io::{Seek, SeekFrom, Write}; use std::path::Path; -use tracing::{debug, debug_span, trace, trace_span}; +use tracing::{debug, debug_span, error, trace, trace_span}; use zerocopy::IntoBytes; /// The default threshold for compressing a chunk of data. -pub const DEFAULT_CHUNK_THRESHOLD: u32 = 0x40_0000; // 4 MiB +pub const DEFAULT_CHUNK_THRESHOLD: u32 = 0x10_0000; // 1 MiB /// The minimum value for the uncompressed chunk size threshold. pub const MIN_CHUNK_SIZE: u32 = 0x1000; @@ -359,6 +359,11 @@ impl MsfzWriterFile { }) } + fn bytes_available_in_chunk_buffer(&self) -> usize { + (self.uncompressed_chunk_size_threshold as usize) + .saturating_sub(self.uncompressed_chunk_data.len()) + } + #[inline(never)] fn finish_current_chunk(&mut self) -> std::io::Result<()> { let _span = debug_span!("finish_current_chunk").entered(); @@ -459,6 +464,12 @@ impl<'a, F: Write + Seek> StreamWriter<'a, F> { self.file.finish_current_chunk() } + /// The number of bytes that can be written to the current chunk buffer, without exceeding + /// the configured maximum. + pub fn bytes_available_in_chunk_buffer(&self) -> usize { + self.file.bytes_available_in_chunk_buffer() + } + /// Specifies whether to use chunked compression or not. The default value for this setting is /// `true` (chunked compression is enabled). /// @@ -525,6 +536,13 @@ impl<'a, F: Write + Seek> Write for StreamWriter<'a, F> { } else { self.file.out.stream_position()? }; + + // The MSFZ spec allocates 48 bytes for the file offset of uncompressed fragments. + if fragment_file_offset > MAX_UNCOMPRESSED_FILE_OFFSET { + error!("The uncompressed file fragment "); + return Err(std::io::ErrorKind::FileTooLarge.into()); + }; + self.file.out.write_all(buf)?; add_fragment_uncompressed(&mut self.stream.fragments, buf_len, fragment_file_offset); diff --git a/pdb/Cargo.toml b/pdb/Cargo.toml index 1bbb39d..4661344 100644 --- a/pdb/Cargo.toml +++ b/pdb/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ms-pdb" -version = "0.1.18" +version = "0.1.19" edition = "2024" description = "Reads Microsoft Program Database (PDB) files" authors = ["Arlie Davis "] @@ -39,7 +39,7 @@ version = "0.1.6" path = "../msf" [dependencies.ms-pdb-msfz] -version = "0.1.9" +version = "0.1.10" path = "../msfz" [dev-dependencies] diff --git a/pdbtool/Cargo.toml b/pdbtool/Cargo.toml index 690b47e..266b2d7 100644 --- a/pdbtool/Cargo.toml +++ b/pdbtool/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pdbtool" -version = "0.1.18" +version = "0.1.19" edition = "2024" description = "A tool for reading Program Database (PDB) files and displaying information about them." authors = ["Arlie Davis "] @@ -34,5 +34,5 @@ version = "0.1.0" path = "../coff" [dependencies.ms-pdb] -version = "0.1.18" +version = "0.1.19" path = "../pdb" diff --git a/pdbtool/src/compare.rs b/pdbtool/src/compare.rs new file mode 100644 index 0000000..550d178 --- /dev/null +++ b/pdbtool/src/compare.rs @@ -0,0 +1,256 @@ +//! Compares two PDB (or PDZ) files to check that their stream directories and +//! stream contents are identical. + +use anyhow::{Context, Result, bail}; +use ms_pdb::{Pdb, RandomAccessFile}; +use std::path::Path; +use tracing::{error, info}; + +/// Compares two PDB (or PDZ) files and verifies that their stream contents +/// are identical. +#[derive(clap::Parser)] +pub(crate) struct CompareOptions { + /// The first PDB file or PDZ file to compare. + pub(crate) first_file: String, + + /// The second PDB file or PDZ file to compare. + pub(crate) second_file: String, + + /// Continue comparison even after finding differences (don't stop at first difference). + #[arg(long)] + pub continue_on_differences: bool, +} + +pub(crate) fn command(options: CompareOptions) -> Result<()> { + let first_path = Path::new(&options.first_file); + let second_path = Path::new(&options.second_file); + + info!("Comparing files:"); + info!(" First: {}", first_path.display()); + info!(" Second: {}", second_path.display()); + + // Open both files + let first_pdb = Pdb::open(first_path) + .with_context(|| format!("Failed to open first file: {}", options.first_file))?; + let second_pdb = Pdb::open(second_path) + .with_context(|| format!("Failed to open second file: {}", options.second_file))?; + + let mut comparison_stats = ComparisonStats::default(); + let success = compare_pdbs(&first_pdb, &second_pdb, &options, &mut comparison_stats)?; + + // Print summary + info!("Comparison completed:"); + info!( + " Total streams compared: {}", + comparison_stats.streams_compared + ); + info!( + " Streams with differences: {}", + comparison_stats.streams_different + ); + info!( + " Nil streams matched: {}", + comparison_stats.nil_streams_matched + ); + + if !success { + info!("Result: Files are DIFFERENT"); + bail!("Files are different"); + } else { + info!("Result: Files are IDENTICAL"); + } + + Ok(()) +} + +#[derive(Default)] +struct ComparisonStats { + streams_compared: u32, + streams_different: u32, + nil_streams_matched: u32, +} + +fn compare_pdbs( + first_pdb: &Pdb, + second_pdb: &Pdb, + options: &CompareOptions, + stats: &mut ComparisonStats, +) -> Result { + let first_container = first_pdb.container(); + let second_container = second_pdb.container(); + + // Compare number of streams + let first_num_streams = first_container.num_streams(); + let second_num_streams = second_container.num_streams(); + + if first_num_streams != second_num_streams { + error!( + "Files have different number of streams. First: {}, Second: {}", + first_num_streams, second_num_streams + ); + if !options.continue_on_differences { + return Ok(false); + } + } + + let min_num_streams = first_num_streams.min(second_num_streams); + let mut has_differences = first_num_streams != second_num_streams; + + let mut first_stream_data: Vec = Vec::new(); + let mut second_stream_data: Vec = Vec::new(); + + // Compare each stream (starting from stream 1, as stream 0 is special) + for stream in 1..min_num_streams { + let first_valid = stream < first_num_streams && first_container.is_stream_valid(stream); + let second_valid = stream < second_num_streams && second_container.is_stream_valid(stream); + + // Check if both streams have same nil-ness + if first_valid != second_valid { + error!( + "Stream {} has different validity. First: {}, Second: {}", + stream, first_valid, second_valid + ); + has_differences = true; + stats.streams_different += 1; + + if !options.continue_on_differences { + return Ok(false); + } + continue; + } + + // If both streams are nil, they match + if !first_valid && !second_valid { + stats.nil_streams_matched += 1; + continue; + } + + // Both streams are valid, compare their contents + stats.streams_compared += 1; + + // Check stream sizes first + let first_size = first_container.stream_len(stream); + let second_size = second_container.stream_len(stream); + + if first_size != second_size { + error!( + "Stream {} has different sizes. First: {} bytes, Second: {} bytes", + stream, first_size, second_size + ); + has_differences = true; + stats.streams_different += 1; + + if !options.continue_on_differences { + return Ok(false); + } + continue; + } + + // Read and compare stream contents + match compare_stream_contents( + first_container, + second_container, + stream, + &mut first_stream_data, + &mut second_stream_data, + ) { + Ok(true) => { + // Streams are identical + } + Ok(false) => { + has_differences = true; + stats.streams_different += 1; + + if !options.continue_on_differences { + return Ok(false); + } + } + Err(e) => { + error!("Failed to compare stream {}: {}", stream, e); + has_differences = true; + stats.streams_different += 1; + + if !options.continue_on_differences { + return Ok(false); + } + } + } + } + + Ok(!has_differences) +} + +fn compare_stream_contents( + first_container: &ms_pdb::Container, + second_container: &ms_pdb::Container, + stream: u32, + first_stream_data: &mut Vec, + second_stream_data: &mut Vec, +) -> Result { + // Read first stream + first_stream_data.clear(); + first_container.read_stream_to_vec_mut(stream, first_stream_data)?; + + // Read second stream + second_stream_data.clear(); + second_container.read_stream_to_vec_mut(stream, second_stream_data)?; + + // Find the first different byte + if let Some(byte_offset) = + find_index_of_first_different_byte(first_stream_data, second_stream_data) + { + error!( + "Stream {} differs at byte offset {} ({:#x})", + stream, byte_offset, byte_offset + ); + return Ok(false); + } + + Ok(true) +} + +/// Compares two byte slices for equality. If they are different, then returns the offset of the +/// first byte that is wrong. +/// +/// * Returns `Some(0)` if the byte slices have different lengths. +/// * Returns `Some(i)` if the byte slices have different contents, and `i` is the offset of the +/// first byte that is different. +/// * Returns `None` if the contents are identical. +#[inline(never)] +pub(crate) fn find_index_of_first_different_byte(mut a: &[u8], mut b: &[u8]) -> Option { + if a.len() != b.len() { + return Some(0); + } + + const BLOCK_SIZE: usize = 4096; + + // This first loop uses memcmp to find the first block which has different contents. + // memcmp is generally going to be faster than a manually-indexed loop. + let mut skipped_len: usize = 0; + loop { + assert_eq!(a.len(), b.len()); + if a.len() < BLOCK_SIZE { + break; + } + let block_a = &a[..BLOCK_SIZE]; + let block_b = &b[..BLOCK_SIZE]; + + if block_a != block_b { + break; + } + + a = &a[BLOCK_SIZE..]; + b = &b[BLOCK_SIZE..]; + skipped_len += BLOCK_SIZE; + } + + // Finish checking the ragged last block -or- repeat the scan of the block which + // contained different contents. + for i in 0..a.len() { + if a[i] != b[i] { + return Some(skipped_len + i); + } + } + + None +} diff --git a/pdbtool/src/container.rs b/pdbtool/src/container.rs index f8223ea..9e391d9 100644 --- a/pdbtool/src/container.rs +++ b/pdbtool/src/container.rs @@ -5,6 +5,14 @@ use std::path::Path; pub struct ContainerOptions { /// The PDB file or PDZ file to read. pdb: String, + + /// Show the chunks table (applicable only to PDZ) + #[arg(long)] + chunks: bool, + + /// Show the streams table + #[arg(long)] + streams: bool, } pub fn container_command(options: &ContainerOptions) -> Result<()> { @@ -19,7 +27,8 @@ pub fn container_command(options: &ContainerOptions) -> Result<()> { " Page size: {:8} bytes per page", u32::from(msf.page_size()) ); - println!(" Number of pages: {:8}", msf.nominal_size()); + println!(" Number of pages: {:8}", msf.num_total_pages()); + println!(" Number of pages * page size: {:8}", msf.nominal_size()); println!(" Number of free pages: {:8}", msf.num_free_pages()); } @@ -28,6 +37,97 @@ pub fn container_command(options: &ContainerOptions) -> Result<()> { println!(" Number of streams: {:8}", pdb.num_streams()); println!(" Number of compressed chunks: {:8}", msfz.num_chunks()); println!(" Number of stream fragments: {:8}", msfz.num_fragments()); + + if options.chunks { + // Build a mapping of streams to chunks, so we can display it. + // We need to know, for each chunk, which streams it contains. + // Contains (chunk, stream) pairs. + let mut chunks_to_streams: Vec<(u32, u32)> = Vec::new(); + + for stream in 1..msfz.num_streams() { + if let Some(fragments) = msfz.stream_fragments(stream) { + for f in fragments { + if f.location.is_compressed() { + let chunk = f.location.compressed_first_chunk(); + chunks_to_streams.push((chunk, stream)); + } + } + } + } + + chunks_to_streams.sort_unstable(); + + println!(); + println!("Chunks table:"); + println!(); + + println!("Chunk | File | Compressed | Uncompressed | Streams"); + println!(" | offset | size | size |"); + println!("---------------------------------------------------------------"); + + let mut j: usize = 0; // index into chunks_to_streams + + let mut streams_list = String::new(); + + for (chunk_index, chunk) in msfz.chunks().iter().enumerate() { + use std::fmt::Write; + + streams_list.clear(); + + while j < chunks_to_streams.len() && chunks_to_streams[j].0 < chunk_index as u32 + { + j += 1; + } + + // (start, last) + let mut current_range: Option<(u32, u32)> = None; + + while j < chunks_to_streams.len() + && chunks_to_streams[j].0 == chunk_index as u32 + { + let stream = chunks_to_streams[j].1; + j += 1; + + match current_range { + // Extend the range, if possible + Some((range_start, range_last)) if range_last + 1 == stream => { + current_range = Some((range_start, stream)); + continue; + } + + Some((range_start, range_last)) => { + // Previous range could not be extended + if range_start != range_last { + _ = write!(streams_list, "{range_start}-{range_last} "); + } else { + _ = write!(streams_list, "{range_start} "); + } + } + + None => { + current_range = Some((stream, stream)); + } + } + } + + if let Some((range_start, range_last)) = current_range { + if range_start != range_last { + _ = write!(streams_list, "{range_start}-{range_last} "); + } else { + _ = write!(streams_list, "{range_start} "); + } + } + + println!( + " {:6} | {:10} | {:10} | {:12} | {}", + chunk_index, + chunk.file_offset, + chunk.compressed_size, + chunk.uncompressed_size, + streams_list + ); + } + } } } diff --git a/pdbtool/src/copy.rs b/pdbtool/src/copy.rs index a406743..78cd853 100644 --- a/pdbtool/src/copy.rs +++ b/pdbtool/src/copy.rs @@ -1,4 +1,5 @@ use ms_pdb::Pdb; +use ms_pdb::msf::{CreateOptions, PageSize}; use std::io::Write; use std::path::PathBuf; @@ -9,11 +10,22 @@ pub struct Options { /// The PDB to write. dest_pdb: PathBuf, + + /// The size in bytes of the pages to use. Must be a power of two. + #[arg(long)] + page_size: Option, } pub fn copy_command(options: &Options) -> anyhow::Result<()> { let src = Pdb::open(&options.source_pdb)?; - let mut dst = ms_pdb::msf::Msf::create(&options.dest_pdb, Default::default())?; + let mut create_options = CreateOptions::default(); + + if let Some(page_size) = options.page_size { + create_options.page_size = PageSize::try_from(page_size) + .map_err(|_| anyhow::anyhow!("The page size must be a power of 2."))?; + } + + let mut dst = ms_pdb::msf::Msf::create(&options.dest_pdb, create_options)?; for stream_index in 1..src.num_streams() { if src.is_stream_valid(stream_index) { diff --git a/pdbtool/src/main.rs b/pdbtool/src/main.rs index 840d4f1..b997569 100644 --- a/pdbtool/src/main.rs +++ b/pdbtool/src/main.rs @@ -10,6 +10,7 @@ use clap::Parser; mod addsrc; mod check; +mod compare; mod container; mod copy; mod counts; @@ -57,6 +58,9 @@ enum Command { /// PDB (MSF) or PDBZ (MSFZ) container format. This also shows some container-specific /// information. Container(container::ContainerOptions), + /// Compares two PDB (or PDZ) files and verifies that their stream contents are identical. + /// Reports differences but continues checking all streams unless stopped. + Compare(compare::CompareOptions), Test, Dump(dump::DumpOptions), Save(save::SaveStreamOptions), @@ -86,6 +90,7 @@ fn main() -> anyhow::Result<()> { Command::Hexdump(args) => hexdump::command(args)?, Command::PdzEncode(args) => pdz::encode::pdz_encode(args)?, Command::Container(args) => container::container_command(&args)?, + Command::Compare(args) => compare::command(args)?, Command::Check(args) => check::command(args)?, } diff --git a/pdbtool/src/pdz.rs b/pdbtool/src/pdz.rs index 87cfe56..0ff4757 100644 --- a/pdbtool/src/pdz.rs +++ b/pdbtool/src/pdz.rs @@ -1,2 +1 @@ pub mod encode; -pub mod util; diff --git a/pdbtool/src/pdz/encode.rs b/pdbtool/src/pdz/encode.rs index f18374b..44aa883 100644 --- a/pdbtool/src/pdz/encode.rs +++ b/pdbtool/src/pdz/encode.rs @@ -1,14 +1,19 @@ -use crate::pdz::util::*; +//! Compresses PDB files into "PDZ" (compressed PDB) files. + use anyhow::{Context, Result, bail}; -use ms_pdb::dbi::{DBI_STREAM_HEADER_LEN, DbiStreamHeader}; +use ms_pdb::codeview::HasRestLen; +use ms_pdb::dbi::DbiStreamHeader; use ms_pdb::msf::Msf; use ms_pdb::msfz::{self, MIN_FILE_SIZE_16K, MsfzFinishOptions, MsfzWriter, StreamWriter}; +use ms_pdb::syms::SymIter; +use ms_pdb::tpi; +use ms_pdb::types::TypesIter; use ms_pdb::{RandomAccessFile, Stream}; use std::fs::File; -use std::io::{Seek, SeekFrom, Write}; +use std::io::{Read, Seek, SeekFrom, Write}; use std::path::Path; -use tracing::{trace, trace_span, warn}; -use zerocopy::FromBytes; +use tracing::{debug, error, info, trace, trace_span, warn}; +use zerocopy::{FromBytes, IntoBytes}; #[derive(clap::Parser, Debug)] pub(crate) struct PdzEncodeOptions { @@ -43,18 +48,16 @@ pub(crate) struct PdzEncodeOptions { /// Place each chunk into its own compression chunk (or chunks, if the stream is large). #[arg(long)] pub one_stream_per_chunk: bool, + + /// After writing the PDZ file, close it, re-open it, and verify that its stream contents + /// match the original PDB, byte for byte. + #[arg(long)] + pub verify: bool, } pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { let _span = trace_span!("pdz_encode").entered(); - let pdb_metadata = std::fs::metadata(&options.input_pdb).with_context(|| { - format!( - "Failed to get metadata for input PDB: {}", - options.input_pdb - ) - })?; - let input_file = RandomAccessFile::open(Path::new(&options.input_pdb)) .with_context(|| format!("Failed to open input PDB: {}", options.input_pdb))?; @@ -87,7 +90,7 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { .with_context(|| format!("Failed to open output PDZ: {}", options.output_pdz))?; writer.set_uncompressed_chunk_size_threshold(max_chunk_size); - println!( + info!( "Using maximum chunk size: {n} ({n:#x})", n = writer.uncompressed_chunk_size_threshold() ); @@ -111,33 +114,100 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { writer.end_chunk()?; } - // Next, write the DBI stream. + // Write the DBI stream (3). + let dbi_header: Option; { let _span = trace_span!("transfer DBI stream"); pdb.read_stream_to_vec_mut(Stream::DBI.into(), &mut stream_data)?; let mut sw = writer.stream_writer(Stream::DBI.into())?; - write_dbi(&mut sw, &stream_data)?; - } - - if options.one_stream_per_chunk { + dbi_header = write_dbi(&mut sw, &stream_data)?; writer.end_chunk()?; } + // Write the TPI (2). + let tpi_header_opt: Option = if pdb.is_stream_valid(Stream::TPI.into()) { + let _span = trace_span!("transfer TPI stream"); + pdb.read_stream_to_vec_mut(Stream::TPI.into(), &mut stream_data)?; + let mut sw = writer.stream_writer(Stream::TPI.into())?; + write_tpi_or_ipi(&mut sw, &stream_data, max_chunk_size as usize)? + } else { + None + }; + + // Write the IPI (4) + let ipi_header_opt: Option = if pdb.is_stream_valid(Stream::IPI.into()) { + let _span = trace_span!("transfer IPI stream"); + pdb.read_stream_to_vec_mut(Stream::IPI.into(), &mut stream_data)?; + let mut sw = writer.stream_writer(Stream::IPI.into())?; + write_tpi_or_ipi(&mut sw, &stream_data, max_chunk_size as usize)? + } else { + None + }; + // Loop through the rest of the streams and do normal chunked compression. for stream_index in 1..num_streams { + let _span = trace_span!("stream").entered(); + trace!(stream_index); + if !pdb.is_stream_valid(stream_index) { // This is a nil stream. We don't need to do anything because the writer has already // reserved this stream slot. continue; } - if stream_index == Stream::PDB.into() || stream_index == Stream::DBI.into() { + if stream_index == Stream::PDB.into() + || stream_index == Stream::DBI.into() + || stream_index == Stream::IPI.into() + || stream_index == Stream::TPI.into() + { // We have already processed these streams, above. continue; } - let _span = trace_span!("stream").entered(); - trace!(stream_index); + // Custom encoding for the TPI Hash Stream + if let Some(tpi_header) = &tpi_header_opt { + if tpi_header.hash_stream_index.get() == Some(stream_index) { + pdb.read_stream_to_vec_mut(stream_index, &mut stream_data)?; + let mut sw = writer.stream_writer(stream_index)?; + write_tpi_or_ipi_hash_stream( + &mut sw, + &stream_data, + max_chunk_size as usize, + tpi_header, + )?; + writer.end_chunk()?; + continue; + } + } + + // Custom encoding for the IPI Hash Stream + if let Some(ipi_header) = &ipi_header_opt { + if ipi_header.hash_stream_index.get() == Some(stream_index) { + pdb.read_stream_to_vec_mut(stream_index, &mut stream_data)?; + let mut sw = writer.stream_writer(stream_index)?; + write_tpi_or_ipi_hash_stream( + &mut sw, + &stream_data, + max_chunk_size as usize, + ipi_header, + )?; + writer.end_chunk()?; + continue; + } + } + + // Custom encoding for the Global Symbol Stream. + // The stream number for the Global Symbol Stream is found in the DBI Stream Header. + match &dbi_header { + Some(dbi_header) if dbi_header.global_symbol_stream.get() == Some(stream_index) => { + let mut sw = writer.stream_writer(stream_index)?; + pdb.read_stream_to_vec_mut(stream_index, &mut stream_data)?; + write_global_symbols_stream(&mut sw, &stream_data, max_chunk_size as usize)?; + writer.end_chunk()?; + continue; + } + _ => {} + } { let _span = trace_span!("read stream").entered(); @@ -156,7 +226,8 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { } } - // Get final size of the file. Don't append more data to the file after this line. + // Finish encoding the MSFZ file. This closes the session; don't append more data to the file + // after this line. This writes the MSFZ Stream Directory, Chunk Table, and MSFZ File Header. let (summary, mut file) = { let _span = trace_span!("finish writing").entered(); writer.finish_with_options(MsfzFinishOptions { @@ -170,38 +241,172 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { }; let out_file_size = file.seek(SeekFrom::End(0))?; - show_comp_rate("PDB -> PDZ", pdb_metadata.len(), out_file_size); - println!("{summary}"); + match std::fs::metadata(&options.input_pdb) { + Ok(pdb_metadata) => { + let before = pdb_metadata.len(); + let after = out_file_size; + + // We don't divide by zero around here. + if before != 0 { + let percent = (before as f64 - after as f64) / (before as f64) * 100.0; + info!( + " PDB -> PDZ compression : {:8} -> {:8} {percent:2.1} %", + friendly::bytes(before), + friendly::bytes(after) + ); + } + } + Err(e) => { + warn!("Failed to get metadata for input PDB: {e:?}"); + } + } + + info!("Number of streams: {:8}", summary.num_streams); + info!("Number of chunks: {:8}", summary.num_chunks); - // Explicitly close our file handles so that the replace_file() call can succeed. - drop(pdb); + // Explicitly drop our output file handle so that we can re-open it for verification. drop(file); + if options.verify { + info!("Verifying PDZ encoding"); + let is_same = verify_pdz(&pdb, &options.output_pdz)?; + if !is_same { + bail!("PDZ encoding failed verification."); + } + } + + drop(pdb); + Ok(()) } +/// Reads all of the data from two files and compares their contents. +/// +/// The first file is a PDB (MSF) file and is already open. +/// The second file is a PDZ (MSFZ) file and its filename is given. +/// +/// Returns `true` if their contents are identical. +fn verify_pdz(input_pdb: &Msf, output_pdz: &str) -> anyhow::Result { + let output = msfz::Msfz::open(output_pdz)?; + + let input_num_streams = input_pdb.num_streams(); + let output_num_streams = output.num_streams(); + if input_num_streams != output_num_streams { + error!( + "The output file (PDZ) has the wrong number of streams. \ + Expected value: {input_num_streams}. \ + Actual value: {output_num_streams}" + ); + bail!("Wrong number of streams"); + } + + let mut has_errors = false; + + let mut input_stream_data: Vec = Vec::new(); + let mut output_stream_data: Vec = Vec::new(); + + for stream in 1..input_num_streams { + let input_stream_is_valid = input_pdb.is_stream_valid(stream); + let output_stream_is_valid = output.is_stream_valid(stream); + + if input_stream_is_valid != output_stream_is_valid { + error!( + "Stream {stream} has wrong validity. \ + Expected value: {input_stream_is_valid:?}. \ + Actual value: {output_stream_is_valid:?}." + ); + has_errors = true; + } + + if !input_stream_is_valid { + continue; + } + + let input_stream_size = input_pdb.stream_size(stream) as usize; + let output_stream_size = output.stream_size(stream)? as usize; + + if input_stream_size != output_stream_size { + error!( + "Stream {stream} has wrong length in stream directory. + Expected value: {input_stream_size}. \ + Actual value: {output_stream_size}." + ); + has_errors = true; + continue; + } + + // Read stream data in the input file. + { + input_stream_data.clear(); + let mut sr = input_pdb.get_stream_reader(stream)?; + sr.read_to_end(&mut input_stream_data)?; + } + + // Read stream data in the output file. + { + output_stream_data.clear(); + let mut sr = output.get_stream_reader(stream)?; + sr.read_to_end(&mut output_stream_data)?; + } + + if input_stream_data.len() != output_stream_data.len() { + error!( + "Stream {stream} has wrong length. \ + Expected value: {}. \ + Actual value: {}.", + input_stream_data.len(), + output_stream_data.len() + ); + has_errors = true; + continue; + } + + if let Some(byte_offset) = crate::compare::find_index_of_first_different_byte( + &input_stream_data, + &output_stream_data, + ) { + error!( + "Stream {stream} has wrong (different) contents, at index {byte_offset} ({byte_offset:#x})" + ); + has_errors = true; + continue; + } + } + + if has_errors { + return Ok(false); + } + + info!("Verification succeeded."); + + Ok(true) +} + /// Write the DBI stream. Be smart about compression and compression chunk boundaries. -fn write_dbi(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> Result<()> { +fn write_dbi( + sw: &mut StreamWriter<'_, File>, + stream_data: &[u8], +) -> Result> { // Avoid compressing data from the DBI stream with other chunks. sw.end_chunk()?; - if stream_data.len() < DBI_STREAM_HEADER_LEN { + // Read the DBI stream header. If we can't read it (because it's too small), then fall back + // to copying the stream. + let Ok((dbi_header, mut rest_of_stream)) = DbiStreamHeader::read_from_prefix(stream_data) + else { // Something is seriously wrong with this PDB. Pass the contents through without any // modification or compression. sw.set_compression_enabled(false); sw.write_all(stream_data)?; - return Ok(()); - } - - let (header_bytes, mut rest_of_stream) = stream_data.split_at(DBI_STREAM_HEADER_LEN); - - // This unwrap() cannot fail because we just tested the size, above. - let dbi_header = DbiStreamHeader::ref_from_bytes(header_bytes).unwrap(); + sw.end_chunk()?; + return Ok(None); + }; // Write the DBI Stream Header uncompressed. This allows symbol.exe to read it. sw.set_compression_enabled(false); - sw.write_all(header_bytes)?; + sw.write_all(dbi_header.as_bytes())?; + sw.end_chunk()?; // The DBI consists of a header, followed by a set of substreams. The substreams contain // data with different sizes, compression characteristic, and different access patterns. @@ -268,6 +473,319 @@ fn write_dbi(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> Result<()> sw.write_all(rest_of_stream)?; sw.end_chunk()?; + Ok(Some(dbi_header)) +} + +/// This is the fallback path for writing complex streams. +/// +/// If we find any problem in writing a complex stream, we fall back to compressing all of it. +#[inline(never)] +fn write_stream_fallback(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> Result<()> { + sw.set_compression_enabled(true); + sw.write_all(stream_data)?; + sw.end_chunk()?; + Ok(()) +} + +fn write_global_symbols_stream( + sw: &mut StreamWriter<'_, File>, + stream_data: &[u8], + mut max_chunk_len: usize, +) -> Result<()> { + debug!("Writing Global Symbol Stream"); + + sw.end_chunk()?; + + // The code below assumes that you can always put at least one type record into a chunk. + // To prevent a forward-progress failure, we require that max_chunk_len is larger than the + // largest type record. + const MIN_CHUNK_LEN: usize = 0x20000; + if max_chunk_len < MIN_CHUNK_LEN { + warn!( + "max_chunk_len ({}) is way too small; promoting it", + max_chunk_len + ); + max_chunk_len = MIN_CHUNK_LEN; + } + + let mut current_chunk_bytes: &[u8] = stream_data; + let mut total_bytes_written: usize = 0; + + 'top: while !current_chunk_bytes.is_empty() { + let mut iter = SymIter::new(current_chunk_bytes); + loop { + let rest_len_before = iter.rest_len(); + if iter.next().is_none() { + break 'top; + } + let rest_len_after = iter.rest_len(); + + let chunk_len_with_this_record = current_chunk_bytes.len() - rest_len_after; + if chunk_len_with_this_record > max_chunk_len { + let chunk_len_without_this_record = current_chunk_bytes.len() - rest_len_before; + let (committed_chunk_bytes, next_chunk_bytes) = + current_chunk_bytes.split_at(chunk_len_without_this_record); + + // TODO: This could be optimized to a single, non-buffered chunk write. + sw.write_all(committed_chunk_bytes)?; + sw.end_chunk()?; + total_bytes_written += committed_chunk_bytes.len(); + + // This will cause us to re-parse a record at the start of the next chunk. + // That's ok, that's cheap. But we do need to handle the case where a record + // is larger than max_chunk_len. We "handle" that by requiring that max_chunk_len + // is at least 0x10004, since that is the maximum size for any record. That's a + // very silly lower bound for a chunk size, so we actually require it to be higher. + current_chunk_bytes = next_chunk_bytes; + continue 'top; + } + } + } + + if !current_chunk_bytes.is_empty() { + sw.write_all(current_chunk_bytes)?; + sw.end_chunk()?; + total_bytes_written += current_chunk_bytes.len(); + } + + assert_eq!( + total_bytes_written, + stream_data.len(), + "expected to write same number of record bytes" + ); + + Ok(()) +} + +/// Write the TPI or IPI stream. Be smart about compression and compression chunk boundaries. +/// +/// This function returns `Some(header)` if the TPI header was correctly parsed. This header can +/// be used to optimize the encoding of the associated Type Hash Stream. +fn write_tpi_or_ipi( + sw: &mut StreamWriter<'_, File>, + stream_data: &[u8], + mut max_chunk_len: usize, +) -> Result> { + debug!("write_tpi_or_ipi"); + sw.end_chunk()?; + + // The code below assumes that you can always put at least one type record into a chunk. + // To prevent a forward-progress failure, we require that max_chunk_len is larger than the + // largest type record. + const MIN_CHUNK_LEN: usize = 0x20000; + if max_chunk_len < MIN_CHUNK_LEN { + warn!("max_chunk_len is way too small; promoting it"); + max_chunk_len = MIN_CHUNK_LEN; + } + + // If the stream does not even contain a full header, then fall back to writing full contents. + let Ok((tpi_header, after_header)) = tpi::TypeStreamHeader::read_from_prefix(stream_data) + else { + warn!("TPI or IPI stream was too short to contain a valid header"); + write_stream_fallback(sw, stream_data)?; + return Ok(None); + }; + + // Find the slice of the type data. There can be data following the type data and we must + // handle it correctly. + let type_record_bytes_len = tpi_header.type_record_bytes.get() as usize; + if after_header.len() < type_record_bytes_len { + warn!( + "TPI or IPI stream contained invalid header value (type_record_bytes exceeded bounds)" + ); + write_stream_fallback(sw, stream_data)?; + return Ok(None); + } + + debug!(type_record_bytes_len, "encoding TPI/IPI."); + + // type_record_bytes contains the encoded type records + // after_records contains unknown data (if any) after the type records + let (type_record_bytes, after_records) = after_header.split_at(type_record_bytes_len); + + // Write the header, without compression. + // TODO: Place this in the initial read section. + sw.set_compression_enabled(false); + sw.write_all(tpi_header.as_bytes())?; + + // Next, we are going to scan through the type records in the TPI. Our goal is to create chunk + // boundaries that align with record boundaries, so that no type record is split across chunks. + sw.set_compression_enabled(true); + + // current_chunk_bytes contains the type records that will be written into the next chunk. + let mut current_chunk_bytes: &[u8] = type_record_bytes; + + let mut record_bytes_written: usize = 0; + + // This loop runs once per "chunk". Each iteration builds a single MSFZ chunk from a + // sequence of contiguous type records. Type records never cross chunk boundaries. + 'top: while !current_chunk_bytes.is_empty() { + let mut iter = TypesIter::new(current_chunk_bytes); + loop { + let rest_len_before = iter.rest_len(); + if iter.next().is_none() { + break 'top; + } + + let rest_len_after = iter.rest_len(); + let record_len = rest_len_before - rest_len_after; + assert!(record_len <= current_chunk_bytes.len()); + + // Would adding this record to the current chunk exceed our threshold? + let chunk_len_with_this_record = current_chunk_bytes.len() - rest_len_after; + if chunk_len_with_this_record > max_chunk_len { + let chunk_len_without_this_record = current_chunk_bytes.len() - rest_len_before; + let (committed_chunk_bytes, next_chunk_bytes) = + current_chunk_bytes.split_at(chunk_len_without_this_record); + + // TODO: This could be optimized to a single, non-buffered chunk write. + sw.write_all(committed_chunk_bytes)?; + sw.end_chunk()?; + record_bytes_written += committed_chunk_bytes.len(); + + // This will cause us to re-parse a record at the start of the next chunk. + // That's ok, that's cheap. But we do need to handle the case where a record + // is larger than max_chunk_len. We "handle" that by requiring that max_chunk_len + // is at least 0x10004, since that is the maximum size for any record. That's a + // very silly lower bound for a chunk size, so we actually require it to be higher. + current_chunk_bytes = next_chunk_bytes; + continue 'top; + } + + // Keep processing records. + } + } + + // If we got here, then the iterator stopped reporting records. That can happen for two + // reasons: 1) the normal case where we reach the end of the types, or 2) we failed to + // decode a type record. In both cases, writing current_chunk_contents will write the + // prefix of records that have been parsed (but have not triggered our threshold + if !current_chunk_bytes.is_empty() { + // TODO: optimize to a single, non-buffered chunk write. + sw.write_all(current_chunk_bytes)?; + record_bytes_written += current_chunk_bytes.len(); + } + sw.end_chunk()?; + assert_eq!( + record_bytes_written, + type_record_bytes.len(), + "expected to write same number of record bytes" + ); + + if !after_records.is_empty() { + debug!( + after_records_len = after_records.len(), + "TPI/IPI contains data after type stream" + ); + sw.write_all(after_records)?; + } + + sw.end_chunk()?; + + Ok(Some(tpi_header)) +} + +/// Write the "Type Hash Stream" associated with the TPI or IPI stream. +/// Be smart about compression and compression chunk boundaries. +/// +/// This function requires the Type Stream Header from the original TPI or IPI stream. +/// That header describes the regions within the Type Stream Header. +/// +/// The Type Hash Stream consists of three regions: 1) Hash Value Buffer, 2) Index Offset Buffer, +/// and 3) Hash Adjustment Buffer. The size and location of each of these regions is specified +/// in fields in the Type Stream Header. +/// +/// We use a simple algorithm. We build a list of the start and end locations of each of these +/// buffers (if they are non-zero length). Then we sort the list and de-dup it. Then we traverse +/// the list, writing chunks for each region. +/// +/// This simple algorithm allows us to ignore all sorts of strange situations, such as regions +/// overlapping, regions occurring in an unusual order, or there being bytes within the stream +/// that are not covered by any region. All we care about is the compression boundaries. +fn write_tpi_or_ipi_hash_stream( + sw: &mut StreamWriter<'_, File>, + stream_data: &[u8], + max_chunk_len: usize, + tpi_header: &tpi::TypeStreamHeader, +) -> Result<()> { + sw.end_chunk()?; + + let stream_len_u32 = stream_data.len() as u32; + + let mut boundaries: Vec = Vec::with_capacity(8); + + boundaries.push(stream_data.len()); + + let regions: [(i32, u32); 3] = [ + ( + tpi_header.hash_value_buffer_offset.get(), + tpi_header.hash_value_buffer_length.get(), + ), + ( + tpi_header.index_offset_buffer_offset.get(), + tpi_header.index_offset_buffer_length.get(), + ), + ( + tpi_header.hash_adj_buffer_offset.get(), + tpi_header.hash_adj_buffer_length.get(), + ), + ]; + + for &(start, length) in regions.iter() { + if length == 0 { + continue; + } + if start < 0 { + warn!("Type Hash Stream has a negative offset for one of its regions"); + continue; + } + + let start_u: u32 = start as u32; + if start_u > stream_len_u32 { + warn!("Type Hash Stream has a region whose start offset is out of bounds"); + continue; + } + + let avail = stream_len_u32 - start_u; + if length > avail { + warn!("Type Hash Stream has a region whose end offset is out of bounds"); + continue; + } + + let end: u32 = start_u + length; + + boundaries.push(start_u as usize); + boundaries.push(end as usize); + } + + boundaries.sort_unstable(); + boundaries.dedup(); + + debug!("Type Hash Stream offset boundaries: {:?}", boundaries); + + let mut previous_boundary: usize = 0; + let mut total_bytes_written: usize = 0; + + for &next_boundary in boundaries.iter() { + assert!(next_boundary >= previous_boundary); + let region_data = &stream_data[previous_boundary..next_boundary]; + + // We are going to _further_ chunk things, based on max_chunk_len. + for chunk_data in region_data.chunks(max_chunk_len) { + sw.write_all(chunk_data)?; + sw.end_chunk()?; + total_bytes_written += chunk_data.len(); + } + + previous_boundary = next_boundary; + } + + assert_eq!( + total_bytes_written, + stream_data.len(), + "expected to write the correct number of bytes" + ); + Ok(()) } diff --git a/pdbtool/src/pdz/util.rs b/pdbtool/src/pdz/util.rs deleted file mode 100644 index 69b2515..0000000 --- a/pdbtool/src/pdz/util.rs +++ /dev/null @@ -1,20 +0,0 @@ -pub(crate) fn show_comp_rate(description: &str, before: u64, after: u64) { - if before == 0 { - // We don't divide by zero around here. - println!( - " {:-30} : {:8} -> {:8}", - description, - friendly::bytes(before), - friendly::bytes(after) - ); - } else { - let percent = (before as f64 - after as f64) / (before as f64) * 100.0; - println!( - " {:-30} : {:8} -> {:8} {:2.1} %", - description, - friendly::bytes(before), - friendly::bytes(after), - percent - ); - } -}