From 2d2c31b02f740d9b8723394079d9441890d27bb4 Mon Sep 17 00:00:00 2001 From: Arlie Davis Date: Wed, 4 Feb 2026 14:04:03 -0800 Subject: [PATCH 1/6] Improvements to PDZ encoding The pdz-encode command was not actually breaking up large streams into fragments. This commit breaks up several of the largest streams. The TPI and IPI streams (and their related Type Hash Streams) are now chunked at record boundaries. --- codeview/src/types/iter.rs | 10 ++ msf/src/lib.rs | 9 ++ msfz/src/reader.rs | 32 +++- msfz/src/writer.rs | 11 ++ pdbtool/src/container.rs | 102 +++++++++++- pdbtool/src/copy.rs | 14 +- pdbtool/src/pdz/encode.rs | 321 +++++++++++++++++++++++++++++++++++-- 7 files changed, 480 insertions(+), 19 deletions(-) diff --git a/codeview/src/types/iter.rs b/codeview/src/types/iter.rs index fb7109b..cd15c8f 100644 --- a/codeview/src/types/iter.rs +++ b/codeview/src/types/iter.rs @@ -16,6 +16,11 @@ impl<'a> TypesIter<'a> { pub fn new(buffer: &'a [u8]) -> Self { Self { buffer } } + + /// Returns the "rest" of the data that has not been parsed. + pub fn rest(&self) -> &'a [u8] { + self.buffer + } } impl<'a> HasRestLen for TypesIter<'a> { @@ -27,6 +32,11 @@ impl<'a> HasRestLen for TypesIter<'a> { impl<'a> Iterator for TypesIter<'a> { type Item = TypeRecord<'a>; + /// Finds the next type record + /// + /// This implementation makes an important guarantee: If it cannot decode the next record, + /// it _will not_ change `self.buffer`. This is important because it allows an application + /// to detect the exact length and contents of an unparseable record. fn next(&mut self) -> Option> { if self.buffer.is_empty() { return None; diff --git a/msf/src/lib.rs b/msf/src/lib.rs index b8f73df..81f5fec 100644 --- a/msf/src/lib.rs +++ b/msf/src/lib.rs @@ -353,6 +353,15 @@ impl Msf { page_to_offset(self.pages.num_pages, self.pages.page_size) } + /// Return the total number of pages allocated to the file, including all pages (allocated, + /// unallocated, etc.). + /// + /// This count includes pages allocated to streams, Page 0, FPM pages, pages that are free + /// (not allocated), and pages allocated to the Stream Directory. + pub fn num_total_pages(&self) -> u32 { + self.pages.num_pages + } + /// Returns the number of free pages. /// /// This number counts the pages that are _less than_ `num_pages`. There may be pages assigned diff --git a/msfz/src/reader.rs b/msfz/src/reader.rs index 9a5c17d..b7bc920 100644 --- a/msfz/src/reader.rs +++ b/msfz/src/reader.rs @@ -32,11 +32,13 @@ pub struct Msfz { chunk_cache: Vec>>, } -// Describes a region within a stream. +/// Describes a region within a stream. #[derive(Clone)] -struct Fragment { - size: u32, - location: FragmentLocation, +pub struct Fragment { + /// The size in bytes of the fragment + pub size: u32, + /// The location of the fragment + pub location: FragmentLocation, } impl std::fmt::Debug for Fragment { @@ -70,7 +72,7 @@ const FRAGMENT_LOCATION_32BIT_IS_COMPRESSED_MASK: u32 = 1u32 << 31; /// Represents the location of a fragment, either compressed or uncompressed. #[derive(Copy, Clone)] -struct FragmentLocation { +pub struct FragmentLocation { /// bits 0-31 lo: u32, /// bits 32-63 @@ -89,11 +91,15 @@ impl FragmentLocation { self.lo == u32::MAX && self.hi == u32::MAX } - fn is_compressed(&self) -> bool { + /// Returns `true` if this is a compressed fragment + pub fn is_compressed(&self) -> bool { (self.hi & FRAGMENT_LOCATION_32BIT_IS_COMPRESSED_MASK) != 0 } - fn compressed_first_chunk(&self) -> u32 { + /// Returns the chunk index for this compressed fragment. + /// + /// You must check `is_compressed()` before calling this function. + pub fn compressed_first_chunk(&self) -> u32 { debug_assert!(!self.is_nil()); debug_assert!(self.is_compressed()); self.hi & !FRAGMENT_LOCATION_32BIT_IS_COMPRESSED_MASK @@ -231,7 +237,7 @@ impl Msfz { /// Gets the fragments for a given stream. /// /// If `stream` is out of range, returns `None`. - fn stream_fragments(&self, stream: u32) -> Option<&[Fragment]> { + pub fn stream_fragments(&self, stream: u32) -> Option<&[Fragment]> { let i = stream as usize; if i < self.stream_fragments.len() - 1 { let start = self.stream_fragments[i] as usize; @@ -481,10 +487,20 @@ impl Msfz { self.fragments.len() } + /// Raw access to the Fragments table + pub fn fragments(&self) -> &[Fragment] { + &self.fragments + } + /// The total number of compressed chunks. pub fn num_chunks(&self) -> usize { self.chunk_table.len() } + + /// Raw access to the Chunks table + pub fn chunks(&self) -> &[ChunkEntry] { + &self.chunk_table + } } /// Allows reading a stream using the [`Read`], [`Seek`], and [`ReadAt`] traits. diff --git a/msfz/src/writer.rs b/msfz/src/writer.rs index 49432b5..e2c210c 100644 --- a/msfz/src/writer.rs +++ b/msfz/src/writer.rs @@ -359,6 +359,11 @@ impl MsfzWriterFile { }) } + fn bytes_available_in_chunk_buffer(&self) -> usize { + (self.uncompressed_chunk_size_threshold as usize) + .saturating_sub(self.uncompressed_chunk_data.len()) + } + #[inline(never)] fn finish_current_chunk(&mut self) -> std::io::Result<()> { let _span = debug_span!("finish_current_chunk").entered(); @@ -459,6 +464,12 @@ impl<'a, F: Write + Seek> StreamWriter<'a, F> { self.file.finish_current_chunk() } + /// The number of bytes that can be written to the current chunk buffer, without exceeding + /// the configured maximum. + pub fn bytes_available_in_chunk_buffer(&self) -> usize { + self.file.bytes_available_in_chunk_buffer() + } + /// Specifies whether to use chunked compression or not. The default value for this setting is /// `true` (chunked compression is enabled). /// diff --git a/pdbtool/src/container.rs b/pdbtool/src/container.rs index f8223ea..9e391d9 100644 --- a/pdbtool/src/container.rs +++ b/pdbtool/src/container.rs @@ -5,6 +5,14 @@ use std::path::Path; pub struct ContainerOptions { /// The PDB file or PDZ file to read. pdb: String, + + /// Show the chunks table (applicable only to PDZ) + #[arg(long)] + chunks: bool, + + /// Show the streams table + #[arg(long)] + streams: bool, } pub fn container_command(options: &ContainerOptions) -> Result<()> { @@ -19,7 +27,8 @@ pub fn container_command(options: &ContainerOptions) -> Result<()> { " Page size: {:8} bytes per page", u32::from(msf.page_size()) ); - println!(" Number of pages: {:8}", msf.nominal_size()); + println!(" Number of pages: {:8}", msf.num_total_pages()); + println!(" Number of pages * page size: {:8}", msf.nominal_size()); println!(" Number of free pages: {:8}", msf.num_free_pages()); } @@ -28,6 +37,97 @@ pub fn container_command(options: &ContainerOptions) -> Result<()> { println!(" Number of streams: {:8}", pdb.num_streams()); println!(" Number of compressed chunks: {:8}", msfz.num_chunks()); println!(" Number of stream fragments: {:8}", msfz.num_fragments()); + + if options.chunks { + // Build a mapping of streams to chunks, so we can display it. + // We need to know, for each chunk, which streams it contains. + // Contains (chunk, stream) pairs. + let mut chunks_to_streams: Vec<(u32, u32)> = Vec::new(); + + for stream in 1..msfz.num_streams() { + if let Some(fragments) = msfz.stream_fragments(stream) { + for f in fragments { + if f.location.is_compressed() { + let chunk = f.location.compressed_first_chunk(); + chunks_to_streams.push((chunk, stream)); + } + } + } + } + + chunks_to_streams.sort_unstable(); + + println!(); + println!("Chunks table:"); + println!(); + + println!("Chunk | File | Compressed | Uncompressed | Streams"); + println!(" | offset | size | size |"); + println!("---------------------------------------------------------------"); + + let mut j: usize = 0; // index into chunks_to_streams + + let mut streams_list = String::new(); + + for (chunk_index, chunk) in msfz.chunks().iter().enumerate() { + use std::fmt::Write; + + streams_list.clear(); + + while j < chunks_to_streams.len() && chunks_to_streams[j].0 < chunk_index as u32 + { + j += 1; + } + + // (start, last) + let mut current_range: Option<(u32, u32)> = None; + + while j < chunks_to_streams.len() + && chunks_to_streams[j].0 == chunk_index as u32 + { + let stream = chunks_to_streams[j].1; + j += 1; + + match current_range { + // Extend the range, if possible + Some((range_start, range_last)) if range_last + 1 == stream => { + current_range = Some((range_start, stream)); + continue; + } + + Some((range_start, range_last)) => { + // Previous range could not be extended + if range_start != range_last { + _ = write!(streams_list, "{range_start}-{range_last} "); + } else { + _ = write!(streams_list, "{range_start} "); + } + } + + None => { + current_range = Some((stream, stream)); + } + } + } + + if let Some((range_start, range_last)) = current_range { + if range_start != range_last { + _ = write!(streams_list, "{range_start}-{range_last} "); + } else { + _ = write!(streams_list, "{range_start} "); + } + } + + println!( + " {:6} | {:10} | {:10} | {:12} | {}", + chunk_index, + chunk.file_offset, + chunk.compressed_size, + chunk.uncompressed_size, + streams_list + ); + } + } } } diff --git a/pdbtool/src/copy.rs b/pdbtool/src/copy.rs index a406743..78cd853 100644 --- a/pdbtool/src/copy.rs +++ b/pdbtool/src/copy.rs @@ -1,4 +1,5 @@ use ms_pdb::Pdb; +use ms_pdb::msf::{CreateOptions, PageSize}; use std::io::Write; use std::path::PathBuf; @@ -9,11 +10,22 @@ pub struct Options { /// The PDB to write. dest_pdb: PathBuf, + + /// The size in bytes of the pages to use. Must be a power of two. + #[arg(long)] + page_size: Option, } pub fn copy_command(options: &Options) -> anyhow::Result<()> { let src = Pdb::open(&options.source_pdb)?; - let mut dst = ms_pdb::msf::Msf::create(&options.dest_pdb, Default::default())?; + let mut create_options = CreateOptions::default(); + + if let Some(page_size) = options.page_size { + create_options.page_size = PageSize::try_from(page_size) + .map_err(|_| anyhow::anyhow!("The page size must be a power of 2."))?; + } + + let mut dst = ms_pdb::msf::Msf::create(&options.dest_pdb, create_options)?; for stream_index in 1..src.num_streams() { if src.is_stream_valid(stream_index) { diff --git a/pdbtool/src/pdz/encode.rs b/pdbtool/src/pdz/encode.rs index f18374b..b649724 100644 --- a/pdbtool/src/pdz/encode.rs +++ b/pdbtool/src/pdz/encode.rs @@ -1,14 +1,17 @@ use crate::pdz::util::*; use anyhow::{Context, Result, bail}; +use ms_pdb::codeview::HasRestLen; use ms_pdb::dbi::{DBI_STREAM_HEADER_LEN, DbiStreamHeader}; use ms_pdb::msf::Msf; use ms_pdb::msfz::{self, MIN_FILE_SIZE_16K, MsfzFinishOptions, MsfzWriter, StreamWriter}; +use ms_pdb::tpi; +use ms_pdb::types::TypesIter; use ms_pdb::{RandomAccessFile, Stream}; use std::fs::File; use std::io::{Seek, SeekFrom, Write}; use std::path::Path; -use tracing::{trace, trace_span, warn}; -use zerocopy::FromBytes; +use tracing::{debug, info, trace, trace_span, warn}; +use zerocopy::{FromBytes, IntoBytes}; #[derive(clap::Parser, Debug)] pub(crate) struct PdzEncodeOptions { @@ -111,33 +114,86 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { writer.end_chunk()?; } - // Next, write the DBI stream. + // Write the DBI stream (3). { let _span = trace_span!("transfer DBI stream"); pdb.read_stream_to_vec_mut(Stream::DBI.into(), &mut stream_data)?; let mut sw = writer.stream_writer(Stream::DBI.into())?; write_dbi(&mut sw, &stream_data)?; - } - - if options.one_stream_per_chunk { writer.end_chunk()?; } + // Write the TPI (2). + let tpi_header_opt: Option = if pdb.is_stream_valid(Stream::TPI.into()) { + let _span = trace_span!("transfer TPI stream"); + pdb.read_stream_to_vec_mut(Stream::TPI.into(), &mut stream_data)?; + let mut sw = writer.stream_writer(Stream::TPI.into())?; + write_tpi_or_ipi(&mut sw, &stream_data, max_chunk_size as usize)? + } else { + None + }; + + // Write the IPI (4) + let ipi_header_opt: Option = if pdb.is_stream_valid(Stream::IPI.into()) { + let _span = trace_span!("transfer IPI stream"); + pdb.read_stream_to_vec_mut(Stream::IPI.into(), &mut stream_data)?; + let mut sw = writer.stream_writer(Stream::IPI.into())?; + write_tpi_or_ipi(&mut sw, &stream_data, max_chunk_size as usize)? + } else { + None + }; + // Loop through the rest of the streams and do normal chunked compression. for stream_index in 1..num_streams { + let _span = trace_span!("stream").entered(); + trace!(stream_index); + if !pdb.is_stream_valid(stream_index) { // This is a nil stream. We don't need to do anything because the writer has already // reserved this stream slot. continue; } - if stream_index == Stream::PDB.into() || stream_index == Stream::DBI.into() { + if stream_index == Stream::PDB.into() + || stream_index == Stream::DBI.into() + || stream_index == Stream::IPI.into() + || stream_index == Stream::TPI.into() + { // We have already processed these streams, above. continue; } - let _span = trace_span!("stream").entered(); - trace!(stream_index); + // Custom encoding for the TPI Hash Stream + if let Some(tpi_header) = &tpi_header_opt { + if tpi_header.hash_stream_index.get() == Some(stream_index) { + pdb.read_stream_to_vec_mut(stream_index, &mut stream_data)?; + let mut sw = writer.stream_writer(stream_index)?; + write_tpi_or_ipi_hash_stream( + &mut sw, + &stream_data, + max_chunk_size as usize, + &tpi_header, + )?; + writer.end_chunk()?; + continue; + } + } + + // Custom encoding for the IPI Hash Stream + if let Some(ipi_header) = &ipi_header_opt { + if ipi_header.hash_stream_index.get() == Some(stream_index) { + pdb.read_stream_to_vec_mut(stream_index, &mut stream_data)?; + let mut sw = writer.stream_writer(stream_index)?; + write_tpi_or_ipi_hash_stream( + &mut sw, + &stream_data, + max_chunk_size as usize, + &ipi_header, + )?; + writer.end_chunk()?; + continue; + } + } { let _span = trace_span!("read stream").entered(); @@ -271,6 +327,253 @@ fn write_dbi(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> Result<()> Ok(()) } +/// This is the fallback path for writing complex streams. +/// +/// If we find any problem in writing a complex stream, we fall back to compressing all of it. +#[inline(never)] +fn write_stream_fallback(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> Result<()> { + sw.set_compression_enabled(true); + sw.write_all(stream_data)?; + sw.end_chunk()?; + return Ok(()); +} + +/// Write the TPI or IPI stream. Be smart about compression and compression chunk boundaries. +/// +/// This function returns `Some(header)` if the TPI header was correctly parsed. This header can +/// be used to optimize the encoding of the associated Type Hash Stream. +fn write_tpi_or_ipi( + sw: &mut StreamWriter<'_, File>, + stream_data: &[u8], + mut max_chunk_len: usize, +) -> Result> { + debug!("write_tpi_or_ipi"); + sw.end_chunk()?; + + // The code below assumes that you can always put at least one type record into a chunk. + // To prevent a forward-progress failure, we require that max_chunk_len is larger than the + // largest type record. + const MIN_CHUNK_LEN: usize = 0x20000; + if max_chunk_len < MIN_CHUNK_LEN { + warn!("max_chunk_len is way too small; promoting it"); + max_chunk_len = MIN_CHUNK_LEN; + } + + // If the stream does not even contain a full header, then fall back to writing full contents. + let Ok((tpi_header, after_header)) = tpi::TypeStreamHeader::read_from_prefix(stream_data) + else { + warn!("TPI or IPI stream was too short to contain a valid header"); + write_stream_fallback(sw, stream_data)?; + return Ok(None); + }; + + // Find the slice of the type data. There can be data following the type data and we must + // handle it correctly. + let type_record_bytes_len = tpi_header.type_record_bytes.get() as usize; + if after_header.len() < type_record_bytes_len { + warn!( + "TPI or IPI stream contained invalid header value (type_record_bytes exceeded bounds)" + ); + write_stream_fallback(sw, stream_data)?; + return Ok(None); + } + + debug!(type_record_bytes_len, "encoding TPI/IPI."); + + // type_record_bytes contains the encoded type records + // after_records contains unknown data (if any) after the type records + let (type_record_bytes, after_records) = after_header.split_at(type_record_bytes_len); + + // Write the header, without compression. + // TODO: Place this in the initial read section. + sw.set_compression_enabled(false); + sw.write_all(tpi_header.as_bytes())?; + + // Next, we are going to scan through the type records in the TPI. Our goal is to create chunk + // boundaries that align with record boundaries, so that no type record is split across chunks. + sw.set_compression_enabled(true); + + // current_chunk_bytes contains the type records that will be written into the next chunk. + let mut current_chunk_bytes: &[u8] = type_record_bytes; + + let mut record_bytes_written: usize = 0; + + // This loop runs once per "chunk". Each iteration builds a single MSFZ chunk from a + // sequence of contiguous type records. Type records never cross chunk boundaries. + 'top: while !current_chunk_bytes.is_empty() { + let mut iter = TypesIter::new(current_chunk_bytes); + loop { + let rest_len_before = iter.rest_len(); + if iter.next().is_none() { + break 'top; + } + + let rest_len_after = iter.rest_len(); + let record_len = rest_len_before - rest_len_after; + assert!(record_len <= current_chunk_bytes.len()); + + // Would adding this record to the current chunk exceed our threshold? + let chunk_len_with_this_record = current_chunk_bytes.len() - rest_len_after; + if chunk_len_with_this_record > max_chunk_len { + let chunk_len_without_this_record = current_chunk_bytes.len() - rest_len_before; + let (committed_chunk_bytes, next_chunk_bytes) = + current_chunk_bytes.split_at(chunk_len_without_this_record); + + // TODO: This could be optimized to a single, non-buffered chunk write. + sw.write_all(committed_chunk_bytes)?; + sw.end_chunk()?; + record_bytes_written += committed_chunk_bytes.len(); + + // This will cause us to re-parse a record at the start of the next chunk. + // That's ok, that's cheap. But we do need to handle the case where a record + // is larger than max_chunk_len. We "handle" that by requiring that max_chunk_len + // is at least 0x10004, since that is the maximum size for any record. That's a + // very silly lower bound for a chunk size, so we actually require it to be higher. + current_chunk_bytes = next_chunk_bytes; + continue 'top; + } + + // Keep processing records. + } + } + + // If we got here, then the iterator stopped reporting records. That can happen for two + // reasons: 1) the normal case where we reach the end of the types, or 2) we failed to + // decode a type record. In both cases, writing current_chunk_contents will write the + // prefix of records that have been parsed (but have not triggered our threshold + if !current_chunk_bytes.is_empty() { + // TODO: optimize to a single, non-buffered chunk write. + sw.write_all(current_chunk_bytes)?; + record_bytes_written += current_chunk_bytes.len(); + } + sw.end_chunk()?; + assert_eq!( + record_bytes_written, + type_record_bytes.len(), + "expected to write same number of record bytes" + ); + + if !after_records.is_empty() { + debug!( + after_records_len = after_records.len(), + "TPI/IPI contains data after type stream" + ); + sw.write_all(after_records)?; + } + + sw.end_chunk()?; + + Ok(Some(tpi_header)) +} + +/// Write the "Type Hash Stream" associated with the TPI or IPI stream. +/// Be smart about compression and compression chunk boundaries. +/// +/// This function requires the Type Stream Header from the original TPI or IPI stream. +/// That header describes the regions within the Type Stream Header. +/// +/// The Type Hash Stream consists of three regions: 1) Hash Value Buffer, 2) Index Offset Buffer, +/// and 3) Hash Adjustment Buffer. The size and location of each of these regions is specified +/// in fields in the Type Stream Header. +/// +/// We use a simple algorithm. We build a list of the start and end locations of each of these +/// buffers (if they are non-zero length). Then we sort the list and de-dup it. Then we traverse +/// the list, writing chunks for each region. +fn write_tpi_or_ipi_hash_stream( + sw: &mut StreamWriter<'_, File>, + stream_data: &[u8], + mut max_chunk_len: usize, + tpi_header: &tpi::TypeStreamHeader, +) -> Result<()> { + println!("TPI/IPI header:\n{:?}", tpi_header); + + sw.end_chunk()?; + + let stream_len_u32 = stream_data.len() as u32; + + let mut boundaries: Vec = Vec::with_capacity(8); + + boundaries.push(stream_data.len()); + + let regions: [(i32, u32); 3] = [ + ( + tpi_header.hash_value_buffer_offset.get(), + tpi_header.hash_value_buffer_length.get(), + ), + ( + tpi_header.index_offset_buffer_offset.get(), + tpi_header.index_offset_buffer_length.get(), + ), + ( + tpi_header.hash_adj_buffer_offset.get(), + tpi_header.hash_adj_buffer_length.get(), + ), + ]; + + for &(start, length) in regions.iter() { + if length == 0 { + continue; + } + if start < 0 { + warn!("Type Hash Stream has a negative offset for one of its regions"); + continue; + } + + let start_u: u32 = start as u32; + if start_u > stream_len_u32 { + warn!("Type Hash Stream has a region whose start offset is out of bounds"); + continue; + } + + let avail = stream_len_u32 - start_u; + if length > avail { + warn!("Type Hash Stream has a region whose end offset is out of bounds"); + continue; + } + + let end: u32 = start_u + length; + + boundaries.push(start_u as usize); + boundaries.push(end as usize); + } + + boundaries.sort_unstable(); + boundaries.dedup(); + + info!("Type Hash Stream offset boundaries: {:?}", boundaries); + + let mut pos: usize = 0; + let mut total_bytes_written: usize = 0; + + for &boundary_offset in boundaries.iter() { + assert!(boundary_offset >= pos); + if boundary_offset == pos { + continue; + } + + let start = pos; + let end = boundary_offset; + pos = end; + + let data_between_boundaries = &stream_data[start..end]; + + // We are going to _further_ chunk things, based on max_chunk_len. + for chunk_data in data_between_boundaries.chunks(max_chunk_len) { + sw.write_all(chunk_data)?; + sw.end_chunk()?; + total_bytes_written += chunk_data.len(); + } + } + + assert_eq!( + total_bytes_written, + stream_data.len(), + "expected to write the correct number of bytes" + ); + + Ok(()) +} + fn parse_bytes(mut bytes_str: &str) -> anyhow::Result { let mut units: u32 = 1; From 04b7cec30c455a4a7ea58031cf874d4946d51d55 Mon Sep 17 00:00:00 2001 From: Arlie Davis Date: Mon, 9 Feb 2026 13:28:40 -0800 Subject: [PATCH 2/6] work in progress on smarter encoding --- msfz/src/lib.rs | 2 +- pdbtool/src/compare.rs | 234 +++++++++++++++++++++++++++++++ pdbtool/src/main.rs | 5 + pdbtool/src/pdz/encode.rs | 282 ++++++++++++++++++++++++++++++++++---- 4 files changed, 493 insertions(+), 30 deletions(-) create mode 100644 pdbtool/src/compare.rs diff --git a/msfz/src/lib.rs b/msfz/src/lib.rs index 0923b42..4836711 100644 --- a/msfz/src/lib.rs +++ b/msfz/src/lib.rs @@ -23,7 +23,7 @@ mod stream_data; mod tests; mod writer; -pub use reader::*; +pub use reader::{Fragment, FragmentLocation, Msfz, StreamReader}; pub use stream_data::StreamData; pub use writer::*; diff --git a/pdbtool/src/compare.rs b/pdbtool/src/compare.rs new file mode 100644 index 0000000..4e31434 --- /dev/null +++ b/pdbtool/src/compare.rs @@ -0,0 +1,234 @@ +//! Compares two PDB (or PDZ) files to check that their stream directories and +//! stream contents are identical. + +use anyhow::{Context, Result, bail}; +use ms_pdb::{Pdb, RandomAccessFile}; +use std::path::Path; +use tracing::{error, info}; + +/// Compares two PDB (or PDZ) files and verifies that their stream contents +/// are identical. +#[derive(clap::Parser)] +pub(crate) struct CompareOptions { + /// The first PDB file or PDZ file to compare. + pub(crate) first_file: String, + + /// The second PDB file or PDZ file to compare. + pub(crate) second_file: String, + + /// Continue comparison even after finding differences (don't stop at first difference). + #[arg(long)] + pub continue_on_differences: bool, +} + +pub(crate) fn command(options: CompareOptions) -> Result<()> { + let first_path = Path::new(&options.first_file); + let second_path = Path::new(&options.second_file); + + info!("Comparing files:"); + info!(" First: {}", first_path.display()); + info!(" Second: {}", second_path.display()); + + // Open both files + let first_pdb = Pdb::open(first_path) + .with_context(|| format!("Failed to open first file: {}", options.first_file))?; + let second_pdb = Pdb::open(second_path) + .with_context(|| format!("Failed to open second file: {}", options.second_file))?; + + let mut comparison_stats = ComparisonStats::default(); + let success = compare_pdbs(&first_pdb, &second_pdb, &options, &mut comparison_stats)?; + + // Print summary + info!("Comparison completed:"); + info!(" Total streams compared: {}", comparison_stats.streams_compared); + info!(" Streams with differences: {}", comparison_stats.streams_different); + info!(" Nil streams matched: {}", comparison_stats.nil_streams_matched); + + if !success { + info!("Result: Files are DIFFERENT"); + bail!("Files are different"); + } else { + info!("Result: Files are IDENTICAL"); + } + + Ok(()) +} + +#[derive(Default)] +struct ComparisonStats { + streams_compared: u32, + streams_different: u32, + nil_streams_matched: u32, +} + +fn compare_pdbs( + first_pdb: &Pdb, + second_pdb: &Pdb, + options: &CompareOptions, + stats: &mut ComparisonStats, +) -> Result { + let first_container = first_pdb.container(); + let second_container = second_pdb.container(); + + // Compare number of streams + let first_num_streams = first_container.num_streams(); + let second_num_streams = second_container.num_streams(); + + if first_num_streams != second_num_streams { + error!( + "Files have different number of streams. First: {}, Second: {}", + first_num_streams, second_num_streams + ); + if !options.continue_on_differences { + return Ok(false); + } + } + + let max_streams = std::cmp::max(first_num_streams, second_num_streams); + let mut has_differences = first_num_streams != second_num_streams; + + let mut first_stream_data: Vec = Vec::new(); + let mut second_stream_data: Vec = Vec::new(); + + // Compare each stream (starting from stream 1, as stream 0 is special) + for stream in 1..=max_streams { + let first_valid = stream < first_num_streams && first_container.is_stream_valid(stream); + let second_valid = stream < second_num_streams && second_container.is_stream_valid(stream); + + // Check if both streams have same nil-ness + if first_valid != second_valid { + error!( + "Stream {} has different validity. First: {}, Second: {}", + stream, first_valid, second_valid + ); + has_differences = true; + stats.streams_different += 1; + + if !options.continue_on_differences { + return Ok(false); + } + continue; + } + + // If both streams are nil, they match + if !first_valid && !second_valid { + stats.nil_streams_matched += 1; + continue; + } + + // Both streams are valid, compare their contents + stats.streams_compared += 1; + + // Check stream sizes first + let first_size = first_container.stream_len(stream); + let second_size = second_container.stream_len(stream); + + if first_size != second_size { + error!( + "Stream {} has different sizes. First: {} bytes, Second: {} bytes", + stream, first_size, second_size + ); + has_differences = true; + stats.streams_different += 1; + + if !options.continue_on_differences { + return Ok(false); + } + continue; + } + + // Read and compare stream contents + match compare_stream_contents( + &first_container, + &second_container, + stream, + &mut first_stream_data, + &mut second_stream_data, + ) { + Ok(true) => { + // Streams are identical + } + Ok(false) => { + has_differences = true; + stats.streams_different += 1; + + if !options.continue_on_differences { + return Ok(false); + } + } + Err(e) => { + error!("Failed to compare stream {}: {}", stream, e); + has_differences = true; + stats.streams_different += 1; + + if !options.continue_on_differences { + return Ok(false); + } + } + } + } + + Ok(!has_differences) +} + +fn compare_stream_contents( + first_container: &ms_pdb::Container, + second_container: &ms_pdb::Container, + stream: u32, + first_stream_data: &mut Vec, + second_stream_data: &mut Vec, +) -> Result { + // Read first stream + first_stream_data.clear(); + first_container.read_stream_to_vec_mut(stream, first_stream_data)?; + + // Read second stream + second_stream_data.clear(); + second_container.read_stream_to_vec_mut(stream, second_stream_data)?; + + // Find the first different byte + if let Some(byte_offset) = find_index_of_first_different_byte(first_stream_data, second_stream_data) { + error!( + "Stream {} differs at byte offset {} ({:#x})", + stream, byte_offset, byte_offset + ); + } else { + error!("Stream {} has different content but no specific difference found", stream); + } + + Ok(false) +} + +fn find_index_of_first_different_byte(a: &[u8], b: &[u8]) -> Option { + if a.len() != b.len() { + return Some(0); + } + + const BLOCK_SIZE: usize = 256; + let mut offset = 0; + + // Compare in blocks for efficiency + while offset + BLOCK_SIZE <= a.len() { + let block_a = &a[offset..offset + BLOCK_SIZE]; + let block_b = &b[offset..offset + BLOCK_SIZE]; + + if block_a != block_b { + // Found difference in this block, find exact byte + for i in 0..BLOCK_SIZE { + if block_a[i] != block_b[i] { + return Some(offset + i); + } + } + } + offset += BLOCK_SIZE; + } + + // Compare remaining bytes + for i in offset..a.len() { + if a[i] != b[i] { + return Some(i); + } + } + + None +} \ No newline at end of file diff --git a/pdbtool/src/main.rs b/pdbtool/src/main.rs index 840d4f1..eaaab30 100644 --- a/pdbtool/src/main.rs +++ b/pdbtool/src/main.rs @@ -21,6 +21,7 @@ mod hexdump; mod pdz; mod save; mod util; +mod compare; #[derive(clap::Parser)] struct CommandWithFlags { @@ -57,6 +58,9 @@ enum Command { /// PDB (MSF) or PDBZ (MSFZ) container format. This also shows some container-specific /// information. Container(container::ContainerOptions), + /// Compares two PDB (or PDZ) files and verifies that their stream contents are identical. + /// Reports differences but continues checking all streams unless stopped. + Compare(compare::CompareOptions), Test, Dump(dump::DumpOptions), Save(save::SaveStreamOptions), @@ -86,6 +90,7 @@ fn main() -> anyhow::Result<()> { Command::Hexdump(args) => hexdump::command(args)?, Command::PdzEncode(args) => pdz::encode::pdz_encode(args)?, Command::Container(args) => container::container_command(&args)?, + Command::Compare(args) => compare::command(args)?, Command::Check(args) => check::command(args)?, } diff --git a/pdbtool/src/pdz/encode.rs b/pdbtool/src/pdz/encode.rs index b649724..b2a929d 100644 --- a/pdbtool/src/pdz/encode.rs +++ b/pdbtool/src/pdz/encode.rs @@ -1,16 +1,19 @@ +//! Compresses PDB files into "PDZ" (compressed PDB) files. + use crate::pdz::util::*; use anyhow::{Context, Result, bail}; use ms_pdb::codeview::HasRestLen; -use ms_pdb::dbi::{DBI_STREAM_HEADER_LEN, DbiStreamHeader}; +use ms_pdb::dbi::DbiStreamHeader; use ms_pdb::msf::Msf; use ms_pdb::msfz::{self, MIN_FILE_SIZE_16K, MsfzFinishOptions, MsfzWriter, StreamWriter}; +use ms_pdb::syms::SymIter; use ms_pdb::tpi; use ms_pdb::types::TypesIter; use ms_pdb::{RandomAccessFile, Stream}; use std::fs::File; -use std::io::{Seek, SeekFrom, Write}; +use std::io::{Read, Seek, SeekFrom, Write}; use std::path::Path; -use tracing::{debug, info, trace, trace_span, warn}; +use tracing::{debug, error, info, trace, trace_span, warn}; use zerocopy::{FromBytes, IntoBytes}; #[derive(clap::Parser, Debug)] @@ -46,18 +49,16 @@ pub(crate) struct PdzEncodeOptions { /// Place each chunk into its own compression chunk (or chunks, if the stream is large). #[arg(long)] pub one_stream_per_chunk: bool, + + /// After writing the PDZ file, close it, re-open it, and verify that its stream contents + /// match the original PDB, byte for byte. + #[arg(long)] + pub verify: bool, } pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { let _span = trace_span!("pdz_encode").entered(); - let pdb_metadata = std::fs::metadata(&options.input_pdb).with_context(|| { - format!( - "Failed to get metadata for input PDB: {}", - options.input_pdb - ) - })?; - let input_file = RandomAccessFile::open(Path::new(&options.input_pdb)) .with_context(|| format!("Failed to open input PDB: {}", options.input_pdb))?; @@ -115,11 +116,12 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { } // Write the DBI stream (3). + let dbi_header: Option; { let _span = trace_span!("transfer DBI stream"); pdb.read_stream_to_vec_mut(Stream::DBI.into(), &mut stream_data)?; let mut sw = writer.stream_writer(Stream::DBI.into())?; - write_dbi(&mut sw, &stream_data)?; + dbi_header = write_dbi(&mut sw, &stream_data)?; writer.end_chunk()?; } @@ -195,6 +197,19 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { } } + // Custom encoding for the Global Symbol Stream. + // The stream number for the Global Symbol Stream is found in the DBI Stream Header. + match &dbi_header { + Some(dbi_header) if dbi_header.global_symbol_stream.get() == Some(stream_index) => { + let mut sw = writer.stream_writer(stream_index)?; + pdb.read_stream_to_vec_mut(stream_index, &mut stream_data)?; + write_global_symbols_stream(&mut sw, &stream_data, max_chunk_size as usize)?; + writer.end_chunk()?; + continue; + } + _ => {} + } + { let _span = trace_span!("read stream").entered(); pdb.read_stream_to_vec_mut(stream_index, &mut stream_data)?; @@ -212,7 +227,8 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { } } - // Get final size of the file. Don't append more data to the file after this line. + // Finish encoding the MSFZ file. This closes the session; don't append more data to the file + // after this line. This writes the MSFZ Stream Directory, Chunk Table, and MSFZ File Header. let (summary, mut file) = { let _span = trace_span!("finish writing").entered(); writer.finish_with_options(MsfzFinishOptions { @@ -226,38 +242,178 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { }; let out_file_size = file.seek(SeekFrom::End(0))?; - show_comp_rate("PDB -> PDZ", pdb_metadata.len(), out_file_size); + + match std::fs::metadata(&options.input_pdb) { + Ok(pdb_metadata) => { + show_comp_rate("PDB -> PDZ", pdb_metadata.len(), out_file_size); + } + Err(e) => { + warn!("Failed to get metadata for input PDB: {e:?}"); + } + } println!("{summary}"); - // Explicitly close our file handles so that the replace_file() call can succeed. - drop(pdb); + // Explicitly drop our output file handle so that we can re-open it for verification. drop(file); + if options.verify { + info!("Verifying PDZ encoding"); + let is_same = verify_pdz(&pdb, &options.output_pdz)?; + if !is_same { + bail!("PDZ encoding failed verification."); + } + } + + drop(pdb); + Ok(()) } +/// Reads all of the data from two files and compares their contents. +/// +/// The first file is a PDB (MSF) file and is already open. +/// The second file is a PDZ (MSFZ) file and its filename is given. +/// +/// Returns `true` if their contents are identical. +fn verify_pdz(input_pdb: &Msf, output_pdz: &str) -> anyhow::Result { + let output = msfz::Msfz::open(output_pdz)?; + + let input_num_streams = input_pdb.num_streams(); + let output_num_streams = output.num_streams(); + if input_num_streams != output_num_streams { + error!( + "The output file (PDZ) has the wrong number of streams. \ + Expected value: {input_num_streams}. \ + Actual value: {output_num_streams}" + ); + bail!("Wrong number of streams"); + } + + let mut has_errors = false; + + let mut input_stream_data: Vec = Vec::new(); + let mut output_stream_data: Vec = Vec::new(); + + for stream in 1..=input_num_streams { + let input_stream_is_valid = input_pdb.is_stream_valid(stream); + let output_stream_is_valid = output.is_stream_valid(stream); + + if input_stream_is_valid != output_stream_is_valid { + error!( + "Stream {stream} has wrong validity. \ + Expected value: {input_stream_is_valid:?}. \ + Actual value: {output_stream_is_valid:?}." + ); + has_errors = true; + } + + if !input_stream_is_valid { + continue; + } + + // Read stream data in the input file. + { + input_stream_data.clear(); + let mut sr = input_pdb.get_stream_reader(stream)?; + sr.read_to_end(&mut input_stream_data)?; + } + + // Read stream data in the output file. + { + output_stream_data.clear(); + let mut sr = output.get_stream_reader(stream)?; + sr.read_to_end(&mut output_stream_data)?; + } + + if input_stream_data.len() != output_stream_data.len() { + error!( + "Stream {stream} has wrong length. \ + Expected value: {}. \ + Actual value: {}.", + input_stream_data.len(), + output_stream_data.len() + ); + has_errors = true; + continue; + } + + if let Some(byte_offset) = + find_index_of_first_different_byte(&input_stream_data, &output_stream_data) + { + error!( + "Stream {stream} has wrong (different) contents, at index {byte_offset} ({byte_offset:#x})" + ); + has_errors = true; + continue; + } + } + + if has_errors { + return Ok(false); + } + + info!("Verification succeeded."); + + Ok(true) +} + +fn find_index_of_first_different_byte(mut a: &[u8], mut b: &[u8]) -> Option { + if a.len() != b.len() { + return Some(0); + } + + const BLOCK_SIZE: usize = 256; + + let mut skipped_len: usize = 0; + loop { + assert_eq!(a.len(), b.len()); + if a.len() < BLOCK_SIZE { + break; + } + let block_a = &a[..BLOCK_SIZE]; + let block_b = &b[..BLOCK_SIZE]; + + if block_a != block_b { + break; + } + + a = &a[BLOCK_SIZE..]; + b = &b[BLOCK_SIZE..]; + skipped_len += BLOCK_SIZE; + } + + for i in 0..a.len() { + if a[i] != b[i] { + return Some(skipped_len + i); + } + } + + None +} + /// Write the DBI stream. Be smart about compression and compression chunk boundaries. -fn write_dbi(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> Result<()> { +fn write_dbi( + sw: &mut StreamWriter<'_, File>, + stream_data: &[u8], +) -> Result> { // Avoid compressing data from the DBI stream with other chunks. sw.end_chunk()?; - if stream_data.len() < DBI_STREAM_HEADER_LEN { + let Ok((dbi_header, mut rest_of_stream)) = DbiStreamHeader::read_from_prefix(stream_data) + else { // Something is seriously wrong with this PDB. Pass the contents through without any // modification or compression. sw.set_compression_enabled(false); sw.write_all(stream_data)?; - return Ok(()); - } - - let (header_bytes, mut rest_of_stream) = stream_data.split_at(DBI_STREAM_HEADER_LEN); - - // This unwrap() cannot fail because we just tested the size, above. - let dbi_header = DbiStreamHeader::ref_from_bytes(header_bytes).unwrap(); + sw.end_chunk()?; + return Ok(None); + }; // Write the DBI Stream Header uncompressed. This allows symbol.exe to read it. sw.set_compression_enabled(false); - sw.write_all(header_bytes)?; + sw.write_all(dbi_header.as_bytes())?; + sw.end_chunk()?; // The DBI consists of a header, followed by a set of substreams. The substreams contain // data with different sizes, compression characteristic, and different access patterns. @@ -324,7 +480,7 @@ fn write_dbi(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> Result<()> sw.write_all(rest_of_stream)?; sw.end_chunk()?; - Ok(()) + Ok(Some(dbi_header)) } /// This is the fallback path for writing complex streams. @@ -338,6 +494,76 @@ fn write_stream_fallback(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> return Ok(()); } +fn write_global_symbols_stream( + sw: &mut StreamWriter<'_, File>, + stream_data: &[u8], + mut max_chunk_len: usize, +) -> Result<()> { + info!("Writing Global Symbol Stream"); + + sw.end_chunk()?; + + // The code below assumes that you can always put at least one type record into a chunk. + // To prevent a forward-progress failure, we require that max_chunk_len is larger than the + // largest type record. + const MIN_CHUNK_LEN: usize = 0x20000; + if max_chunk_len < MIN_CHUNK_LEN { + warn!( + "max_chunk_len ({}) is way too small; promoting it", + max_chunk_len + ); + max_chunk_len = MIN_CHUNK_LEN; + } + + let mut current_chunk_bytes: &[u8] = stream_data; + let mut total_bytes_written: usize = 0; + + 'top: while !current_chunk_bytes.is_empty() { + let mut iter = SymIter::new(current_chunk_bytes); + loop { + let rest_len_before = iter.rest_len(); + if iter.next().is_none() { + break 'top; + } + let rest_len_after = iter.rest_len(); + + let chunk_len_with_this_record = current_chunk_bytes.len() - rest_len_after; + if chunk_len_with_this_record > max_chunk_len { + let chunk_len_without_this_record = current_chunk_bytes.len() - rest_len_before; + let (committed_chunk_bytes, next_chunk_bytes) = + current_chunk_bytes.split_at(chunk_len_without_this_record); + + // TODO: This could be optimized to a single, non-buffered chunk write. + sw.write_all(committed_chunk_bytes)?; + sw.end_chunk()?; + total_bytes_written += committed_chunk_bytes.len(); + + // This will cause us to re-parse a record at the start of the next chunk. + // That's ok, that's cheap. But we do need to handle the case where a record + // is larger than max_chunk_len. We "handle" that by requiring that max_chunk_len + // is at least 0x10004, since that is the maximum size for any record. That's a + // very silly lower bound for a chunk size, so we actually require it to be higher. + current_chunk_bytes = next_chunk_bytes; + continue 'top; + } + } + } + + if !current_chunk_bytes.is_empty() { + sw.write_all(current_chunk_bytes)?; + sw.end_chunk()?; + total_bytes_written += current_chunk_bytes.len(); + } + + assert_eq!( + total_bytes_written, + stream_data.len(), + "expected to write same number of record bytes" + ); + + Ok(()) +} + /// Write the TPI or IPI stream. Be smart about compression and compression chunk boundaries. /// /// This function returns `Some(header)` if the TPI header was correctly parsed. This header can @@ -482,11 +708,9 @@ fn write_tpi_or_ipi( fn write_tpi_or_ipi_hash_stream( sw: &mut StreamWriter<'_, File>, stream_data: &[u8], - mut max_chunk_len: usize, + max_chunk_len: usize, tpi_header: &tpi::TypeStreamHeader, ) -> Result<()> { - println!("TPI/IPI header:\n{:?}", tpi_header); - sw.end_chunk()?; let stream_len_u32 = stream_data.len() as u32; From 645e679f67fac28c9ab9528892e6c760119dfe04 Mon Sep 17 00:00:00 2001 From: Arlie Davis Date: Tue, 10 Feb 2026 09:48:05 -0800 Subject: [PATCH 3/6] Fix a PDZ decoder bug, plus cleanup PDZ decoder was reading wrong data. Offset calculation was wrong. --- msf/src/lib.rs | 2 +- msfz/src/reader.rs | 30 +++++++++------- pdbtool/src/compare.rs | 74 +++++++++++++++++++++++++-------------- pdbtool/src/main.rs | 2 +- pdbtool/src/pdz/encode.rs | 74 +++++++++++++++++++-------------------- 5 files changed, 104 insertions(+), 78 deletions(-) diff --git a/msf/src/lib.rs b/msf/src/lib.rs index 81f5fec..3b9d2ef 100644 --- a/msf/src/lib.rs +++ b/msf/src/lib.rs @@ -355,7 +355,7 @@ impl Msf { /// Return the total number of pages allocated to the file, including all pages (allocated, /// unallocated, etc.). - /// + /// /// This count includes pages allocated to streams, Page 0, FPM pages, pages that are free /// (not allocated), and pages allocated to the Stream Directory. pub fn num_total_pages(&self) -> u32 { diff --git a/msfz/src/reader.rs b/msfz/src/reader.rs index b7bc920..c808eb1 100644 --- a/msfz/src/reader.rs +++ b/msfz/src/reader.rs @@ -54,15 +54,15 @@ impl std::fmt::Debug for FragmentLocation { } else if self.is_compressed() { write!( f, - "uncompressed at 0x{:06x}", - self.uncompressed_file_offset() + "chunk {} : 0x{:04x}", + self.compressed_first_chunk(), + self.compressed_offset_within_chunk() ) } else { write!( f, - "chunk {} : 0x{:04x}", - self.compressed_first_chunk(), - self.compressed_offset_within_chunk() + "uncompressed at 0x{:06x}", + self.uncompressed_file_offset() ) } } @@ -70,6 +70,10 @@ impl std::fmt::Debug for FragmentLocation { const FRAGMENT_LOCATION_32BIT_IS_COMPRESSED_MASK: u32 = 1u32 << 31; +/// A bit mask for the bits within a packed `FragmentLocation` which encode +/// the file offset of an uncompressed fragment. +const UNCOMPRESSED_FRAGMENT_FILE_OFFSET_MASK: u64 = (1u64 << 48) - 1; + /// Represents the location of a fragment, either compressed or uncompressed. #[derive(Copy, Clone)] pub struct FragmentLocation { @@ -97,7 +101,7 @@ impl FragmentLocation { } /// Returns the chunk index for this compressed fragment. - /// + /// /// You must check `is_compressed()` before calling this function. pub fn compressed_first_chunk(&self) -> u32 { debug_assert!(!self.is_nil()); @@ -114,7 +118,7 @@ impl FragmentLocation { fn uncompressed_file_offset(&self) -> u64 { debug_assert!(!self.is_nil()); debug_assert!(!self.is_compressed()); - ((self.hi as u64) << 32) | (self.lo as u64) + (((self.hi as u64) << 32) | (self.lo as u64)) & UNCOMPRESSED_FRAGMENT_FILE_OFFSET_MASK } } @@ -560,7 +564,9 @@ impl<'a, F: ReadAt> ReadAt for StreamReader<'a, F> { )?; buf_xfer.copy_from_slice(chunk_slice); } else { - // Read the stream data directly from disk. + // Read the stream data directly from disk. We don't validate the file offset or + // the length of the transfer. Instead, we just allow the underlying file system + // to report errors. let file_offset = fragment.location.uncompressed_file_offset(); self.msfz .file @@ -571,11 +577,9 @@ impl<'a, F: ReadAt> ReadAt for StreamReader<'a, F> { break; } - if current_offset >= num_bytes_xfer as u64 { - current_offset -= num_bytes_xfer as u64; - } else { - current_offset = 0; - } + // Since we have finished reading from one chunk and there is more data to read in the + // next chunk, the "offset within chunk" becomes zero. + current_offset = 0; } Ok(original_buf_len - buf.len()) diff --git a/pdbtool/src/compare.rs b/pdbtool/src/compare.rs index 4e31434..3c2d998 100644 --- a/pdbtool/src/compare.rs +++ b/pdbtool/src/compare.rs @@ -40,10 +40,19 @@ pub(crate) fn command(options: CompareOptions) -> Result<()> { // Print summary info!("Comparison completed:"); - info!(" Total streams compared: {}", comparison_stats.streams_compared); - info!(" Streams with differences: {}", comparison_stats.streams_different); - info!(" Nil streams matched: {}", comparison_stats.nil_streams_matched); - + info!( + " Total streams compared: {}", + comparison_stats.streams_compared + ); + info!( + " Streams with differences: {}", + comparison_stats.streams_different + ); + info!( + " Nil streams matched: {}", + comparison_stats.nil_streams_matched + ); + if !success { info!("Result: Files are DIFFERENT"); bail!("Files are different"); @@ -187,48 +196,61 @@ fn compare_stream_contents( second_container.read_stream_to_vec_mut(stream, second_stream_data)?; // Find the first different byte - if let Some(byte_offset) = find_index_of_first_different_byte(first_stream_data, second_stream_data) { + if let Some(byte_offset) = + find_index_of_first_different_byte(first_stream_data, second_stream_data) + { error!( "Stream {} differs at byte offset {} ({:#x})", stream, byte_offset, byte_offset ); - } else { - error!("Stream {} has different content but no specific difference found", stream); + return Ok(false); } - Ok(false) + Ok(true) } -fn find_index_of_first_different_byte(a: &[u8], b: &[u8]) -> Option { +/// Compares two byte slices for equality. If they are different, then returns the offset of the +/// first byte that is wrong. +/// +/// * Returns `Some(0)` if the byte slices have different lengths. +/// * Returns `Some(i)` if the byte slices have different contents, and `i` is the offset of the +/// first byte that is different. +/// * Returns `None` if the contents are identical. +#[inline(never)] +pub(crate) fn find_index_of_first_different_byte(mut a: &[u8], mut b: &[u8]) -> Option { if a.len() != b.len() { return Some(0); } - const BLOCK_SIZE: usize = 256; - let mut offset = 0; + const BLOCK_SIZE: usize = 4096; - // Compare in blocks for efficiency - while offset + BLOCK_SIZE <= a.len() { - let block_a = &a[offset..offset + BLOCK_SIZE]; - let block_b = &b[offset..offset + BLOCK_SIZE]; + // This first loop uses memcmp to find the first block which has different contents. + // memcmp is generally going to be faster than a manually-indexed loop. + let mut skipped_len: usize = 0; + loop { + assert_eq!(a.len(), b.len()); + if a.len() < BLOCK_SIZE { + break; + } + let block_a = &a[..BLOCK_SIZE]; + let block_b = &b[..BLOCK_SIZE]; if block_a != block_b { - // Found difference in this block, find exact byte - for i in 0..BLOCK_SIZE { - if block_a[i] != block_b[i] { - return Some(offset + i); - } - } + break; } - offset += BLOCK_SIZE; + + a = &a[BLOCK_SIZE..]; + b = &b[BLOCK_SIZE..]; + skipped_len += BLOCK_SIZE; } - // Compare remaining bytes - for i in offset..a.len() { + // Finish checking the ragged last block -or- repeat the scan of the block which + // contained different contents. + for i in 0..a.len() { if a[i] != b[i] { - return Some(i); + return Some(skipped_len + i); } } None -} \ No newline at end of file +} diff --git a/pdbtool/src/main.rs b/pdbtool/src/main.rs index eaaab30..b997569 100644 --- a/pdbtool/src/main.rs +++ b/pdbtool/src/main.rs @@ -10,6 +10,7 @@ use clap::Parser; mod addsrc; mod check; +mod compare; mod container; mod copy; mod counts; @@ -21,7 +22,6 @@ mod hexdump; mod pdz; mod save; mod util; -mod compare; #[derive(clap::Parser)] struct CommandWithFlags { diff --git a/pdbtool/src/pdz/encode.rs b/pdbtool/src/pdz/encode.rs index b2a929d..e4563c2 100644 --- a/pdbtool/src/pdz/encode.rs +++ b/pdbtool/src/pdz/encode.rs @@ -312,6 +312,19 @@ fn verify_pdz(input_pdb: &Msf, output_pdz: &str) -> anyhow::Result { continue; } + let input_stream_size = input_pdb.stream_size(stream) as usize; + let output_stream_size = output.stream_size(stream)? as usize; + + if input_stream_size != output_stream_size { + error!( + "Stream {stream} has wrong length in stream directory. + Expected value: {input_stream_size}. \ + Actual value: {output_stream_size}." + ); + has_errors = true; + continue; + } + // Read stream data in the input file. { input_stream_data.clear(); @@ -338,12 +351,31 @@ fn verify_pdz(input_pdb: &Msf, output_pdz: &str) -> anyhow::Result { continue; } - if let Some(byte_offset) = - find_index_of_first_different_byte(&input_stream_data, &output_stream_data) - { + if let Some(byte_offset) = crate::compare::find_index_of_first_different_byte( + &input_stream_data, + &output_stream_data, + ) { error!( "Stream {stream} has wrong (different) contents, at index {byte_offset} ({byte_offset:#x})" ); + + let out_dir = Path::new(r"d:\temp"); + + let input_file_path = out_dir.join(format!("pdb-s{stream}.bin")); + let output_file_path = out_dir.join(format!("pdz-s{stream}.bin")); + + fn write_or_complain(path: &Path, data: &[u8]) { + match std::fs::write(path, data) { + Ok(()) => {} + Err(e) => { + warn!("failed to write file: {} : {e:?}", path.display()); + } + } + } + + write_or_complain(&input_file_path, &input_stream_data); + write_or_complain(&output_file_path, &output_stream_data); + has_errors = true; continue; } @@ -358,40 +390,6 @@ fn verify_pdz(input_pdb: &Msf, output_pdz: &str) -> anyhow::Result { Ok(true) } -fn find_index_of_first_different_byte(mut a: &[u8], mut b: &[u8]) -> Option { - if a.len() != b.len() { - return Some(0); - } - - const BLOCK_SIZE: usize = 256; - - let mut skipped_len: usize = 0; - loop { - assert_eq!(a.len(), b.len()); - if a.len() < BLOCK_SIZE { - break; - } - let block_a = &a[..BLOCK_SIZE]; - let block_b = &b[..BLOCK_SIZE]; - - if block_a != block_b { - break; - } - - a = &a[BLOCK_SIZE..]; - b = &b[BLOCK_SIZE..]; - skipped_len += BLOCK_SIZE; - } - - for i in 0..a.len() { - if a[i] != b[i] { - return Some(skipped_len + i); - } - } - - None -} - /// Write the DBI stream. Be smart about compression and compression chunk boundaries. fn write_dbi( sw: &mut StreamWriter<'_, File>, @@ -400,6 +398,8 @@ fn write_dbi( // Avoid compressing data from the DBI stream with other chunks. sw.end_chunk()?; + // Read the DBI stream header. If we can't read it (because it's too small), then fall back + // to copying the stream. let Ok((dbi_header, mut rest_of_stream)) = DbiStreamHeader::read_from_prefix(stream_data) else { // Something is seriously wrong with this PDB. Pass the contents through without any From e04da5d83db299b3436137afb2cacecda1c9a2b7 Mon Sep 17 00:00:00 2001 From: Arlie Davis Date: Tue, 10 Feb 2026 09:52:05 -0800 Subject: [PATCH 4/6] clippy + bump versions --- Cargo.lock | 6 +++--- msfz/Cargo.toml | 2 +- pdb/Cargo.toml | 4 ++-- pdbtool/Cargo.toml | 4 ++-- pdbtool/src/compare.rs | 4 ++-- pdbtool/src/pdz/encode.rs | 6 +++--- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8fecbf7..886efae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -380,7 +380,7 @@ dependencies = [ [[package]] name = "ms-pdb" -version = "0.1.18" +version = "0.1.19" dependencies = [ "anyhow", "bitfield", @@ -424,7 +424,7 @@ dependencies = [ [[package]] name = "ms-pdb-msfz" -version = "0.1.9" +version = "0.1.10" dependencies = [ "anyhow", "bstr", @@ -492,7 +492,7 @@ dependencies = [ [[package]] name = "pdbtool" -version = "0.1.18" +version = "0.1.19" dependencies = [ "anyhow", "bitvec", diff --git a/msfz/Cargo.toml b/msfz/Cargo.toml index e584b73..041fa3a 100644 --- a/msfz/Cargo.toml +++ b/msfz/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ms-pdb-msfz" -version = "0.1.9" +version = "0.1.10" edition = "2024" description = "Reads Compressed Multi-Stream Files, which is part of the Microsoft PDB file format" authors = ["Arlie Davis "] diff --git a/pdb/Cargo.toml b/pdb/Cargo.toml index 1bbb39d..4661344 100644 --- a/pdb/Cargo.toml +++ b/pdb/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ms-pdb" -version = "0.1.18" +version = "0.1.19" edition = "2024" description = "Reads Microsoft Program Database (PDB) files" authors = ["Arlie Davis "] @@ -39,7 +39,7 @@ version = "0.1.6" path = "../msf" [dependencies.ms-pdb-msfz] -version = "0.1.9" +version = "0.1.10" path = "../msfz" [dev-dependencies] diff --git a/pdbtool/Cargo.toml b/pdbtool/Cargo.toml index 690b47e..266b2d7 100644 --- a/pdbtool/Cargo.toml +++ b/pdbtool/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pdbtool" -version = "0.1.18" +version = "0.1.19" edition = "2024" description = "A tool for reading Program Database (PDB) files and displaying information about them." authors = ["Arlie Davis "] @@ -34,5 +34,5 @@ version = "0.1.0" path = "../coff" [dependencies.ms-pdb] -version = "0.1.18" +version = "0.1.19" path = "../pdb" diff --git a/pdbtool/src/compare.rs b/pdbtool/src/compare.rs index 3c2d998..7ac017f 100644 --- a/pdbtool/src/compare.rs +++ b/pdbtool/src/compare.rs @@ -148,8 +148,8 @@ fn compare_pdbs( // Read and compare stream contents match compare_stream_contents( - &first_container, - &second_container, + first_container, + second_container, stream, &mut first_stream_data, &mut second_stream_data, diff --git a/pdbtool/src/pdz/encode.rs b/pdbtool/src/pdz/encode.rs index e4563c2..a89071a 100644 --- a/pdbtool/src/pdz/encode.rs +++ b/pdbtool/src/pdz/encode.rs @@ -174,7 +174,7 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { &mut sw, &stream_data, max_chunk_size as usize, - &tpi_header, + tpi_header, )?; writer.end_chunk()?; continue; @@ -190,7 +190,7 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { &mut sw, &stream_data, max_chunk_size as usize, - &ipi_header, + ipi_header, )?; writer.end_chunk()?; continue; @@ -491,7 +491,7 @@ fn write_stream_fallback(sw: &mut StreamWriter<'_, File>, stream_data: &[u8]) -> sw.set_compression_enabled(true); sw.write_all(stream_data)?; sw.end_chunk()?; - return Ok(()); + Ok(()) } fn write_global_symbols_stream( From b87a105732831db56853c894a8c98ea53a6472ce Mon Sep 17 00:00:00 2001 From: Arlie Davis Date: Tue, 10 Feb 2026 10:10:47 -0800 Subject: [PATCH 5/6] remove some debugging code + cosmetic cleanup --- pdbtool/src/compare.rs | 4 ++-- pdbtool/src/pdz.rs | 1 - pdbtool/src/pdz/encode.rs | 43 ++++++++++++++++----------------------- pdbtool/src/pdz/util.rs | 20 ------------------ 4 files changed, 20 insertions(+), 48 deletions(-) delete mode 100644 pdbtool/src/pdz/util.rs diff --git a/pdbtool/src/compare.rs b/pdbtool/src/compare.rs index 7ac017f..550d178 100644 --- a/pdbtool/src/compare.rs +++ b/pdbtool/src/compare.rs @@ -93,14 +93,14 @@ fn compare_pdbs( } } - let max_streams = std::cmp::max(first_num_streams, second_num_streams); + let min_num_streams = first_num_streams.min(second_num_streams); let mut has_differences = first_num_streams != second_num_streams; let mut first_stream_data: Vec = Vec::new(); let mut second_stream_data: Vec = Vec::new(); // Compare each stream (starting from stream 1, as stream 0 is special) - for stream in 1..=max_streams { + for stream in 1..min_num_streams { let first_valid = stream < first_num_streams && first_container.is_stream_valid(stream); let second_valid = stream < second_num_streams && second_container.is_stream_valid(stream); diff --git a/pdbtool/src/pdz.rs b/pdbtool/src/pdz.rs index 87cfe56..0ff4757 100644 --- a/pdbtool/src/pdz.rs +++ b/pdbtool/src/pdz.rs @@ -1,2 +1 @@ pub mod encode; -pub mod util; diff --git a/pdbtool/src/pdz/encode.rs b/pdbtool/src/pdz/encode.rs index a89071a..293b026 100644 --- a/pdbtool/src/pdz/encode.rs +++ b/pdbtool/src/pdz/encode.rs @@ -1,6 +1,5 @@ //! Compresses PDB files into "PDZ" (compressed PDB) files. -use crate::pdz::util::*; use anyhow::{Context, Result, bail}; use ms_pdb::codeview::HasRestLen; use ms_pdb::dbi::DbiStreamHeader; @@ -91,7 +90,7 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { .with_context(|| format!("Failed to open output PDZ: {}", options.output_pdz))?; writer.set_uncompressed_chunk_size_threshold(max_chunk_size); - println!( + info!( "Using maximum chunk size: {n} ({n:#x})", n = writer.uncompressed_chunk_size_threshold() ); @@ -245,14 +244,26 @@ pub fn pdz_encode(options: PdzEncodeOptions) -> Result<()> { match std::fs::metadata(&options.input_pdb) { Ok(pdb_metadata) => { - show_comp_rate("PDB -> PDZ", pdb_metadata.len(), out_file_size); + let before = pdb_metadata.len(); + let after = out_file_size; + + // We don't divide by zero around here. + if before != 0 { + let percent = (before as f64 - after as f64) / (before as f64) * 100.0; + info!( + " PDB -> PDZ compression : {:8} -> {:8} {percent:2.1} %", + friendly::bytes(before), + friendly::bytes(after) + ); + } } Err(e) => { warn!("Failed to get metadata for input PDB: {e:?}"); } } - println!("{summary}"); + info!("Number of streams: {:8}", summary.num_streams); + info!("Number of chunks: {:8}", summary.num_chunks); // Explicitly drop our output file handle so that we can re-open it for verification. drop(file); @@ -295,7 +306,7 @@ fn verify_pdz(input_pdb: &Msf, output_pdz: &str) -> anyhow::Result { let mut input_stream_data: Vec = Vec::new(); let mut output_stream_data: Vec = Vec::new(); - for stream in 1..=input_num_streams { + for stream in 1..input_num_streams { let input_stream_is_valid = input_pdb.is_stream_valid(stream); let output_stream_is_valid = output.is_stream_valid(stream); @@ -358,24 +369,6 @@ fn verify_pdz(input_pdb: &Msf, output_pdz: &str) -> anyhow::Result { error!( "Stream {stream} has wrong (different) contents, at index {byte_offset} ({byte_offset:#x})" ); - - let out_dir = Path::new(r"d:\temp"); - - let input_file_path = out_dir.join(format!("pdb-s{stream}.bin")); - let output_file_path = out_dir.join(format!("pdz-s{stream}.bin")); - - fn write_or_complain(path: &Path, data: &[u8]) { - match std::fs::write(path, data) { - Ok(()) => {} - Err(e) => { - warn!("failed to write file: {} : {e:?}", path.display()); - } - } - } - - write_or_complain(&input_file_path, &input_stream_data); - write_or_complain(&output_file_path, &output_stream_data); - has_errors = true; continue; } @@ -499,7 +492,7 @@ fn write_global_symbols_stream( stream_data: &[u8], mut max_chunk_len: usize, ) -> Result<()> { - info!("Writing Global Symbol Stream"); + debug!("Writing Global Symbol Stream"); sw.end_chunk()?; @@ -764,7 +757,7 @@ fn write_tpi_or_ipi_hash_stream( boundaries.sort_unstable(); boundaries.dedup(); - info!("Type Hash Stream offset boundaries: {:?}", boundaries); + debug!("Type Hash Stream offset boundaries: {:?}", boundaries); let mut pos: usize = 0; let mut total_bytes_written: usize = 0; diff --git a/pdbtool/src/pdz/util.rs b/pdbtool/src/pdz/util.rs deleted file mode 100644 index 69b2515..0000000 --- a/pdbtool/src/pdz/util.rs +++ /dev/null @@ -1,20 +0,0 @@ -pub(crate) fn show_comp_rate(description: &str, before: u64, after: u64) { - if before == 0 { - // We don't divide by zero around here. - println!( - " {:-30} : {:8} -> {:8}", - description, - friendly::bytes(before), - friendly::bytes(after) - ); - } else { - let percent = (before as f64 - after as f64) / (before as f64) * 100.0; - println!( - " {:-30} : {:8} -> {:8} {:2.1} %", - description, - friendly::bytes(before), - friendly::bytes(after), - percent - ); - } -} From 49394bd10023fd8d6c56b6ea9f20c11afef68d34 Mon Sep 17 00:00:00 2001 From: Arlie Davis Date: Tue, 10 Feb 2026 13:33:05 -0800 Subject: [PATCH 6/6] PR feedback --- msfz/src/lib.rs | 5 +++++ msfz/src/writer.rs | 11 +++++++++-- pdbtool/src/pdz/encode.rs | 24 +++++++++++------------- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/msfz/src/lib.rs b/msfz/src/lib.rs index 4836711..5141380 100644 --- a/msfz/src/lib.rs +++ b/msfz/src/lib.rs @@ -100,6 +100,11 @@ pub const COMPRESSION_ZSTD: u32 = 1; /// This uses the "raw" Deflate stream. It _does not_ use the GZIP encapsulation header. pub const COMPRESSION_DEFLATE: u32 = 2; +/// This is the maximum file offset where an uncompressed fragment be be stored. +/// +/// The MSFZ specification provides 48 bits for storing the file offset of an uncompressed fragment. +pub const MAX_UNCOMPRESSED_FILE_OFFSET: u64 = (1u64 << 48) - 1; + /// Specifies the compression algorithms that are supported by this library. #[derive(Copy, Clone, Eq, PartialEq, Debug)] #[non_exhaustive] diff --git a/msfz/src/writer.rs b/msfz/src/writer.rs index e2c210c..6552ba9 100644 --- a/msfz/src/writer.rs +++ b/msfz/src/writer.rs @@ -4,11 +4,11 @@ use pow2::Pow2; use std::fs::File; use std::io::{Seek, SeekFrom, Write}; use std::path::Path; -use tracing::{debug, debug_span, trace, trace_span}; +use tracing::{debug, debug_span, error, trace, trace_span}; use zerocopy::IntoBytes; /// The default threshold for compressing a chunk of data. -pub const DEFAULT_CHUNK_THRESHOLD: u32 = 0x40_0000; // 4 MiB +pub const DEFAULT_CHUNK_THRESHOLD: u32 = 0x10_0000; // 1 MiB /// The minimum value for the uncompressed chunk size threshold. pub const MIN_CHUNK_SIZE: u32 = 0x1000; @@ -536,6 +536,13 @@ impl<'a, F: Write + Seek> Write for StreamWriter<'a, F> { } else { self.file.out.stream_position()? }; + + // The MSFZ spec allocates 48 bytes for the file offset of uncompressed fragments. + if fragment_file_offset > MAX_UNCOMPRESSED_FILE_OFFSET { + error!("The uncompressed file fragment "); + return Err(std::io::ErrorKind::FileTooLarge.into()); + }; + self.file.out.write_all(buf)?; add_fragment_uncompressed(&mut self.stream.fragments, buf_len, fragment_file_offset); diff --git a/pdbtool/src/pdz/encode.rs b/pdbtool/src/pdz/encode.rs index 293b026..44aa883 100644 --- a/pdbtool/src/pdz/encode.rs +++ b/pdbtool/src/pdz/encode.rs @@ -698,6 +698,10 @@ fn write_tpi_or_ipi( /// We use a simple algorithm. We build a list of the start and end locations of each of these /// buffers (if they are non-zero length). Then we sort the list and de-dup it. Then we traverse /// the list, writing chunks for each region. +/// +/// This simple algorithm allows us to ignore all sorts of strange situations, such as regions +/// overlapping, regions occurring in an unusual order, or there being bytes within the stream +/// that are not covered by any region. All we care about is the compression boundaries. fn write_tpi_or_ipi_hash_stream( sw: &mut StreamWriter<'_, File>, stream_data: &[u8], @@ -759,27 +763,21 @@ fn write_tpi_or_ipi_hash_stream( debug!("Type Hash Stream offset boundaries: {:?}", boundaries); - let mut pos: usize = 0; + let mut previous_boundary: usize = 0; let mut total_bytes_written: usize = 0; - for &boundary_offset in boundaries.iter() { - assert!(boundary_offset >= pos); - if boundary_offset == pos { - continue; - } - - let start = pos; - let end = boundary_offset; - pos = end; - - let data_between_boundaries = &stream_data[start..end]; + for &next_boundary in boundaries.iter() { + assert!(next_boundary >= previous_boundary); + let region_data = &stream_data[previous_boundary..next_boundary]; // We are going to _further_ chunk things, based on max_chunk_len. - for chunk_data in data_between_boundaries.chunks(max_chunk_len) { + for chunk_data in region_data.chunks(max_chunk_len) { sw.write_all(chunk_data)?; sw.end_chunk()?; total_bytes_written += chunk_data.len(); } + + previous_boundary = next_boundary; } assert_eq!(