From 6210c625de3f617b0553ae8176f2483133517161 Mon Sep 17 00:00:00 2001 From: Colin Finck Date: Sun, 1 Aug 2021 23:18:14 +0200 Subject: Implement finding files by name in a file name index. This introduces parsing the $UpCase file to perform case-insensitive searches. --- src/error.rs | 2 + src/index.rs | 95 +++++++++++++++++++++++++++++++++++++++++++ src/indexes/file_name.rs | 29 +++++++++++++ src/lib.rs | 1 + src/ntfs.rs | 29 +++++++++++++ src/string.rs | 77 +++++++++++++++++++++++++++++------ src/upcase_table.rs | 103 +++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 323 insertions(+), 13 deletions(-) create mode 100644 src/upcase_table.rs diff --git a/src/error.rs b/src/error.rs index 89e8d7b..85b6d3e 100644 --- a/src/error.rs +++ b/src/error.rs @@ -118,6 +118,8 @@ pub enum NtfsError { expected: &'static [u8], actual: [u8; 2], }, + /// The Upcase Table should have a size of {expected} bytes, but it has {actual} bytes + InvalidUpcaseTableSize { expected: u64, actual: u64 }, /// The VCN {vcn} read from the NTFS data run header at byte position {position:#010x} cannot be added to the LCN {previous_lcn} calculated from previous data runs InvalidVcnInDataRunHeader { position: u64, diff --git a/src/index.rs b/src/index.rs index e6abd7a..f9769d9 100644 --- a/src/index.rs +++ b/src/index.rs @@ -7,6 +7,7 @@ use crate::indexes::NtfsIndexEntryType; use crate::structured_values::{NtfsIndexAllocation, NtfsIndexRoot}; use alloc::vec::Vec; use binread::io::{Read, Seek}; +use core::cmp::Ordering; use core::marker::PhantomData; pub struct NtfsIndex<'n, 'f, E> @@ -41,6 +42,12 @@ where }) } + /// Returns an [`NtfsIndexFinder`] structure to efficiently find an entry in this index. + pub fn finder<'i>(&'i self) -> NtfsIndexFinder<'n, 'f, 'i, E> { + NtfsIndexFinder::new(self) + } + + /// Returns an [`NtfsIndexEntries`] iterator to perform an in-order traversal of this index. pub fn iter<'i>(&'i self) -> NtfsIndexEntries<'n, 'f, 'i, E> { NtfsIndexEntries::new(self) } @@ -152,3 +159,91 @@ where Some(Ok(entry)) } } + +/// Helper structure to efficiently find an entry in an index, created by [`NtfsIndex::finder`]. +/// +/// This helper is required, because the returned entry borrows from the iterator it was created from. +/// The idea is that you copy the field(s) you need from the returned entry and then drop the entry and the finder. +pub struct NtfsIndexFinder<'n, 'f, 'i, E> +where + E: NtfsIndexEntryType, +{ + index: &'i NtfsIndex<'n, 'f, E>, + inner_iterator: IndexNodeEntryRanges, +} + +impl<'n, 'f, 'i, E> NtfsIndexFinder<'n, 'f, 'i, E> +where + E: NtfsIndexEntryType, +{ + fn new(index: &'i NtfsIndex<'n, 'f, E>) -> Self { + // This is superfluous and done again in `find`, but doesn't justify using an `Option` here. + let inner_iterator = index.index_root.entry_ranges(); + + Self { + index, + inner_iterator, + } + } + + /// Finds an entry in this index using the given comparison function and returns an [`NtfsIndexEntry`] + /// (if there is one). + pub fn find<'a, T, F>(&'a mut self, fs: &mut T, cmp: F) -> Option>> + where + T: Read + Seek, + F: Fn(&E::KeyType) -> Ordering, + { + // Always (re)start by iterating through the Index Root entry ranges. + self.inner_iterator = self.index.index_root.entry_ranges(); + + loop { + // Get the next entry. + // + // A textbook B-tree search algorithm would get the middle entry and perform binary search. + // But we can't do that here, as we are dealing with variable-length entries. + let entry_range = self.inner_iterator.next()?; + let entry = entry_range.to_entry(self.inner_iterator.data()); + + // Check if this entry has a key. + if let Some(key) = entry.key() { + // The entry has a key, so compare it using the given function. + let key = iter_try!(key); + + match cmp(&key) { + Ordering::Equal => { + // We found what we were looking for! + // Recreate `entry` from the last `self.inner_iterator` to please the borrow checker. + let entry = entry_range.to_entry(self.inner_iterator.data()); + return Some(Ok(entry)); + } + Ordering::Less => { + // What we are looking for comes BEFORE this entry. + // Hence, it must be in a subnode of this entry and we continue below. + } + Ordering::Greater => { + // What we are looking for comes AFTER this entry. + // Keep searching on the same subnode level. + continue; + } + } + } + + // Either this entry has no key (= is the last one on this subnode level) or + // it comes lexicographically AFTER what we're looking for. + // In both cases, we have to continue iterating in the subnode of this entry (if there is any). + let subnode_vcn = entry.subnode_vcn()?; + let index_allocation = + iter_try!(self.index.index_allocation.as_ref().ok_or_else(|| { + NtfsError::MissingIndexAllocation { + position: self.index.index_root.position(), + } + })); + let subnode = iter_try!(index_allocation.record_from_vcn( + fs, + &self.index.index_root, + subnode_vcn + )); + self.inner_iterator = subnode.into_entry_ranges(); + } + } +} diff --git a/src/indexes/file_name.rs b/src/indexes/file_name.rs index 069836a..5f77bc7 100644 --- a/src/indexes/file_name.rs +++ b/src/indexes/file_name.rs @@ -1,12 +1,41 @@ // Copyright 2021 Colin Finck // SPDX-License-Identifier: GPL-2.0-or-later +use crate::error::Result; +use crate::index::NtfsIndexFinder; +use crate::index_entry::NtfsIndexEntry; use crate::indexes::{NtfsIndexEntryHasFileReference, NtfsIndexEntryType}; +use crate::ntfs::Ntfs; +use crate::string::UpcaseOrd; use crate::structured_values::NtfsFileName; +use binread::io::{Read, Seek}; +/// Defines the [`NtfsIndexEntryType`] for filename indexes (more commonly known as "directories"). #[derive(Debug)] pub struct NtfsFileNameIndex {} +impl NtfsFileNameIndex { + /// Finds a file in a filename index by name and returns the [`NtfsIndexEntry`] (if any). + /// The name is compared case-insensitively based on the filesystem's $UpCase table. + /// + /// # Panics + /// + /// Panics if [`read_upcase_table`][Ntfs::read_upcase_table] had not been called on the passed [`Ntfs`] object. + pub fn find<'a, 'n, T>( + index_finder: &'a mut NtfsIndexFinder, + ntfs: &'n Ntfs, + fs: &mut T, + name: &str, + ) -> Option>> + where + T: Read + Seek, + { + // TODO: This always performs a case-insensitive comparison. + // There are some corner cases where NTFS uses case-sensitive filenames. These need to be considered! + index_finder.find(fs, |file_name| name.upcase_cmp(ntfs, &file_name.name())) + } +} + impl NtfsIndexEntryType for NtfsFileNameIndex { type KeyType = NtfsFileName; } diff --git a/src/lib.rs b/src/lib.rs index 6215182..70723fb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,7 @@ pub mod structured_values; mod time; mod traits; mod types; +mod upcase_table; pub use crate::attribute::*; pub use crate::attribute_value::*; diff --git a/src/ntfs.rs b/src/ntfs.rs index aeb6dd8..ee2f7fc 100644 --- a/src/ntfs.rs +++ b/src/ntfs.rs @@ -6,6 +6,7 @@ use crate::boot_sector::BootSector; use crate::error::{NtfsError, Result}; use crate::ntfs_file::{KnownNtfsFile, NtfsFile}; use crate::structured_values::{NtfsVolumeInformation, NtfsVolumeName}; +use crate::upcase_table::UpcaseTable; use binread::io::{Read, Seek, SeekFrom}; use binread::BinReaderExt; @@ -23,6 +24,8 @@ pub struct Ntfs { file_record_size: u32, /// Serial number of the NTFS volume. serial_number: u64, + /// Table of Unicode uppercase characters (only required for case-insensitive comparisons). + upcase_table: Option, } impl Ntfs { @@ -42,6 +45,7 @@ impl Ntfs { let mft_position = 0; let file_record_size = bpb.file_record_size()?; let serial_number = bpb.serial_number(); + let upcase_table = None; let mut ntfs = Self { cluster_size, @@ -50,6 +54,7 @@ impl Ntfs { mft_position, file_record_size, serial_number, + upcase_table, }; ntfs.mft_position = bpb.mft_lcn().position(&ntfs)?; @@ -88,6 +93,19 @@ impl Ntfs { NtfsFile::new(&self, fs, position) } + /// Reads the $UpCase file from the filesystem and stores it in this [`Ntfs`] object. + /// + /// This function only needs to be called if case-insensitive comparisons are later performed + /// (i.e. finding files). + pub fn read_upcase_table(&mut self, fs: &mut T) -> Result<()> + where + T: Read + Seek, + { + let upcase_table = UpcaseTable::read(self, fs)?; + self.upcase_table = Some(upcase_table); + Ok(()) + } + /// Returns the root [`Dir`] of this NTFS volume. pub fn root_dir(&self) -> ! { panic!("TODO") @@ -108,6 +126,17 @@ impl Ntfs { self.size } + /// Returns the stored [`UpcaseTable`]. + /// + /// # Panics + /// + /// Panics if [`read_upcase_table`][Ntfs::read_upcase_table] had not been called. + pub(crate) fn upcase_table(&self) -> &UpcaseTable { + self.upcase_table + .as_ref() + .expect("You need to call read_upcase_table first") + } + /// Returns an [`NtfsVolumeInformation`] containing general information about /// the volume, like the NTFS version. pub fn volume_info(&self, fs: &mut T) -> Result diff --git a/src/string.rs b/src/string.rs index ce0109b..2d6d4a9 100644 --- a/src/string.rs +++ b/src/string.rs @@ -1,10 +1,11 @@ // Copyright 2021 Colin Finck // SPDX-License-Identifier: GPL-2.0-or-later +use crate::ntfs::Ntfs; use alloc::string::String; use core::char; use core::cmp::Ordering; -use core::convert::TryInto; +use core::convert::{identity, TryInto}; use core::fmt; /// Zero-copy representation of a string stored in an NTFS filesystem structure. @@ -12,17 +13,21 @@ use core::fmt; pub struct NtfsString<'a>(pub &'a [u8]); impl<'a> NtfsString<'a> { - fn cmp_iter(mut this_iter: TI, mut other_iter: OI) -> Ordering + fn cmp_iter(mut this_iter: TI, mut other_iter: OI, code_unit_fn: F) -> Ordering where TI: Iterator, OI: Iterator, + F: Fn(u16) -> u16, { loop { match (this_iter.next(), other_iter.next()) { (Some(this_code_unit), Some(other_code_unit)) => { // We have two UTF-16 code units to compare. - if this_code_unit != other_code_unit { - return this_code_unit.cmp(&other_code_unit); + let this_upper = code_unit_fn(this_code_unit); + let other_upper = code_unit_fn(other_code_unit); + + if this_upper != other_upper { + return this_upper.cmp(&other_upper); } } (Some(_), None) => { @@ -91,7 +96,7 @@ impl<'a> fmt::Display for NtfsString<'a> { impl<'a> Ord for NtfsString<'a> { fn cmp(&self, other: &Self) -> Ordering { - NtfsString::cmp_iter(self.u16_iter(), other.u16_iter()) + NtfsString::cmp_iter(self.u16_iter(), other.u16_iter(), identity) } } @@ -104,25 +109,25 @@ impl<'a> PartialEq for NtfsString<'a> { impl<'a> PartialEq for NtfsString<'a> { fn eq(&self, other: &str) -> bool { - NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16()) == Ordering::Equal + NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16(), identity) == Ordering::Equal } } impl<'a> PartialEq> for str { fn eq(&self, other: &NtfsString<'a>) -> bool { - NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter()) == Ordering::Equal + NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter(), identity) == Ordering::Equal } } impl<'a> PartialEq<&str> for NtfsString<'a> { fn eq(&self, other: &&str) -> bool { - NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16()) == Ordering::Equal + NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16(), identity) == Ordering::Equal } } impl<'a> PartialEq> for &str { fn eq(&self, other: &NtfsString<'a>) -> bool { - NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter()) == Ordering::Equal + NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter(), identity) == Ordering::Equal } } @@ -134,24 +139,70 @@ impl<'a> PartialOrd for NtfsString<'a> { impl<'a> PartialOrd for NtfsString<'a> { fn partial_cmp(&self, other: &str) -> Option { - Some(NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16())) + Some(NtfsString::cmp_iter( + self.u16_iter(), + other.encode_utf16(), + identity, + )) } } impl<'a> PartialOrd> for str { fn partial_cmp(&self, other: &NtfsString<'a>) -> Option { - Some(NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter())) + Some(NtfsString::cmp_iter( + self.encode_utf16(), + other.u16_iter(), + identity, + )) } } impl<'a> PartialOrd<&str> for NtfsString<'a> { fn partial_cmp(&self, other: &&str) -> Option { - Some(NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16())) + Some(NtfsString::cmp_iter( + self.u16_iter(), + other.encode_utf16(), + identity, + )) } } impl<'a> PartialOrd> for &str { fn partial_cmp(&self, other: &NtfsString<'a>) -> Option { - Some(NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter())) + Some(NtfsString::cmp_iter( + self.encode_utf16(), + other.u16_iter(), + identity, + )) + } +} + +pub trait UpcaseOrd { + /// Performs a case-insensitive ordering based on the upcase table read from the filesystem. + /// + /// # Panics + /// + /// Panics if [`read_upcase_table`][Ntfs::read_upcase_table] had not been called on the passed [`Ntfs`] object. + fn upcase_cmp(&self, ntfs: &Ntfs, other: &Rhs) -> Ordering; +} + +impl<'a> UpcaseOrd> for NtfsString<'a> { + fn upcase_cmp(&self, ntfs: &Ntfs, other: &NtfsString<'a>) -> Ordering { + let upcase_fn = |x| ntfs.upcase_table().u16_to_uppercase(x); + NtfsString::cmp_iter(self.u16_iter(), other.u16_iter(), upcase_fn) + } +} + +impl<'a> UpcaseOrd<&str> for NtfsString<'a> { + fn upcase_cmp(&self, ntfs: &Ntfs, other: &&str) -> Ordering { + let upcase_fn = |x| ntfs.upcase_table().u16_to_uppercase(x); + NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16(), upcase_fn) + } +} + +impl<'a> UpcaseOrd> for &str { + fn upcase_cmp(&self, ntfs: &Ntfs, other: &NtfsString<'a>) -> Ordering { + let upcase_fn = |x| ntfs.upcase_table().u16_to_uppercase(x); + NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter(), upcase_fn) } } diff --git a/src/upcase_table.rs b/src/upcase_table.rs new file mode 100644 index 0000000..b612fa6 --- /dev/null +++ b/src/upcase_table.rs @@ -0,0 +1,103 @@ +// Copyright 2021 Colin Finck +// SPDX-License-Identifier: GPL-2.0-or-later + +use crate::attribute::NtfsAttributeType; +use crate::error::{NtfsError, Result}; +use crate::ntfs::Ntfs; +use crate::ntfs_file::KnownNtfsFile; +use crate::traits::NtfsReadSeek; +use binread::io::{Read, Seek}; +use core::convert::TryInto; +use core::mem; + +/// The Upcase Table contains an uppercase character for each Unicode character of the Basic Multilingual Plane. +const UPCASE_CHARACTER_COUNT: usize = 65536; + +/// Hence, the table has a size of 128 KiB. +const UPCASE_TABLE_SIZE: u64 = (UPCASE_CHARACTER_COUNT * mem::size_of::()) as u64; + +/// Manages a table for converting characters to uppercase. +/// This table is used for case-insensitive file name comparisons. +/// +/// NTFS stores such a table in the special $UpCase file on every filesystem. +/// As this table is slightly different depending on the Windows version used for creating the filesystem, +/// it is very important to always read the table from the filesystem itself. +/// Hence, this table is not hardcoded into the crate. +#[derive(Clone, Debug)] +pub(crate) struct UpcaseTable { + uppercase_characters: Vec, +} + +impl UpcaseTable { + /// Reads the $UpCase file from the given filesystem into a new [`UpcaseTable`] object. + pub(crate) fn read(ntfs: &Ntfs, fs: &mut T) -> Result + where + T: Read + Seek, + { + // Lookup the $UpCase file and its $DATA attribute. + let upcase_file = ntfs.ntfs_file(fs, KnownNtfsFile::UpCase as u64)?; + let data_attribute = upcase_file + .attributes() + .find(|attribute| { + // TODO: Replace by attribute.ty().contains() once https://github.com/rust-lang/rust/issues/62358 has landed. + attribute + .ty() + .map(|ty| ty == NtfsAttributeType::Data) + .unwrap_or(false) + }) + .ok_or(NtfsError::AttributeNotFound { + position: upcase_file.position(), + ty: NtfsAttributeType::VolumeName, + })?; + if data_attribute.value_length() != UPCASE_TABLE_SIZE { + return Err(NtfsError::InvalidUpcaseTableSize { + expected: UPCASE_TABLE_SIZE, + actual: data_attribute.value_length(), + }); + } + + // Read the entire raw data from the $DATA attribute. + let mut data_value = data_attribute.value()?; + let mut data = vec![0u8; UPCASE_TABLE_SIZE as usize]; + data_value.read_exact(fs, &mut data)?; + + // Store it in an array of `u16` uppercase characters. + // Any endianness conversion is done here once, which makes `u16_to_uppercase` fast. + let uppercase_characters = data + .chunks_exact(2) + .map(|two_bytes| u16::from_le_bytes(two_bytes.try_into().unwrap())) + .collect(); + + Ok(Self { + uppercase_characters, + }) + } + + /// Returns the uppercase variant of the given UCS-2 character (i.e. a Unicode character + /// from the Basic Multilingual Plane) based on the stored conversion table. + /// A character without an uppercase equivalent is returned as-is. + pub(crate) fn u16_to_uppercase(&self, character: u16) -> u16 { + self.uppercase_characters[character as usize] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_upcase_table() { + let mut testfs1 = crate::helpers::tests::testfs1(); + let ntfs = Ntfs::new(&mut testfs1).unwrap(); + let upcase_table = UpcaseTable::read(&ntfs, &mut testfs1).unwrap(); + + // Prove that at least the lowercase English characters are mapped to their uppercase equivalents. + // It makes no sense to check everything here. + for (lowercase, uppercase) in (b'a'..=b'z').zip(b'A'..=b'Z') { + assert_eq!( + upcase_table.u16_to_uppercase(lowercase as u16), + uppercase as u16 + ); + } + } +} -- cgit v1.2.3