Implement finding files by name in a file name index.

This introduces parsing the $UpCase file to perform case-insensitive searches.
author: Colin Finck <colin@reactos.org> 2021-08-02 00:18:14 +0300
committer: Colin Finck <colin@reactos.org> 2021-08-02 00:18:14 +0300
commit: 6210c625de3f617b0553ae8176f2483133517161 (patch)
tree: 09ed0a59abd45e1c5691c652011baf2733184e8e
parent: 943309f22ed6e208b7dcc1b24ae8879eb88f8a39 (diff)
7 files changed, 323 insertions, 13 deletions
diff --git a/src/error.rs b/src/error.rs
index 89e8d7b..85b6d3e 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -118,6 +118,8 @@ pub enum NtfsError {
         expected: &'static [u8],
         actual: [u8; 2],
     },
+    /// The Upcase Table should have a size of {expected} bytes, but it has {actual} bytes
+    InvalidUpcaseTableSize { expected: u64, actual: u64 },
     /// The VCN {vcn} read from the NTFS data run header at byte position {position:#010x} cannot be added to the LCN {previous_lcn} calculated from previous data runs
     InvalidVcnInDataRunHeader {
         position: u64,
diff --git a/src/index.rs b/src/index.rs
index e6abd7a..f9769d9 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -7,6 +7,7 @@ use crate::indexes::NtfsIndexEntryType;
 use crate::structured_values::{NtfsIndexAllocation, NtfsIndexRoot};
 use alloc::vec::Vec;
 use binread::io::{Read, Seek};
+use core::cmp::Ordering;
 use core::marker::PhantomData;
 
 pub struct NtfsIndex<'n, 'f, E>
@@ -41,6 +42,12 @@ where
         })
     }
 
+    /// Returns an [`NtfsIndexFinder`] structure to efficiently find an entry in this index.
+    pub fn finder<'i>(&'i self) -> NtfsIndexFinder<'n, 'f, 'i, E> {
+        NtfsIndexFinder::new(self)
+    }
+
+    /// Returns an [`NtfsIndexEntries`] iterator to perform an in-order traversal of this index.
     pub fn iter<'i>(&'i self) -> NtfsIndexEntries<'n, 'f, 'i, E> {
         NtfsIndexEntries::new(self)
     }
@@ -152,3 +159,91 @@ where
         Some(Ok(entry))
     }
 }
+
+/// Helper structure to efficiently find an entry in an index, created by [`NtfsIndex::finder`].
+///
+/// This helper is required, because the returned entry borrows from the iterator it was created from.
+/// The idea is that you copy the field(s) you need from the returned entry and then drop the entry and the finder.
+pub struct NtfsIndexFinder<'n, 'f, 'i, E>
+where
+    E: NtfsIndexEntryType,
+{
+    index: &'i NtfsIndex<'n, 'f, E>,
+    inner_iterator: IndexNodeEntryRanges<E>,
+}
+
+impl<'n, 'f, 'i, E> NtfsIndexFinder<'n, 'f, 'i, E>
+where
+    E: NtfsIndexEntryType,
+{
+    fn new(index: &'i NtfsIndex<'n, 'f, E>) -> Self {
+        // This is superfluous and done again in `find`, but doesn't justify using an `Option` here.
+        let inner_iterator = index.index_root.entry_ranges();
+
+        Self {
+            index,
+            inner_iterator,
+        }
+    }
+
+    /// Finds an entry in this index using the given comparison function and returns an [`NtfsIndexEntry`]
+    /// (if there is one).
+    pub fn find<'a, T, F>(&'a mut self, fs: &mut T, cmp: F) -> Option<Result<NtfsIndexEntry<'a, E>>>
+    where
+        T: Read + Seek,
+        F: Fn(&E::KeyType) -> Ordering,
+    {
+        // Always (re)start by iterating through the Index Root entry ranges.
+        self.inner_iterator = self.index.index_root.entry_ranges();
+
+        loop {
+            // Get the next entry.
+            //
+            // A textbook B-tree search algorithm would get the middle entry and perform binary search.
+            // But we can't do that here, as we are dealing with variable-length entries.
+            let entry_range = self.inner_iterator.next()?;
+            let entry = entry_range.to_entry(self.inner_iterator.data());
+
+            // Check if this entry has a key.
+            if let Some(key) = entry.key() {
+                // The entry has a key, so compare it using the given function.
+                let key = iter_try!(key);
+
+                match cmp(&key) {
+                    Ordering::Equal => {
+                        // We found what we were looking for!
+                        // Recreate `entry` from the last `self.inner_iterator` to please the borrow checker.
+                        let entry = entry_range.to_entry(self.inner_iterator.data());
+                        return Some(Ok(entry));
+                    }
+                    Ordering::Less => {
+                        // What we are looking for comes BEFORE this entry.
+                        // Hence, it must be in a subnode of this entry and we continue below.
+                    }
+                    Ordering::Greater => {
+                        // What we are looking for comes AFTER this entry.
+                        // Keep searching on the same subnode level.
+                        continue;
+                    }
+                }
+            }
+
+            // Either this entry has no key (= is the last one on this subnode level) or
+            // it comes lexicographically AFTER what we're looking for.
+            // In both cases, we have to continue iterating in the subnode of this entry (if there is any).
+            let subnode_vcn = entry.subnode_vcn()?;
+            let index_allocation =
+                iter_try!(self.index.index_allocation.as_ref().ok_or_else(|| {
+                    NtfsError::MissingIndexAllocation {
+                        position: self.index.index_root.position(),
+                    }
+                }));
+            let subnode = iter_try!(index_allocation.record_from_vcn(
+                fs,
+                &self.index.index_root,
+                subnode_vcn
+            ));
+            self.inner_iterator = subnode.into_entry_ranges();
+        }
+    }
+}
diff --git a/src/indexes/file_name.rs b/src/indexes/file_name.rs
index 069836a..5f77bc7 100644
--- a/src/indexes/file_name.rs
+++ b/src/indexes/file_name.rs
@@ -1,12 +1,41 @@
 // Copyright 2021 Colin Finck <colin@reactos.org>
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+use crate::error::Result;
+use crate::index::NtfsIndexFinder;
+use crate::index_entry::NtfsIndexEntry;
 use crate::indexes::{NtfsIndexEntryHasFileReference, NtfsIndexEntryType};
+use crate::ntfs::Ntfs;
+use crate::string::UpcaseOrd;
 use crate::structured_values::NtfsFileName;
+use binread::io::{Read, Seek};
 
+/// Defines the [`NtfsIndexEntryType`] for filename indexes (more commonly known as "directories").
 #[derive(Debug)]
 pub struct NtfsFileNameIndex {}
 
+impl NtfsFileNameIndex {
+    /// Finds a file in a filename index by name and returns the [`NtfsIndexEntry`] (if any).
+    /// The name is compared case-insensitively based on the filesystem's $UpCase table.
+    ///
+    /// # Panics
+    ///
+    /// Panics if [`read_upcase_table`][Ntfs::read_upcase_table] had not been called on the passed [`Ntfs`] object.
+    pub fn find<'a, 'n, T>(
+        index_finder: &'a mut NtfsIndexFinder<Self>,
+        ntfs: &'n Ntfs,
+        fs: &mut T,
+        name: &str,
+    ) -> Option<Result<NtfsIndexEntry<'a, Self>>>
+    where
+        T: Read + Seek,
+    {
+        // TODO: This always performs a case-insensitive comparison.
+        // There are some corner cases where NTFS uses case-sensitive filenames. These need to be considered!
+        index_finder.find(fs, |file_name| name.upcase_cmp(ntfs, &file_name.name()))
+    }
+}
+
 impl NtfsIndexEntryType for NtfsFileNameIndex {
     type KeyType = NtfsFileName;
 }
diff --git a/src/lib.rs b/src/lib.rs
index 6215182..70723fb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,6 +27,7 @@ pub mod structured_values;
 mod time;
 mod traits;
 mod types;
+mod upcase_table;
 
 pub use crate::attribute::*;
 pub use crate::attribute_value::*;
diff --git a/src/ntfs.rs b/src/ntfs.rs
index aeb6dd8..ee2f7fc 100644
--- a/src/ntfs.rs
+++ b/src/ntfs.rs
@@ -6,6 +6,7 @@ use crate::boot_sector::BootSector;
 use crate::error::{NtfsError, Result};
 use crate::ntfs_file::{KnownNtfsFile, NtfsFile};
 use crate::structured_values::{NtfsVolumeInformation, NtfsVolumeName};
+use crate::upcase_table::UpcaseTable;
 use binread::io::{Read, Seek, SeekFrom};
 use binread::BinReaderExt;
 
@@ -23,6 +24,8 @@ pub struct Ntfs {
     file_record_size: u32,
     /// Serial number of the NTFS volume.
     serial_number: u64,
+    /// Table of Unicode uppercase characters (only required for case-insensitive comparisons).
+    upcase_table: Option<UpcaseTable>,
 }
 
 impl Ntfs {
@@ -42,6 +45,7 @@ impl Ntfs {
         let mft_position = 0;
         let file_record_size = bpb.file_record_size()?;
         let serial_number = bpb.serial_number();
+        let upcase_table = None;
 
         let mut ntfs = Self {
             cluster_size,
@@ -50,6 +54,7 @@ impl Ntfs {
             mft_position,
             file_record_size,
             serial_number,
+            upcase_table,
         };
         ntfs.mft_position = bpb.mft_lcn().position(&ntfs)?;
 
@@ -88,6 +93,19 @@ impl Ntfs {
         NtfsFile::new(&self, fs, position)
     }
 
+    /// Reads the $UpCase file from the filesystem and stores it in this [`Ntfs`] object.
+    ///
+    /// This function only needs to be called if case-insensitive comparisons are later performed
+    /// (i.e. finding files).
+    pub fn read_upcase_table<T>(&mut self, fs: &mut T) -> Result<()>
+    where
+        T: Read + Seek,
+    {
+        let upcase_table = UpcaseTable::read(self, fs)?;
+        self.upcase_table = Some(upcase_table);
+        Ok(())
+    }
+
     /// Returns the root [`Dir`] of this NTFS volume.
     pub fn root_dir(&self) -> ! {
         panic!("TODO")
@@ -108,6 +126,17 @@ impl Ntfs {
         self.size
     }
 
+    /// Returns the stored [`UpcaseTable`].
+    ///
+    /// # Panics
+    ///
+    /// Panics if [`read_upcase_table`][Ntfs::read_upcase_table] had not been called.
+    pub(crate) fn upcase_table(&self) -> &UpcaseTable {
+        self.upcase_table
+            .as_ref()
+            .expect("You need to call read_upcase_table first")
+    }
+
     /// Returns an [`NtfsVolumeInformation`] containing general information about
     /// the volume, like the NTFS version.
     pub fn volume_info<T>(&self, fs: &mut T) -> Result<NtfsVolumeInformation>
diff --git a/src/string.rs b/src/string.rs
index ce0109b..2d6d4a9 100644
--- a/src/string.rs
+++ b/src/string.rs
@@ -1,10 +1,11 @@
 // Copyright 2021 Colin Finck <colin@reactos.org>
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+use crate::ntfs::Ntfs;
 use alloc::string::String;
 use core::char;
 use core::cmp::Ordering;
-use core::convert::TryInto;
+use core::convert::{identity, TryInto};
 use core::fmt;
 
 /// Zero-copy representation of a string stored in an NTFS filesystem structure.
@@ -12,17 +13,21 @@ use core::fmt;
 pub struct NtfsString<'a>(pub &'a [u8]);
 
 impl<'a> NtfsString<'a> {
-    fn cmp_iter<TI, OI>(mut this_iter: TI, mut other_iter: OI) -> Ordering
+    fn cmp_iter<TI, OI, F>(mut this_iter: TI, mut other_iter: OI, code_unit_fn: F) -> Ordering
     where
         TI: Iterator<Item = u16>,
         OI: Iterator<Item = u16>,
+        F: Fn(u16) -> u16,
     {
         loop {
             match (this_iter.next(), other_iter.next()) {
                 (Some(this_code_unit), Some(other_code_unit)) => {
                     // We have two UTF-16 code units to compare.
-                    if this_code_unit != other_code_unit {
-                        return this_code_unit.cmp(&other_code_unit);
+                    let this_upper = code_unit_fn(this_code_unit);
+                    let other_upper = code_unit_fn(other_code_unit);
+
+                    if this_upper != other_upper {
+                        return this_upper.cmp(&other_upper);
                     }
                 }
                 (Some(_), None) => {
@@ -91,7 +96,7 @@ impl<'a> fmt::Display for NtfsString<'a> {
 
 impl<'a> Ord for NtfsString<'a> {
     fn cmp(&self, other: &Self) -> Ordering {
-        NtfsString::cmp_iter(self.u16_iter(), other.u16_iter())
+        NtfsString::cmp_iter(self.u16_iter(), other.u16_iter(), identity)
     }
 }
 
@@ -104,25 +109,25 @@ impl<'a> PartialEq for NtfsString<'a> {
 
 impl<'a> PartialEq<str> for NtfsString<'a> {
     fn eq(&self, other: &str) -> bool {
-        NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16()) == Ordering::Equal
+        NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16(), identity) == Ordering::Equal
     }
 }
 
 impl<'a> PartialEq<NtfsString<'a>> for str {
     fn eq(&self, other: &NtfsString<'a>) -> bool {
-        NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter()) == Ordering::Equal
+        NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter(), identity) == Ordering::Equal
     }
 }
 
 impl<'a> PartialEq<&str> for NtfsString<'a> {
     fn eq(&self, other: &&str) -> bool {
-        NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16()) == Ordering::Equal
+        NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16(), identity) == Ordering::Equal
     }
 }
 
 impl<'a> PartialEq<NtfsString<'a>> for &str {
     fn eq(&self, other: &NtfsString<'a>) -> bool {
-        NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter()) == Ordering::Equal
+        NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter(), identity) == Ordering::Equal
     }
 }
 
@@ -134,24 +139,70 @@ impl<'a> PartialOrd for NtfsString<'a> {
 
 impl<'a> PartialOrd<str> for NtfsString<'a> {
     fn partial_cmp(&self, other: &str) -> Option<Ordering> {
-        Some(NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16()))
+        Some(NtfsString::cmp_iter(
+            self.u16_iter(),
+            other.encode_utf16(),
+            identity,
+        ))
     }
 }
 
 impl<'a> PartialOrd<NtfsString<'a>> for str {
     fn partial_cmp(&self, other: &NtfsString<'a>) -> Option<Ordering> {
-        Some(NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter()))
+        Some(NtfsString::cmp_iter(
+            self.encode_utf16(),
+            other.u16_iter(),
+            identity,
+        ))
     }
 }
 
 impl<'a> PartialOrd<&str> for NtfsString<'a> {
     fn partial_cmp(&self, other: &&str) -> Option<Ordering> {
-        Some(NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16()))
+        Some(NtfsString::cmp_iter(
+            self.u16_iter(),
+            other.encode_utf16(),
+            identity,
+        ))
     }
 }
 
 impl<'a> PartialOrd<NtfsString<'a>> for &str {
     fn partial_cmp(&self, other: &NtfsString<'a>) -> Option<Ordering> {
-        Some(NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter()))
+        Some(NtfsString::cmp_iter(
+            self.encode_utf16(),
+            other.u16_iter(),
+            identity,
+        ))
+    }
+}
+
+pub trait UpcaseOrd<Rhs> {
+    /// Performs a case-insensitive ordering based on the upcase table read from the filesystem.
+    ///
+    /// # Panics
+    ///
+    /// Panics if [`read_upcase_table`][Ntfs::read_upcase_table] had not been called on the passed [`Ntfs`] object.
+    fn upcase_cmp(&self, ntfs: &Ntfs, other: &Rhs) -> Ordering;
+}
+
+impl<'a> UpcaseOrd<NtfsString<'a>> for NtfsString<'a> {
+    fn upcase_cmp(&self, ntfs: &Ntfs, other: &NtfsString<'a>) -> Ordering {
+        let upcase_fn = |x| ntfs.upcase_table().u16_to_uppercase(x);
+        NtfsString::cmp_iter(self.u16_iter(), other.u16_iter(), upcase_fn)
+    }
+}
+
+impl<'a> UpcaseOrd<&str> for NtfsString<'a> {
+    fn upcase_cmp(&self, ntfs: &Ntfs, other: &&str) -> Ordering {
+        let upcase_fn = |x| ntfs.upcase_table().u16_to_uppercase(x);
+        NtfsString::cmp_iter(self.u16_iter(), other.encode_utf16(), upcase_fn)
+    }
+}
+
+impl<'a> UpcaseOrd<NtfsString<'a>> for &str {
+    fn upcase_cmp(&self, ntfs: &Ntfs, other: &NtfsString<'a>) -> Ordering {
+        let upcase_fn = |x| ntfs.upcase_table().u16_to_uppercase(x);
+        NtfsString::cmp_iter(self.encode_utf16(), other.u16_iter(), upcase_fn)
     }
 }
diff --git a/src/upcase_table.rs b/src/upcase_table.rs
new file mode 100644
index 0000000..b612fa6
--- /dev/null
+++ b/src/upcase_table.rs
@@ -0,0 +1,103 @@
+// Copyright 2021 Colin Finck <colin@reactos.org>
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+use crate::attribute::NtfsAttributeType;
+use crate::error::{NtfsError, Result};
+use crate::ntfs::Ntfs;
+use crate::ntfs_file::KnownNtfsFile;
+use crate::traits::NtfsReadSeek;
+use binread::io::{Read, Seek};
+use core::convert::TryInto;
+use core::mem;
+
+/// The Upcase Table contains an uppercase character for each Unicode character of the Basic Multilingual Plane.
+const UPCASE_CHARACTER_COUNT: usize = 65536;
+
+/// Hence, the table has a size of 128 KiB.
+const UPCASE_TABLE_SIZE: u64 = (UPCASE_CHARACTER_COUNT * mem::size_of::<u16>()) as u64;
+
+/// Manages a table for converting characters to uppercase.
+/// This table is used for case-insensitive file name comparisons.
+///
+/// NTFS stores such a table in the special $UpCase file on every filesystem.
+/// As this table is slightly different depending on the Windows version used for creating the filesystem,
+/// it is very important to always read the table from the filesystem itself.
+/// Hence, this table is not hardcoded into the crate.
+#[derive(Clone, Debug)]
+pub(crate) struct UpcaseTable {
+    uppercase_characters: Vec<u16>,
+}
+
+impl UpcaseTable {
+    /// Reads the $UpCase file from the given filesystem into a new [`UpcaseTable`] object.
+    pub(crate) fn read<T>(ntfs: &Ntfs, fs: &mut T) -> Result<Self>
+    where
+        T: Read + Seek,
+    {
+        // Lookup the $UpCase file and its $DATA attribute.
+        let upcase_file = ntfs.ntfs_file(fs, KnownNtfsFile::UpCase as u64)?;
+        let data_attribute = upcase_file
+            .attributes()
+            .find(|attribute| {
+                // TODO: Replace by attribute.ty().contains() once https://github.com/rust-lang/rust/issues/62358 has landed.
+                attribute
+                    .ty()
+                    .map(|ty| ty == NtfsAttributeType::Data)
+                    .unwrap_or(false)
+            })
+            .ok_or(NtfsError::AttributeNotFound {
+                position: upcase_file.position(),
+                ty: NtfsAttributeType::VolumeName,
+            })?;
+        if data_attribute.value_length() != UPCASE_TABLE_SIZE {
+            return Err(NtfsError::InvalidUpcaseTableSize {
+                expected: UPCASE_TABLE_SIZE,
+                actual: data_attribute.value_length(),
+            });
+        }
+
+        // Read the entire raw data from the $DATA attribute.
+        let mut data_value = data_attribute.value()?;
+        let mut data = vec![0u8; UPCASE_TABLE_SIZE as usize];
+        data_value.read_exact(fs, &mut data)?;
+
+        // Store it in an array of `u16` uppercase characters.
+        // Any endianness conversion is done here once, which makes `u16_to_uppercase` fast.
+        let uppercase_characters = data
+            .chunks_exact(2)
+            .map(|two_bytes| u16::from_le_bytes(two_bytes.try_into().unwrap()))
+            .collect();
+
+        Ok(Self {
+            uppercase_characters,
+        })
+    }
+
+    /// Returns the uppercase variant of the given UCS-2 character (i.e. a Unicode character
+    /// from the Basic Multilingual Plane) based on the stored conversion table.
+    /// A character without an uppercase equivalent is returned as-is.
+    pub(crate) fn u16_to_uppercase(&self, character: u16) -> u16 {
+        self.uppercase_characters[character as usize]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_upcase_table() {
+        let mut testfs1 = crate::helpers::tests::testfs1();
+        let ntfs = Ntfs::new(&mut testfs1).unwrap();
+        let upcase_table = UpcaseTable::read(&ntfs, &mut testfs1).unwrap();
+
+        // Prove that at least the lowercase English characters are mapped to their uppercase equivalents.
+        // It makes no sense to check everything here.
+        for (lowercase, uppercase) in (b'a'..=b'z').zip(b'A'..=b'Z') {
+            assert_eq!(
+                upcase_table.u16_to_uppercase(lowercase as u16),
+                uppercase as u16
+            );
+        }
+    }
+}
author	Colin Finck <colin@reactos.org>	2021-08-02 00:18:14 +0300
committer	Colin Finck <colin@reactos.org>	2021-08-02 00:18:14 +0300
commit	6210c625de3f617b0553ae8176f2483133517161 (patch)
tree	09ed0a59abd45e1c5691c652011baf2733184e8e
parent	943309f22ed6e208b7dcc1b24ae8879eb88f8a39 (diff)