diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index bcd9e092a310f..261f77851aa14 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -147,6 +147,7 @@ #![feature(slice_ptr_get)] #![feature(slice_range)] #![feature(std_internals)] +#![feature(str_internals)] #![feature(temporary_niche_types)] #![feature(titlecase)] #![feature(transmutability)] diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 8a3326c7d76a7..ac3f054900e83 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -415,13 +415,6 @@ impl str { && !case_ignorable_then_cased(from[i + const { 'Σ'.len_utf8() }..].chars()); if is_word_final { 'ς' } else { 'σ' } } - - fn case_ignorable_then_cased>(iter: I) -> bool { - match iter.skip_while(|&c| c.is_case_ignorable()).next() { - Some(c) => c.is_cased(), - None => false, - } - } } /// Returns the uppercase equivalent of this string slice, as a new [`String`]. @@ -481,7 +474,17 @@ impl str { } s } +} +#[cfg(not(no_global_oom_handling))] +pub(crate) fn case_ignorable_then_cased>(iter: I) -> bool { + match iter.skip_while(|&c| c.is_case_ignorable()).next() { + Some(c) => c.is_cased(), + None => false, + } +} + +impl str { /// Converts a [`Box`] into a [`String`] without copying or allocating. /// /// # Examples diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 30e52f3e1be46..6d44dc22fda58 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -50,9 +50,9 @@ use core::iter::from_fn; use core::num::Saturating; #[cfg(not(no_global_oom_handling))] use core::ops::Add; -#[cfg(not(no_global_oom_handling))] -use core::ops::AddAssign; use core::ops::{self, Range, RangeBounds}; +#[cfg(not(no_global_oom_handling))] +use core::ops::{AddAssign, ControlFlow}; use core::str::pattern::{Pattern, Utf8Pattern}; use core::{fmt, hash, ptr, slice}; @@ -62,6 +62,8 @@ use crate::alloc::Allocator; use crate::borrow::{Cow, ToOwned}; use crate::boxed::Box; use crate::collections::TryReserveError; +#[cfg(not(no_global_oom_handling))] +use crate::collections::VecDeque; use crate::str::{self, CharIndices, Chars, Utf8Error, from_utf8_unchecked_mut}; #[cfg(not(no_global_oom_handling))] use crate::str::{FromStr, from_boxed_utf8_unchecked}; @@ -3604,3 +3606,245 @@ impl From for String { c.to_string() } } + +// In place case changes + +#[cfg(not(no_global_oom_handling))] +impl String { + fn case_change_while_ascii(&mut self) -> ControlFlow { + // SAFETY the as_bytes_mut is unsafe but we will only do ascii case change in place with it + unsafe { + self.as_bytes_mut().into_iter().enumerate().try_for_each(|(i, b)| { + if b.is_ascii() { + if MAKE_UPPER { + b.make_ascii_uppercase(); + } else { + b.make_ascii_lowercase(); + } + ControlFlow::Continue(()) + } else { + ControlFlow::Break(i) + } + }) + } + } + + /// Converts this string to its uppercase equivalent in-place. + /// + /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property + /// `Uppercase`. + /// + /// Since some characters can expand into multiple characters when changing + /// the case, this method may change the length of the string. If the string + /// shrinks, the excess capacity is not reclaimed. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut s = String::from("hello"); + /// s.make_uppercase(); + /// + /// assert_eq!("HELLO", s); + /// ``` + /// + /// Scripts without case are not changed: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut new_year = String::from("农历新年"); + /// new_year.make_uppercase(); + /// + /// assert_eq!("农历新年", new_year); + /// ``` + /// + /// One character can become multiple: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut s = String::from("tschüß"); + /// s.make_uppercase(); + /// + /// assert_eq!("TSCHÜSS", s); + /// ``` + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + pub fn make_uppercase(&mut self) { + let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::() else { + return; + }; + let mut wc = WriteChars::new(self, non_utf8_offset); + while let Some(l_c) = wc.pop() { + l_c.to_uppercase().for_each(|u_c| wc.write(u_c)); + } + // SAFETY: At this point, none of the methods of wc panicked + unsafe { + wc.finalize(); + } + } + + /// Converts this string to its lowercase equivalent in-place. + /// + /// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property + /// `Lowercase`. + /// + /// Since some characters can expand into multiple characters when changing + /// the case, this method may change the length of the string. If the string + /// shrinks, the excess capacity is not reclaimed. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut s = String::from("HELLO"); + /// s.make_lowercase(); + /// + /// assert_eq!("hello", s); + /// ``` + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut odysseus = String::from("ὈΔΥΣΣΕΎΣ"); + /// odysseus.make_lowercase(); + /// + /// assert_eq!("ὀδυσσεύς", odysseus); + /// ``` + /// + /// Languages without case are not changed: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut new_year = String::from("农历新年"); + /// new_year.make_lowercase(); + /// + /// assert_eq!("农历新年", new_year); + /// ``` + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + pub fn make_lowercase(&mut self) { + fn update_word_final(word_final_so_far: bool, u_c: char) -> bool { + u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable()) + } + + let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::() else { + return; + }; + // We will only update the streaming word_final detection if the str contains sigma + // because it requires table lookups that we consider expensive. + let has_sigma = self[non_utf8_offset..].contains('Σ'); + let mut word_final_so_far = + has_sigma && self[..non_utf8_offset].chars().fold(false, update_word_final); + let mut wc = WriteChars::new(self, non_utf8_offset); + while let Some(u_c) = wc.pop() { + if u_c == 'Σ' { + if word_final_so_far && !crate::str::case_ignorable_then_cased(wc.rest().chars()) { + // actually word final + wc.write('ς'); + } else { + wc.write('σ'); + } + } else { + u_c.to_lowercase().for_each(|l_c| wc.write(l_c)); + } + if has_sigma { + word_final_so_far = update_word_final(word_final_so_far, u_c); + } + } + // SAFETY: At this point, none of the methods of wc panicked + unsafe { + wc.finalize(); + } + } +} + +/// A helper for in place modification of strings, where we gradually "pop" characters, +/// hereby making room to write back to the string buffer +#[cfg(not(no_global_oom_handling))] +#[unstable(issue = "none", feature = "std_internals")] +struct WriteChars<'a> { + // This is the internal buffer of the string temporarily changed to Vec because + // it will contain non utf8 bytes. + // invariant: self.v.len() == original string until finalize is run + v: Vec, + // A reference kept to restore the string at the end + // (ie finalize time) + s: &'a mut String, + // invariant: write_offset <= read_offset + write_offset: usize, + // invariant: self.read_offset <= self.v.len() + // before finalize + read_offset: usize, + buffer: VecDeque, +} + +#[cfg(not(no_global_oom_handling))] +#[unstable(issue = "none", feature = "std_internals")] +impl<'a> WriteChars<'a> { + fn new(s: &'a mut String, offset: usize) -> Self { + let v = core::mem::take(s).into_bytes(); + WriteChars { s, v, write_offset: offset, read_offset: offset, buffer: VecDeque::new() } + } + + fn rest(&self) -> &str { + // SAFETY: read_offset is always ok to read from + unsafe { str::from_utf8_unchecked(&self.v[self.read_offset..]) } + } + + fn pop(&mut self) -> Option { + // SAFETY: The bytes from read_offset are valid UTF8 + let (code_point, width) = unsafe { + core::str::next_code_point_with_width(&mut self.v[self.read_offset..].iter())? + }; + self.read_offset += width; + // Dump what is buffered in the newly freed space + while self.write_offset < self.read_offset + && let Some(b) = self.buffer.pop_front() + { + self.v[self.write_offset] = b; + self.write_offset += 1; + } + // SAFETY: The code point is valid + let c = unsafe { char::from_u32_unchecked(code_point) }; + Some(c) + } + + fn write(&mut self, c: char) { + let writable_slice = &mut self.v[self.write_offset..self.read_offset]; + let mut buffer = [0u8; 4]; + let len = c.encode_utf8(&mut buffer).len(); + let direct_copy_length = core::cmp::min(len, writable_slice.len()); + writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]); + self.write_offset += direct_copy_length; + self.buffer.extend(&buffer[direct_copy_length..len]); + } + + // Set the proper length of the string's storage + // or grow it to add what is still in the buffer. + /// Finalize should be run for the modifications to be actually written back to the string + /// # Safety + /// Must not be called if one of the previous method calls of self panicked because the buffer + /// may contain invalid utf8 + unsafe fn finalize(mut self) { + if self.buffer.is_empty() { + // SAFETY: if the queue is empty, then + // there were less bytes than in the original so we can simply shrink + unsafe { + self.v.set_len(self.write_offset); + } + } else { + let (q1, q2) = self.buffer.as_slices(); + self.v.extend_from_slice(q1); + self.v.extend_from_slice(q2); + }; + // SAFETY: this is valid utf8 + *self.s = unsafe { String::from_utf8_unchecked(core::mem::take(&mut self.v)) } + } +} diff --git a/library/alloctests/tests/lib.rs b/library/alloctests/tests/lib.rs index 699a5010282b0..1202df63f88c2 100644 --- a/library/alloctests/tests/lib.rs +++ b/library/alloctests/tests/lib.rs @@ -44,6 +44,7 @@ #![allow(internal_features)] #![deny(fuzzy_provenance_casts)] #![deny(unsafe_op_in_unsafe_fn)] +#![feature(string_make_uplowercase)] extern crate alloc; diff --git a/library/alloctests/tests/string.rs b/library/alloctests/tests/string.rs index 08eb1855a4824..540e447665b25 100644 --- a/library/alloctests/tests/string.rs +++ b/library/alloctests/tests/string.rs @@ -956,3 +956,83 @@ fn test_str_concat() { let s: String = format!("{a}{b}"); assert_eq!(s.as_bytes()[9], 'd' as u8); } + +#[test] +fn make_uppercase() { + fn test(s: &str) { + let ground_truth = s.to_uppercase(); + let mut tested = s.to_owned(); + tested.make_uppercase(); + assert!( + tested == ground_truth, + r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("abcde"); + // 4 to 9 bytes + test("ǰΐ"); + // 10*3 to 10*2 bytes + test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ"); + test("aéDžßfiᾀ"); +} + +#[test] +fn make_lowercase() { + fn test(s: &str) { + let ground_truth = s.to_lowercase(); + let mut tested = s.to_owned(); + tested.make_lowercase(); + assert!( + tested == ground_truth, + r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("AÉDžaé "); + + // https://github.com/rust-lang/rust/issues/26035 + test("ΑΣ"); + test("Α'Σ"); + test("Α''Σ"); + + test("ΑΣ Α"); + test("Α'Σ Α"); + test("Α''Σ Α"); + + test("ΑΣ' Α"); + test("ΑΣ'' Α"); + + test("Α'Σ' Α"); + test("Α''Σ'' Α"); + + test("Α Σ"); + test("Α 'Σ"); + test("Α ''Σ"); + + test("Σ"); + test("'Σ"); + test("''Σ"); + + test("ΑΣΑ"); + test("ΑΣ'Α"); + test("ΑΣ''Α"); + + // https://github.com/rust-lang/rust/issues/124714 + // input lengths around the boundary of the chunk size used by the ascii prefix optimization + test("abcdefghijklmnoΣ"); + test("abcdefghijklmnopΣ"); + test("abcdefghijklmnopqΣ"); + + // a really long string that has it's lowercase form + // even longer. this tests that implementations don't assume + // an incorrect upper bound on allocations + let upper = str::repeat("İ", 512); + test(&upper); + + // a really long ascii-only string. + // This test that the ascii hot-path + // functions correctly + let upper = str::repeat("A", 511); + test(&upper); +} diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 0d52bfb8c9aa4..d63d944a2a6de 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -58,7 +58,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks}; #[stable(feature = "rust1", since = "1.0.0")] pub use traits::FromStr; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::{next_code_point, utf8_char_width}; +pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width}; #[inline(never)] #[cold] diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index b54d6478e584d..972640687c8d8 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -25,18 +25,20 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { } /// Reads the next code point out of a byte iterator (assuming a -/// UTF-8-like encoding). +/// UTF-8-like encoding) and returns it along with its width. /// /// # Safety /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[unstable(feature = "str_internals", issue = "none")] #[inline] -pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { +pub unsafe fn next_code_point_with_width<'a, I: Iterator>( + bytes: &mut I, +) -> Option<(u32, usize)> { // Decode UTF-8 let x = *bytes.next()?; if x < 128 { - return Some(x as u32); + return Some((x as u32, 1)); } // Multibyte case follows @@ -46,6 +48,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let y = unsafe { *bytes.next().unwrap_unchecked() }; + let mut width = 2; let mut ch = utf8_acc_cont_byte(init, y); if x >= 0xE0 { // [[x y z] w] case @@ -53,6 +56,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let z = unsafe { *bytes.next().unwrap_unchecked() }; + width = 3; let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { @@ -61,11 +65,25 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let w = unsafe { *bytes.next().unwrap_unchecked() }; + width = 4; ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } } - Some(ch) + Some((ch, width)) +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // SAFETY: same call condition + Some(unsafe { next_code_point_with_width(bytes) }?.0) } /// Reads the last code point out of a byte iterator (assuming a