From e75e4778c7b2d7b483e25a40318f2037e83a9226 Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Sat, 28 Mar 2026 01:11:13 +0100 Subject: [PATCH 1/6] Add String::make_(lower|upper)case APIs --- library/alloc/src/lib.rs | 1 + library/alloc/src/str.rs | 16 ++- library/alloc/src/string.rs | 202 +++++++++++++++++++++++++++- library/alloctests/tests/lib.rs | 1 + library/alloctests/tests/string.rs | 80 +++++++++++ library/core/src/str/mod.rs | 2 +- library/core/src/str/validations.rs | 26 +++- 7 files changed, 315 insertions(+), 13 deletions(-) diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index bcd9e092a310f..261f77851aa14 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -147,6 +147,7 @@ #![feature(slice_ptr_get)] #![feature(slice_range)] #![feature(std_internals)] +#![feature(str_internals)] #![feature(temporary_niche_types)] #![feature(titlecase)] #![feature(transmutability)] diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 8a3326c7d76a7..a7690df779bae 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -415,13 +415,6 @@ impl str { && !case_ignorable_then_cased(from[i + const { 'Σ'.len_utf8() }..].chars()); if is_word_final { 'ς' } else { 'σ' } } - - fn case_ignorable_then_cased>(iter: I) -> bool { - match iter.skip_while(|&c| c.is_case_ignorable()).next() { - Some(c) => c.is_cased(), - None => false, - } - } } /// Returns the uppercase equivalent of this string slice, as a new [`String`]. @@ -481,7 +474,16 @@ impl str { } s } +} + +pub(crate) fn case_ignorable_then_cased>(iter: I) -> bool { + match iter.skip_while(|&c| c.is_case_ignorable()).next() { + Some(c) => c.is_cased(), + None => false, + } +} +impl str { /// Converts a [`Box`] into a [`String`] without copying or allocating. /// /// # Examples diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 30e52f3e1be46..5ebb193d99979 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -61,7 +61,7 @@ use crate::alloc::Allocator; #[cfg(not(no_global_oom_handling))] use crate::borrow::{Cow, ToOwned}; use crate::boxed::Box; -use crate::collections::TryReserveError; +use crate::collections::{TryReserveError, VecDeque}; use crate::str::{self, CharIndices, Chars, Utf8Error, from_utf8_unchecked_mut}; #[cfg(not(no_global_oom_handling))] use crate::str::{FromStr, from_boxed_utf8_unchecked}; @@ -3604,3 +3604,203 @@ impl From for String { c.to_string() } } + +// In place case changes + +impl String { + /// Converts this string to its uppercase equivalent in-place. + /// + /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property + /// `Uppercase`. + /// + /// Since some characters can expand into multiple characters when changing + /// the case, this method may change the length of the string. If the string + /// shrinks, the excess capacity is not reclaimed. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut s = String::from("hello"); + /// s.make_uppercase(); + /// + /// assert_eq!("HELLO", s); + /// ``` + /// + /// Scripts without case are not changed: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut new_year = String::from("农历新年"); + /// new_year.make_uppercase(); + /// + /// assert_eq!("农历新年", new_year); + /// ``` + /// + /// One character can become multiple: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut s = String::from("tschüß"); + /// s.make_uppercase(); + /// + /// assert_eq!("TSCHÜSS", s); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + pub fn make_uppercase(&mut self) { + let mut wc = WriteChars::new(self); + while let Some(l_c) = wc.pop() { + l_c.to_uppercase().for_each(|u_c| wc.write(u_c)); + } + } + + /// Converts this string to its lowercase equivalent in-place. + /// + /// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property + /// `Lowercase`. + /// + /// Since some characters can expand into multiple characters when changing + /// the case, this method may change the length of the string. If the string + /// shrinks, the excess capacity is not reclaimed. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut s = String::from("HELLO"); + /// s.make_lowercase(); + /// + /// assert_eq!("hello", s); + /// ``` + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut odysseus = String::from("ὈΔΥΣΣΕΎΣ"); + /// odysseus.make_lowercase(); + /// + /// assert_eq!("ὀδυσσεύς", odysseus); + /// ``` + /// + /// Languages without case are not changed: + /// + /// ``` + /// #![feature(string_make_uplowercase)] + /// + /// let mut new_year = String::from("农历新年"); + /// new_year.make_lowercase(); + /// + /// assert_eq!("农历新年", new_year); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + pub fn make_lowercase(&mut self) { + let mut wc = WriteChars::new(self); + // This is unfortunately paid whether or not you have sigmas in the str + // but it is kind of mandatory because as we are overwriting the source bytes + // we have to compute this information as we go. + let mut word_final_so_far = false; + while let Some(u_c) = wc.pop() { + if u_c == 'Σ' { + if word_final_so_far && !crate::str::case_ignorable_then_cased(wc.rest().chars()) { + // actually word final + wc.write('ς'); + } else { + wc.write('σ'); + } + } else { + u_c.to_lowercase().for_each(|l_c| wc.write(l_c)); + } + word_final_so_far = u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable()); + } + } +} + +/// A helper for in place modification of strings, where we gradually "pop" characters, +/// hereby making room to write back to the string buffer +#[unstable(issue = "none", feature = "std_internals")] +struct WriteChars<'a> { + // This is the internal buffer of the string temporarily changed to Vec because + // it will contain non utf8 bytes. + // invariant: self.v.len() == original string until drop is run + v: Vec, + // A reference kept to restore the string at the end + // (ie drop time) + s: &'a mut String, + // invariant: write_offset <= read_offset + write_offset: usize, + // invariant: self.read_offset <= self.v.len() + // before the Drop + read_offset: usize, + buffer: VecDeque, +} + +impl<'a> Drop for WriteChars<'a> { + // Set the proper length of the string's storage + // or grow it to add what is still in the buffer. + fn drop(&mut self) { + if self.buffer.is_empty() { + // SAFETY: if the queue is empty, then + // there were less bytes than in the original so we can simply shrink + unsafe { + self.v.set_len(self.write_offset); + } + } else { + let (q1, q2) = self.buffer.as_slices(); + self.v.extend_from_slice(q1); + self.v.extend_from_slice(q2); + }; + // SAFETY: this is valid utf8 + *self.s = unsafe { String::from_utf8_unchecked(core::mem::take(&mut self.v)) } + } +} + +#[unstable(issue = "none", feature = "std_internals")] +impl<'a> WriteChars<'a> { + fn new(s: &'a mut String) -> Self { + let v = core::mem::take(s).into_bytes(); + WriteChars { s, v, write_offset: 0, read_offset: 0, buffer: VecDeque::new() } + } + + fn rest(&self) -> &str { + // SAFETY: read_offset is always ok to read from + unsafe { str::from_utf8_unchecked(&self.v[self.read_offset..]) } + } + + fn pop(&mut self) -> Option { + // SAFETY: The bytes from read_offset are valid UTF8 + let (code_point, width) = unsafe { + core::str::next_code_point_with_width(&mut self.v[self.read_offset..].iter())? + }; + self.read_offset += width; + // Dump what is buffered in the newly freed space + while self.write_offset < self.read_offset + && let Some(b) = self.buffer.pop_front() + { + self.v[self.write_offset] = b; + self.write_offset += 1; + } + // SAFETY: The code point is valid + let c = unsafe { char::from_u32_unchecked(code_point) }; + Some(c) + } + + fn write(&mut self, c: char) { + let writable_slice = &mut self.v[self.write_offset..self.read_offset]; + let mut buffer = [0u8; 4]; + let len = c.encode_utf8(&mut buffer).len(); + let direct_copy_length = core::cmp::min(len, writable_slice.len()); + writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]); + self.write_offset += direct_copy_length; + self.buffer.extend(&buffer[direct_copy_length..len]); + } +} diff --git a/library/alloctests/tests/lib.rs b/library/alloctests/tests/lib.rs index 699a5010282b0..1202df63f88c2 100644 --- a/library/alloctests/tests/lib.rs +++ b/library/alloctests/tests/lib.rs @@ -44,6 +44,7 @@ #![allow(internal_features)] #![deny(fuzzy_provenance_casts)] #![deny(unsafe_op_in_unsafe_fn)] +#![feature(string_make_uplowercase)] extern crate alloc; diff --git a/library/alloctests/tests/string.rs b/library/alloctests/tests/string.rs index 08eb1855a4824..540e447665b25 100644 --- a/library/alloctests/tests/string.rs +++ b/library/alloctests/tests/string.rs @@ -956,3 +956,83 @@ fn test_str_concat() { let s: String = format!("{a}{b}"); assert_eq!(s.as_bytes()[9], 'd' as u8); } + +#[test] +fn make_uppercase() { + fn test(s: &str) { + let ground_truth = s.to_uppercase(); + let mut tested = s.to_owned(); + tested.make_uppercase(); + assert!( + tested == ground_truth, + r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("abcde"); + // 4 to 9 bytes + test("ǰΐ"); + // 10*3 to 10*2 bytes + test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ"); + test("aéDžßfiᾀ"); +} + +#[test] +fn make_lowercase() { + fn test(s: &str) { + let ground_truth = s.to_lowercase(); + let mut tested = s.to_owned(); + tested.make_lowercase(); + assert!( + tested == ground_truth, + r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("AÉDžaé "); + + // https://github.com/rust-lang/rust/issues/26035 + test("ΑΣ"); + test("Α'Σ"); + test("Α''Σ"); + + test("ΑΣ Α"); + test("Α'Σ Α"); + test("Α''Σ Α"); + + test("ΑΣ' Α"); + test("ΑΣ'' Α"); + + test("Α'Σ' Α"); + test("Α''Σ'' Α"); + + test("Α Σ"); + test("Α 'Σ"); + test("Α ''Σ"); + + test("Σ"); + test("'Σ"); + test("''Σ"); + + test("ΑΣΑ"); + test("ΑΣ'Α"); + test("ΑΣ''Α"); + + // https://github.com/rust-lang/rust/issues/124714 + // input lengths around the boundary of the chunk size used by the ascii prefix optimization + test("abcdefghijklmnoΣ"); + test("abcdefghijklmnopΣ"); + test("abcdefghijklmnopqΣ"); + + // a really long string that has it's lowercase form + // even longer. this tests that implementations don't assume + // an incorrect upper bound on allocations + let upper = str::repeat("İ", 512); + test(&upper); + + // a really long ascii-only string. + // This test that the ascii hot-path + // functions correctly + let upper = str::repeat("A", 511); + test(&upper); +} diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 0d52bfb8c9aa4..d63d944a2a6de 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -58,7 +58,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks}; #[stable(feature = "rust1", since = "1.0.0")] pub use traits::FromStr; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::{next_code_point, utf8_char_width}; +pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width}; #[inline(never)] #[cold] diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index b54d6478e584d..972640687c8d8 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -25,18 +25,20 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { } /// Reads the next code point out of a byte iterator (assuming a -/// UTF-8-like encoding). +/// UTF-8-like encoding) and returns it along with its width. /// /// # Safety /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[unstable(feature = "str_internals", issue = "none")] #[inline] -pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { +pub unsafe fn next_code_point_with_width<'a, I: Iterator>( + bytes: &mut I, +) -> Option<(u32, usize)> { // Decode UTF-8 let x = *bytes.next()?; if x < 128 { - return Some(x as u32); + return Some((x as u32, 1)); } // Multibyte case follows @@ -46,6 +48,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let y = unsafe { *bytes.next().unwrap_unchecked() }; + let mut width = 2; let mut ch = utf8_acc_cont_byte(init, y); if x >= 0xE0 { // [[x y z] w] case @@ -53,6 +56,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let z = unsafe { *bytes.next().unwrap_unchecked() }; + width = 3; let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { @@ -61,11 +65,25 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let w = unsafe { *bytes.next().unwrap_unchecked() }; + width = 4; ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } } - Some(ch) + Some((ch, width)) +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // SAFETY: same call condition + Some(unsafe { next_code_point_with_width(bytes) }?.0) } /// Reads the last code point out of a byte iterator (assuming a From 27b6a1f107fc0be3e5ddd25be06461813bd43235 Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Sat, 28 Mar 2026 03:35:49 +0100 Subject: [PATCH 2/6] Only run the streaming sigma detection when needed --- library/alloc/src/string.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 5ebb193d99979..cb65f2fcc2763 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -3704,11 +3704,11 @@ impl String { #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_make_uplowercase", issue = "135885")] pub fn make_lowercase(&mut self) { - let mut wc = WriteChars::new(self); - // This is unfortunately paid whether or not you have sigmas in the str - // but it is kind of mandatory because as we are overwriting the source bytes - // we have to compute this information as we go. + // We will only update the streaming word_final detection if the str contains sigma + // because it requires table lookups that we consider expensive. + let has_sigma = self.contains('Σ'); let mut word_final_so_far = false; + let mut wc = WriteChars::new(self); while let Some(u_c) = wc.pop() { if u_c == 'Σ' { if word_final_so_far && !crate::str::case_ignorable_then_cased(wc.rest().chars()) { @@ -3720,7 +3720,10 @@ impl String { } else { u_c.to_lowercase().for_each(|l_c| wc.write(l_c)); } - word_final_so_far = u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable()); + if has_sigma { + word_final_so_far = + u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable()); + } } } } From f50c99ef905d1a300c28c930e0190ce61e6e4819 Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Sat, 28 Mar 2026 04:09:22 +0100 Subject: [PATCH 3/6] Ensure safety even in the face of panics --- library/alloc/src/string.rs | 56 ++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index cb65f2fcc2763..ecb3e25229ff1 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -3658,6 +3658,10 @@ impl String { while let Some(l_c) = wc.pop() { l_c.to_uppercase().for_each(|u_c| wc.write(u_c)); } + // SAFETY: At this point, none of the methods of wc panicked + unsafe { + wc.finalize(); + } } /// Converts this string to its lowercase equivalent in-place. @@ -3725,6 +3729,10 @@ impl String { u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable()); } } + // SAFETY: At this point, none of the methods of wc panicked + unsafe { + wc.finalize(); + } } } @@ -3734,39 +3742,19 @@ impl String { struct WriteChars<'a> { // This is the internal buffer of the string temporarily changed to Vec because // it will contain non utf8 bytes. - // invariant: self.v.len() == original string until drop is run + // invariant: self.v.len() == original string until finalize is run v: Vec, // A reference kept to restore the string at the end - // (ie drop time) + // (ie finalize time) s: &'a mut String, // invariant: write_offset <= read_offset write_offset: usize, // invariant: self.read_offset <= self.v.len() - // before the Drop + // before finalize read_offset: usize, buffer: VecDeque, } -impl<'a> Drop for WriteChars<'a> { - // Set the proper length of the string's storage - // or grow it to add what is still in the buffer. - fn drop(&mut self) { - if self.buffer.is_empty() { - // SAFETY: if the queue is empty, then - // there were less bytes than in the original so we can simply shrink - unsafe { - self.v.set_len(self.write_offset); - } - } else { - let (q1, q2) = self.buffer.as_slices(); - self.v.extend_from_slice(q1); - self.v.extend_from_slice(q2); - }; - // SAFETY: this is valid utf8 - *self.s = unsafe { String::from_utf8_unchecked(core::mem::take(&mut self.v)) } - } -} - #[unstable(issue = "none", feature = "std_internals")] impl<'a> WriteChars<'a> { fn new(s: &'a mut String) -> Self { @@ -3806,4 +3794,26 @@ impl<'a> WriteChars<'a> { self.write_offset += direct_copy_length; self.buffer.extend(&buffer[direct_copy_length..len]); } + + // Set the proper length of the string's storage + // or grow it to add what is still in the buffer. + /// Finalize should be run for the modifications to be actually written back to the string + /// # Safety + /// Must not be called if one of the previous method calls of self panicked because the buffer + /// may contain invalid utf8 + unsafe fn finalize(mut self) { + if self.buffer.is_empty() { + // SAFETY: if the queue is empty, then + // there were less bytes than in the original so we can simply shrink + unsafe { + self.v.set_len(self.write_offset); + } + } else { + let (q1, q2) = self.buffer.as_slices(); + self.v.extend_from_slice(q1); + self.v.extend_from_slice(q2); + }; + // SAFETY: this is valid utf8 + *self.s = unsafe { String::from_utf8_unchecked(core::mem::take(&mut self.v)) } + } } From c9f19b4b46b92c15aba0b014e51b943f601f541e Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Sat, 28 Mar 2026 10:58:47 +0100 Subject: [PATCH 4/6] Add cfg oom hangling to vecdeque import --- library/alloc/src/string.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index ecb3e25229ff1..809971036d755 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -61,7 +61,9 @@ use crate::alloc::Allocator; #[cfg(not(no_global_oom_handling))] use crate::borrow::{Cow, ToOwned}; use crate::boxed::Box; -use crate::collections::{TryReserveError, VecDeque}; +use crate::collections::{TryReserveError}; +#[cfg(not(no_global_oom_handling))] +use crate::collections::VecDeque; use crate::str::{self, CharIndices, Chars, Utf8Error, from_utf8_unchecked_mut}; #[cfg(not(no_global_oom_handling))] use crate::str::{FromStr, from_boxed_utf8_unchecked}; From e0c3c99495dc94c2ee9b38b617973f309f35f6bc Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Sat, 28 Mar 2026 11:34:26 +0100 Subject: [PATCH 5/6] Add ascii happy path --- library/alloc/src/string.rs | 51 +++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 809971036d755..2657fb9986606 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -50,9 +50,9 @@ use core::iter::from_fn; use core::num::Saturating; #[cfg(not(no_global_oom_handling))] use core::ops::Add; -#[cfg(not(no_global_oom_handling))] -use core::ops::AddAssign; use core::ops::{self, Range, RangeBounds}; +#[cfg(not(no_global_oom_handling))] +use core::ops::{AddAssign, ControlFlow}; use core::str::pattern::{Pattern, Utf8Pattern}; use core::{fmt, hash, ptr, slice}; @@ -61,7 +61,7 @@ use crate::alloc::Allocator; #[cfg(not(no_global_oom_handling))] use crate::borrow::{Cow, ToOwned}; use crate::boxed::Box; -use crate::collections::{TryReserveError}; +use crate::collections::TryReserveError; #[cfg(not(no_global_oom_handling))] use crate::collections::VecDeque; use crate::str::{self, CharIndices, Chars, Utf8Error, from_utf8_unchecked_mut}; @@ -3610,6 +3610,25 @@ impl From for String { // In place case changes impl String { + #[cfg(not(no_global_oom_handling))] + fn case_change_while_ascii(&mut self) -> ControlFlow { + // SAFETY the as_bytes_mut is unsafe but we will only do ascii case change in place with it + unsafe { + self.as_bytes_mut().into_iter().enumerate().try_for_each(|(i, b)| { + if b.is_ascii() { + if MAKE_UPPER { + b.make_ascii_uppercase(); + } else { + b.make_ascii_lowercase(); + } + ControlFlow::Continue(()) + } else { + ControlFlow::Break(i) + } + }) + } + } + /// Converts this string to its uppercase equivalent in-place. /// /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property @@ -3656,7 +3675,10 @@ impl String { #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_make_uplowercase", issue = "135885")] pub fn make_uppercase(&mut self) { - let mut wc = WriteChars::new(self); + let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::() else { + return; + }; + let mut wc = WriteChars::new(self, non_utf8_offset); while let Some(l_c) = wc.pop() { l_c.to_uppercase().for_each(|u_c| wc.write(u_c)); } @@ -3710,11 +3732,19 @@ impl String { #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_make_uplowercase", issue = "135885")] pub fn make_lowercase(&mut self) { + fn update_word_final(word_final_so_far: bool, u_c: char) -> bool { + u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable()) + } + + let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::() else { + return; + }; // We will only update the streaming word_final detection if the str contains sigma // because it requires table lookups that we consider expensive. - let has_sigma = self.contains('Σ'); - let mut word_final_so_far = false; - let mut wc = WriteChars::new(self); + let has_sigma = self[non_utf8_offset..].contains('Σ'); + let mut word_final_so_far = + has_sigma && self[..non_utf8_offset].chars().fold(false, update_word_final); + let mut wc = WriteChars::new(self, non_utf8_offset); while let Some(u_c) = wc.pop() { if u_c == 'Σ' { if word_final_so_far && !crate::str::case_ignorable_then_cased(wc.rest().chars()) { @@ -3727,8 +3757,7 @@ impl String { u_c.to_lowercase().for_each(|l_c| wc.write(l_c)); } if has_sigma { - word_final_so_far = - u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable()); + word_final_so_far = update_word_final(word_final_so_far, u_c); } } // SAFETY: At this point, none of the methods of wc panicked @@ -3759,9 +3788,9 @@ struct WriteChars<'a> { #[unstable(issue = "none", feature = "std_internals")] impl<'a> WriteChars<'a> { - fn new(s: &'a mut String) -> Self { + fn new(s: &'a mut String, offset: usize) -> Self { let v = core::mem::take(s).into_bytes(); - WriteChars { s, v, write_offset: 0, read_offset: 0, buffer: VecDeque::new() } + WriteChars { s, v, write_offset: offset, read_offset: offset, buffer: VecDeque::new() } } fn rest(&self) -> &str { From 62e9d60e54ac3bef71da4fc70d55631501c8a02d Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Sat, 28 Mar 2026 14:20:13 +0100 Subject: [PATCH 6/6] Fix no_oom_handling --- library/alloc/src/str.rs | 1 + library/alloc/src/string.rs | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index a7690df779bae..ac3f054900e83 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -476,6 +476,7 @@ impl str { } } +#[cfg(not(no_global_oom_handling))] pub(crate) fn case_ignorable_then_cased>(iter: I) -> bool { match iter.skip_while(|&c| c.is_case_ignorable()).next() { Some(c) => c.is_cased(), diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 2657fb9986606..6d44dc22fda58 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -3609,8 +3609,8 @@ impl From for String { // In place case changes +#[cfg(not(no_global_oom_handling))] impl String { - #[cfg(not(no_global_oom_handling))] fn case_change_while_ascii(&mut self) -> ControlFlow { // SAFETY the as_bytes_mut is unsafe but we will only do ascii case change in place with it unsafe { @@ -3672,7 +3672,6 @@ impl String { /// /// assert_eq!("TSCHÜSS", s); /// ``` - #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_make_uplowercase", issue = "135885")] pub fn make_uppercase(&mut self) { let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::() else { @@ -3729,7 +3728,6 @@ impl String { /// /// assert_eq!("农历新年", new_year); /// ``` - #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_make_uplowercase", issue = "135885")] pub fn make_lowercase(&mut self) { fn update_word_final(word_final_so_far: bool, u_c: char) -> bool { @@ -3769,6 +3767,7 @@ impl String { /// A helper for in place modification of strings, where we gradually "pop" characters, /// hereby making room to write back to the string buffer +#[cfg(not(no_global_oom_handling))] #[unstable(issue = "none", feature = "std_internals")] struct WriteChars<'a> { // This is the internal buffer of the string temporarily changed to Vec because @@ -3786,6 +3785,7 @@ struct WriteChars<'a> { buffer: VecDeque, } +#[cfg(not(no_global_oom_handling))] #[unstable(issue = "none", feature = "std_internals")] impl<'a> WriteChars<'a> { fn new(s: &'a mut String, offset: usize) -> Self {