Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions library/alloc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
#![feature(slice_ptr_get)]
#![feature(slice_range)]
#![feature(std_internals)]
#![feature(str_internals)]
#![feature(temporary_niche_types)]
#![feature(titlecase)]
#![feature(transmutability)]
Expand Down
17 changes: 10 additions & 7 deletions library/alloc/src/str.rs
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this file, I am simply making case_ignorable_then_cased accessible from the string module

Original file line number Diff line number Diff line change
Expand Up @@ -415,13 +415,6 @@ impl str {
&& !case_ignorable_then_cased(from[i + const { 'Σ'.len_utf8() }..].chars());
if is_word_final { 'ς' } else { 'σ' }
}

fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
Some(c) => c.is_cased(),
None => false,
}
}
}

/// Returns the uppercase equivalent of this string slice, as a new [`String`].
Expand Down Expand Up @@ -481,7 +474,17 @@ impl str {
}
s
}
}

#[cfg(not(no_global_oom_handling))]
pub(crate) fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
Some(c) => c.is_cased(),
None => false,
}
}

impl str {
/// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
///
/// # Examples
Expand Down
248 changes: 246 additions & 2 deletions library/alloc/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ use core::iter::from_fn;
use core::num::Saturating;
#[cfg(not(no_global_oom_handling))]
use core::ops::Add;
#[cfg(not(no_global_oom_handling))]
use core::ops::AddAssign;
use core::ops::{self, Range, RangeBounds};
#[cfg(not(no_global_oom_handling))]
use core::ops::{AddAssign, ControlFlow};
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this should be done in the line above without no_global_oom_handling? Let me know.

use core::str::pattern::{Pattern, Utf8Pattern};
use core::{fmt, hash, ptr, slice};

Expand All @@ -62,6 +62,8 @@ use crate::alloc::Allocator;
use crate::borrow::{Cow, ToOwned};
use crate::boxed::Box;
use crate::collections::TryReserveError;
#[cfg(not(no_global_oom_handling))]
use crate::collections::VecDeque;
use crate::str::{self, CharIndices, Chars, Utf8Error, from_utf8_unchecked_mut};
#[cfg(not(no_global_oom_handling))]
use crate::str::{FromStr, from_boxed_utf8_unchecked};
Expand Down Expand Up @@ -3604,3 +3606,245 @@ impl From<char> for String {
c.to_string()
}
}

// In place case changes

#[cfg(not(no_global_oom_handling))]
impl String {
fn case_change_while_ascii<const MAKE_UPPER: bool>(&mut self) -> ControlFlow<usize> {
// SAFETY the as_bytes_mut is unsafe but we will only do ascii case change in place with it
unsafe {
self.as_bytes_mut().into_iter().enumerate().try_for_each(|(i, b)| {
if b.is_ascii() {
if MAKE_UPPER {
b.make_ascii_uppercase();
} else {
b.make_ascii_lowercase();
}
ControlFlow::Continue(())
} else {
ControlFlow::Break(i)
}
})
}
Comment on lines +3614 to +3629
Copy link
Copy Markdown
Contributor Author

@krtab krtab Mar 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has room for improvement to end up on par with the str::to_*case implementations, but doing it properly requires factorizing code in common so I'm delaying it until later

}

/// Converts this string to its uppercase equivalent in-place.
///
/// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property
/// `Uppercase`.
///
/// Since some characters can expand into multiple characters when changing
/// the case, this method may change the length of the string. If the string
/// shrinks, the excess capacity is not reclaimed.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// #![feature(string_make_uplowercase)]
///
/// let mut s = String::from("hello");
/// s.make_uppercase();
///
/// assert_eq!("HELLO", s);
/// ```
///
/// Scripts without case are not changed:
///
/// ```
/// #![feature(string_make_uplowercase)]
///
/// let mut new_year = String::from("农历新年");
/// new_year.make_uppercase();
///
/// assert_eq!("农历新年", new_year);
/// ```
///
/// One character can become multiple:
///
/// ```
/// #![feature(string_make_uplowercase)]
///
/// let mut s = String::from("tschüß");
/// s.make_uppercase();
///
/// assert_eq!("TSCHÜSS", s);
/// ```
#[unstable(feature = "string_make_uplowercase", issue = "135885")]
pub fn make_uppercase(&mut self) {
let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::<true>() else {
return;
};
let mut wc = WriteChars::new(self, non_utf8_offset);
while let Some(l_c) = wc.pop() {
l_c.to_uppercase().for_each(|u_c| wc.write(u_c));
}
// SAFETY: At this point, none of the methods of wc panicked
unsafe {
wc.finalize();
}
}

/// Converts this string to its lowercase equivalent in-place.
///
/// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property
/// `Lowercase`.
///
/// Since some characters can expand into multiple characters when changing
/// the case, this method may change the length of the string. If the string
/// shrinks, the excess capacity is not reclaimed.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// #![feature(string_make_uplowercase)]
///
/// let mut s = String::from("HELLO");
/// s.make_lowercase();
///
/// assert_eq!("hello", s);
/// ```
///
/// ```
/// #![feature(string_make_uplowercase)]
///
/// let mut odysseus = String::from("ὈΔΥΣΣΕΎΣ");
/// odysseus.make_lowercase();
///
/// assert_eq!("ὀδυσσεύς", odysseus);
/// ```
///
/// Languages without case are not changed:
///
/// ```
/// #![feature(string_make_uplowercase)]
///
/// let mut new_year = String::from("农历新年");
/// new_year.make_lowercase();
///
/// assert_eq!("农历新年", new_year);
/// ```
#[unstable(feature = "string_make_uplowercase", issue = "135885")]
pub fn make_lowercase(&mut self) {
fn update_word_final(word_final_so_far: bool, u_c: char) -> bool {
u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable())
}

let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::<false>() else {
return;
};
// We will only update the streaming word_final detection if the str contains sigma
// because it requires table lookups that we consider expensive.
let has_sigma = self[non_utf8_offset..].contains('Σ');
let mut word_final_so_far =
has_sigma && self[..non_utf8_offset].chars().fold(false, update_word_final);
let mut wc = WriteChars::new(self, non_utf8_offset);
while let Some(u_c) = wc.pop() {
if u_c == 'Σ' {
if word_final_so_far && !crate::str::case_ignorable_then_cased(wc.rest().chars()) {
// actually word final
wc.write('ς');
} else {
wc.write('σ');
}
} else {
u_c.to_lowercase().for_each(|l_c| wc.write(l_c));
}
if has_sigma {
word_final_so_far = update_word_final(word_final_so_far, u_c);
}
}
// SAFETY: At this point, none of the methods of wc panicked
unsafe {
wc.finalize();
}
}
}

/// A helper for in place modification of strings, where we gradually "pop" characters,
/// hereby making room to write back to the string buffer
#[cfg(not(no_global_oom_handling))]
#[unstable(issue = "none", feature = "std_internals")]
struct WriteChars<'a> {
// This is the internal buffer of the string temporarily changed to Vec<u8> because
// it will contain non utf8 bytes.
// invariant: self.v.len() == original string until finalize is run
v: Vec<u8>,
// A reference kept to restore the string at the end
// (ie finalize time)
s: &'a mut String,
// invariant: write_offset <= read_offset
write_offset: usize,
// invariant: self.read_offset <= self.v.len()
// before finalize
read_offset: usize,
buffer: VecDeque<u8>,
}

#[cfg(not(no_global_oom_handling))]
#[unstable(issue = "none", feature = "std_internals")]
impl<'a> WriteChars<'a> {
fn new(s: &'a mut String, offset: usize) -> Self {
let v = core::mem::take(s).into_bytes();
WriteChars { s, v, write_offset: offset, read_offset: offset, buffer: VecDeque::new() }
}

fn rest(&self) -> &str {
// SAFETY: read_offset is always ok to read from
unsafe { str::from_utf8_unchecked(&self.v[self.read_offset..]) }
}

fn pop(&mut self) -> Option<char> {
// SAFETY: The bytes from read_offset are valid UTF8
let (code_point, width) = unsafe {
core::str::next_code_point_with_width(&mut self.v[self.read_offset..].iter())?
};
self.read_offset += width;
// Dump what is buffered in the newly freed space
while self.write_offset < self.read_offset
&& let Some(b) = self.buffer.pop_front()
{
self.v[self.write_offset] = b;
self.write_offset += 1;
}
// SAFETY: The code point is valid
let c = unsafe { char::from_u32_unchecked(code_point) };
Some(c)
}

fn write(&mut self, c: char) {
let writable_slice = &mut self.v[self.write_offset..self.read_offset];
let mut buffer = [0u8; 4];
let len = c.encode_utf8(&mut buffer).len();
let direct_copy_length = core::cmp::min(len, writable_slice.len());
writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]);
self.write_offset += direct_copy_length;
self.buffer.extend(&buffer[direct_copy_length..len]);
}

// Set the proper length of the string's storage
// or grow it to add what is still in the buffer.
/// Finalize should be run for the modifications to be actually written back to the string
/// # Safety
/// Must not be called if one of the previous method calls of self panicked because the buffer
/// may contain invalid utf8
unsafe fn finalize(mut self) {
if self.buffer.is_empty() {
// SAFETY: if the queue is empty, then
// there were less bytes than in the original so we can simply shrink
unsafe {
self.v.set_len(self.write_offset);
}
} else {
let (q1, q2) = self.buffer.as_slices();
self.v.extend_from_slice(q1);
self.v.extend_from_slice(q2);
};
// SAFETY: this is valid utf8
*self.s = unsafe { String::from_utf8_unchecked(core::mem::take(&mut self.v)) }
}
}
1 change: 1 addition & 0 deletions library/alloctests/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#![allow(internal_features)]
#![deny(fuzzy_provenance_casts)]
#![deny(unsafe_op_in_unsafe_fn)]
#![feature(string_make_uplowercase)]

extern crate alloc;

Expand Down
80 changes: 80 additions & 0 deletions library/alloctests/tests/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -956,3 +956,83 @@ fn test_str_concat() {
let s: String = format!("{a}{b}");
assert_eq!(s.as_bytes()[9], 'd' as u8);
}

#[test]
fn make_uppercase() {
fn test(s: &str) {
let ground_truth = s.to_uppercase();
let mut tested = s.to_owned();
tested.make_uppercase();
assert!(
tested == ground_truth,
r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
);
}
test("");
test("abcde");
// 4 to 9 bytes
test("ǰΐ");
// 10*3 to 10*2 bytes
test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
test("aéDžßfiᾀ");
}

#[test]
fn make_lowercase() {
fn test(s: &str) {
let ground_truth = s.to_lowercase();
let mut tested = s.to_owned();
tested.make_lowercase();
assert!(
tested == ground_truth,
r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
);
}
test("");
test("AÉDžaé ");

// https://github.com/rust-lang/rust/issues/26035
test("ΑΣ");
test("Α'Σ");
test("Α''Σ");

test("ΑΣ Α");
test("Α'Σ Α");
test("Α''Σ Α");

test("ΑΣ' Α");
test("ΑΣ'' Α");

test("Α'Σ' Α");
test("Α''Σ'' Α");

test("Α Σ");
test("Α 'Σ");
test("Α ''Σ");

test("Σ");
test("'Σ");
test("''Σ");

test("ΑΣΑ");
test("ΑΣ'Α");
test("ΑΣ''Α");

// https://github.com/rust-lang/rust/issues/124714
// input lengths around the boundary of the chunk size used by the ascii prefix optimization
test("abcdefghijklmnoΣ");
test("abcdefghijklmnopΣ");
test("abcdefghijklmnopqΣ");

// a really long string that has it's lowercase form
// even longer. this tests that implementations don't assume
// an incorrect upper bound on allocations
let upper = str::repeat("İ", 512);
test(&upper);

// a really long ascii-only string.
// This test that the ascii hot-path
// functions correctly
let upper = str::repeat("A", 511);
test(&upper);
}
Loading
Loading