-
-
Notifications
You must be signed in to change notification settings - Fork 14.7k
Implement String::make_(upp|low)ercase #135888
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
e75e477
27b6a1f
f50c99e
c9f19b4
e0c3c99
62e9d60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,9 +50,9 @@ use core::iter::from_fn; | |
| use core::num::Saturating; | ||
| #[cfg(not(no_global_oom_handling))] | ||
| use core::ops::Add; | ||
| #[cfg(not(no_global_oom_handling))] | ||
| use core::ops::AddAssign; | ||
| use core::ops::{self, Range, RangeBounds}; | ||
| #[cfg(not(no_global_oom_handling))] | ||
| use core::ops::{AddAssign, ControlFlow}; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe this should be done in the line above without no_global_oom_handling? Let me know. |
||
| use core::str::pattern::{Pattern, Utf8Pattern}; | ||
| use core::{fmt, hash, ptr, slice}; | ||
|
|
||
|
|
@@ -62,6 +62,8 @@ use crate::alloc::Allocator; | |
| use crate::borrow::{Cow, ToOwned}; | ||
| use crate::boxed::Box; | ||
| use crate::collections::TryReserveError; | ||
| #[cfg(not(no_global_oom_handling))] | ||
| use crate::collections::VecDeque; | ||
| use crate::str::{self, CharIndices, Chars, Utf8Error, from_utf8_unchecked_mut}; | ||
| #[cfg(not(no_global_oom_handling))] | ||
| use crate::str::{FromStr, from_boxed_utf8_unchecked}; | ||
|
|
@@ -3604,3 +3606,245 @@ impl From<char> for String { | |
| c.to_string() | ||
| } | ||
| } | ||
|
|
||
| // In place case changes | ||
|
|
||
| #[cfg(not(no_global_oom_handling))] | ||
| impl String { | ||
| fn case_change_while_ascii<const MAKE_UPPER: bool>(&mut self) -> ControlFlow<usize> { | ||
| // SAFETY the as_bytes_mut is unsafe but we will only do ascii case change in place with it | ||
| unsafe { | ||
| self.as_bytes_mut().into_iter().enumerate().try_for_each(|(i, b)| { | ||
| if b.is_ascii() { | ||
| if MAKE_UPPER { | ||
| b.make_ascii_uppercase(); | ||
| } else { | ||
| b.make_ascii_lowercase(); | ||
| } | ||
| ControlFlow::Continue(()) | ||
| } else { | ||
| ControlFlow::Break(i) | ||
| } | ||
| }) | ||
| } | ||
|
Comment on lines
+3614
to
+3629
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has room for improvement to end up on par with the str::to_*case implementations, but doing it properly requires factorizing code in common so I'm delaying it until later |
||
| } | ||
|
|
||
| /// Converts this string to its uppercase equivalent in-place. | ||
| /// | ||
| /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property | ||
| /// `Uppercase`. | ||
| /// | ||
| /// Since some characters can expand into multiple characters when changing | ||
| /// the case, this method may change the length of the string. If the string | ||
| /// shrinks, the excess capacity is not reclaimed. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// Basic usage: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(string_make_uplowercase)] | ||
| /// | ||
| /// let mut s = String::from("hello"); | ||
| /// s.make_uppercase(); | ||
| /// | ||
| /// assert_eq!("HELLO", s); | ||
| /// ``` | ||
| /// | ||
| /// Scripts without case are not changed: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(string_make_uplowercase)] | ||
| /// | ||
| /// let mut new_year = String::from("农历新年"); | ||
| /// new_year.make_uppercase(); | ||
| /// | ||
| /// assert_eq!("农历新年", new_year); | ||
| /// ``` | ||
| /// | ||
| /// One character can become multiple: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(string_make_uplowercase)] | ||
| /// | ||
| /// let mut s = String::from("tschüß"); | ||
| /// s.make_uppercase(); | ||
| /// | ||
| /// assert_eq!("TSCHÜSS", s); | ||
| /// ``` | ||
| #[unstable(feature = "string_make_uplowercase", issue = "135885")] | ||
| pub fn make_uppercase(&mut self) { | ||
| let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::<true>() else { | ||
| return; | ||
| }; | ||
| let mut wc = WriteChars::new(self, non_utf8_offset); | ||
| while let Some(l_c) = wc.pop() { | ||
| l_c.to_uppercase().for_each(|u_c| wc.write(u_c)); | ||
| } | ||
| // SAFETY: At this point, none of the methods of wc panicked | ||
| unsafe { | ||
| wc.finalize(); | ||
| } | ||
| } | ||
|
|
||
| /// Converts this string to its lowercase equivalent in-place. | ||
| /// | ||
| /// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property | ||
| /// `Lowercase`. | ||
| /// | ||
| /// Since some characters can expand into multiple characters when changing | ||
| /// the case, this method may change the length of the string. If the string | ||
| /// shrinks, the excess capacity is not reclaimed. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// Basic usage: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(string_make_uplowercase)] | ||
| /// | ||
| /// let mut s = String::from("HELLO"); | ||
| /// s.make_lowercase(); | ||
| /// | ||
| /// assert_eq!("hello", s); | ||
| /// ``` | ||
| /// | ||
| /// ``` | ||
| /// #![feature(string_make_uplowercase)] | ||
| /// | ||
| /// let mut odysseus = String::from("ὈΔΥΣΣΕΎΣ"); | ||
| /// odysseus.make_lowercase(); | ||
| /// | ||
| /// assert_eq!("ὀδυσσεύς", odysseus); | ||
| /// ``` | ||
| /// | ||
| /// Languages without case are not changed: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(string_make_uplowercase)] | ||
| /// | ||
| /// let mut new_year = String::from("农历新年"); | ||
| /// new_year.make_lowercase(); | ||
| /// | ||
| /// assert_eq!("农历新年", new_year); | ||
| /// ``` | ||
| #[unstable(feature = "string_make_uplowercase", issue = "135885")] | ||
| pub fn make_lowercase(&mut self) { | ||
| fn update_word_final(word_final_so_far: bool, u_c: char) -> bool { | ||
| u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable()) | ||
| } | ||
|
|
||
| let ControlFlow::Break(non_utf8_offset) = self.case_change_while_ascii::<false>() else { | ||
| return; | ||
| }; | ||
| // We will only update the streaming word_final detection if the str contains sigma | ||
| // because it requires table lookups that we consider expensive. | ||
| let has_sigma = self[non_utf8_offset..].contains('Σ'); | ||
| let mut word_final_so_far = | ||
| has_sigma && self[..non_utf8_offset].chars().fold(false, update_word_final); | ||
| let mut wc = WriteChars::new(self, non_utf8_offset); | ||
| while let Some(u_c) = wc.pop() { | ||
| if u_c == 'Σ' { | ||
| if word_final_so_far && !crate::str::case_ignorable_then_cased(wc.rest().chars()) { | ||
| // actually word final | ||
| wc.write('ς'); | ||
| } else { | ||
| wc.write('σ'); | ||
| } | ||
| } else { | ||
| u_c.to_lowercase().for_each(|l_c| wc.write(l_c)); | ||
| } | ||
| if has_sigma { | ||
| word_final_so_far = update_word_final(word_final_so_far, u_c); | ||
| } | ||
| } | ||
| // SAFETY: At this point, none of the methods of wc panicked | ||
| unsafe { | ||
| wc.finalize(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// A helper for in place modification of strings, where we gradually "pop" characters, | ||
| /// hereby making room to write back to the string buffer | ||
| #[cfg(not(no_global_oom_handling))] | ||
| #[unstable(issue = "none", feature = "std_internals")] | ||
| struct WriteChars<'a> { | ||
| // This is the internal buffer of the string temporarily changed to Vec<u8> because | ||
| // it will contain non utf8 bytes. | ||
| // invariant: self.v.len() == original string until finalize is run | ||
| v: Vec<u8>, | ||
| // A reference kept to restore the string at the end | ||
| // (ie finalize time) | ||
| s: &'a mut String, | ||
| // invariant: write_offset <= read_offset | ||
| write_offset: usize, | ||
| // invariant: self.read_offset <= self.v.len() | ||
| // before finalize | ||
| read_offset: usize, | ||
| buffer: VecDeque<u8>, | ||
| } | ||
|
|
||
| #[cfg(not(no_global_oom_handling))] | ||
| #[unstable(issue = "none", feature = "std_internals")] | ||
| impl<'a> WriteChars<'a> { | ||
| fn new(s: &'a mut String, offset: usize) -> Self { | ||
| let v = core::mem::take(s).into_bytes(); | ||
| WriteChars { s, v, write_offset: offset, read_offset: offset, buffer: VecDeque::new() } | ||
| } | ||
|
|
||
| fn rest(&self) -> &str { | ||
| // SAFETY: read_offset is always ok to read from | ||
| unsafe { str::from_utf8_unchecked(&self.v[self.read_offset..]) } | ||
| } | ||
|
|
||
| fn pop(&mut self) -> Option<char> { | ||
| // SAFETY: The bytes from read_offset are valid UTF8 | ||
| let (code_point, width) = unsafe { | ||
| core::str::next_code_point_with_width(&mut self.v[self.read_offset..].iter())? | ||
| }; | ||
| self.read_offset += width; | ||
| // Dump what is buffered in the newly freed space | ||
| while self.write_offset < self.read_offset | ||
| && let Some(b) = self.buffer.pop_front() | ||
| { | ||
| self.v[self.write_offset] = b; | ||
| self.write_offset += 1; | ||
| } | ||
| // SAFETY: The code point is valid | ||
| let c = unsafe { char::from_u32_unchecked(code_point) }; | ||
| Some(c) | ||
| } | ||
|
|
||
| fn write(&mut self, c: char) { | ||
| let writable_slice = &mut self.v[self.write_offset..self.read_offset]; | ||
| let mut buffer = [0u8; 4]; | ||
| let len = c.encode_utf8(&mut buffer).len(); | ||
| let direct_copy_length = core::cmp::min(len, writable_slice.len()); | ||
| writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]); | ||
| self.write_offset += direct_copy_length; | ||
| self.buffer.extend(&buffer[direct_copy_length..len]); | ||
| } | ||
|
|
||
| // Set the proper length of the string's storage | ||
| // or grow it to add what is still in the buffer. | ||
| /// Finalize should be run for the modifications to be actually written back to the string | ||
| /// # Safety | ||
| /// Must not be called if one of the previous method calls of self panicked because the buffer | ||
| /// may contain invalid utf8 | ||
| unsafe fn finalize(mut self) { | ||
| if self.buffer.is_empty() { | ||
| // SAFETY: if the queue is empty, then | ||
| // there were less bytes than in the original so we can simply shrink | ||
| unsafe { | ||
| self.v.set_len(self.write_offset); | ||
| } | ||
| } else { | ||
| let (q1, q2) = self.buffer.as_slices(); | ||
| self.v.extend_from_slice(q1); | ||
| self.v.extend_from_slice(q2); | ||
| }; | ||
| // SAFETY: this is valid utf8 | ||
| *self.s = unsafe { String::from_utf8_unchecked(core::mem::take(&mut self.v)) } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this file, I am simply making case_ignorable_then_cased accessible from the string module