@@ -92,6 +92,26 @@ constexpr size_t simpleUtfEncodingLength(uint16_t c) {
9292 return 3 ;
9393}
9494
95+ // Finds the maximum number of input characters (UTF-16 or Latin1) that can be
96+ // encoded into a UTF-8 buffer of the given size.
97+ //
98+ // The challenge is that UTF-8 encoding expands characters by variable amounts:
99+ // - ASCII (< 0x80): 1 byte
100+ // - Code points < 0x800: 2 bytes
101+ // - Other BMP characters: 3 bytes
102+ // - Surrogate pairs (supplementary planes): 4 bytes total
103+ //
104+ // This function uses an adaptive chunking algorithm:
105+ // 1. Process the input in chunks, estimating how many characters will fit
106+ // 2. Calculate the actual UTF-8 length for each chunk using simdutf
107+ // 3. Adjust the expansion factor based on observed encoding ratios
108+ // 4. Fall back to character-by-character processing near the buffer boundary
109+ // 5. Handle UTF-16 surrogate pairs to avoid splitting them across boundaries
110+ //
111+ // The algorithm starts with a conservative expansion estimate (1.15x) and
112+ // dynamically adjusts based on actual character distribution, making it
113+ // efficient for common ASCII-heavy text while remaining correct for
114+ // multi-byte heavy content.
95115template <typename Char>
96116size_t findBestFit (const Char* data, size_t length, size_t bufferSize) {
97117 size_t pos = 0 ;
@@ -197,24 +217,23 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
197217
198218 char * write_result = static_cast <char *>(buf->Data ()) + dest->ByteOffset ();
199219 size_t dest_length = dest->ByteLength ();
220+ size_t read = 0 ;
221+ size_t written = 0 ;
200222
201223 // For small strings (length <= 32), use the old V8 path for better
202224 // performance
203- if (source->Length () <= 32 ) {
204- size_t nchars;
205- size_t written =
206- source->WriteUtf8V2 (isolate,
207- write_result,
208- dest_length,
209- String::WriteFlags::kReplaceInvalidUtf8 ,
210- &nchars);
211- binding_data->encode_into_results_buffer_ [0 ] = nchars;
212- binding_data->encode_into_results_buffer_ [1 ] = written;
225+ static constexpr int kSmallStringThreshold = 32 ;
226+ if (source->Length () <= kSmallStringThreshold ) {
227+ written = source->WriteUtf8V2 (isolate,
228+ write_result,
229+ dest_length,
230+ String::WriteFlags::kReplaceInvalidUtf8 ,
231+ &read);
232+ binding_data->encode_into_results_buffer_ [0 ] = static_cast <double >(read);
233+ binding_data->encode_into_results_buffer_ [1 ] = static_cast <double >(written);
213234 return ;
214235 }
215236
216- size_t read = 0 ;
217- size_t written = 0 ;
218237 v8::String::ValueView view (isolate, source);
219238 size_t length_that_fits =
220239 std::min (static_cast <size_t >(view.length ()), dest_length);
@@ -230,8 +249,7 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
230249 length_that_fits -= read;
231250 dest_length -= read;
232251 if (length_that_fits != 0 && dest_length != 0 ) {
233- size_t rest = findBestFit (data, length_that_fits, dest_length);
234- if (rest != 0 ) {
252+ if (size_t rest = findBestFit (data, length_that_fits, dest_length)) {
235253 DCHECK_LE (simdutf::utf8_length_from_latin1 (data, rest), dest_length);
236254 written += simdutf::convert_latin1_to_utf8 (data, rest, write_result);
237255 read += rest;
0 commit comments