Skip to content

Commit 730da98

Browse files
committed
address pr reviews
1 parent 4cec6dc commit 730da98

File tree

1 file changed

+32
-14
lines changed

1 file changed

+32
-14
lines changed

src/encoding_binding.cc

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,26 @@ constexpr size_t simpleUtfEncodingLength(uint16_t c) {
9292
return 3;
9393
}
9494

95+
// Finds the maximum number of input characters (UTF-16 or Latin1) that can be
96+
// encoded into a UTF-8 buffer of the given size.
97+
//
98+
// The challenge is that UTF-8 encoding expands characters by variable amounts:
99+
// - ASCII (< 0x80): 1 byte
100+
// - Code points < 0x800: 2 bytes
101+
// - Other BMP characters: 3 bytes
102+
// - Surrogate pairs (supplementary planes): 4 bytes total
103+
//
104+
// This function uses an adaptive chunking algorithm:
105+
// 1. Process the input in chunks, estimating how many characters will fit
106+
// 2. Calculate the actual UTF-8 length for each chunk using simdutf
107+
// 3. Adjust the expansion factor based on observed encoding ratios
108+
// 4. Fall back to character-by-character processing near the buffer boundary
109+
// 5. Handle UTF-16 surrogate pairs to avoid splitting them across boundaries
110+
//
111+
// The algorithm starts with a conservative expansion estimate (1.15x) and
112+
// dynamically adjusts based on actual character distribution, making it
113+
// efficient for common ASCII-heavy text while remaining correct for
114+
// multi-byte heavy content.
95115
template <typename Char>
96116
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
97117
size_t pos = 0;
@@ -197,24 +217,23 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
197217

198218
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
199219
size_t dest_length = dest->ByteLength();
220+
size_t read = 0;
221+
size_t written = 0;
200222

201223
// For small strings (length <= 32), use the old V8 path for better
202224
// performance
203-
if (source->Length() <= 32) {
204-
size_t nchars;
205-
size_t written =
206-
source->WriteUtf8V2(isolate,
207-
write_result,
208-
dest_length,
209-
String::WriteFlags::kReplaceInvalidUtf8,
210-
&nchars);
211-
binding_data->encode_into_results_buffer_[0] = nchars;
212-
binding_data->encode_into_results_buffer_[1] = written;
225+
static constexpr int kSmallStringThreshold = 32;
226+
if (source->Length() <= kSmallStringThreshold) {
227+
written = source->WriteUtf8V2(isolate,
228+
write_result,
229+
dest_length,
230+
String::WriteFlags::kReplaceInvalidUtf8,
231+
&read);
232+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
233+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
213234
return;
214235
}
215236

216-
size_t read = 0;
217-
size_t written = 0;
218237
v8::String::ValueView view(isolate, source);
219238
size_t length_that_fits =
220239
std::min(static_cast<size_t>(view.length()), dest_length);
@@ -230,8 +249,7 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
230249
length_that_fits -= read;
231250
dest_length -= read;
232251
if (length_that_fits != 0 && dest_length != 0) {
233-
size_t rest = findBestFit(data, length_that_fits, dest_length);
234-
if (rest != 0) {
252+
if (size_t rest = findBestFit(data, length_that_fits, dest_length)) {
235253
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
236254
written += simdutf::convert_latin1_to_utf8(data, rest, write_result);
237255
read += rest;

0 commit comments

Comments
 (0)