util: improve textencoder encodeInto performance

anonrig · erikcorry · lemire · anonrig · commit 518b87762fc6 · 2025-11-24T14:50:47.000-05:00
Co-authored-by: Erik Corry &lt;ecorry@cloudflare.com&gt;
Co-authored-by: Daniel Lemire &lt;daniel@lemire.me&gt;
diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc
@@ -71,6 +71,83 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
   return info;
 }
 
+namespace {
+constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
+
+constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
+  return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
+}
+
+constexpr size_t simpleUtfEncodingLength(uint16_t c) {
+  if (c < 0x80) return 1;
+  if (c < 0x400) return 2;
+  return 3;
+}
+
+template <typename Char>
+size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
+  size_t pos = 0;
+  size_t utf8Accumulated = 0;
+  constexpr size_t CHUNK = 257;
+  constexpr bool UTF16 = sizeof(Char) == 2;
+  constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
+
+  double expansion = 1.15;
+
+  while (pos < length && utf8Accumulated < bufferSize) {
+    size_t remainingInput = length - pos;
+    size_t spaceRemaining = bufferSize - utf8Accumulated;
+    DCHECK_GE(expansion, 1.15);
+
+    size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
+    if (guaranteedToFit >= remainingInput) {
+      return length;
+    }
+    size_t likelyToFit =
+        std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
+    size_t fitEstimate = std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
+    size_t chunkSize = std::min(remainingInput, fitEstimate);
+    if (chunkSize == 1) break;
+    DCHECK_GE(chunkSize, 1);
+
+    size_t chunkUtf8Len;
+    if constexpr (UTF16) {
+      // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when available
+      // For now, validate and use utf8_length_from_utf16
+      chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
+    } else {
+      chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
+    }
+
+    if (utf8Accumulated + chunkUtf8Len > bufferSize) {
+      DCHECK_GT(chunkSize, guaranteedToFit);
+      expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
+    } else {
+      expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
+      pos += chunkSize;
+      utf8Accumulated += chunkUtf8Len;
+    }
+  }
+
+  while (pos < length && utf8Accumulated < bufferSize) {
+    size_t extra = simpleUtfEncodingLength(data[pos]);
+    if (utf8Accumulated + extra > bufferSize) break;
+    pos++;
+    utf8Accumulated += extra;
+  }
+
+  if (UTF16 && pos != 0 && pos != length &&
+      isSurrogatePair(data[pos - 1], data[pos])) {
+    if (utf8Accumulated < bufferSize) {
+      pos++;
+    } else {
+      pos--;
+    }
+  }
+  return pos;
+}
+}  // namespace
+
 void BindingData::Deserialize(Local<Context> context,
                               Local<Object> holder,
                               int index,
@@ -101,15 +178,64 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
   char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
   size_t dest_length = dest->ByteLength();
 
-  size_t nchars;
-  size_t written = source->WriteUtf8V2(isolate,
-                                       write_result,
-                                       dest_length,
-                                       String::WriteFlags::kReplaceInvalidUtf8,
-                                       &nchars);
+  size_t read = 0;
+  size_t written = 0;
+  v8::String::ValueView view(isolate, source);
+  uint32_t length = view.length();
+
+  if (view.is_one_byte()) {
+    auto data = reinterpret_cast<const char*>(view.data8());
+    simdutf::result result = simdutf::validate_ascii_with_errors(
+        data, std::min(static_cast<size_t>(length), dest_length));
+    written = read = result.count;
+    auto out_addr = write_result;
+    memcpy(out_addr, data, read);
+    out_addr += read;
+    data += read;
+    length -= read;
+    dest_length -= read;
+    if (length != 0 && dest_length != 0) {
+      size_t rest = findBestFit(data, length, dest_length);
+      if (rest != 0) {
+        DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
+        written += simdutf::convert_latin1_to_utf8(data, rest, out_addr);
+        read += rest;
+      }
+    }
+  } else {
+    auto data = reinterpret_cast<const char16_t*>(view.data16());
+
+    // Check if input has unpaired surrogates - if so, convert to well-formed first
+    simdutf::result validation_result =
+        simdutf::validate_utf16_with_errors(data, length);
+
+    if (validation_result.error == simdutf::SUCCESS) {
+      // Valid UTF-16 - use the fast path
+      read = findBestFit(data, length, dest_length);
+      if (read != 0) {
+        DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
+        written = simdutf::convert_utf16_to_utf8(data, read, write_result);
+      }
+    } else {
+      // Invalid UTF-16 with unpaired surrogates - convert to well-formed first
+      // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when available
+      std::vector<char16_t> conversion_buffer(length);
+      simdutf::to_well_formed_utf16(data, length, conversion_buffer.data());
+
+      // Now use findBestFit with the well-formed data
+      read = findBestFit(conversion_buffer.data(), length, dest_length);
+      if (read != 0) {
+        DCHECK_LE(simdutf::utf8_length_from_utf16(conversion_buffer.data(), read),
+                  dest_length);
+        written = simdutf::convert_utf16_to_utf8(
+            conversion_buffer.data(), read, write_result);
+      }
+    }
+  }
+  DCHECK_LE(written, dest_length);
 
-  binding_data->encode_into_results_buffer_[0] = nchars;
-  binding_data->encode_into_results_buffer_[1] = written;
+  binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
+  binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
 }
 
 // Encode a single string to a UTF-8 Uint8Array (not Buffer).