Skip to content

Commit 518b877

Browse files
anonrigerikcorrylemire
committed
util: improve textencoder encodeInto performance
Co-authored-by: Erik Corry <[email protected]> Co-authored-by: Daniel Lemire <[email protected]>
1 parent 340e619 commit 518b877

File tree

1 file changed

+134
-8
lines changed

1 file changed

+134
-8
lines changed

src/encoding_binding.cc

Lines changed: 134 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,83 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7171
return info;
7272
}
7373

74+
namespace {
75+
constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
76+
77+
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
78+
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
79+
}
80+
81+
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
82+
if (c < 0x80) return 1;
83+
if (c < 0x400) return 2;
84+
return 3;
85+
}
86+
87+
template <typename Char>
88+
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
89+
size_t pos = 0;
90+
size_t utf8Accumulated = 0;
91+
constexpr size_t CHUNK = 257;
92+
constexpr bool UTF16 = sizeof(Char) == 2;
93+
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
94+
95+
double expansion = 1.15;
96+
97+
while (pos < length && utf8Accumulated < bufferSize) {
98+
size_t remainingInput = length - pos;
99+
size_t spaceRemaining = bufferSize - utf8Accumulated;
100+
DCHECK_GE(expansion, 1.15);
101+
102+
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
103+
if (guaranteedToFit >= remainingInput) {
104+
return length;
105+
}
106+
size_t likelyToFit =
107+
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
108+
size_t fitEstimate = std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
109+
size_t chunkSize = std::min(remainingInput, fitEstimate);
110+
if (chunkSize == 1) break;
111+
DCHECK_GE(chunkSize, 1);
112+
113+
size_t chunkUtf8Len;
114+
if constexpr (UTF16) {
115+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when available
116+
// For now, validate and use utf8_length_from_utf16
117+
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
118+
} else {
119+
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
120+
}
121+
122+
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
123+
DCHECK_GT(chunkSize, guaranteedToFit);
124+
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
125+
} else {
126+
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
127+
pos += chunkSize;
128+
utf8Accumulated += chunkUtf8Len;
129+
}
130+
}
131+
132+
while (pos < length && utf8Accumulated < bufferSize) {
133+
size_t extra = simpleUtfEncodingLength(data[pos]);
134+
if (utf8Accumulated + extra > bufferSize) break;
135+
pos++;
136+
utf8Accumulated += extra;
137+
}
138+
139+
if (UTF16 && pos != 0 && pos != length &&
140+
isSurrogatePair(data[pos - 1], data[pos])) {
141+
if (utf8Accumulated < bufferSize) {
142+
pos++;
143+
} else {
144+
pos--;
145+
}
146+
}
147+
return pos;
148+
}
149+
} // namespace
150+
74151
void BindingData::Deserialize(Local<Context> context,
75152
Local<Object> holder,
76153
int index,
@@ -101,15 +178,64 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101178
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
102179
size_t dest_length = dest->ByteLength();
103180

104-
size_t nchars;
105-
size_t written = source->WriteUtf8V2(isolate,
106-
write_result,
107-
dest_length,
108-
String::WriteFlags::kReplaceInvalidUtf8,
109-
&nchars);
181+
size_t read = 0;
182+
size_t written = 0;
183+
v8::String::ValueView view(isolate, source);
184+
uint32_t length = view.length();
185+
186+
if (view.is_one_byte()) {
187+
auto data = reinterpret_cast<const char*>(view.data8());
188+
simdutf::result result = simdutf::validate_ascii_with_errors(
189+
data, std::min(static_cast<size_t>(length), dest_length));
190+
written = read = result.count;
191+
auto out_addr = write_result;
192+
memcpy(out_addr, data, read);
193+
out_addr += read;
194+
data += read;
195+
length -= read;
196+
dest_length -= read;
197+
if (length != 0 && dest_length != 0) {
198+
size_t rest = findBestFit(data, length, dest_length);
199+
if (rest != 0) {
200+
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
201+
written += simdutf::convert_latin1_to_utf8(data, rest, out_addr);
202+
read += rest;
203+
}
204+
}
205+
} else {
206+
auto data = reinterpret_cast<const char16_t*>(view.data16());
207+
208+
// Check if input has unpaired surrogates - if so, convert to well-formed first
209+
simdutf::result validation_result =
210+
simdutf::validate_utf16_with_errors(data, length);
211+
212+
if (validation_result.error == simdutf::SUCCESS) {
213+
// Valid UTF-16 - use the fast path
214+
read = findBestFit(data, length, dest_length);
215+
if (read != 0) {
216+
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
217+
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
218+
}
219+
} else {
220+
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
221+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when available
222+
std::vector<char16_t> conversion_buffer(length);
223+
simdutf::to_well_formed_utf16(data, length, conversion_buffer.data());
224+
225+
// Now use findBestFit with the well-formed data
226+
read = findBestFit(conversion_buffer.data(), length, dest_length);
227+
if (read != 0) {
228+
DCHECK_LE(simdutf::utf8_length_from_utf16(conversion_buffer.data(), read),
229+
dest_length);
230+
written = simdutf::convert_utf16_to_utf8(
231+
conversion_buffer.data(), read, write_result);
232+
}
233+
}
234+
}
235+
DCHECK_LE(written, dest_length);
110236

111-
binding_data->encode_into_results_buffer_[0] = nchars;
112-
binding_data->encode_into_results_buffer_[1] = written;
237+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
238+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
113239
}
114240

115241
// Encode a single string to a UTF-8 Uint8Array (not Buffer).

0 commit comments

Comments
 (0)