@@ -71,6 +71,83 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7171 return info;
7272}
7373
74+ namespace {
75+ constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096 ;
76+
77+ constexpr bool isSurrogatePair (uint16_t lead, uint16_t trail) {
78+ return (lead & 0xfc00 ) == 0xd800 && (trail & 0xfc00 ) == 0xdc00 ;
79+ }
80+
81+ constexpr size_t simpleUtfEncodingLength (uint16_t c) {
82+ if (c < 0x80 ) return 1 ;
83+ if (c < 0x400 ) return 2 ;
84+ return 3 ;
85+ }
86+
87+ template <typename Char>
88+ size_t findBestFit (const Char* data, size_t length, size_t bufferSize) {
89+ size_t pos = 0 ;
90+ size_t utf8Accumulated = 0 ;
91+ constexpr size_t CHUNK = 257 ;
92+ constexpr bool UTF16 = sizeof (Char) == 2 ;
93+ constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2 ;
94+
95+ double expansion = 1.15 ;
96+
97+ while (pos < length && utf8Accumulated < bufferSize) {
98+ size_t remainingInput = length - pos;
99+ size_t spaceRemaining = bufferSize - utf8Accumulated;
100+ DCHECK_GE (expansion, 1.15 );
101+
102+ size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
103+ if (guaranteedToFit >= remainingInput) {
104+ return length;
105+ }
106+ size_t likelyToFit =
107+ std::min (static_cast <size_t >(spaceRemaining / expansion), CHUNK);
108+ size_t fitEstimate = std::max (size_t {1 }, std::max (guaranteedToFit, likelyToFit));
109+ size_t chunkSize = std::min (remainingInput, fitEstimate);
110+ if (chunkSize == 1 ) break ;
111+ DCHECK_GE (chunkSize, 1 );
112+
113+ size_t chunkUtf8Len;
114+ if constexpr (UTF16) {
115+ // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when available
116+ // For now, validate and use utf8_length_from_utf16
117+ chunkUtf8Len = simdutf::utf8_length_from_utf16 (data + pos, chunkSize);
118+ } else {
119+ chunkUtf8Len = simdutf::utf8_length_from_latin1 (data + pos, chunkSize);
120+ }
121+
122+ if (utf8Accumulated + chunkUtf8Len > bufferSize) {
123+ DCHECK_GT (chunkSize, guaranteedToFit);
124+ expansion = std::max (expansion * 1.1 , (chunkUtf8Len * 1.1 ) / chunkSize);
125+ } else {
126+ expansion = std::max (1.15 , (chunkUtf8Len * 1.1 ) / chunkSize);
127+ pos += chunkSize;
128+ utf8Accumulated += chunkUtf8Len;
129+ }
130+ }
131+
132+ while (pos < length && utf8Accumulated < bufferSize) {
133+ size_t extra = simpleUtfEncodingLength (data[pos]);
134+ if (utf8Accumulated + extra > bufferSize) break ;
135+ pos++;
136+ utf8Accumulated += extra;
137+ }
138+
139+ if (UTF16 && pos != 0 && pos != length &&
140+ isSurrogatePair (data[pos - 1 ], data[pos])) {
141+ if (utf8Accumulated < bufferSize) {
142+ pos++;
143+ } else {
144+ pos--;
145+ }
146+ }
147+ return pos;
148+ }
149+ } // namespace
150+
74151void BindingData::Deserialize (Local<Context> context,
75152 Local<Object> holder,
76153 int index,
@@ -101,15 +178,64 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101178 char * write_result = static_cast <char *>(buf->Data ()) + dest->ByteOffset ();
102179 size_t dest_length = dest->ByteLength ();
103180
104- size_t nchars;
105- size_t written = source->WriteUtf8V2 (isolate,
106- write_result,
107- dest_length,
108- String::WriteFlags::kReplaceInvalidUtf8 ,
109- &nchars);
181+ size_t read = 0 ;
182+ size_t written = 0 ;
183+ v8::String::ValueView view (isolate, source);
184+ uint32_t length = view.length ();
185+
186+ if (view.is_one_byte ()) {
187+ auto data = reinterpret_cast <const char *>(view.data8 ());
188+ simdutf::result result = simdutf::validate_ascii_with_errors (
189+ data, std::min (static_cast <size_t >(length), dest_length));
190+ written = read = result.count ;
191+ auto out_addr = write_result;
192+ memcpy (out_addr, data, read);
193+ out_addr += read;
194+ data += read;
195+ length -= read;
196+ dest_length -= read;
197+ if (length != 0 && dest_length != 0 ) {
198+ size_t rest = findBestFit (data, length, dest_length);
199+ if (rest != 0 ) {
200+ DCHECK_LE (simdutf::utf8_length_from_latin1 (data, rest), dest_length);
201+ written += simdutf::convert_latin1_to_utf8 (data, rest, out_addr);
202+ read += rest;
203+ }
204+ }
205+ } else {
206+ auto data = reinterpret_cast <const char16_t *>(view.data16 ());
207+
208+ // Check if input has unpaired surrogates - if so, convert to well-formed first
209+ simdutf::result validation_result =
210+ simdutf::validate_utf16_with_errors (data, length);
211+
212+ if (validation_result.error == simdutf::SUCCESS) {
213+ // Valid UTF-16 - use the fast path
214+ read = findBestFit (data, length, dest_length);
215+ if (read != 0 ) {
216+ DCHECK_LE (simdutf::utf8_length_from_utf16 (data, read), dest_length);
217+ written = simdutf::convert_utf16_to_utf8 (data, read, write_result);
218+ }
219+ } else {
220+ // Invalid UTF-16 with unpaired surrogates - convert to well-formed first
221+ // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when available
222+ std::vector<char16_t > conversion_buffer (length);
223+ simdutf::to_well_formed_utf16 (data, length, conversion_buffer.data ());
224+
225+ // Now use findBestFit with the well-formed data
226+ read = findBestFit (conversion_buffer.data (), length, dest_length);
227+ if (read != 0 ) {
228+ DCHECK_LE (simdutf::utf8_length_from_utf16 (conversion_buffer.data (), read),
229+ dest_length);
230+ written = simdutf::convert_utf16_to_utf8 (
231+ conversion_buffer.data (), read, write_result);
232+ }
233+ }
234+ }
235+ DCHECK_LE (written, dest_length);
110236
111- binding_data->encode_into_results_buffer_ [0 ] = nchars ;
112- binding_data->encode_into_results_buffer_ [1 ] = written;
237+ binding_data->encode_into_results_buffer_ [0 ] = static_cast < double >(read) ;
238+ binding_data->encode_into_results_buffer_ [1 ] = static_cast < double >( written) ;
113239}
114240
115241// Encode a single string to a UTF-8 Uint8Array (not Buffer).
0 commit comments