Skip to content

Commit 3edf7c0

Browse files
committed
Replace some std::set<int> with a sorted std::vector
`Transducer::determinize()` doesn't do very much inserting into sets, but does iterate over them and use them as map keys, both operations which are substantially sped up by ensuring that the elements of the set are contiguous. Thus this commit adds `sorted_vector` which is a wrapper around `std::vector` that ensures that inserted elements are sorted and unique. The results are significant improvements in both runtime and memory usage for transducer minimization. Effect on `lt-comp`: | | eng | oci | |----------|---------------|---------------| | orig | 136 MB 23.9 s | 980 MB 219 s | | pre-comp | 160 MB 16.8 s | 1120 MB 163 s | | VecSet | 99 MB 12.7 s | 800 MB 116 s | | net diff | -27% -47% | -18% -47% |
1 parent 9111665 commit 3edf7c0

File tree

3 files changed

+286
-9
lines changed

3 files changed

+286
-9
lines changed

lttoolbox/Makefile.am

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \
44
match_exe.h match_node.h match_state.h my_stdio.h node.h \
55
pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \
66
transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \
7-
ustring.h
7+
ustring.h sorted_vector.hpp
88
cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \
99
expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \
1010
match_node.cc match_state.cc node.cc pattern_list.cc \

lttoolbox/sorted_vector.hpp

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
/*
2+
* Copyright (C) 2022 Apertium
3+
*
4+
* This program is free software; you can redistribute it and/or
5+
* modify it under the terms of the GNU General Public License as
6+
* published by the Free Software Foundation; either version 2 of the
7+
* License, or (at your option) any later version.
8+
*
9+
* This program is distributed in the hope that it will be useful, but
10+
* WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12+
* General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with this program; if not, see <https://www.gnu.org/licenses/>.
16+
*/
17+
18+
#pragma once
19+
#ifndef c6d28b7452ec699b_SORTED_VECTOR_HPP
20+
#define c6d28b7452ec699b_SORTED_VECTOR_HPP
21+
#include <set>
22+
#include <vector>
23+
#include <algorithm>
24+
#include <functional>
25+
26+
namespace detail {
27+
template<typename ForwardIt, typename Comp>
28+
bool is_sorted(ForwardIt first, ForwardIt last, Comp comp) {
29+
if (first != last) {
30+
ForwardIt next = first;
31+
while (++next != last) {
32+
if (comp(*next, *first)) {
33+
return false;
34+
}
35+
first = next;
36+
}
37+
}
38+
return true;
39+
}
40+
}
41+
42+
template<typename T, typename Comp = std::less<T>>
43+
class sorted_vector {
44+
public:
45+
typedef typename std::vector<T> container;
46+
typedef typename container::iterator iterator;
47+
typedef typename container::const_iterator const_iterator;
48+
typedef typename container::const_reverse_iterator const_reverse_iterator;
49+
typedef typename container::size_type size_type;
50+
typedef T value_type;
51+
typedef T key_type;
52+
53+
sorted_vector() {}
54+
55+
sorted_vector(const std::set<T>& o) {
56+
insert(o.begin(), o.end());
57+
}
58+
59+
std::pair<iterator, bool> insert(T t) {
60+
if (elements.empty()) {
61+
elements.push_back(t);
62+
return std::make_pair(elements.begin(), true);
63+
}
64+
iterator it = std::lower_bound(elements.begin(), elements.end(), t, comp);
65+
size_t at = std::distance(elements.begin(), it);
66+
if (it == elements.end() || comp(*it, t) || comp(t, *it)) {
67+
elements.insert(it, t);
68+
return std::make_pair(elements.begin() + at, true);
69+
}
70+
return std::make_pair(elements.begin() + at, false);
71+
}
72+
73+
template<typename It>
74+
void insert(It b, It e) {
75+
size_t d = std::distance(b, e);
76+
if (d == 1) {
77+
insert(*b);
78+
return;
79+
}
80+
81+
static thread_local container merged;
82+
merged.resize(0);
83+
merged.reserve(elements.size() + d);
84+
85+
if (detail::is_sorted(b, e, comp)) {
86+
std::merge(elements.begin(), elements.end(), b, e, std::back_inserter(merged), comp);
87+
}
88+
else {
89+
static thread_local container sorted;
90+
sorted.assign(b, e);
91+
std::sort(sorted.begin(), sorted.end(), comp);
92+
std::merge(elements.begin(), elements.end(), sorted.begin(), sorted.end(), std::back_inserter(merged), comp);
93+
}
94+
95+
merged.swap(elements);
96+
auto it = std::unique(elements.begin(), elements.end());
97+
elements.erase(it, elements.end());
98+
}
99+
100+
void push_back(T t) {
101+
insert(t);
102+
}
103+
104+
bool erase(T t) {
105+
if (elements.empty()) {
106+
return false;
107+
}
108+
if (comp(elements.back(), t)) {
109+
return false;
110+
}
111+
if (comp(t, elements.front())) {
112+
return false;
113+
}
114+
auto it = lower_bound(t);
115+
if (it != elements.end() && !comp(*it, t) && !comp(t, *it)) {
116+
elements.erase(it);
117+
return true;
118+
}
119+
return false;
120+
}
121+
122+
const_iterator erase(const_iterator it) {
123+
size_type o = std::distance<const_iterator>(elements.begin(), it);
124+
return elements.erase(elements.begin() + o);
125+
}
126+
127+
template<typename It>
128+
void erase(It b, It e) {
129+
for (; b != e; ++b) {
130+
erase(*b);
131+
}
132+
}
133+
134+
const_iterator find(T t) const {
135+
if (elements.empty()) {
136+
return elements.end();
137+
}
138+
if (comp(elements.back(), t)) {
139+
return elements.end();
140+
}
141+
if (comp(t, elements.front())) {
142+
return elements.end();
143+
}
144+
auto it = lower_bound(t);
145+
if (it != elements.end() && (comp(*it, t) || comp(t, *it))) {
146+
return elements.end();
147+
}
148+
return it;
149+
}
150+
151+
size_t count(T t) const {
152+
return (find(t) != end());
153+
}
154+
155+
iterator begin() {
156+
return elements.begin();
157+
}
158+
159+
iterator end() {
160+
return elements.end();
161+
}
162+
163+
const_iterator begin() const {
164+
return elements.begin();
165+
}
166+
167+
const_iterator end() const {
168+
return elements.end();
169+
}
170+
171+
const_iterator cbegin() const {
172+
return elements.cbegin();
173+
}
174+
175+
const_iterator cend() const {
176+
return elements.cend();
177+
}
178+
179+
const_reverse_iterator rbegin() const {
180+
return elements.rbegin();
181+
}
182+
183+
const_reverse_iterator rend() const {
184+
return elements.rend();
185+
}
186+
187+
T front() const {
188+
return elements.front();
189+
}
190+
191+
T back() const {
192+
return elements.back();
193+
}
194+
195+
iterator lower_bound(T t) {
196+
return std::lower_bound(elements.begin(), elements.end(), t, comp);
197+
}
198+
199+
const_iterator lower_bound(T t) const {
200+
return std::lower_bound(elements.begin(), elements.end(), t, comp);
201+
}
202+
203+
const_iterator upper_bound(T t) const {
204+
return std::upper_bound(elements.begin(), elements.end(), t, comp);
205+
}
206+
207+
bool intersects(const sorted_vector<T>& other) const {
208+
auto ti = begin();
209+
auto oi = other.begin();
210+
auto te = end();
211+
auto oe = other.end();
212+
while (ti != te && oi != oe) {
213+
if (*ti == *oi) {
214+
return true;
215+
}
216+
else if (comp(*ti, *oi)) {
217+
++ti;
218+
}
219+
else {
220+
++oi;
221+
}
222+
}
223+
return false;
224+
}
225+
226+
size_type size() const {
227+
return elements.size();
228+
}
229+
230+
size_type capacity() const {
231+
return elements.capacity();
232+
}
233+
234+
bool empty() const {
235+
return elements.empty();
236+
}
237+
238+
template<typename It>
239+
void assign(It b, It e) {
240+
clear();
241+
insert(b, e);
242+
}
243+
244+
void assign(const_iterator b, const_iterator e) {
245+
elements.assign(b, e);
246+
}
247+
248+
void swap(sorted_vector& other) {
249+
elements.swap(other.elements);
250+
}
251+
252+
void clear() {
253+
elements.clear();
254+
}
255+
256+
void sort() {
257+
std::sort(elements.begin(), elements.end(), Comp());
258+
}
259+
260+
void pop_back() {
261+
elements.pop_back();
262+
}
263+
264+
container& get() {
265+
return elements;
266+
}
267+
268+
bool operator<(const sorted_vector<T>& o) const {
269+
return elements < o.elements;
270+
}
271+
272+
private:
273+
container elements;
274+
Comp comp;
275+
};
276+
277+
#endif

lttoolbox/transducer.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <lttoolbox/my_stdio.h>
2121
#include <lttoolbox/deserialiser.h>
2222
#include <lttoolbox/serialiser.h>
23+
#include <lttoolbox/sorted_vector.hpp>
2324

2425
#include <cstdlib>
2526
#include <iostream>
@@ -314,16 +315,16 @@ Transducer::isEmptyIntersection(std::set<int> const &s1, std::set<int> const &s2
314315
void
315316
Transducer::determinize(int const epsilon_tag)
316317
{
317-
std::vector<std::set<int> > R(2);
318-
std::vector<std::set<int>> Q_prime;
319-
std::map<std::set<int>, int> Q_prime_inv;
318+
std::vector<sorted_vector<int>> R(2);
319+
std::vector<sorted_vector<int>> Q_prime;
320+
std::map<sorted_vector<int>, int> Q_prime_inv;
320321

321322
std::map<int, std::multimap<int, std::pair<int, double> > > transitions_prime;
322323

323324
// We're almost certainly going to need the closure of (nearly) every
324325
// state, and we're often going to need the closure several times,
325326
// so it's faster to precompute (though it does slow things down a bit).
326-
std::vector<std::set<int>> all_closures;
327+
std::vector<sorted_vector<int>> all_closures;
327328
all_closures.reserve(transitions.size());
328329
for (size_t i = 0; i < transitions.size(); i++) {
329330
all_closures.push_back(closure(i, epsilon_tag));
@@ -345,7 +346,7 @@ Transducer::determinize(int const epsilon_tag)
345346

346347
int t = 0;
347348

348-
std::set<int> finals_state;
349+
sorted_vector<int> finals_state;
349350
for(auto& it : finals) {
350351
finals_state.insert(it.first);
351352
}
@@ -357,8 +358,7 @@ Transducer::determinize(int const epsilon_tag)
357358

358359
for(auto& it : R[t])
359360
{
360-
if(!isEmptyIntersection(Q_prime[it], finals_state))
361-
{
361+
if (Q_prime[it].intersects(finals_state)) {
362362
double w = default_weight;
363363
auto it3 = finals.find(it);
364364
if (it3 != finals.end()) {
@@ -367,7 +367,7 @@ Transducer::determinize(int const epsilon_tag)
367367
finals_prime.insert(std::make_pair(it, w));
368368
}
369369

370-
std::map<std::pair<int, double>, std::set<int> > mymap;
370+
std::map<std::pair<int, double>, sorted_vector<int> > mymap;
371371

372372
for(auto& it2 : Q_prime[it])
373373
{

0 commit comments

Comments
 (0)