Permalink
Browse files

Reduce the size of static data in std_unicode::tables.

`BoolTrie` works well for sets of code points spread out through
most of Unicode’s range, but is uses a lot of space for sets
with few, mostly low, code points.

This switches a few of its instances to a similar but simpler trie
data structure.

 ## Before

`size_of::<BoolTrie>()` is 1552, which is added to
`table.r3.len() * 8 + t.r5.len() + t.r6.len() * 8`:

* `Cc_table`: 1632
* `White_Space_table`: 1656
* `Pattern_White_Space_table`: 1640
* Total: 4928 bytes

 ## After

`size_of::<SmallBoolTrie>()` is 32, which is added to
`t.r1.len() + t.r2.len() * 8`:

* `Cc_table`: 51
* `White_Space_table`: 273
* `Pattern_White_Space_table`: 193
* Total: 517 bytes

 ## Difference

Every Rust program with `std` statically linked should be about 4 KB smaller.
  • Loading branch information...
1 parent 90c7c05 commit 3b208d2dac15237a46eb82c03bcc70a2b1165d20 @SimonSapin SimonSapin committed Jan 2, 2017
Showing with 110 additions and 220 deletions.
  1. +7 −6 .gitignore
  2. +58 −6 src/etc/unicode.py
  3. +45 −208 src/libstd_unicode/tables.rs
View
@@ -73,12 +73,13 @@ __pycache__/
/obj/
/rt/
/rustllvm/
-/src/libunicode/DerivedCoreProperties.txt
-/src/libunicode/EastAsianWidth.txt
-/src/libunicode/HangulSyllableType.txt
-/src/libunicode/PropList.txt
-/src/libunicode/Scripts.txt
-/src/libunicode/UnicodeData.txt
+/src/libstd_unicode/DerivedCoreProperties.txt
+/src/libstd_unicode/DerivedNormalizationProps.txt
+/src/libstd_unicode/PropList.txt
+/src/libstd_unicode/ReadMe.txt
+/src/libstd_unicode/Scripts.txt
+/src/libstd_unicode/SpecialCasing.txt
+/src/libstd_unicode/UnicodeData.txt
/stage[0-9]+/
/target
/test/
View
@@ -23,7 +23,7 @@
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
-import fileinput, re, os, sys, operator
+import fileinput, re, os, sys, operator, math
preamble = '''// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
@@ -359,7 +359,23 @@ def emit_trie_lookup_range_table(f):
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
trie_range_leaf(c, r.r6[leaf as usize])
}
-}\n
+}
+
+pub struct SmallBoolTrie {
+ r1: &'static [u8], // first level
+ r2: &'static [u64], // leaves
+}
+
+impl SmallBoolTrie {
+ fn lookup(&self, c: char) -> bool {
+ let c = c as usize;
+ match self.r1.get(c >> 6) {
+ Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
+ None => false,
+ }
+ }
+}
+
""")
def compute_trie(rawdata, chunksize):
@@ -429,13 +445,49 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
f.write(" };\n\n")
+def emit_small_bool_trie(f, name, t_data, is_pub=True):
+ last_chunk = max(int(hi / 64) for (lo, hi) in t_data)
+ n_chunks = last_chunk + 1
+ chunks = [0] * n_chunks
+ for (lo, hi) in t_data:
+ for cp in range(lo, hi + 1):
+ if int(cp / 64) >= len(chunks):
+ print(cp, int(cp / 64), len(chunks), lo, hi)
+ chunks[int(cp / 64)] |= 1 << (cp & 63)
+
+ pub_string = ""
+ if is_pub:
+ pub_string = "pub "
+ f.write(" %sconst %s: &'static super::SmallBoolTrie = &super::SmallBoolTrie {\n"
+ % (pub_string, name))
+
+ (r1, r2) = compute_trie(chunks, 1)
+
+ f.write(" r1: &[\n")
+ data = ','.join(str(node) for node in r1)
+ format_table_content(f, data, 12)
+ f.write("\n ],\n")
+
+ f.write(" r2: &[\n")
+ data = ','.join('0x%016x' % node for node in r2)
+ format_table_content(f, data, 12)
+ f.write("\n ],\n")
+
+ f.write(" };\n\n")
+
def emit_property_module(f, mod, tbl, emit):
f.write("pub mod %s {\n" % mod)
for cat in sorted(emit):
- emit_bool_trie(f, "%s_table" % cat, tbl[cat])
- f.write(" pub fn %s(c: char) -> bool {\n" % cat)
- f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
- f.write(" }\n\n")
+ if cat in ["Cc", "White_Space", "Pattern_White_Space"]:
+ emit_small_bool_trie(f, "%s_table" % cat, tbl[cat])
+ f.write(" pub fn %s(c: char) -> bool {\n" % cat)
+ f.write(" %s_table.lookup(c)\n" % cat)
+ f.write(" }\n\n")
+ else:
+ emit_bool_trie(f, "%s_table" % cat, tbl[cat])
+ f.write(" pub fn %s(c: char) -> bool {\n" % cat)
+ f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
+ f.write(" }\n\n")
f.write("}\n\n")
def emit_conversions_module(f, to_upper, to_lower, to_title):
Oops, something went wrong.

0 comments on commit 3b208d2

Please sign in to comment.