• R/O
  • SSH
  • HTTPS

unf: コミット


コミットメタ情報

リビジョン53 (tree)
日時2011-11-19 18:38:58
作者phjgt

ログメッセージ

TRIE構築系のC++ソース・コマンドを削除

変更サマリ

差分

--- trunk/src/unf/gen-unf-table.cc (revision 52)
+++ trunk/src/unf/gen-unf-table.cc (nonexistent)
@@ -1,146 +0,0 @@
1-#include <iostream>
2-#include <fstream>
3-#include <string>
4-#include <algorithm>
5-#include <cstdlib>
6-#include "trie/builder.hh"
7-
8-/**
9- * macro and function declaration
10- */
11-#define CAN_DEC_FILE (data_dir+"/canonical-decomposition.def").c_str()
12-#define COM_DEC_FILE (data_dir+"/compatibility-decomposition.def").c_str()
13-#define CAN_COM_FILE (data_dir+"/canonical-composition.def").c_str()
14-#define CCC_FILE (data_dir+"/canonical-combining-class.def").c_str()
15-#define NFC_ILLEGAL_FILE (data_dir+"/nfc-illegal-char.def").c_str()
16-#define NFKC_ILLEGAL_FILE (data_dir+"/nfkc-illegal-char.def").c_str()
17-
18-bool read_mapping_definition(const char* filepath, const char* prefix, std::ofstream& out, std::string& buffer);
19-bool read_char_attr_definition(const char* filepath, const char* prefix, std::ofstream& out);
20-
21-/**
22- * main function
23- */
24-int main(int argc, char** argv) {
25- if(argc != 3) {
26- std::cerr << "Usage: gen-unf-table <table-file(C++ header)> <data-dir>" << std::endl;
27- return 1;
28- }
29-
30- const char* table_file = argv[1];
31- const std::string data_dir = argv[2];
32-
33- std::cerr << "= create table file (C++ header file): " << table_file << std::endl;
34- std::ofstream out(table_file);
35- if(!out) {
36- std::cerr << "Can't open file for writing: " << table_file << std::endl;
37- return 1;
38- }
39-
40- // generate C++ header file
41- out << "#ifndef UNF_TABLE_HH" << std::endl
42- << "#define UNF_TABLE_HH" << std::endl
43- << "namespace UNF {" << std::endl
44- << " namespace TABLE {" << std::endl;
45-
46- // read definitions and write Node array
47- std::string buffer;
48- std::cerr << " == canonical decomposition mapping" << std::endl;
49- if(read_mapping_definition(CAN_DEC_FILE, "CANONICAL_DECOM", out, buffer)==false)
50- return 1;
51-
52- std::cerr << " == compatibility decomposition mapping" << std::endl;
53- if(read_mapping_definition(COM_DEC_FILE, "COMPATIBILITY_DECOM", out, buffer)==false)
54- return 1;
55-
56- std::cerr << " == canonical composition mapping" << std::endl;
57- if(read_mapping_definition(CAN_COM_FILE, "CANONICAL_COM", out, buffer)==false)
58- return 1;
59-
60- std::cerr << " == canonical combining class" << std::endl;
61- if(read_char_attr_definition(CCC_FILE, "CANONICAL_CLASS", out)==false)
62- return 1;
63-
64- std::cerr << " == NFC illegal character list" << std::endl;
65- if(read_char_attr_definition(NFC_ILLEGAL_FILE, "NFC_ILLEGAL", out)==false)
66- return 1;
67-
68- std::cerr << " == NFKC illegal character list" << std::endl;
69- if(read_char_attr_definition(NFKC_ILLEGAL_FILE, "NFKC_ILLEGAL", out)==false)
70- return 1;
71-
72- // write value string
73- out << "const char VALUE[]={" << std::endl;
74- for(unsigned i=0; i < buffer.size(); i++) {
75- out << std::setw(4) << std::setfill(' ')
76- << std::right << std::dec << static_cast<int>(buffer[i]);
77-
78- if(i+1 < buffer.size()) out << ',';
79- if((i+1)%20==0) out << std::endl;
80- }
81- out << "};" << std::endl;
82-
83- out << " }" << std::endl
84- << "}" << std::endl
85- << "#endif" << std::endl;
86-
87- std::cerr << "DONE" << std::endl;
88- return 0;
89-}
90-
91-/**
92- * function definition
93- */
94-// File format:
95-// [FROM STRING]\t[TO STRING]\n
96-bool read_mapping_definition(const char* filepath, const char* prefix, std::ofstream& out, std::string& buffer) {
97- using UNF::Trie::MappingKey;
98- using UNF::Trie::MappingKeyList;
99-
100- std::ifstream in(filepath);
101- if(!in) {
102- std::cerr << "Can't open file: " << filepath << std::endl;
103- return false;
104- }
105-
106- MappingKeyList keys;
107- for(std::string line; std::getline(in,line);) {
108- std::size_t p = line.find('\t');
109- std::string key = line.substr(0,p);
110- std::string value = line.substr(p+1);
111-
112- // search sharable value
113- // - using liner search (for simplicity)
114- // - very slow
115- std::size_t pos = buffer.find(value);
116- if(pos == std::string::npos)
117- keys.push_back(MappingKey(key, value, buffer));
118- else
119- keys.push_back(MappingKey(key, value, pos));
120- }
121- std::sort(keys.begin(), keys.end());
122-
123- UNF::Trie::Builder<MappingKeyList>(keys).build().output_nodes_cpp_array(out, prefix);
124- return true;
125-}
126-
127-// File format:
128-// [two-digit hexadecimal] [CHARACTER]
129-bool read_char_attr_definition(const char* filepath, const char* prefix, std::ofstream& out) {
130- using UNF::Trie::AttrKey;
131- using UNF::Trie::AttrKeyList;
132-
133- std::ifstream in(filepath);
134- if(!in) {
135- std::cerr << "Can't open file: " << filepath << std::endl;
136- return false;
137- }
138-
139- AttrKeyList keys;
140- for(std::string line; std::getline(in,line);)
141- keys.push_back(AttrKey(line.substr(3), strtol(line.substr(0,2).c_str(),NULL,16)));
142- std::sort(keys.begin(), keys.end());
143-
144- UNF::Trie::Builder<AttrKeyList>(keys).build().output_nodes_cpp_array(out, prefix);
145- return true;
146-}
--- trunk/src/unf/trie/node_allocator.hh (revision 52)
+++ trunk/src/unf/trie/node_allocator.hh (nonexistent)
@@ -1,105 +0,0 @@
1-#ifndef UNF_TRIE_NODE_ALLOCATOR_HH
2-#define UNF_TRIE_NODE_ALLOCATOR_HH
3-
4-#include <vector>
5-
6-namespace UNF {
7- namespace Trie {
8- class NodeAllocator {
9- typedef unsigned NodeIndex;
10- typedef std::vector<unsigned char> UCharList;
11- typedef std::vector<bool> BitSet;
12-
13- private:
14- struct ForwardLink {
15- ForwardLink(NodeIndex i) : idx(i), next(NULL) {}
16-
17- NodeIndex append(NodeIndex beg_idx, unsigned num) {
18- ForwardLink* cur=this;
19- while(cur->next) cur=cur->next;
20-
21- for(NodeIndex i=beg_idx; i < beg_idx+num; i++)
22- cur = cur->next = new ForwardLink(i);
23- return beg_idx+num-1;
24- }
25-
26- void remove_next() {
27- ForwardLink* tmp=next; next=next->next; delete tmp;
28- }
29-
30- void delete_all_tail() {
31- for(ForwardLink* tmp=next; next; tmp=next) {
32- next=next->next;
33- delete tmp;
34- }
35- }
36-
37- NodeIndex idx;
38- ForwardLink* next;
39- };
40-
41- public:
42- static const unsigned PER_LINK_SIZE=0x10000;
43-
44- public:
45- NodeAllocator(unsigned node_size) : root(0), used_base(node_size) {
46- last_idx = root.append(1,PER_LINK_SIZE);
47- }
48- ~NodeAllocator() {
49- root.delete_all_tail();
50- }
51-
52- int allocate(unsigned char code) {
53- static UCharList codes(1); // XXX: thread unsafe
54- codes[0] = code;
55- return allocate(codes);
56- }
57-
58- int allocate(const UCharList& codes, ForwardLink* prev=NULL) {
59- if(!prev) prev=&root;
60- if(!prev->next)
61- last_idx = prev->append(last_idx+1, PER_LINK_SIZE);
62-
63- ForwardLink *cur = prev->next;
64- unsigned char min_cd = codes.front();
65-
66- for(; cur->idx <= min_cd; prev=cur, cur=cur->next);
67- for(; cur->next; prev=cur,cur=cur->next) {
68- NodeIndex x = cur->idx - min_cd;
69- if(used_base[x]==false && can_allocate(cur, codes, x)) {
70- used_base[x] = true;
71- alloc(prev,codes,x);
72- return x;
73- }
74- }
75- return allocate(codes, cur);
76- }
77-
78- private:
79- bool can_allocate(ForwardLink* cur, const UCharList& codes, NodeIndex x) const {
80- cur=cur->next;
81- for(unsigned i=1;i < codes.size(); i++) {
82- for(; cur && cur->idx < x+codes[i]; cur=cur->next);
83- if(!cur || cur->idx > x+codes[i])
84- return false;
85- }
86- return true;
87- }
88-
89- void alloc(ForwardLink* prev, const UCharList& codes, NodeIndex x) {
90- prev->remove_next();
91- for(unsigned i=1; i < codes.size(); i++) {
92- for(; prev->next->idx < x+codes[i]; prev=prev->next);
93- prev->remove_next();
94- }
95- }
96-
97- private:
98- ForwardLink root;
99- NodeIndex last_idx;
100- BitSet used_base;
101- };
102- }
103-}
104-
105-#endif
--- trunk/src/unf/trie/builder.hh (revision 52)
+++ trunk/src/unf/trie/builder.hh (nonexistent)
@@ -1,170 +0,0 @@
1-#ifndef UNF_TRIE_BUILDER_HH
2-#define UNF_TRIE_BUILDER_HH
3-
4-#include "node_allocator.hh"
5-#include "char_stream.hh"
6-#include "node.hh"
7-#include <fstream>
8-#include <cstring>
9-#include <string>
10-#include <iomanip>
11-
12-namespace UNF {
13- namespace Trie {
14- /**
15- * Trie key class
16- */
17- struct Key {
18- Key(const std::string& key) : buf(key), cs(buf.c_str()) {}
19-
20- bool operator<(const Key& k) const { return strcmp(cs.cur(), k.cs.cur()) < 0; }
21- unsigned char read() { return cs.read(); }
22- unsigned char prev() const { return cs.prev(); }
23- unsigned char peek() const { return cs.peek(); }
24- void reset() { cs.setCur(buf.c_str()); }
25- virtual void set_node_value(Node& node) = 0;
26-
27- std::string buf;
28- CharStream cs;
29- };
30-
31- struct AttrKey : public Key {
32- AttrKey(const std::string& key, unsigned attr) : Key(key), attr(attr) {}
33- virtual void set_node_value(Node& node) { node.set_value(attr); }
34-
35- unsigned attr;
36- };
37- typedef std::vector<AttrKey> AttrKeyList;
38-
39- struct MappingKey : public Key {
40- static const unsigned OFFSET_BITLEN = 18;
41- static const unsigned OFFSET_BITMASK = 0x3FFFF;
42-
43- MappingKey(const std::string& key, const std::string& value, std::string& buffer)
44- : Key(key) {
45- value_info = (value.size()<<OFFSET_BITLEN) | buffer.size();
46- buffer += value;
47- }
48- MappingKey(const std::string& key, const std::string& value, unsigned value_info)
49- : Key(key) {
50- this->value_info = (value.size()<<OFFSET_BITLEN) | (value_info&OFFSET_BITMASK);
51- }
52-
53- virtual void set_node_value(Node& node) { node.set_value(value_info); }
54-
55- unsigned value_info;
56- };
57- typedef std::vector<MappingKey> MappingKeyList;
58-
59- /**
60- * Builder class
61- */
62- template<class KeyList>
63- class Builder {
64- public:
65- Builder(KeyList& keys)
66- : keys(keys), node_size(count_node(keys)*1.5), alloc(node_size) {
67- nodes = new Node[node_size];
68- }
69- ~Builder() {
70- delete [] nodes;
71- }
72-
73- Builder& build() {
74- build_impl(0, keys.size(), 0);
75- return *this;
76- }
77-
78- void output_nodes_cpp_array(std::ofstream& out, const char* prefix) {
79- if(node_size > 0xFF)
80- while(nodes[node_size-0xFF].is_unused())
81- node_size--;
82-
83- out << "const unsigned " << prefix << "_NODES[]={" << std::endl;
84- for(unsigned i=0; i < node_size; i++) {
85- out << "0x" << std::hex << std::setw(8) << std::setfill('0') << std::right << std::hex << nodes[i].to_uint();
86- if(i+1 < node_size) out << ',';
87- if((i+1)%10==0) out << std::endl;
88- }
89- out << "};" << std::endl << std::endl;
90- }
91-
92- private:
93- void build_impl(std::size_t beg, std::size_t end, int root_node) {
94- if(end-beg == 1) {
95- if(nodes[root_node].check_char()!='\0') {
96- for(; keys[beg].peek()!='\0'; keys[beg].read())
97- root_node = set_node(root_node, alloc.allocate(keys[beg].peek()), keys[beg].peek());
98- root_node = set_node(root_node, alloc.allocate('\0'), '\0');
99- }
100- keys[beg].set_node_value(nodes[root_node]);
101- return;
102- }
103-
104- std::vector<unsigned char> children;
105- std::vector<std::size_t> ranges;
106- do {
107- children.push_back(keys[beg].peek());
108- ranges.push_back(beg);
109- beg = end_of_same_node(keys, beg, end);
110- } while (beg != end);
111- ranges.push_back(end);
112-
113- int base_node = alloc.allocate(children);
114- for(std::size_t i=0; i < children.size(); i++)
115- build_impl(ranges[i], ranges[i+1], set_node(root_node, base_node, children[i]));
116- }
117-
118- int set_node(int node, int base_node, unsigned char child) {
119- int next = base_node + child;
120- nodes[node].set_base_index(base_node);
121- nodes[next].set_check_char(child);
122- return next;
123- }
124-
125- unsigned end_of_same_node(KeyList& keys, std::size_t beg, std::size_t end) {
126- unsigned char ch = keys[beg].read();
127- std::size_t cur = beg+1;
128- for(; cur < end && ch == keys[cur].peek(); cur++)
129- keys[cur].read();
130- return cur;
131- }
132-
133- unsigned count_node(KeyList& keys) {
134- unsigned count = count_node_impl(keys,0,keys.size());
135- for(std::size_t i = 0; i < keys.size(); i++)
136- keys[i].reset();
137- return count;
138- }
139-
140- unsigned count_node_impl(KeyList& keys, std::size_t beg, std::size_t end) {
141- if(end-beg == 1) {
142- unsigned count=1;
143- while(keys[beg].read()!='\0')
144- count++;
145- return count;
146- }
147-
148- std::vector<std::size_t> ranges;
149- do {
150- ranges.push_back(beg);
151- beg = end_of_same_node(keys, beg, end);
152- } while (beg != end);
153- ranges.push_back(end);
154-
155- unsigned count=ranges.size()-1;
156- for(std::size_t i=0; i < ranges.size()-1; i++)
157- count += count_node_impl(keys, ranges[i], ranges[i+1]);
158- return count;
159- }
160-
161- private:
162- KeyList& keys;
163- unsigned node_size;
164- Node* nodes;
165- NodeAllocator alloc;
166- };
167- }
168-}
169-
170-#endif
--- trunk/src/unf/trie/node.hh (revision 52)
+++ trunk/src/unf/trie/node.hh (revision 53)
@@ -5,14 +5,6 @@
55 namespace Trie {
66 class Node {
77 public:
8- Node() : data(0xFFFFFFFF) {}
9-
10- void set_base_index(unsigned base_index) { data = (data&0xFF000000)+(base_index&0x00FFFFFF); }
11- void set_value(unsigned value) { set_base_index(value); }
12- void set_check_char(unsigned char ch) { data = (ch << 24) + base(); }
13-
14- bool is_unused() const { return data==0xFFFFFFFF; }
15-
168 unsigned jump(unsigned char ch) const { return base() + ch; }
179 unsigned value() const { return base(); }
1810 unsigned check_char() const { return data>>24; }
@@ -24,9 +16,7 @@
2416 private:
2517 unsigned base() const { return data & 0xFFFFFF; }
2618
27- //XXX: private:
28- public:
29- // TODO: bit-fieldを使ってみる
19+ private:
3020 unsigned data;
3121 };
3222 }
--- trunk/src/unf/trie/searcher.hh (revision 52)
+++ trunk/src/unf/trie/searcher.hh (revision 53)
@@ -13,7 +13,6 @@
1313 : nodes(nodes), root(root), value(value) {}
1414
1515 unsigned find_value(const char* key, int default_value) const {
16- //std::cout << "# " << key << std::endl;
1716 unsigned node_index=root;
1817 for(CharStream in(key);; in.read()) {
1918 node_index = nodes[node_index].jump(in.peek());
@@ -20,7 +19,6 @@
2019 if(nodes[node_index].check_char()==in.peek()) {
2120 unsigned terminal_index = nodes[node_index].jump('\0');
2221 if(nodes[terminal_index].check_char()=='\0') {
23- //std::cout << " => " << nodes[terminal_index].value() << std::endl;
2422 return nodes[terminal_index].value();
2523 }
2624 } else
@@ -96,11 +94,6 @@
9694 };
9795
9896 class NormalizationForm : private Searcher {
99- private:
100- // TODO: => word_append
101- static unsigned v_start(unsigned v) { return v&0x3FFFF; }
102- static unsigned v_end(unsigned v) { return v_start(v)+(v>>18); }
103-
10497 public:
10598 NormalizationForm(const unsigned* node_uints, unsigned root, const char* value=NULL)
10699 : Searcher(Node::from_uint_array(node_uints), root, value) {}
@@ -116,8 +109,7 @@
116109 if(nodes[node_index].check_char()==in.prev()) {
117110 unsigned terminal_index = nodes[node_index].jump('\0');
118111 if(nodes[terminal_index].check_char()=='\0') {
119- unsigned v = nodes[terminal_index].value();
120- buffer.append(value+v_start(v), value+v_end(v));
112+ word_append(buffer, value, nodes[terminal_index].value());
121113 beg = in.cur();
122114 break;
123115 }
@@ -137,8 +129,7 @@
137129
138130 const char* const beg = in.cur();
139131 const char* current_char_head = in.cur();
140- const char* composed_char = NULL;
141- const char* composed_char_end = NULL;
132+ unsigned composed_char_info = 0;
142133
143134 unsigned node_index = root;
144135 unsigned retry_root_node = root;
@@ -161,9 +152,7 @@
161152 node_index = next_index;
162153 unsigned terminal_index = nodes[node_index].jump('\0');
163154 if(nodes[terminal_index].check_char()=='\0') {
164- unsigned v = nodes[terminal_index].value();
165- composed_char = value+v_start(v);
166- composed_char_end = value+v_end(v);
155+ composed_char_info = nodes[terminal_index].value();
167156
168157 in.mark_as_last_valid_point();
169158 if(in.eos() || retry_root_class > in.get_canonical_class())
@@ -182,9 +171,9 @@
182171 }
183172 }
184173
185- if(composed_char) {
174+ if(composed_char_info != 0) {
186175 // append composed unicode-character and skipped combining-characters
187- buf.append(composed_char, composed_char_end);
176+ word_append(buf, value, composed_char_info);
188177 in.append_skipped_chars_to_str(buf);
189178 in.reset_at_marked_point();
190179 } else {
@@ -193,6 +182,11 @@
193182 in.append_read_char_to_str(buf, beg);
194183 }
195184 }
185+
186+ private:
187+ static void word_append(std::string& buffer, const char* base, unsigned pos_info) {
188+ buffer.append(base+(pos_info&0x3FFFF), pos_info>>18);
189+ }
196190 };
197191 }
198192 }
--- trunk/Makefile (revision 52)
+++ trunk/Makefile (revision 53)
@@ -1,9 +1,8 @@
11 CXX=g++
22 CXX_FLAGS=-O2 -Wall -ansi -pedantic-errors
33
4-COMMANDS=bin/unf bin/unf-test bin/unf-time bin/gen-unf-table
4+COMMANDS=bin/unf bin/unf-test bin/unf-time
55 SRC=src/unf
6-BUILD_TRIES=${SRC}/trie/builder.hh ${SRC}/trie/char_stream.hh ${SRC}/trie/node_allocator.hh ${SRC}/trie/node.hh
76 SEARCH_TRIES=${SRC}/trie/char_stream.hh ${SRC}/trie/node.hh ${SRC}/trie/searcher.hh
87
98 all: bin ${COMMANDS}
@@ -11,9 +10,6 @@
1110 bin:
1211 mkdir bin
1312
14-bin/gen-unf-table: ${SRC}/gen-unf-table.cc ${BUILD_TRIES}
15- ${CXX} ${CXX_FLAGS} -o ${@} ${<}
16-
1713 bin/unf: ${SRC}/unf.cc ${SRC}/normalizer.hh ${SRC}/table.hh ${SRC}/util.hh ${SEARCH_TRIES}
1814 ${CXX} ${CXX_FLAGS} -o ${@} ${<}
1915
@@ -28,6 +24,3 @@
2824
2925 test: bin/unf-test
3026 ${<} < data/normalization-test.txt
31-
32-gen-table: bin/gen-unf-table
33- bin/gen-unf-table ${SRC}/table.hh data
旧リポジトリブラウザで表示