null+****@clear*****
null+****@clear*****
2012年 1月 19日 (木) 12:20:08 JST
Susumu Yata 2012-01-19 12:20:08 +0900 (Thu, 19 Jan 2012) New Revision: 76543ff617ce97ebe0fb1c7280ac422c635fe49f Log: implemented grn_dat_repair(). Modified files: lib/dat.cpp lib/dat.h lib/dat/dat.hpp lib/dat/entry.hpp lib/dat/trie.cpp lib/dat/trie.hpp Modified: lib/dat.cpp (+27 -0) =================================================================== --- lib/dat.cpp 2012-01-17 10:49:32 +0900 (0f99f68) +++ lib/dat.cpp 2012-01-19 12:20:08 +0900 (7ffc0a9) @@ -1015,4 +1015,31 @@ grn_dat_clear_status_flags(grn_ctx *ctx, grn_dat *dat) return GRN_SUCCESS; } +grn_rc +grn_dat_repair(grn_ctx *ctx, grn_dat *dat) +{ + if (!grn_dat_open_trie_if_needed(ctx, dat)) { + return ctx->rc; + } + const grn::dat::Trie * const trie = static_cast<grn::dat::Trie *>(dat->trie); + if (!trie) { + return GRN_INVALID_ARGUMENT; + } + + char trie_path[PATH_MAX]; + grn_dat_generate_trie_path(grn_io_path(dat->io), trie_path, dat->header->file_id + 1); + try { + grn::dat::Trie().repair(*trie, trie_path); + } catch (const grn::dat::Exception &ex) { + const grn_rc error_code = grn_dat_translate_error_code(ex.code()); + ERR(error_code, "grn::dat::Trie::create failed"); + return error_code; + } + ++dat->header->file_id; + if (!grn_dat_open_trie_if_needed(ctx, dat)) { + return ctx->rc; + } + return GRN_SUCCESS; +} + } // extern "C" Modified: lib/dat.h (+10 -0) =================================================================== --- lib/dat.h 2012-01-17 10:49:32 +0900 (df9477d) +++ lib/dat.h 2012-01-19 12:20:08 +0900 (b747b76) @@ -70,6 +70,10 @@ grn_id grn_dat_lcp_search(grn_ctx *ctx, grn_dat *dat, grn_id grn_dat_curr_id(grn_ctx *ctx, grn_dat *dat); +/* + Currently, grn_dat_truncate() is available if the grn_dat object is + associated with a file. + */ grn_rc grn_dat_truncate(grn_ctx *ctx, grn_dat *dat); const char *_grn_dat_key(grn_ctx *ctx, grn_dat *dat, grn_id id, uint32_t *key_size); @@ -78,6 +82,12 @@ grn_id grn_dat_at(grn_ctx *ctx, grn_dat *dat, grn_id id); grn_rc grn_dat_clear_status_flags(grn_ctx *ctx, grn_dat *dat); +/* + Currently, grn_dat_repair() is available if the grn_dat object is associated + with a file. + */ +grn_rc grn_dat_repair(grn_ctx *ctx, grn_dat *dat); + #ifdef __cplusplus } #endif Modified: lib/dat/dat.hpp (+2 -0) =================================================================== --- lib/dat/dat.hpp 2012-01-17 10:49:32 +0900 (1dace8d) +++ lib/dat/dat.hpp 2012-01-19 12:20:08 +0900 (29fcd0e) @@ -142,6 +142,8 @@ const UInt32 INSERTING_FLAG = 1U << 1; const UInt32 UPDATING_FLAG = 1U << 2; const UInt32 CHANGING_MASK = REMOVING_FLAG | INSERTING_FLAG | UPDATING_FLAG; +const UInt32 MKQ_SORT_THRESHOLD = 10; + enum ErrorCode { PARAM_ERROR = -1, IO_ERROR = -2, Modified: lib/dat/entry.hpp (+0 -2) =================================================================== --- lib/dat/entry.hpp 2012-01-17 10:49:32 +0900 (60be482) +++ lib/dat/entry.hpp 2012-01-19 12:20:08 +0900 (80b4155) @@ -43,11 +43,9 @@ class Entry { } void set_key_pos(UInt32 x) { - GRN_DAT_DEBUG_THROW_IF((x & IS_VALID_FLAG) != 0); value_ = IS_VALID_FLAG | x; } void set_next(UInt32 x) { - GRN_DAT_DEBUG_THROW_IF((x & IS_VALID_FLAG) != 0); value_ = x; } Modified: lib/dat/trie.cpp (+239 -0) =================================================================== --- lib/dat/trie.cpp 2012-01-17 10:49:32 +0900 (ebb6b00) +++ lib/dat/trie.cpp 2012-01-19 12:20:08 +0900 (3be2845) @@ -20,6 +20,8 @@ #include <algorithm> #include <cstring> +#include "vector.hpp" + namespace grn { namespace dat { namespace { @@ -137,6 +139,14 @@ void Trie::create(const Trie &trie, new_trie.swap(this); } +void Trie::repair(const Trie &trie, const char *file_name) { + Trie new_trie; + new_trie.create_file(file_name, trie.file_size(), trie.max_num_keys(), + trie.max_num_blocks(), trie.key_buf_size()); + new_trie.repair_trie(trie); + new_trie.swap(this); +} + void Trie::open(const char *file_name) { GRN_DAT_THROW_IF(PARAM_ERROR, file_name == NULL); @@ -339,6 +349,235 @@ void Trie::build_from_trie(const Trie &trie, UInt32 src, UInt32 dest) { } } +void Trie::repair_trie(const Trie &trie) { + Vector<UInt32> valid_ids; + header_->set_max_key_id(trie.max_key_id()); + header_->set_next_key_id(trie.max_key_id()); + UInt32 prev_invalid_key_id = INVALID_KEY_ID; + for (UInt32 i = min_key_id(); i <= max_key_id(); ++i) { + const Entry &entry = trie.ith_entry(i); + if (entry.is_valid()) { + valid_ids.push_back(i); + ith_entry(i) = entry; + const Key &key = trie.get_key(entry.key_pos()); + Key::create(key_buf_.ptr() + next_key_pos(), + key.id(), key.ptr(), key.length()); + ith_entry(i).set_key_pos(next_key_pos()); + header_->set_next_key_pos( + next_key_pos() + Key::estimate_size(key.length())); + header_->set_total_key_length( + total_key_length() + key.length()); + header_->set_num_keys(num_keys() + 1); + } else { + if (prev_invalid_key_id == INVALID_KEY_ID) { + header_->set_next_key_id(i); + } else { + ith_entry(prev_invalid_key_id).set_next(i); + } + prev_invalid_key_id = i; + } + } + if (prev_invalid_key_id != INVALID_KEY_ID) { + ith_entry(prev_invalid_key_id).set_next(max_key_id() + 1); + } + mkq_sort(valid_ids.begin(), valid_ids.end(), 0); + build_from_keys(valid_ids.begin(), valid_ids.end(), 0, ROOT_NODE_ID); +} + +void Trie::build_from_keys(const UInt32 *begin, const UInt32 *end, + UInt32 depth, UInt32 node_id) { + if ((end - begin) == 1) { + ith_node(node_id).set_key_pos(ith_entry(*begin).key_pos()); + return; + } + + UInt32 offset; + { + UInt16 labels[MAX_LABEL + 1]; + UInt32 num_labels = 0; + + const UInt32 *it = begin; + if (ith_key(*it).length() == depth) { + labels[num_labels++] = TERMINAL_LABEL; + ++it; + } + + labels[num_labels++] = (UInt8)ith_key(*it)[depth]; + for (++it; it < end; ++it) { + const Key &key = ith_key(*it); + if ((UInt8)key[depth] != labels[num_labels - 1]) { + labels[num_labels++] = (UInt8)key[depth]; + } + } + + offset = find_offset(labels, num_labels); + for (UInt32 i = 0; i < num_labels; ++i) { + const UInt32 next = offset ^ labels[i]; + reserve_node(next); + ith_node(next).set_label(labels[i]); + } + + if (offset >= num_nodes()) { + reserve_block(num_blocks()); + } + ith_node(offset).set_is_offset(true); + ith_node(node_id).set_offset(offset); + } + + if (ith_key(*begin).length() == depth) { + build_from_keys(begin, begin + 1, depth + 1, offset ^ TERMINAL_LABEL); + ++begin; + } + + UInt16 label = ith_key(*begin)[depth]; + for (const UInt32 *it = begin + 1; it < end; ++it) { + const Key &key = ith_key(*it); + if ((UInt8)key[depth] != label) { + build_from_keys(begin, it, depth + 1, offset ^ label); + label = (UInt8)key[depth]; + begin = it; + } + } + build_from_keys(begin, end, depth + 1, offset ^ label); +} + +void Trie::mkq_sort(UInt32 *l, UInt32 *r, UInt32 depth) { + while ((r - l) >= MKQ_SORT_THRESHOLD) { + UInt32 *pl = l; + UInt32 *pr = r; + UInt32 *pivot_l = l; + UInt32 *pivot_r = r; + + const int pivot = get_median(*l, *(l + (r - l) / 2), *(r - 1), depth); + for ( ; ; ) { + while (pl < pr) { + const int label = get_label(*pl, depth); + if (label > pivot) { + break; + } else if (label == pivot) { + swap_ids(pl, pivot_l); + ++pivot_l; + } + ++pl; + } + while (pl < pr) { + const int label = get_label(*--pr, depth); + if (label < pivot) { + break; + } else if (label == pivot) { + swap_ids(pr, --pivot_r); + } + } + if (pl >= pr) { + break; + } + swap_ids(pl, pr); + ++pl; + } + while (pivot_l > l) { + swap_ids(--pivot_l, --pl); + } + while (pivot_r < r) { + swap_ids(pivot_r, pr); + ++pivot_r; + ++pr; + } + + if (((pl - l) > (pr - pl)) || ((r - pr) > (pr - pl))) { + if ((pr - pl) > 1) { + mkq_sort(pl, pr, depth + 1); + } + + if ((pl - l) < (r - pr)) { + if ((pl - l) > 1) { + mkq_sort(l, pl, depth); + } + l = pr; + } else { + if ((r - pr) > 1) { + mkq_sort(pr, r, depth); + } + r = pl; + } + } else { + if ((pl - l) > 1) { + mkq_sort(l, pl, depth); + } + + if ((r - pr) > 1) { + mkq_sort(pr, r, depth); + } + + l = pl, r = pr; + if ((pr - pl) > 1) { + ++depth; + } + } + } + + if ((r - l) > 1) { + insertion_sort(l, r, depth); + } +} + +void Trie::insertion_sort(UInt32 *l, UInt32 *r, UInt32 depth) { + for (UInt32 *i = l + 1; i < r; ++i) { + for (UInt32 *j = i; j > l; --j) { + if (less_than(*(j - 1), *j, depth)) { + break; + } + swap_ids(j - 1, j); + } + } +} + +int Trie::get_median(UInt32 a, UInt32 b, UInt32 c, UInt32 depth) const { + const int x = get_label(a, depth); + const int y = get_label(b, depth); + const int z = get_label(c, depth); + if (x < y) { + if (y < z) { + return y; + } else if (x < z) { + return z; + } + return x; + } else if (x < z) { + return x; + } else if (y < z) { + return z; + } + return y; +} + +int Trie::get_label(UInt32 key_id, UInt32 depth) const { + const Key &key = ith_key(key_id); + if (depth == key.length()) { + return -1; + } else { + return (UInt8)key[depth]; + } +} + +bool Trie::less_than(UInt32 lhs, UInt32 rhs, UInt32 depth) const { + const Key &lhs_key = ith_key(lhs); + const Key &rhs_key = ith_key(rhs); + const UInt32 length = (lhs_key.length() < rhs_key.length()) ? + lhs_key.length() : rhs_key.length(); + for (UInt32 i = depth; i < length; ++i) { + if (lhs_key[i] != rhs_key[i]) { + return (UInt8)lhs_key[i] < (UInt8)rhs_key[i]; + } + } + return lhs_key.length() < rhs_key.length(); +} + +void Trie::swap_ids(UInt32 *lhs, UInt32 *rhs) { + UInt32 temp = *lhs; + *lhs = *rhs; + *rhs = temp; +} + bool Trie::search_key(const UInt8 *ptr, UInt32 length, UInt32 *key_pos) const { GRN_DAT_DEBUG_THROW_IF((ptr == NULL) && (length != 0)); Modified: lib/dat/trie.hpp (+15 -1) =================================================================== --- lib/dat/trie.hpp 2012-01-17 10:49:32 +0900 (199dac3) +++ lib/dat/trie.hpp 2012-01-19 12:20:08 +0900 (048846f) @@ -40,13 +40,15 @@ class Trie { double num_nodes_per_key = 0.0, double average_key_length = 0.0); - void create(const Trie &src_trie, + void create(const Trie &trie, const char *file_name = NULL, UInt64 file_size = 0, UInt32 max_num_keys = 0, double num_nodes_per_key = 0.0, double average_key_length = 0.0); + void repair(const Trie &trie, const char *file_name = NULL); + void open(const char *file_name); void close(); @@ -213,6 +215,18 @@ class Trie { void build_from_trie(const Trie &trie); void build_from_trie(const Trie &trie, UInt32 src, UInt32 dest); + void repair_trie(const Trie &trie); + void build_from_keys(const UInt32 *begin, const UInt32 *end, + UInt32 depth, UInt32 node_id); + + void mkq_sort(UInt32 *l, UInt32 *r, UInt32 depth); + void insertion_sort(UInt32 *l, UInt32 *r, UInt32 depth); + + inline int get_label(UInt32 key_id, UInt32 depth) const; + inline int get_median(UInt32 a, UInt32 b, UInt32 c, UInt32 depth) const; + inline bool less_than(UInt32 lhs, UInt32 rhs, UInt32 depth) const; + inline static void swap_ids(UInt32 *lhs, UInt32 *rhs); + bool search_key(const UInt8 *ptr, UInt32 length, UInt32 *key_pos) const; bool search_linker(const UInt8 *ptr, UInt32 length, UInt32 &node_id, UInt32 &query_pos) const;