[Groonga-commit] groonga/groonga [master] implemented grn_dat_repair().

アーカイブの一覧に戻る

null+****@clear***** null+****@clear*****
2012年 1月 19日 (木) 12:20:08 JST


Susumu Yata	2012-01-19 12:20:08 +0900 (Thu, 19 Jan 2012)

  New Revision: 76543ff617ce97ebe0fb1c7280ac422c635fe49f

  Log:
    implemented grn_dat_repair().

  Modified files:
    lib/dat.cpp
    lib/dat.h
    lib/dat/dat.hpp
    lib/dat/entry.hpp
    lib/dat/trie.cpp
    lib/dat/trie.hpp

  Modified: lib/dat.cpp (+27 -0)
===================================================================
--- lib/dat.cpp    2012-01-17 10:49:32 +0900 (0f99f68)
+++ lib/dat.cpp    2012-01-19 12:20:08 +0900 (7ffc0a9)
@@ -1015,4 +1015,31 @@ grn_dat_clear_status_flags(grn_ctx *ctx, grn_dat *dat)
   return GRN_SUCCESS;
 }
 
+grn_rc
+grn_dat_repair(grn_ctx *ctx, grn_dat *dat)
+{
+  if (!grn_dat_open_trie_if_needed(ctx, dat)) {
+    return ctx->rc;
+  }
+  const grn::dat::Trie * const trie = static_cast<grn::dat::Trie *>(dat->trie);
+  if (!trie) {
+    return GRN_INVALID_ARGUMENT;
+  }
+
+  char trie_path[PATH_MAX];
+  grn_dat_generate_trie_path(grn_io_path(dat->io), trie_path, dat->header->file_id + 1);
+  try {
+    grn::dat::Trie().repair(*trie, trie_path);
+  } catch (const grn::dat::Exception &ex) {
+    const grn_rc error_code = grn_dat_translate_error_code(ex.code());
+    ERR(error_code, "grn::dat::Trie::create failed");
+    return error_code;
+  }
+  ++dat->header->file_id;
+  if (!grn_dat_open_trie_if_needed(ctx, dat)) {
+    return ctx->rc;
+  }
+  return GRN_SUCCESS;
+}
+
 }  // extern "C"

  Modified: lib/dat.h (+10 -0)
===================================================================
--- lib/dat.h    2012-01-17 10:49:32 +0900 (df9477d)
+++ lib/dat.h    2012-01-19 12:20:08 +0900 (b747b76)
@@ -70,6 +70,10 @@ grn_id grn_dat_lcp_search(grn_ctx *ctx, grn_dat *dat,
 
 grn_id grn_dat_curr_id(grn_ctx *ctx, grn_dat *dat);
 
+/*
+  Currently, grn_dat_truncate() is available if the grn_dat object is
+  associated with a file.
+ */
 grn_rc grn_dat_truncate(grn_ctx *ctx, grn_dat *dat);
 
 const char *_grn_dat_key(grn_ctx *ctx, grn_dat *dat, grn_id id, uint32_t *key_size);
@@ -78,6 +82,12 @@ grn_id grn_dat_at(grn_ctx *ctx, grn_dat *dat, grn_id id);
 
 grn_rc grn_dat_clear_status_flags(grn_ctx *ctx, grn_dat *dat);
 
+/*
+  Currently, grn_dat_repair() is available if the grn_dat object is associated
+  with a file.
+ */
+grn_rc grn_dat_repair(grn_ctx *ctx, grn_dat *dat);
+
 #ifdef __cplusplus
 }
 #endif

  Modified: lib/dat/dat.hpp (+2 -0)
===================================================================
--- lib/dat/dat.hpp    2012-01-17 10:49:32 +0900 (1dace8d)
+++ lib/dat/dat.hpp    2012-01-19 12:20:08 +0900 (29fcd0e)
@@ -142,6 +142,8 @@ const UInt32 INSERTING_FLAG = 1U << 1;
 const UInt32 UPDATING_FLAG  = 1U << 2;
 const UInt32 CHANGING_MASK  = REMOVING_FLAG | INSERTING_FLAG | UPDATING_FLAG;
 
+const UInt32 MKQ_SORT_THRESHOLD = 10;
+
 enum ErrorCode {
   PARAM_ERROR      = -1,
   IO_ERROR         = -2,

  Modified: lib/dat/entry.hpp (+0 -2)
===================================================================
--- lib/dat/entry.hpp    2012-01-17 10:49:32 +0900 (60be482)
+++ lib/dat/entry.hpp    2012-01-19 12:20:08 +0900 (80b4155)
@@ -43,11 +43,9 @@ class Entry {
   }
 
   void set_key_pos(UInt32 x) {
-    GRN_DAT_DEBUG_THROW_IF((x & IS_VALID_FLAG) != 0);
     value_ = IS_VALID_FLAG | x;
   }
   void set_next(UInt32 x) {
-    GRN_DAT_DEBUG_THROW_IF((x & IS_VALID_FLAG) != 0);
     value_ = x;
   }
 

  Modified: lib/dat/trie.cpp (+239 -0)
===================================================================
--- lib/dat/trie.cpp    2012-01-17 10:49:32 +0900 (ebb6b00)
+++ lib/dat/trie.cpp    2012-01-19 12:20:08 +0900 (3be2845)
@@ -20,6 +20,8 @@
 #include <algorithm>
 #include <cstring>
 
+#include "vector.hpp"
+
 namespace grn {
 namespace dat {
 namespace {
@@ -137,6 +139,14 @@ void Trie::create(const Trie &trie,
   new_trie.swap(this);
 }
 
+void Trie::repair(const Trie &trie, const char *file_name) {
+  Trie new_trie;
+  new_trie.create_file(file_name, trie.file_size(), trie.max_num_keys(),
+                       trie.max_num_blocks(), trie.key_buf_size());
+  new_trie.repair_trie(trie);
+  new_trie.swap(this);
+}
+
 void Trie::open(const char *file_name) {
   GRN_DAT_THROW_IF(PARAM_ERROR, file_name == NULL);
 
@@ -339,6 +349,235 @@ void Trie::build_from_trie(const Trie &trie, UInt32 src, UInt32 dest) {
   }
 }
 
+void Trie::repair_trie(const Trie &trie) {
+  Vector<UInt32> valid_ids;
+  header_->set_max_key_id(trie.max_key_id());
+  header_->set_next_key_id(trie.max_key_id());
+  UInt32 prev_invalid_key_id = INVALID_KEY_ID;
+  for (UInt32 i = min_key_id(); i <= max_key_id(); ++i) {
+    const Entry &entry = trie.ith_entry(i);
+    if (entry.is_valid()) {
+      valid_ids.push_back(i);
+      ith_entry(i) = entry;
+      const Key &key = trie.get_key(entry.key_pos());
+      Key::create(key_buf_.ptr() + next_key_pos(),
+          key.id(), key.ptr(), key.length());
+      ith_entry(i).set_key_pos(next_key_pos());
+      header_->set_next_key_pos(
+          next_key_pos() + Key::estimate_size(key.length()));
+      header_->set_total_key_length(
+          total_key_length() + key.length());
+      header_->set_num_keys(num_keys() + 1);
+    } else {
+      if (prev_invalid_key_id == INVALID_KEY_ID) {
+        header_->set_next_key_id(i);
+      } else {
+        ith_entry(prev_invalid_key_id).set_next(i);
+      }
+      prev_invalid_key_id = i;
+    }
+  }
+  if (prev_invalid_key_id != INVALID_KEY_ID) {
+    ith_entry(prev_invalid_key_id).set_next(max_key_id() + 1);
+  }
+  mkq_sort(valid_ids.begin(), valid_ids.end(), 0);
+  build_from_keys(valid_ids.begin(), valid_ids.end(), 0, ROOT_NODE_ID);
+}
+
+void Trie::build_from_keys(const UInt32 *begin, const UInt32 *end,
+                           UInt32 depth, UInt32 node_id) {
+  if ((end - begin) == 1) {
+    ith_node(node_id).set_key_pos(ith_entry(*begin).key_pos());
+    return;
+  }
+
+  UInt32 offset;
+  {
+    UInt16 labels[MAX_LABEL + 1];
+    UInt32 num_labels = 0;
+
+    const UInt32 *it = begin;
+    if (ith_key(*it).length() == depth) {
+      labels[num_labels++] = TERMINAL_LABEL;
+      ++it;
+    }
+
+    labels[num_labels++] = (UInt8)ith_key(*it)[depth];
+    for (++it; it < end; ++it) {
+      const Key &key = ith_key(*it);
+      if ((UInt8)key[depth] != labels[num_labels - 1]) {
+        labels[num_labels++] = (UInt8)key[depth];
+      }
+    }
+
+    offset = find_offset(labels, num_labels);
+    for (UInt32 i = 0; i < num_labels; ++i) {
+      const UInt32 next = offset ^ labels[i];
+      reserve_node(next);
+      ith_node(next).set_label(labels[i]);
+    }
+
+    if (offset >= num_nodes()) {
+      reserve_block(num_blocks());
+    }
+    ith_node(offset).set_is_offset(true);
+    ith_node(node_id).set_offset(offset);
+  }
+
+  if (ith_key(*begin).length() == depth) {
+    build_from_keys(begin, begin + 1, depth + 1, offset ^ TERMINAL_LABEL);
+    ++begin;
+  }
+
+  UInt16 label = ith_key(*begin)[depth];
+  for (const UInt32 *it = begin + 1; it < end; ++it) {
+    const Key &key = ith_key(*it);
+    if ((UInt8)key[depth] != label) {
+      build_from_keys(begin, it, depth + 1, offset ^ label);
+      label = (UInt8)key[depth];
+      begin = it;
+    }
+  }
+  build_from_keys(begin, end, depth + 1, offset ^ label);
+}
+
+void Trie::mkq_sort(UInt32 *l, UInt32 *r, UInt32 depth) {
+  while ((r - l) >= MKQ_SORT_THRESHOLD) {
+    UInt32 *pl = l;
+    UInt32 *pr = r;
+    UInt32 *pivot_l = l;
+    UInt32 *pivot_r = r;
+
+    const int pivot = get_median(*l, *(l + (r - l) / 2), *(r - 1), depth);
+    for ( ; ; ) {
+      while (pl < pr) {
+        const int label = get_label(*pl, depth);
+        if (label > pivot) {
+          break;
+        } else if (label == pivot) {
+          swap_ids(pl, pivot_l);
+          ++pivot_l;
+        }
+        ++pl;
+      }
+      while (pl < pr) {
+        const int label = get_label(*--pr, depth);
+        if (label < pivot) {
+          break;
+        } else if (label == pivot) {
+          swap_ids(pr, --pivot_r);
+        }
+      }
+      if (pl >= pr) {
+        break;
+      }
+      swap_ids(pl, pr);
+      ++pl;
+    }
+    while (pivot_l > l) {
+      swap_ids(--pivot_l, --pl);
+    }
+    while (pivot_r < r) {
+      swap_ids(pivot_r, pr);
+      ++pivot_r;
+      ++pr;
+    }
+
+    if (((pl - l) > (pr - pl)) || ((r - pr) > (pr - pl))) {
+      if ((pr - pl) > 1) {
+        mkq_sort(pl, pr, depth + 1);
+      }
+
+      if ((pl - l) < (r - pr)) {
+        if ((pl - l) > 1) {
+          mkq_sort(l, pl, depth);
+        }
+        l = pr;
+      } else {
+        if ((r - pr) > 1) {
+          mkq_sort(pr, r, depth);
+        }
+        r = pl;
+      }
+    } else {
+      if ((pl - l) > 1) {
+        mkq_sort(l, pl, depth);
+      }
+
+      if ((r - pr) > 1) {
+        mkq_sort(pr, r, depth);
+      }
+
+      l = pl, r = pr;
+      if ((pr - pl) > 1) {
+        ++depth;
+      }
+    }
+  }
+
+  if ((r - l) > 1) {
+    insertion_sort(l, r, depth);
+  }
+}
+
+void Trie::insertion_sort(UInt32 *l, UInt32 *r, UInt32 depth) {
+  for (UInt32 *i = l + 1; i < r; ++i) {
+    for (UInt32 *j = i; j > l; --j) {
+      if (less_than(*(j - 1), *j, depth)) {
+        break;
+      }
+      swap_ids(j - 1, j);
+    }
+  }
+}
+
+int Trie::get_median(UInt32 a, UInt32 b, UInt32 c, UInt32 depth) const {
+  const int x = get_label(a, depth);
+  const int y = get_label(b, depth);
+  const int z = get_label(c, depth);
+  if (x < y) {
+    if (y < z) {
+      return y;
+    } else if (x < z) {
+      return z;
+    }
+    return x;
+  } else if (x < z) {
+    return x;
+  } else if (y < z) {
+    return z;
+  }
+  return y;
+}
+
+int Trie::get_label(UInt32 key_id, UInt32 depth) const {
+  const Key &key = ith_key(key_id);
+  if (depth == key.length()) {
+    return -1;
+  } else {
+    return (UInt8)key[depth];
+  }
+}
+
+bool Trie::less_than(UInt32 lhs, UInt32 rhs, UInt32 depth) const {
+  const Key &lhs_key = ith_key(lhs);
+  const Key &rhs_key = ith_key(rhs);
+  const UInt32 length = (lhs_key.length() < rhs_key.length()) ?
+      lhs_key.length() : rhs_key.length();
+  for (UInt32 i = depth; i < length; ++i) {
+    if (lhs_key[i] != rhs_key[i]) {
+      return (UInt8)lhs_key[i] < (UInt8)rhs_key[i];
+    }
+  }
+  return lhs_key.length() < rhs_key.length();
+}
+
+void Trie::swap_ids(UInt32 *lhs, UInt32 *rhs) {
+  UInt32 temp = *lhs;
+  *lhs = *rhs;
+  *rhs = temp;
+}
+
 bool Trie::search_key(const UInt8 *ptr, UInt32 length, UInt32 *key_pos) const {
   GRN_DAT_DEBUG_THROW_IF((ptr == NULL) && (length != 0));
 

  Modified: lib/dat/trie.hpp (+15 -1)
===================================================================
--- lib/dat/trie.hpp    2012-01-17 10:49:32 +0900 (199dac3)
+++ lib/dat/trie.hpp    2012-01-19 12:20:08 +0900 (048846f)
@@ -40,13 +40,15 @@ class Trie {
               double num_nodes_per_key = 0.0,
               double average_key_length = 0.0);
 
-  void create(const Trie &src_trie,
+  void create(const Trie &trie,
               const char *file_name = NULL,
               UInt64 file_size = 0,
               UInt32 max_num_keys = 0,
               double num_nodes_per_key = 0.0,
               double average_key_length = 0.0);
 
+  void repair(const Trie &trie, const char *file_name = NULL);
+
   void open(const char *file_name);
   void close();
 
@@ -213,6 +215,18 @@ class Trie {
   void build_from_trie(const Trie &trie);
   void build_from_trie(const Trie &trie, UInt32 src, UInt32 dest);
 
+  void repair_trie(const Trie &trie);
+  void build_from_keys(const UInt32 *begin, const UInt32 *end,
+                       UInt32 depth, UInt32 node_id);
+
+  void mkq_sort(UInt32 *l, UInt32 *r, UInt32 depth);
+  void insertion_sort(UInt32 *l, UInt32 *r, UInt32 depth);
+
+  inline int get_label(UInt32 key_id, UInt32 depth) const;
+  inline int get_median(UInt32 a, UInt32 b, UInt32 c, UInt32 depth) const;
+  inline bool less_than(UInt32 lhs, UInt32 rhs, UInt32 depth) const;
+  inline static void swap_ids(UInt32 *lhs, UInt32 *rhs);
+
   bool search_key(const UInt8 *ptr, UInt32 length, UInt32 *key_pos) const;
   bool search_linker(const UInt8 *ptr, UInt32 length,
                      UInt32 &node_id, UInt32 &query_pos) const;




Groonga-commit メーリングリストの案内
アーカイブの一覧に戻る