[Groonga-commit] groonga/groonga [master] tokenizer: add grn_tokenizer_is_delimiter()

アーカイブの一覧に戻る

Kouhei Sutou null+****@clear*****
Fri Nov 9 11:22:37 JST 2012


Kouhei Sutou	2012-11-09 11:22:37 +0900 (Fri, 09 Nov 2012)

  New Revision: 981fb2fabe7367795aa8c4e9d9bdfbb43e0b2869
  https://github.com/groonga/groonga/commit/981fb2fabe7367795aa8c4e9d9bdfbb43e0b2869

  Log:
    tokenizer: add grn_tokenizer_is_delimiter()

  Added files:
    test/unit/core/test-tokenizer.c
  Modified files:
    include/groonga/tokenizer.h
    lib/tokenizer.c
    test/unit/core/Makefile.am

  Modified: include/groonga/tokenizer.h (+10 -0)
===================================================================
--- include/groonga/tokenizer.h    2012-11-09 11:01:42 +0900 (608fe21)
+++ include/groonga/tokenizer.h    2012-11-09 11:22:37 +0900 (0d0a899)
@@ -44,6 +44,16 @@ int grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
                           unsigned int str_length, grn_encoding encoding);
 
 /*
+  grn_tokenizer_is_delimiter() returns whether is the first character
+  in the string specified by `str_ptr' and `str_length' the special
+  delimiter character or not.
+ */
+grn_bool grn_tokenizer_is_delimiter(grn_ctx *ctx,
+                                    const char *str_ptr,
+                                    unsigned int str_length,
+                                    grn_encoding encoding);
+
+/*
   grn_tokenizer_query is a structure for storing a query. See the following
   functions.
  */

  Modified: lib/tokenizer.c (+19 -0)
===================================================================
--- lib/tokenizer.c    2012-11-09 11:01:42 +0900 (31b3e4b)
+++ lib/tokenizer.c    2012-11-09 11:22:37 +0900 (061b7f6)
@@ -81,6 +81,25 @@ grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
   return 0;
 }
 
+grn_bool
+grn_tokenizer_is_delimiter(grn_ctx *ctx, const char *str_ptr,
+                           unsigned int str_length, grn_encoding encoding)
+{
+  const unsigned char *binary_string = str_ptr;
+
+  if (encoding != GRN_ENC_UTF8) {
+    return GRN_FALSE;
+  }
+
+  if (str_length != 3) {
+    return GRN_FALSE;
+  }
+
+  return binary_string[0] == 0xEF &&
+    binary_string[1] == 0xBF &&
+    binary_string[2] == 0xBE;
+}
+
 grn_tokenizer_query *
 grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args)
 {

  Modified: test/unit/core/Makefile.am (+3 -1)
===================================================================
--- test/unit/core/Makefile.am    2012-11-09 11:01:42 +0900 (fb5be8a)
+++ test/unit/core/Makefile.am    2012-11-09 11:22:37 +0900 (19b657f)
@@ -66,7 +66,8 @@ noinst_LTLIBRARIES =				\
 	test-geo-in-rectangle-border.la		\
 	test-accessor.la			\
 	test-object.la				\
-	test-rename.la
+	test-rename.la				\
+	test-tokenizer.la
 endif
 
 INCLUDES =			\
@@ -158,3 +159,4 @@ test_geo_in_rectangle_border_la_SOURCES	= test-geo-in-rectangle-border.c
 test_accessor_la_SOURCES		= test-accessor.c
 test_object_la_SOURCES			= test-object.c
 test_rename_la_SOURCES			= test-rename.c
+test_tokenizer_la_SOURCES		= test-tokenizer.c

  Added: test/unit/core/test-tokenizer.c (+87 -0) 100644
===================================================================
--- /dev/null
+++ test/unit/core/test-tokenizer.c    2012-11-09 11:22:37 +0900 (433ec5c)
@@ -0,0 +1,87 @@
+/* -*- c-basic-offset: 2; coding: utf-8 -*- */
+/*
+  Copyright (C) 2012  Kouhei Sutou <kou �� clear-code.com>
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <groonga.h>
+#include <groonga/tokenizer.h>
+
+#include <gcutter.h>
+
+#include "../lib/grn-assertions.h"
+
+void data_is_delimiter(void);
+void test_is_delimiter(gconstpointer data);
+
+static grn_ctx context;
+static grn_obj *db;
+static grn_obj buffer;
+
+void
+setup (void)
+{
+  grn_ctx_init(&context, GRN_CTX_USE_QL);
+  db = grn_db_create(&context, NULL, NULL);
+  GRN_VOID_INIT(&buffer);
+}
+
+void
+teardown (void)
+{
+  GRN_OBJ_FIN(&context, &buffer);
+  grn_obj_unlink(&context, db);
+  grn_ctx_fin(&context);
+}
+
+void
+data_is_delimiter(void)
+{
+#define ADD_DATUM(label, expected, input, encoding)                     \
+  gcut_add_datum(label,                                                 \
+                 "expected", G_TYPE_BOOLEAN, expected,                  \
+                 "input",    G_TYPE_STRING,  input,                     \
+                 "encoding", G_TYPE_INT,     encoding,                  \
+                 NULL)
+
+  ADD_DATUM("U+FFFE (UTF-8)",     GRN_TRUE,  "\xEF\xBF\xBE", GRN_ENC_UTF8);
+  ADD_DATUM("U+FFFE (EUC-JP)",    GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_EUC_JP);
+  ADD_DATUM("U+FFFE (Shift_JIS)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_SJIS);
+  ADD_DATUM("U+FFFE (NONE)",      GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_NONE);
+  ADD_DATUM("U+FFFE (LATIN1)",    GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_LATIN1);
+  ADD_DATUM("U+FFFE (KOI8R)",     GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_KOI8R);
+
+  ADD_DATUM("U+FFFF",             GRN_FALSE, "\xEF\xBF\xBF", GRN_ENC_UTF8);
+
+#undef ADD_DATUM
+}
+
+void
+test_is_delimiter(gconstpointer data)
+{
+  const gchar *input;
+  grn_encoding encoding;
+
+  encoding = gcut_data_get_int(data, "encoding");
+  GRN_CTX_SET_ENCODING(&context, encoding);
+  input = gcut_data_get_string(data, "input");
+  if (gcut_data_get_boolean(data, "expected")) {
+    cut_assert_true(grn_tokenizer_is_delimiter(&context, input, strlen(input),
+                                               encoding));
+  } else {
+    cut_assert_false(grn_tokenizer_is_delimiter(&context, input, strlen(input),
+                                                encoding));
+  }
+}
-------------- next part --------------
HTML����������������������������...
ダウンロード 



More information about the Groonga-commit mailing list
アーカイブの一覧に戻る