Kouhei Sutou
null+****@clear*****
Fri Nov 9 11:22:37 JST 2012
Kouhei Sutou 2012-11-09 11:22:37 +0900 (Fri, 09 Nov 2012) New Revision: 981fb2fabe7367795aa8c4e9d9bdfbb43e0b2869 https://github.com/groonga/groonga/commit/981fb2fabe7367795aa8c4e9d9bdfbb43e0b2869 Log: tokenizer: add grn_tokenizer_is_delimiter() Added files: test/unit/core/test-tokenizer.c Modified files: include/groonga/tokenizer.h lib/tokenizer.c test/unit/core/Makefile.am Modified: include/groonga/tokenizer.h (+10 -0) =================================================================== --- include/groonga/tokenizer.h 2012-11-09 11:01:42 +0900 (608fe21) +++ include/groonga/tokenizer.h 2012-11-09 11:22:37 +0900 (0d0a899) @@ -44,6 +44,16 @@ int grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr, unsigned int str_length, grn_encoding encoding); /* + grn_tokenizer_is_delimiter() returns whether is the first character + in the string specified by `str_ptr' and `str_length' the special + delimiter character or not. + */ +grn_bool grn_tokenizer_is_delimiter(grn_ctx *ctx, + const char *str_ptr, + unsigned int str_length, + grn_encoding encoding); + +/* grn_tokenizer_query is a structure for storing a query. See the following functions. */ Modified: lib/tokenizer.c (+19 -0) =================================================================== --- lib/tokenizer.c 2012-11-09 11:01:42 +0900 (31b3e4b) +++ lib/tokenizer.c 2012-11-09 11:22:37 +0900 (061b7f6) @@ -81,6 +81,25 @@ grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr, return 0; } +grn_bool +grn_tokenizer_is_delimiter(grn_ctx *ctx, const char *str_ptr, + unsigned int str_length, grn_encoding encoding) +{ + const unsigned char *binary_string = str_ptr; + + if (encoding != GRN_ENC_UTF8) { + return GRN_FALSE; + } + + if (str_length != 3) { + return GRN_FALSE; + } + + return binary_string[0] == 0xEF && + binary_string[1] == 0xBF && + binary_string[2] == 0xBE; +} + grn_tokenizer_query * grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args) { Modified: test/unit/core/Makefile.am (+3 -1) =================================================================== --- test/unit/core/Makefile.am 2012-11-09 11:01:42 +0900 (fb5be8a) +++ test/unit/core/Makefile.am 2012-11-09 11:22:37 +0900 (19b657f) @@ -66,7 +66,8 @@ noinst_LTLIBRARIES = \ test-geo-in-rectangle-border.la \ test-accessor.la \ test-object.la \ - test-rename.la + test-rename.la \ + test-tokenizer.la endif INCLUDES = \ @@ -158,3 +159,4 @@ test_geo_in_rectangle_border_la_SOURCES = test-geo-in-rectangle-border.c test_accessor_la_SOURCES = test-accessor.c test_object_la_SOURCES = test-object.c test_rename_la_SOURCES = test-rename.c +test_tokenizer_la_SOURCES = test-tokenizer.c Added: test/unit/core/test-tokenizer.c (+87 -0) 100644 =================================================================== --- /dev/null +++ test/unit/core/test-tokenizer.c 2012-11-09 11:22:37 +0900 (433ec5c) @@ -0,0 +1,87 @@ +/* -*- c-basic-offset: 2; coding: utf-8 -*- */ +/* + Copyright (C) 2012 Kouhei Sutou <kou �� clear-code.com> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <groonga.h> +#include <groonga/tokenizer.h> + +#include <gcutter.h> + +#include "../lib/grn-assertions.h" + +void data_is_delimiter(void); +void test_is_delimiter(gconstpointer data); + +static grn_ctx context; +static grn_obj *db; +static grn_obj buffer; + +void +setup (void) +{ + grn_ctx_init(&context, GRN_CTX_USE_QL); + db = grn_db_create(&context, NULL, NULL); + GRN_VOID_INIT(&buffer); +} + +void +teardown (void) +{ + GRN_OBJ_FIN(&context, &buffer); + grn_obj_unlink(&context, db); + grn_ctx_fin(&context); +} + +void +data_is_delimiter(void) +{ +#define ADD_DATUM(label, expected, input, encoding) \ + gcut_add_datum(label, \ + "expected", G_TYPE_BOOLEAN, expected, \ + "input", G_TYPE_STRING, input, \ + "encoding", G_TYPE_INT, encoding, \ + NULL) + + ADD_DATUM("U+FFFE (UTF-8)", GRN_TRUE, "\xEF\xBF\xBE", GRN_ENC_UTF8); + ADD_DATUM("U+FFFE (EUC-JP)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_EUC_JP); + ADD_DATUM("U+FFFE (Shift_JIS)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_SJIS); + ADD_DATUM("U+FFFE (NONE)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_NONE); + ADD_DATUM("U+FFFE (LATIN1)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_LATIN1); + ADD_DATUM("U+FFFE (KOI8R)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_KOI8R); + + ADD_DATUM("U+FFFF", GRN_FALSE, "\xEF\xBF\xBF", GRN_ENC_UTF8); + +#undef ADD_DATUM +} + +void +test_is_delimiter(gconstpointer data) +{ + const gchar *input; + grn_encoding encoding; + + encoding = gcut_data_get_int(data, "encoding"); + GRN_CTX_SET_ENCODING(&context, encoding); + input = gcut_data_get_string(data, "input"); + if (gcut_data_get_boolean(data, "expected")) { + cut_assert_true(grn_tokenizer_is_delimiter(&context, input, strlen(input), + encoding)); + } else { + cut_assert_false(grn_tokenizer_is_delimiter(&context, input, strlen(input), + encoding)); + } +} -------------- next part -------------- HTML����������������������������... ダウンロード