Kouhei Sutou
null+****@clear*****
Tue Oct 28 12:47:04 JST 2014
Kouhei Sutou 2014-10-28 12:47:04 +0900 (Tue, 28 Oct 2014) New Revision: 11751f0148314142a2ddb360b591e3bdbed043d5 https://github.com/groonga/groonga/commit/11751f0148314142a2ddb360b591e3bdbed043d5 Message: table_tokenize: don't add tokens before tokenization on "GET" mode Modified files: lib/proc.c test/command/suite/table_tokenize/flags.expected test/command/suite/table_tokenize/flags.test test/command/suite/table_tokenize/get_mode.expected test/command/suite/table_tokenize/get_mode.test test/command/suite/table_tokenize/with_normalizer.expected test/command/suite/table_tokenize/with_normalizer.test test/command/suite/table_tokenize/with_token_filters.expected test/command/suite/table_tokenize/with_token_filters.test Modified: lib/proc.c (+20 -30) =================================================================== --- lib/proc.c 2014-10-28 12:35:32 +0900 (dd6de98) +++ lib/proc.c 2014-10-28 12:47:04 +0900 (3d159ee) @@ -3536,32 +3536,6 @@ tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_token_mode mode, grn_token_cursor_close(ctx, token_cursor); } -static void -tokenize_add(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, unsigned int flags) -{ - grn_obj tokens; - GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); - tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); - output_tokens(ctx, &tokens, lexicon); - GRN_OBJ_FIN(ctx, &tokens); -} - -static void -tokenize_get(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, unsigned int flags) -{ - grn_obj tokens; - - GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); - - tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); - - GRN_BULK_REWIND(&tokens); - tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); - output_tokens(ctx, &tokens, lexicon); - - GRN_OBJ_FIN(ctx, &tokens); -} - static grn_obj * proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { @@ -3610,14 +3584,23 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) + { + grn_obj tokens; + GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) { - tokenize_add(ctx, lexicon, string, flags); + tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); } else if (MODE_NAME_EQUAL("GET")) { - tokenize_get(ctx, lexicon, string, flags); + tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); + GRN_BULK_REWIND(&tokens); + tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); } else { ERR(GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>", (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); } + GRN_OBJ_FIN(ctx, &tokens); + } #undef MODE_NAME_EQUAL grn_obj_unlink(ctx, lexicon); @@ -3668,14 +3651,21 @@ proc_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) + { + grn_obj tokens; + GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) { - tokenize_get(ctx, lexicon, string, flags); + tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); } else if (MODE_NAME_EQUAL("ADD")) { - tokenize_add(ctx, lexicon, string, flags); + tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); } else { ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] invalid mode: <%.*s>", (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); } + GRN_OBJ_FIN(ctx, &tokens); + } #undef MODE_NAME_EQUAL grn_obj_unlink(ctx, lexicon); Modified: test/command/suite/table_tokenize/flags.expected (+1 -1) =================================================================== --- test/command/suite/table_tokenize/flags.expected 2014-10-28 12:35:32 +0900 (20bb1c6) +++ test/command/suite/table_tokenize/flags.expected 2014-10-28 12:47:04 +0900 (c3bc0b0) @@ -1,6 +1,6 @@ table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenDelimit --normalizer NormalizerAuto [[0,0.0,0.0],true] -table_tokenize Terms "aBcDe 123" ENABLE_TOKENIZED_DELIMITER +table_tokenize Terms "aBcDe 123" ENABLE_TOKENIZED_DELIMITER ADD [ [ 0, Modified: test/command/suite/table_tokenize/flags.test (+1 -1) =================================================================== --- test/command/suite/table_tokenize/flags.test 2014-10-28 12:35:32 +0900 (e5da57a) +++ test/command/suite/table_tokenize/flags.test 2014-10-28 12:47:04 +0900 (df0c4cc) @@ -2,4 +2,4 @@ table_create Terms TABLE_PAT_KEY ShortText \ --default_tokenizer TokenDelimit \ --normalizer NormalizerAuto -table_tokenize Terms "aBcDe 123" ENABLE_TOKENIZED_DELIMITER +table_tokenize Terms "aBcDe 123" ENABLE_TOKENIZED_DELIMITER ADD Modified: test/command/suite/table_tokenize/get_mode.expected (+9 -19) =================================================================== --- test/command/suite/table_tokenize/get_mode.expected 2014-10-28 12:35:32 +0900 (774c69d) +++ test/command/suite/table_tokenize/get_mode.expected 2014-10-28 12:47:04 +0900 (6455fab) @@ -1,24 +1,14 @@ table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto [[0,0.0,0.0],true] table_tokenize Terms "あいabアイ" --mode GET +[[0,0.0,0.0],[]] +load --table Terms [ - [ - 0, - 0.0, - 0.0 - ], - [ - { - "value": "あい", - "position": 0 - }, - { - "value": "ab", - "position": 2 - }, - { - "value": "アイ", - "position": 3 - } - ] +{"_key": "あい"}, +{"_key": "い"}, +{"_key": "ab"}, +{"_key": "イ"} ] +[[0,0.0,0.0],4] +table_tokenize Terms "あいabアイ" --mode GET +[[0,0.0,0.0],[{"value":"あい","position":0},{"value":"ab","position":2}]] Modified: test/command/suite/table_tokenize/get_mode.test (+10 -0) =================================================================== --- test/command/suite/table_tokenize/get_mode.test 2014-10-28 12:35:32 +0900 (529c395) +++ test/command/suite/table_tokenize/get_mode.test 2014-10-28 12:47:04 +0900 (0805f4d) @@ -3,3 +3,13 @@ table_create Terms TABLE_PAT_KEY ShortText \ --normalizer NormalizerAuto table_tokenize Terms "あいabアイ" --mode GET + +load --table Terms +[ +{"_key": "あい"}, +{"_key": "い"}, +{"_key": "ab"}, +{"_key": "イ"} +] + +table_tokenize Terms "あいabアイ" --mode GET Modified: test/command/suite/table_tokenize/with_normalizer.expected (+1 -1) =================================================================== --- test/command/suite/table_tokenize/with_normalizer.expected 2014-10-28 12:35:32 +0900 (ac20b02) +++ test/command/suite/table_tokenize/with_normalizer.expected 2014-10-28 12:47:04 +0900 (28c3b87) @@ -1,4 +1,4 @@ table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto [[0,0.0,0.0],true] -table_tokenize Terms "aBcDe 123" +table_tokenize Terms "aBcDe 123" --mode ADD [[0,0.0,0.0],[{"value":"abcde","position":0},{"value":"123","position":1}]] Modified: test/command/suite/table_tokenize/with_normalizer.test (+1 -1) =================================================================== --- test/command/suite/table_tokenize/with_normalizer.test 2014-10-28 12:35:32 +0900 (3cda398) +++ test/command/suite/table_tokenize/with_normalizer.test 2014-10-28 12:47:04 +0900 (e6d1537) @@ -2,4 +2,4 @@ table_create Terms TABLE_PAT_KEY ShortText \ --default_tokenizer TokenBigram \ --normalizer NormalizerAuto -table_tokenize Terms "aBcDe 123" +table_tokenize Terms "aBcDe 123" --mode ADD Modified: test/command/suite/table_tokenize/with_token_filters.expected (+31 -1) =================================================================== --- test/command/suite/table_tokenize/with_token_filters.expected 2014-10-28 12:35:32 +0900 (7cfcda0) +++ test/command/suite/table_tokenize/with_token_filters.expected 2014-10-28 12:47:04 +0900 (a7c3727) @@ -4,12 +4,42 @@ table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram - [[0,0.0,0.0],true] column_create Terms is_stop_word COLUMN_SCALAR Bool [[0,0.0,0.0],true] +table_tokenize Terms "Hello and Good-bye" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "hello", + "position": 0 + }, + { + "value": "and", + "position": 1 + }, + { + "value": "good", + "position": 2 + }, + { + "value": "-", + "position": 3 + }, + { + "value": "bye", + "position": 4 + } + ] +] load --table Terms [ {"_key": "and", "is_stop_word": true} ] [[0,0.0,0.0],1] -table_tokenize Terms "Hello and Good-bye" --mode GET +table_tokenize Terms "Hello and Good-bye" [ [ 0, Modified: test/command/suite/table_tokenize/with_token_filters.test (+3 -1) =================================================================== --- test/command/suite/table_tokenize/with_token_filters.test 2014-10-28 12:35:32 +0900 (80c5adb) +++ test/command/suite/table_tokenize/with_token_filters.test 2014-10-28 12:47:04 +0900 (c02cb61) @@ -6,9 +6,11 @@ table_create Terms TABLE_PAT_KEY ShortText \ --token_filters TokenFilterStopWord column_create Terms is_stop_word COLUMN_SCALAR Bool +table_tokenize Terms "Hello and Good-bye" --mode ADD + load --table Terms [ {"_key": "and", "is_stop_word": true} ] -table_tokenize Terms "Hello and Good-bye" --mode GET +table_tokenize Terms "Hello and Good-bye" -------------- next part -------------- HTML����������������������������... ダウンロード