Kouhei Sutou
null+****@clear*****
Fri Nov 9 14:13:49 JST 2012
Kouhei Sutou 2012-11-09 14:13:49 +0900 (Fri, 09 Nov 2012) New Revision: 812749828a970e9a8fa4168f638a9e2341015260 https://github.com/groonga/groonga/commit/812749828a970e9a8fa4168f638a9e2341015260 Log: TokenNgram family: ignore tokenizer delimiter (U+FFFE) Added files: test/command/suite/table_create/default_tokenizer/bigram/default.expected test/command/suite/table_create/default_tokenizer/bigram/default.test test/command/suite/table_create/default_tokenizer/bigram/normalize.expected test/command/suite/table_create/default_tokenizer/bigram/normalize.test test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.expected test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.test test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.expected test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.test Modified files: lib/token.c Modified: lib/token.c (+4 -1) =================================================================== --- lib/token.c 2012-11-09 13:56:28 +0900 (9ec0769) +++ lib/token.c 2012-11-09 14:13:49 +0900 (3579c95) @@ -212,7 +212,10 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram { grn_obj *str; grn_obj *normalizer = NULL; - int nflags = GRN_STRING_REMOVE_BLANK|GRN_STRING_WITH_TYPES; + int nflags = + GRN_STRING_REMOVE_BLANK | + GRN_STRING_WITH_TYPES | + GRN_STRING_REMOVE_TOKENIZER_DELIMITER; const char *normalized; unsigned int normalized_length_in_bytes; grn_ngram_tokenizer *token; Added: test/command/suite/table_create/default_tokenizer/bigram/default.expected (+95 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/bigram/default.expected 2012-11-09 14:13:49 +0900 (e5ce2c0) @@ -0,0 +1,95 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram +[[0,0.0,0.0],true] +column_create Terms memos_content COLUMN_INDEX Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "This is a pen."}, +{"content": "これはペンです。"} +] +[[0,0.0,0.0],2] +select Terms --output_columns _key --limit -1 +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 20 + ], + [ + [ + "_key", + "ShortText" + ] + ], + [ + " a" + ], + [ + " i" + ], + [ + " p" + ], + [ + "." + ], + [ + "Th" + ], + [ + "a " + ], + [ + "en" + ], + [ + "hi" + ], + [ + "is" + ], + [ + "n." + ], + [ + "pe" + ], + [ + "s " + ], + [ + "。" + ], + [ + "これ" + ], + [ + "す。" + ], + [ + "です" + ], + [ + "はペ" + ], + [ + "れは" + ], + [ + "ペン" + ], + [ + "ンで" + ] + ] + ] +] Added: test/command/suite/table_create/default_tokenizer/bigram/default.test (+14 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/bigram/default.test 2012-11-09 14:13:49 +0900 (b04772a) @@ -0,0 +1,14 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram +column_create Terms memos_content COLUMN_INDEX Memos content + +load --table Memos +[ +{"content": "This is a pen."}, +{"content": "これはペンです。"} +] + +select Terms --output_columns _key --limit -1 Added: test/command/suite/table_create/default_tokenizer/bigram/normalize.expected (+74 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/bigram/normalize.expected 2012-11-09 14:13:49 +0900 (6cc6df6) @@ -0,0 +1,74 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY|KEY_NORMALIZE ShortText --default_tokenizer TokenBigram +[[0,0.0,0.0],true] +column_create Terms memos_content COLUMN_INDEX Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "This is a pen."}, +{"content": "これはペンです。"} +] +[[0,0.0,0.0],2] +select Terms --output_columns _key --limit -1 +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 13 + ], + [ + [ + "_key", + "ShortText" + ] + ], + [ + "." + ], + [ + "a" + ], + [ + "is" + ], + [ + "pen" + ], + [ + "this" + ], + [ + "。" + ], + [ + "これ" + ], + [ + "す" + ], + [ + "です" + ], + [ + "はペ" + ], + [ + "れは" + ], + [ + "ペン" + ], + [ + "ンで" + ] + ] + ] +] Added: test/command/suite/table_create/default_tokenizer/bigram/normalize.test (+14 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/bigram/normalize.test 2012-11-09 14:13:49 +0900 (af6adc7) @@ -0,0 +1,14 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create Terms TABLE_PAT_KEY|KEY_NORMALIZE ShortText \ + --default_tokenizer TokenBigram +column_create Terms memos_content COLUMN_INDEX Memos content + +load --table Memos +[ +{"content": "This is a pen."}, +{"content": "これはペンです。"} +] + +select Terms --output_columns _key --limit -1 Added: test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.expected (+95 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.expected 2012-11-09 14:13:49 +0900 (7f373d3) @@ -0,0 +1,95 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram +[[0,0.0,0.0],true] +column_create Terms memos_content COLUMN_INDEX Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Th\uFFFEis is a p\uFFFEen."}, +{"content": "これは\uFFFEペン\uFFFEです。"} +] +[[0,0.0,0.0],2] +select Terms --output_columns _key --limit -1 +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 20 + ], + [ + [ + "_key", + "ShortText" + ] + ], + [ + " a" + ], + [ + " i" + ], + [ + " p" + ], + [ + "." + ], + [ + "Th" + ], + [ + "a " + ], + [ + "en" + ], + [ + "hi" + ], + [ + "is" + ], + [ + "n." + ], + [ + "pe" + ], + [ + "s " + ], + [ + "。" + ], + [ + "これ" + ], + [ + "す。" + ], + [ + "です" + ], + [ + "はペ" + ], + [ + "れは" + ], + [ + "ペン" + ], + [ + "ンで" + ] + ] + ] +] Added: test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.test (+14 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.test 2012-11-09 14:13:49 +0900 (98bc30b) @@ -0,0 +1,14 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram +column_create Terms memos_content COLUMN_INDEX Memos content + +load --table Memos +[ +{"content": "Th\uFFFEis is a p\uFFFEen."}, +{"content": "これは\uFFFEペン\uFFFEです。"} +] + +select Terms --output_columns _key --limit -1 Added: test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.expected (+74 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.expected 2012-11-09 14:13:49 +0900 (c321869) @@ -0,0 +1,74 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY|KEY_NORMALIZE ShortText --default_tokenizer TokenBigram +[[0,0.0,0.0],true] +column_create Terms memos_content COLUMN_INDEX Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Th\uFFFEis is a p\uFFFEen."}, +{"content": "これは\uFFFEペン\uFFFEです。"} +] +[[0,0.0,0.0],2] +select Terms --output_columns _key --limit -1 +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 13 + ], + [ + [ + "_key", + "ShortText" + ] + ], + [ + "." + ], + [ + "a" + ], + [ + "is" + ], + [ + "pen" + ], + [ + "this" + ], + [ + "。" + ], + [ + "これ" + ], + [ + "す" + ], + [ + "です" + ], + [ + "はペ" + ], + [ + "れは" + ], + [ + "ペン" + ], + [ + "ンで" + ] + ] + ] +] Added: test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.test (+14 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.test 2012-11-09 14:13:49 +0900 (e2ae5b4) @@ -0,0 +1,14 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create Terms TABLE_PAT_KEY|KEY_NORMALIZE ShortText \ + --default_tokenizer TokenBigram +column_create Terms memos_content COLUMN_INDEX Memos content + +load --table Memos +[ +{"content": "Th\uFFFEis is a p\uFFFEen."}, +{"content": "これは\uFFFEペン\uFFFEです。"} +] + +select Terms --output_columns _key --limit -1 -------------- next part -------------- HTML����������������������������... ダウンロード