Kouhei Sutou 2019-02-01 14:46:38 +0900 (Fri, 01 Feb 2019) Revision: 8b87c840b6a8f1c8e1faa5f50c8314628ad0085b https://github.com/groonga/groonga/commit/8b87c840b6a8f1c8e1faa5f50c8314628ad0085b Message: TokenPattern: add support for multiple "pattern"s Added files: test/command/suite/tokenizers/pattern/multiple.expected test/command/suite/tokenizers/pattern/multiple.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+17 -8) =================================================================== --- lib/tokenizers.c 2019-02-01 14:31:23 +0900 (920141519) +++ lib/tokenizers.c 2019-02-01 14:46:38 +0900 (d10118356) @@ -1576,6 +1576,7 @@ pattern_open_options(grn_ctx *ctx, void *user_data) { grn_pattern_options *options; + grn_obj all_patterns; options = GRN_MALLOC(sizeof(grn_pattern_options)); if (!options) { @@ -1586,6 +1587,7 @@ pattern_open_options(grn_ctx *ctx, } pattern_options_init(options); + GRN_TEXT_INIT(&all_patterns, 0); GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) { grn_raw_string name_raw; name_raw.value = name; @@ -1604,20 +1606,27 @@ pattern_open_options(grn_ctx *ctx, NULL, &domain); if (grn_type_id_is_text_family(ctx, domain) && pattern_length > 0) { - if (options->regex) { - onig_free(options->regex); + if (GRN_TEXT_LEN(&all_patterns) > 0) { + GRN_TEXT_PUTS(ctx, &all_patterns, "|"); } - options->regex = grn_onigmo_new(ctx, - pattern, - pattern_length, - GRN_ONIGMO_OPTION_DEFAULT, - GRN_ONIGMO_SYNTAX_DEFAULT, - "[tokenizer][delimit]"); + GRN_TEXT_PUTS(ctx, &all_patterns, "(?:"); + GRN_TEXT_PUT(ctx, &all_patterns, pattern, pattern_length); + GRN_TEXT_PUTS(ctx, &all_patterns, ")"); } #endif /* GRN_SUPPORT_REGEXP */ } } GRN_OPTION_VALUES_EACH_END(); + if (GRN_TEXT_LEN(&all_patterns) > 0) { + options->regex = grn_onigmo_new(ctx, + GRN_TEXT_VALUE(&all_patterns), + GRN_TEXT_LEN(&all_patterns), + GRN_ONIGMO_OPTION_DEFAULT, + GRN_ONIGMO_SYNTAX_DEFAULT, + "[tokenizer][pattern]"); + } + GRN_OBJ_FIN(ctx, &all_patterns); + return options; } Added: test/command/suite/tokenizers/pattern/multiple.expected (+40 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/pattern/multiple.expected 2019-02-01 14:46:38 +0900 (a36f8d4ac) @@ -0,0 +1,40 @@ +tokenize 'TokenPattern("pattern", "\\\\d+円", "pattern", "りんご|みかん")' "私は100円のりんごと50円のみかんを129円で買いました。" +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "100円", + "position": 0, + "force_prefix": false, + "force_prefix_search": false + }, + { + "value": "りんご", + "position": 1, + "force_prefix": false, + "force_prefix_search": false + }, + { + "value": "50円", + "position": 2, + "force_prefix": false, + "force_prefix_search": false + }, + { + "value": "みかん", + "position": 3, + "force_prefix": false, + "force_prefix_search": false + }, + { + "value": "129円", + "position": 4, + "force_prefix": false, + "force_prefix_search": false + } + ] +] Added: test/command/suite/tokenizers/pattern/multiple.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/pattern/multiple.test 2019-02-01 14:46:38 +0900 (b2469fb15) @@ -0,0 +1,4 @@ +tokenize \ + 'TokenPattern("pattern", "\\\\d+円", \ + "pattern", "りんご|みかん")' \ + "私は100円のりんごと50円のみかんを129円で買いました。" -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190201/aca18407/attachment-0001.html>