[Groonga-commit] groonga/groonga at d658ad0 [master] TokenDelimit: invert what pattern means

アーカイブの一覧に戻る
Kouhei Sutou null+****@clear*****
Mon Nov 26 10:37:01 JST 2018


Kouhei Sutou	2018-11-26 10:37:01 +0900 (Mon, 26 Nov 2018)

  Revision: d658ad001c67b43e4565c9731fba6c3557ced933
  https://github.com/groonga/groonga/commit/d658ad001c67b43e4565c9731fba6c3557ced933

  Message:
    TokenDelimit: invert what pattern means
    
    Now, pattern specifies delimiter pattern not token pattern.

  Modified files:
    lib/tokenizers.c
    test/command/suite/tokenizers/delimit/options/pattern/no_match.expected
    test/command/suite/tokenizers/delimit/options/pattern/sentences.expected
    test/command/suite/tokenizers/delimit/options/pattern/sentences.test

  Modified: lib/tokenizers.c (+4 -4)
===================================================================
--- lib/tokenizers.c    2018-11-26 10:36:49 +0900 (cb4478d44)
+++ lib/tokenizers.c    2018-11-26 10:37:01 +0900 (5c5420853)
@@ -337,14 +337,14 @@ delimit_next(grn_ctx *ctx,
     if (position == ONIG_MISMATCH) {
       grn_token_set_data(ctx,
                          token,
-                         NULL,
-                         0);
+                         tokenizer->next,
+                         tokenizer->end - tokenizer->next);
       grn_token_set_status(ctx, token, GRN_TOKEN_LAST);
     } else {
       grn_token_set_data(ctx,
                          token,
-                         tokenizer->start + region.beg[0],
-                         region.end[0] - region.beg[0]);
+                         tokenizer->next,
+                         (tokenizer->start + region.beg[0]) - tokenizer->next);
       grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE);
       tokenizer->next = tokenizer->start + region.end[0];
       onig_region_free(&region, 0);

  Modified: test/command/suite/tokenizers/delimit/options/pattern/no_match.expected (+15 -1)
===================================================================
--- test/command/suite/tokenizers/delimit/options/pattern/no_match.expected    2018-11-26 10:36:49 +0900 (70e6dbf63)
+++ test/command/suite/tokenizers/delimit/options/pattern/no_match.expected    2018-11-26 10:37:01 +0900 (7b2d3a084)
@@ -1,2 +1,16 @@
 tokenize   'TokenDelimit("pattern", "nonexistent")'   "Hello"
-[[0,0.0,0.0],[]]
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "Hello",
+      "position": 0,
+      "force_prefix": false,
+      "force_prefix_search": false
+    }
+  ]
+]

  Modified: test/command/suite/tokenizers/delimit/options/pattern/sentences.expected (+5 -11)
===================================================================
--- test/command/suite/tokenizers/delimit/options/pattern/sentences.expected    2018-11-26 10:36:49 +0900 (08ac7c822)
+++ test/command/suite/tokenizers/delimit/options/pattern/sentences.expected    2018-11-26 10:37:01 +0900 (f8447b9a4)
@@ -1,4 +1,4 @@
-tokenize   'TokenDelimit("pattern", "[^\\\\s].*?[.。]")'   "りんごです。ペンです。This is an apple. Mr. X."
+tokenize   'TokenDelimit("pattern", "(?:(?<!(?:Mr|bldg))[.]|。)\\\\s*")'   "りんごです。ペンです。This is an apple. Mr. X."
 [
   [
     0,
@@ -7,34 +7,28 @@ tokenize   'TokenDelimit("pattern", "[^\\\\s].*?[.。]")'   "りんごです。
   ],
   [
     {
-      "value": "りんごです。",
+      "value": "りんごです",
       "position": 0,
       "force_prefix": false,
       "force_prefix_search": false
     },
     {
-      "value": "ペンです。",
+      "value": "ペンです",
       "position": 1,
       "force_prefix": false,
       "force_prefix_search": false
     },
     {
-      "value": "This is an apple.",
+      "value": "This is an apple",
       "position": 2,
       "force_prefix": false,
       "force_prefix_search": false
     },
     {
-      "value": "Mr.",
+      "value": "Mr. X",
       "position": 3,
       "force_prefix": false,
       "force_prefix_search": false
-    },
-    {
-      "value": "X.",
-      "position": 4,
-      "force_prefix": false,
-      "force_prefix_search": false
     }
   ]
 ]

  Modified: test/command/suite/tokenizers/delimit/options/pattern/sentences.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/delimit/options/pattern/sentences.test    2018-11-26 10:36:49 +0900 (8396e3284)
+++ test/command/suite/tokenizers/delimit/options/pattern/sentences.test    2018-11-26 10:37:01 +0900 (22b497abb)
@@ -1,3 +1,3 @@
 tokenize \
-  'TokenDelimit("pattern", "[^\\\\s].*?[.。]")' \
+  'TokenDelimit("pattern", "(?:(?<!(?:Mr|bldg))[.]|。)\\\\s*")' \
   "りんごです。ペンです。This is an apple. Mr. X."
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181126/890ee4d9/attachment-0001.html>


More information about the Groonga-commit mailing list
アーカイブの一覧に戻る