[Groonga-commit] groonga/groonga-normalizer-mysql at c82646b [master] Fix a bug that full-width space isn't treated as blank character

アーカイブの一覧に戻る

Kouhei Sutou null+****@clear*****
Wed May 13 22:01:30 JST 2015


Kouhei Sutou	2015-05-13 22:01:30 +0900 (Wed, 13 May 2015)

  New Revision: c82646b64b538854e5757e887e5d456c88825b41
  https://github.com/groonga/groonga-normalizer-mysql/commit/c82646b64b538854e5757e887e5d456c88825b41

  Message:
    Fix a bug that full-width space isn't treated as blank character
    
    [groonga-dev,03215]
    
    Reported by Shota Mitsui. Thanks!!!

  Added files:
    test/suite/unicode_ci/remove_blank_full_width.expected
    test/suite/unicode_ci/remove_blank_full_width.test
  Modified files:
    normalizers/mysql.c

  Modified: normalizers/mysql.c (+31 -22)
===================================================================
--- normalizers/mysql.c    2015-05-06 19:35:56 +0900 (c7eb102)
+++ normalizers/mysql.c    2015-05-13 22:01:30 +0900 (e7961ee)
@@ -364,41 +364,50 @@ normalize(grn_ctx *ctx, grn_obj *string,
   rest_length = original_length_in_bytes;
   while (rest_length > 0) {
     int character_length;
+    grn_bool custom_normalized = GRN_FALSE;
+    unsigned int normalized_character_length;
+    unsigned int previous_normalized_length_in_bytes =
+      normalized_length_in_bytes;
+    unsigned int previous_normalized_n_characters =
+      normalized_n_characters;
 
     character_length = grn_plugin_charlen(ctx, rest, rest_length, encoding);
     if (character_length == 0) {
       break;
     }
 
-    if (remove_blank_p && character_length == 1 && rest[0] == ' ') {
+    if (custom_normalizer) {
+      custom_normalized = custom_normalizer(ctx,
+                                            rest,
+                                            &character_length,
+                                            rest_length - character_length,
+                                            normalize_table,
+                                            normalized,
+                                            &normalized_character_length,
+                                            &normalized_length_in_bytes,
+                                            &normalized_n_characters);
+    }
+    if (!custom_normalized) {
+      normalize_character(rest, character_length,
+                          normalize_table, normalize_table_size,
+                          normalized,
+                          &normalized_character_length,
+                          &normalized_length_in_bytes,
+                          &normalized_n_characters);
+    }
+
+    if (remove_blank_p &&
+        normalized_character_length == 1 &&
+        normalized[previous_normalized_length_in_bytes] == ' ') {
       if (current_type > types) {
         current_type[-1] |= GRN_CHAR_BLANK;
       }
       if (current_check) {
         current_check[0]++;
       }
+      normalized_length_in_bytes = previous_normalized_length_in_bytes;
+      normalized_n_characters = previous_normalized_n_characters;
     } else {
-      grn_bool custom_normalized = GRN_FALSE;
-      unsigned int normalized_character_length;
-      if (custom_normalizer) {
-        custom_normalized = custom_normalizer(ctx,
-                                              rest,
-                                              &character_length,
-                                              rest_length - character_length,
-                                              normalize_table,
-                                              normalized,
-                                              &normalized_character_length,
-                                              &normalized_length_in_bytes,
-                                              &normalized_n_characters);
-      }
-      if (!custom_normalized) {
-        normalize_character(rest, character_length,
-                            normalize_table, normalize_table_size,
-                            normalized,
-                            &normalized_character_length,
-                            &normalized_length_in_bytes,
-                            &normalized_n_characters);
-      }
       if (current_type && normalized_character_length > 0) {
         char *current_normalized;
         current_normalized =

  Added: test/suite/unicode_ci/remove_blank_full_width.expected (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/unicode_ci/remove_blank_full_width.expected    2015-05-13 22:01:30 +0900 (d710539)
@@ -0,0 +1,4 @@
+register normalizers/mysql
+[[0,0.0,0.0],true]
+normalize NormalizerMySQLUnicodeCI " a  b   c" REMOVE_BLANK|WITH_CHECKS
+[[0,0.0,0.0],{"normalized":"ABC","types":[],"checks":[2,3,4]}]

  Added: test/suite/unicode_ci/remove_blank_full_width.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/unicode_ci/remove_blank_full_width.test    2015-05-13 22:01:30 +0900 (68b7eb8)
@@ -0,0 +1,3 @@
+register normalizers/mysql
+
+normalize NormalizerMySQLUnicodeCI " a  b   c" REMOVE_BLANK|WITH_CHECKS
-------------- next part --------------
HTML����������������������������...
ダウンロード 



More information about the Groonga-commit mailing list
アーカイブの一覧に戻る