• R/O
  • SSH
  • HTTPS

qrobosdk: コミット


コミットメタ情報

リビジョン1819 (tree)
日時2010-05-10 07:26:56
作者satofumi

ログメッセージ

TestRubiParse.cpp is implemented.

変更サマリ

差分

--- trunk/libs/input/rubi_parse.h (revision 1818)
+++ trunk/libs/input/rubi_parse.h (revision 1819)
@@ -18,15 +18,15 @@
1818 {
1919 typedef struct
2020 {
21- size_t original_first;
22- size_t original_size;
21+ size_t kanji_first;
22+ size_t kanji_size;
2323 size_t rubi_first;
2424 size_t rubi_size;
2525 } rubi_t;
2626
2727
28- extern void parse_rubi(std::vector<rubi_t>& rubi_positions,
29- const char* original, const char* kana);
28+ extern bool parse_rubi(std::vector<rubi_t>& rubi_positions,
29+ const char* text, const char* kana_only);
3030 }
3131
3232 #endif /* !QRK_RUBI_PARSE_H */
--- trunk/libs/input/rubi_parse.cpp (revision 1818)
+++ trunk/libs/input/rubi_parse.cpp (revision 1819)
@@ -5,18 +5,134 @@
55 \author Satofumi KAMIMURA
66
77 $Id$
8+
9+ \todo 不適切な文字列が渡されたときにも、Segmentation falt にならないようにする
810 */
911
1012 #include "rubi_parse.h"
13+#include "Utf8.h"
1114
15+using namespace qrk;
16+using namespace std;
1217
13-void qrk::parse_rubi(std::vector<rubi_t>& rubi_positions,
14- const char* original, const char* kana)
18+
19+namespace
1520 {
16- (void)original;
17- (void)kana;
21+ bool isKana(unsigned short code)
22+ {
23+ //fprintf(stderr, " [code: %04x], ", code);
24+ // 8181(ぁ) から 83b6(ヶ) までをの範囲で true を返す
25+ return ((code >= 0x8181) && (code <= 0x83b6)) ? true : false;
26+ }
1827
19- rubi_positions.clear();
2028
21- // !!!
29+ rubi_t create_rubi(size_t kanji_first, size_t kanji_size,
30+ size_t rubi_first, size_t rubi_size)
31+ {
32+ rubi_t rubi;
33+ rubi.kanji_first = kanji_first;
34+ rubi.kanji_size = kanji_size;
35+ rubi.rubi_first = rubi_first;
36+ rubi.rubi_size = rubi_size;
37+
38+ return rubi;
39+ }
40+
41+
42+
43+ bool parse(vector<rubi_t>& rubi_positions,
44+ const char* kanji_text, size_t kanji_offset,
45+ const char* kana_text, size_t kana_offset)
46+ {
47+ //fprintf(stderr, "\n[%d, %d]: ", kanji_offset, kana_offset);
48+
49+ Utf8 kanji(kanji_text);
50+ Utf8 kana(kana_text);
51+ //fprintf(stderr, " [kanji: %d, kana: %d],", kanji.size(), kana.size());
52+
53+ if (kana.size() < kanji.size()) {
54+ // 平仮名の方が短ければ、戻る
55+ //fprintf(stderr, "\n");
56+ return false;
57+ }
58+
59+ // 先頭から共通して同じ平仮名を取り除く
60+ size_t same_size = 0;
61+ size_t n = min(kanji.size(), kana.size());
62+ for (size_t i = 0; i < n; ++i, ++same_size) {
63+ // !!! 範囲外にアクセスしないかのチェックをすべきp
64+ //fprintf(stderr, " <%04x,%04x>,", kanji.ch(i), kana.ch(i));
65+ if (kanji.ch(i) != kana.ch(i)) {
66+ break;
67+ }
68+ }
69+ //fprintf(stderr, " [same_size: %d],", same_size);
70+
71+ // !!! kana.size() も使うべきかを検討する
72+ if (kanji.size() - same_size == 0) {
73+ // 文字列がなくなったら、処理を終了する
74+ //fprintf(stderr, "\n");
75+ return true;
76+ }
77+
78+ // 漢字の次の文字を探す
79+ // !!! 1 byte 文字なども適切に処理できることを保証すべき
80+ size_t kana_index = 0;
81+ for (size_t i = same_size + 1; i < kanji.size(); ++i) {
82+ if (isKana(kanji.ch(i))) {
83+ kana_index = i;
84+ break;
85+ }
86+ }
87+ //fprintf(stderr, " [kana_index: %d],", kana_index);
88+
89+ // 平仮名がなければ、残り全てをルビとみなす
90+ if (kana_index == 0) {
91+ // ルビの登録
92+ size_t rubi_size = kana.size() - same_size;
93+ rubi_positions.
94+ push_back(create_rubi(kanji_offset + same_size,
95+ kanji.size() - kana_index - same_size,
96+ kana_offset + same_size, rubi_size));
97+ //fprintf(stderr, "\n");
98+ return true;
99+ }
100+
101+ unsigned short found_kana = kanji.ch(kana_index);
102+
103+ // !!! 以下を、ループで順に繰り返すようにする
104+ size_t rubi_size = 1;
105+ for (size_t i = same_size + 1;
106+ (i < kana.size()) && (found_kana != kana.ch(i)); ++i) {
107+ //fprintf(stderr, " [i = %d],", i);
108+ ++rubi_size;
109+ }
110+ //fprintf(stderr, " [rubi_size: %d],", rubi_size);
111+
112+ // ルビの登録
113+ rubi_positions.
114+ push_back(create_rubi(kanji_offset + same_size,
115+ kana_index - same_size,
116+ kana_offset + same_size,
117+ rubi_size));
118+
119+ size_t next_kanji_offset = kanji_offset + kana_index + 1;
120+ size_t next_kana_offset = kana_offset + same_size + rubi_size + 1;
121+ const string next_kanji =
122+ kanji.substr(next_kanji_offset,
123+ kanji.size() - next_kanji_offset).toStdString();
124+ const string next_kana =
125+ kana.substr(next_kana_offset,
126+ kana.size() - next_kana_offset).toStdString();
127+ return parse(rubi_positions,
128+ next_kanji.c_str(), next_kanji_offset,
129+ next_kana.c_str(), next_kana_offset);
130+ }
22131 }
132+
133+
134+bool qrk::parse_rubi(std::vector<rubi_t>& rubi_positions,
135+ const char* text, const char* kana_only)
136+{
137+ return parse(rubi_positions, text, 0, kana_only, 0);
138+}
--- trunk/libs/input/TestRubiParse.cpp (revision 1818)
+++ trunk/libs/input/TestRubiParse.cpp (revision 1819)
@@ -21,45 +21,51 @@
2121 {
2222 vector<rubi_t> rubi;
2323
24+ rubi.clear();
2425 parse_rubi(rubi, "む", "む");
2526 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi.size());
2627
28+ rubi.clear();
2729 parse_rubi(rubi, "無", "む");
2830 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi.size());
29- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].original_first);
30- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].original_size);
31+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].kanji_first);
32+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].kanji_size);
3133 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].rubi_first);
3234 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].rubi_size);
3335
36+ rubi.clear();
3437 parse_rubi(rubi, "無し", "なし");
3538 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi.size());
36- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].original_first);
37- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].original_size);
39+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].kanji_first);
40+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].kanji_size);
3841 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].rubi_first);
3942 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].rubi_size);
4043
44+ rubi.clear();
4145 parse_rubi(rubi, "よい水", "よいみず");
4246 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi.size());
43- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].original_first);
44- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].original_size);
47+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].kanji_first);
48+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].kanji_size);
4549 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].rubi_first);
4650 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].rubi_size);
4751
52+ rubi.clear();
4853 parse_rubi(rubi, "よい水だ", "よいみずだ");
4954 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi.size());
50- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].original_first);
51- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].original_size);
55+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].kanji_first);
56+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].kanji_size);
5257 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].rubi_first);
5358 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].rubi_size);
5459
60+ rubi.clear();
5561 parse_rubi(rubi, "秋の田の", "あきのたの");
5662 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi.size());
57- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].original_first);
58- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].original_size);
63+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].kanji_first);
64+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[0].kanji_size);
5965 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), rubi[0].rubi_first);
6066 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[0].rubi_size);
61- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[1].original_first);
62- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[1].original_size);
67+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[1].kanji_first);
68+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[1].kanji_size);
6369 CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), rubi[1].rubi_first);
64- CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), rubi[1].rubi_size);
70+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), rubi[1].rubi_size);
6571 }
--- trunk/libs/input/Makefile (revision 1818)
+++ trunk/libs/input/Makefile (revision 1819)
@@ -51,9 +51,10 @@
5151
5252 TestConvertToJp.o: TestConvertToJp.h convertToJp.h Utf8.h
5353 TestConvertToRoman.o: TestConvertToRoman.h convertToRoman.h
54+TestRubiParse.o: TestRubiParse.h rubi_parse.h
5455 TestUtf8.o: TestUtf8.h Utf8.h
5556 Utf8.o: Utf8.h
5657 convertToJp.o: convertToJp.h roman_table.h kana_table.h Utf8.h
5758 convertToRoman.o: convertToRoman.h roman_table.h Utf8.h
58-rubi_parse.o: rubi_parse.h
59+rubi_parse.o: rubi_parse.h Utf8.h
5960 utf8_string.o: utf8_string.h Utf8.h
旧リポジトリブラウザで表示