Kouhei Sutou 2019-03-02 05:59:03 +0900 (Sat, 02 Mar 2019) Revision: 36cabe712cf4a74b4863d6c6d687943f3dc5631c https://github.com/ranguba/chupa-text/commit/36cabe712cf4a74b4863d6c6d687943f3dc5631c Message: xml: add support for Nokogiri Modified files: lib/chupa-text/decomposers/xml.rb lib/chupa-text/sax-parser.rb test/decomposers/test-xml.rb Modified: lib/chupa-text/decomposers/xml.rb (+19 -8) =================================================================== --- lib/chupa-text/decomposers/xml.rb 2019-03-02 05:32:43 +0900 (498f9d3) +++ lib/chupa-text/decomposers/xml.rb 2019-03-02 05:59:03 +0900 (f5575ff) @@ -14,8 +14,7 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -require "rexml/document" -require "rexml/streamlistener" +require "chupa-text/sax-parser" module ChupaText module Decomposers @@ -34,9 +33,9 @@ module ChupaText listener = Listener.new(text) data.open do |input| begin - parser = REXML::Parsers::StreamParser.new(input, listener) + parser = SAXParser.new(input, listener) parser.parse - rescue REXML::ParseException => xml_error + rescue SAXParser::ParseError => xml_error error do message = "#{log_tag} Failed to parse XML: " message << "#{xml_error.class}: #{xml_error.message}\n" @@ -54,15 +53,27 @@ module ChupaText def log_tag "[decomposer][xml]" end - class Listener - include REXML::StreamListener + class Listener < SAXListener def initialize(output) @output = output + @level = 0 end - def text(text) - @output << text + def start_element(*args) + @level += 1 + end + + def end_element(*args) + @level -= 1 + end + + def characters(text) + @output << text if @level > 0 + end + + def cdata(content) + @output << content if @level > 0 end end end Modified: lib/chupa-text/sax-parser.rb (+17 -2) =================================================================== --- lib/chupa-text/sax-parser.rb 2019-03-02 05:32:43 +0900 (d6b8a3b) +++ lib/chupa-text/sax-parser.rb 2019-03-02 05:59:03 +0900 (8c1f1b2) @@ -25,6 +25,9 @@ end module ChupaText class SAXParser + class ParseError < Error + end + class << self def backend case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"] @@ -94,6 +97,10 @@ module ChupaText @listener.cdata(content) end + def error(detail) + raise ParseError, detail + end + private def build_qname(prefix, local_name) if prefix @@ -105,10 +112,18 @@ module ChupaText end else def parse - source = REXML::Source.new(@input.read) + source = @input + if source.is_a?(Archive::Zip::Codec::Deflate::Decompress) + source = source.read + end parser = REXML::Parsers::SAX2Parser.new(source) parser.listen(Listener.new(@listener)) - parser.parse + begin + parser.parse + rescue REXML::ParseException => error + message = "#{error.class}: #{error.message}" + raise ParseError, message + end end class Listener Modified: test/decomposers/test-xml.rb (+9 -4) =================================================================== --- test/decomposers/test-xml.rb 2019-03-02 05:32:43 +0900 (05697ff) +++ test/decomposers/test-xml.rb 2019-03-02 05:59:03 +0900 (ee846ea) @@ -35,7 +35,6 @@ class TestDecomposersXML < Test::Unit::TestCase Hello & World - TEXT assert_equal([text], decompose(xml).collect(&:body)) @@ -45,15 +44,21 @@ class TestDecomposersXML < Test::Unit::TestCase messages = capture_log do assert_equal([], decompose("<root x=/>")) end + normalized_messages = messages.collect do |level, message| + [ + level, + message.gsub(/(ChupaText::SAXParser::ParseError:) .*/, + "\\1 ...") + ] + end assert_equal([ [ :error, "[decomposer][xml] Failed to parse XML: " + - "REXML::ParseException: " + - "Missing attribute value start quote: <x>", + "ChupaText::SAXParser::ParseError: ...", ], ], - messages) + normalized_messages) end private -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190302/40f83396/attachment-0001.html>