From df3138fd2e83f0ba079b91fe923cdeab2c319281 Mon Sep 17 00:00:00 2001 From: Stephen Paul Weber Date: Sun, 28 Apr 2019 13:18:21 -0500 Subject: [PATCH] Update text-or-binary heuristic The `cbor` gem is a bit janky and it turns out catching NoMemoryError is not safe, so use an encoding-based heuristic to detect binary vs utf8 data. --- lib/dhall.rb | 15 +++++---------- lib/dhall/util.rb | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/lib/dhall.rb b/lib/dhall.rb index f28ba66..ae4b91b 100644 --- a/lib/dhall.rb +++ b/lib/dhall.rb @@ -22,18 +22,13 @@ module Dhall end def self.load_raw(source) - unless source.valid_encoding? - raise ArgumentError, "invalid byte sequence in #{source.encoding}" - end + source = Util.text_or_binary(source) - begin - return from_binary(source) if source.encoding == Encoding::BINARY - rescue Exception # rubocop:disable Lint/RescueException - # Parsing CBOR failed, so guess this is source text in standard UTF-8 - return load_raw(source.force_encoding("UTF-8")) + if source.encoding == Encoding::BINARY + from_binary(source) + else + Parser.parse(source).value end - - Parser.parse(source.encode("UTF-8")).value end def self.dump(o) diff --git a/lib/dhall/util.rb b/lib/dhall/util.rb index f2c2b8a..1ce737f 100644 --- a/lib/dhall/util.rb +++ b/lib/dhall/util.rb @@ -116,5 +116,24 @@ module Dhall Hash[hash_or_not.map { |k, v| [(yield k), v] }] end + + def self.utf8_if_possible(str) + utf8 = str.dup.force_encoding(Encoding::UTF_8) + utf8.valid_encoding? ? utf8 : str + end + + def self.text_or_binary(str) + unless str.valid_encoding? + raise ArgumentError, "invalid byte sequence in #{str.encoding}" + end + + if str.encoding == Encoding::BINARY + return str if str =~ /(?!\s)[[:cntrl:]]/ + + utf8_if_possible(str) + else + str.encode(Encoding::UTF_8) + end + end end end -- 2.34.2