~singpolyma/biboumi

3960e4d5afa09c299f595b411ee8522db30580fd — Florent Le Coz 10 years ago b29290f
Functions to provide xml-valid strings

By removing invalid chars, see http://www.w3.org/TR/xml/#charsets
3 files changed, 87 insertions(+), 1 deletions(-)

M src/test.cpp
M src/utils/encoding.cpp
M src/utils/encoding.hpp
M src/test.cpp => src/test.cpp +6 -1
@@ 52,6 52,11 @@ int main()
  assert(from_ascii == "couc�ou");
  std::cout << from_ascii << std::endl;

  std::string without_ctrl_char("𤭢€¢$");
  assert(utils::remove_invalid_xml_chars(without_ctrl_char) == without_ctrl_char);
  assert(utils::remove_invalid_xml_chars(in) == in);
  assert(utils::remove_invalid_xml_chars("\acouco\u0008u\uFFFEt\uFFFFe\r\n♥") == "coucoute\r\n♥");

  /**
   * Utils
   */


@@ 156,7 161,7 @@ int main()
  /**
   * Config
   */
  std::cout << color << "Testing JID parsing…" << reset << std::endl;
  std::cout << color << "Testing config…" << reset << std::endl;
  Config::filename = "test.cfg";
  Config::file_must_exist = false;
  Config::set("coucou", "bonjour");

M src/utils/encoding.cpp => src/utils/encoding.cpp +73 -0
@@ 9,6 9,8 @@

#include <config.h>

#include <bitset>

/**
 * The UTF-8-encoded character used as a place holder when a character conversion fails.
 * This is U+FFFD � "replacement character"


@@ 66,6 68,77 @@ namespace utils
    return true;
  }

  std::string remove_invalid_xml_chars(const std::string& original)
  {
    // The given string MUST be a valid utf-8 string
    unsigned char* res = new unsigned char[original.size()];
    ScopeGuard sg([&res]() { delete[] res;});

    // pointer where we write valid chars
    unsigned char* r = res;

    const unsigned char* str = reinterpret_cast<const unsigned char*>(original.c_str());
    std::bitset<20> codepoint;

    while (*str)
      {
        // 4 bytes:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        if ((str[0] & 11111000_b) == 11110000_b)
          {
            codepoint  = ((str[0] & 00000111_b) << 18);
            codepoint |= ((str[1] & 00111111_b) << 12);
            codepoint |= ((str[2] & 00111111_b) << 6 );
            codepoint |= ((str[3] & 00111111_b) << 0 );
            if (codepoint.to_ulong() <= 0x10FFFF)
              {
                ::memcpy(r, str, 4);
                r += 4;
              }
            str += 4;
          }
        // 3 bytes:  1110xxx 10xxxxxx 10xxxxxx
        else if ((str[0] & 11110000_b) == 11100000_b)
          {
            codepoint  = ((str[0] & 00001111_b) << 12);
            codepoint |= ((str[1] & 00111111_b) << 6);
            codepoint |= ((str[2] & 00111111_b) << 0 );
            if (codepoint.to_ulong() <= 0xD7FF ||
                (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD))
              {
                ::memcpy(r, str, 3);
                r += 3;
              }
            str += 3;
          }
        // 2 bytes:  110xxxxx 10xxxxxx
        else if (((str[0]) & 11100000_b) == 11000000_b)
          {
            // All 2 bytes char are valid, don't even bother calculating
            // the codepoint
            ::memcpy(r, str, 2);
            r += 2;
            str += 2;
          }
        // 1 byte:  0xxxxxxx
        else if ((str[0] & 10000000_b) == 0)
          {
            codepoint = ((str[0] & 01111111_b));
            if (codepoint.to_ulong() == 0x09 ||
                codepoint.to_ulong() == 0x0A ||
                codepoint.to_ulong() == 0x0D ||
                codepoint.to_ulong() >= 0x20)
              {
                ::memcpy(r, str, 1);
                r += 1;
              }
            str += 1;
          }
        else
          throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars");
      }
    return std::string(reinterpret_cast<char*>(res), r-res);
  }

  std::string convert_to_utf8(const std::string& str, const char* charset)
  {
    std::string res;

M src/utils/encoding.hpp => src/utils/encoding.hpp +8 -0
@@ 12,6 12,14 @@ namespace utils
   */
  bool is_valid_utf8(const char* s);
  /**
   * Remove all invalid codepoints from the given utf-8-encoded string.
   * The value returned is a copy of the string, without the removed chars.
   *
   * See http://www.w3.org/TR/xml/#charsets for the list of valid characters
   * in XML.
   */
  std::string remove_invalid_xml_chars(const std::string& original);
  /**
   * Convert the given string (encoded is "encoding") into valid utf-8.
   * If some decoding fails, insert an utf-8 placeholder character instead.
   */