~singpolyma/biboumi

ccebe901d7d76dfddc082d994efa54ef2aefee57 — Florent Le Coz 10 years ago a418b6e
Check UTF-8 encoding, and convert strings to UTF-8

Handle conversion errors properly by inserting � instead.  Add a binary
header to provide portable way to write binary literals (I like them) Also
add a test file.  ref #2404
6 files changed, 247 insertions(+), 25 deletions(-)

M CMakeLists.txt
M src/main.cpp
A src/test.cpp
A src/utils/binary.hpp
A src/utils/encoding.cpp
A src/utils/encoding.hpp
M CMakeLists.txt => CMakeLists.txt +21 -3
@@ 20,6 20,13 @@ include_directories("src/")
include_directories(SYSTEM ${CRYPTO++_INCLUDE_DIR})

#
## utils
#
file(GLOB source_utils
  src/utils/*.[hc]pp)
add_library(utils STATIC ${source_utils})

#
## network
#
file(GLOB source_network


@@ 32,7 39,7 @@ add_library(network STATIC ${source_network})
file(GLOB source_irc
  src/irc/*.[hc]pp)
add_library(irc STATIC ${source_irc})
target_link_libraries(irc network)
target_link_libraries(irc network utils)

#
## xmpplib


@@ 40,7 47,7 @@ target_link_libraries(irc network)
file(GLOB source_xmpp
  src/xmpp/*.[hc]pp)
add_library(xmpp STATIC ${source_xmpp})
target_link_libraries(xmpp bridge network ${CRYPTO++_LIBRARIES} expatpp)
target_link_libraries(xmpp bridge network utils ${CRYPTO++_LIBRARIES} expatpp)

#
## bridge


@@ 54,4 61,15 @@ add_executable(${PROJECT_NAME} src/main.cpp)
target_link_libraries(${PROJECT_NAME}
  xmpp
  irc
  bridge)
\ No newline at end of file
  bridge)

#
## Tests
#

add_executable(test src/test.cpp)
target_link_libraries(test
  xmpp
  irc
  bridge
  utils)

M src/main.cpp => src/main.cpp +7 -22
@@ 1,31 1,16 @@
#include <irc/irc_client.hpp>
#include <xmpp/xmpp_component.hpp>
#include <network/poller.hpp>

#include <xmpp/xmpp_parser.hpp>
#include <xmpp/xmpp_stanza.hpp>
#include <xmpp/xmpp_component.hpp>

#include <memory>

#include <xmpp/jid.hpp>
#include <irc/iid.hpp>

#include <iostream>

int main()
{
  Poller p;
  // Now I'm the bridge, creating an ircclient because needed.
  std::shared_ptr<IrcClient> c = std::make_shared<IrcClient>();
  p.add_socket_handler(c);
  std::shared_ptr<IrcClient> d = std::make_shared<IrcClient>();
  p.add_socket_handler(d);
  std::shared_ptr<IrcClient> e = std::make_shared<IrcClient>();
  p.add_socket_handler(e);
  c->connect("localhost", "7877");
  d->connect("localhost", "7878");
  e->connect("localhost", "7879");
  while (true)
    p.poll();
  std::shared_ptr<XmppComponent> xmpp_component =
    std::make_shared<XmppComponent>("irc.localhost", "secret");
  p.add_socket_handler(xmpp_component);
  xmpp_component->start();
  while (p.poll())
    ;
  return 0;
}

A src/test.cpp => src/test.cpp +43 -0
@@ 0,0 1,43 @@
/**
 * Just a very simple test suite, by hand, using assert()
 */

#include <assert.h>

#include <iostream>

#include <utils/encoding.hpp>
#include <string.h>

#include <fstream>

int main()
{
  /**
   * Encoding
   */
  const char* valid = "C̡͔͕̩͙̽ͫ̈́ͥ̿̆ͧ̚r̸̩̘͍̻͖̆͆͛͊̉̕͡o͇͈̳̤̱̊̈͢q̻͍̦̮͕ͥͬͬ̽ͭ͌̾ͅǔ͉͕͇͚̙͉̭͉̇̽ȇ͈̮̼͍͔ͣ͊͞͝ͅ ͫ̾ͪ̓ͥ̆̋̔҉̢̦̠͈͔̖̲̯̦ụ̶̯͐̃̋ͮ͆͝n̬̱̭͇̻̱̰̖̤̏͛̏̿̑͟ë́͐҉̸̥̪͕̹̻̙͉̰ ̹̼̱̦̥ͩ͑̈́͑͝ͅt͍̥͈̹̝ͣ̃̔̈̔ͧ̕͝ḙ̸̖̟̙͙ͪ͢ų̯̞̼̲͓̻̞͛̃̀́b̮̰̗̩̰̊̆͗̾̎̆ͯ͌͝.̗̙͎̦ͫ̈́ͥ͌̈̓ͬ";
  assert(utils::is_valid_utf8(valid) == true);
  const char* invalid = "\xF0\x0F";
  assert(utils::is_valid_utf8(invalid) == false);
  const char* invalid2 = "\xFE\xFE\xFF\xFF";
  assert(utils::is_valid_utf8(invalid2) == false);

  std::string in = "coucou les copains  ♥ ";
  assert(utils::is_valid_utf8(in.c_str()) == true);
  std::string res = utils::convert_to_utf8(in, "UTF-8");
  assert(utils::is_valid_utf8(res.c_str()) == true && res == in);

  std::string original_utf8("couc¥ou");
  std::string original_latin1("couc\xa5ou");

  // When converting back to utf-8
  std::string from_latin1 = utils::convert_to_utf8(original_latin1.c_str(), "ISO-8859-1");
  assert(from_latin1 == original_utf8);

  // Check the behaviour when the decoding fails (here because we provide a
  // wrong charset)
  std::string from_ascii = utils::convert_to_utf8(original_latin1, "US-ASCII");
  assert(from_ascii == "couc�ou");
  return 0;
}

A src/utils/binary.hpp => src/utils/binary.hpp +16 -0
@@ 0,0 1,16 @@
#ifndef BINARY_INCLUDED
# define BINARY_INCLUDED

template<char FIRST, char... REST> struct binary
{
  static_assert(FIRST == '0' || FIRST == '1', "invalid binary digit" );
  enum { value = ((FIRST - '0') << sizeof...(REST)) + binary<REST...>::value };
};

template<> struct binary<'0'> { enum { value = 0 }; };
template<> struct binary<'1'> { enum { value = 1 }; };

template<char... LITERAL> inline
constexpr unsigned int operator "" _b() { return binary<LITERAL...>::value; }

#endif // BINARY_INCLUDED

A src/utils/encoding.cpp => src/utils/encoding.cpp +139 -0
@@ 0,0 1,139 @@
#include <utils/encoding.hpp>
#include <utils/binary.hpp>

#include <utils/scopeguard.hpp>

#include <assert.h>
#include <string.h>
#include <iconv.h>

/**
 * The UTF-8-encoded character used as a place holder when a character conversion fails.
 * This is U+FFFD � "replacement character"
 */
static const char* invalid_char = "\xef\xbf\xbd";
static const size_t invalid_char_len = 3;

namespace utils
{
  /**
   * Based on http://en.wikipedia.org/wiki/UTF-8#Description
   */
  bool is_valid_utf8(const char* s)
  {
    if (!s)
      return false;

    const unsigned char* str = reinterpret_cast<const unsigned char*>(s);

    while (*str)
      {
        // 4 bytes:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        if ((str[0] & 11111000_b) == 11110000_b)
          {
            if (!str[1] || !str[2] || !str[3]
                || ((str[1] & 11000000_b) != 10000000_b)
                || ((str[2] & 11000000_b) != 10000000_b)
                || ((str[3] & 11000000_b) != 10000000_b))
              return false;
            str += 4;
          }
        // 3 bytes:  1110xxx 10xxxxxx 10xxxxxx
        else if ((str[0] & 11110000_b) == 11100000_b)
          {
            if (!str[1] || !str[2]
                || ((str[1] & 11000000_b) != 10000000_b)
                || ((str[2] & 11000000_b) != 10000000_b))
              return false;
            str += 3;
          }
        // 2 bytes:  110xxxxx 10xxxxxx
        else if (((str[0]) & 11100000_b) == 11000000_b)
          {
            if (!str[1] ||
                ((str[1] & 11000000_b) != 10000000_b))
              return false;
            str += 2;
          }
        // 1 byte:  0xxxxxxx
        else if ((str[0] & 10000000_b) != 0)
          return false;
        else
          str++;
      }
    return true;
  }

  std::string convert_to_utf8(const std::string& str, const char* charset)
  {
    std::string res;

    const iconv_t cd = iconv_open("UTF-8", charset);
    if (cd == (iconv_t)-1)
      throw std::runtime_error("Cannot convert into UTF-8");

    // Make sure cd is always closed when we leave this function
    ScopeGuard sg([&]{ iconv_close(cd); });

    // iconv will not attempt to modify this buffer, but it still requires
    // a char**.
    size_t inbytesleft = str.size();
    char* inbuf_ptr = const_cast<char*>(str.c_str());

    size_t outbytesleft = str.size() * 4;
    char* outbuf = new char[outbytesleft];
    char* outbuf_ptr = outbuf;

    // Make sure outbuf is always deleted when we leave this function
    sg.add_callback([&]{ delete[] outbuf; });

    bool done = false;
    while (done == false)
      {
        size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft);
        if ((size_t)-1 == error)
          {
            switch (errno)
              {
              case EILSEQ:
                // Invalid byte found. Insert a placeholder instead of the
                // converted character, jump one byte and continue
                memcpy(outbuf_ptr, invalid_char, invalid_char_len);
                outbuf_ptr += invalid_char_len;
                inbytesleft--;
                inbuf_ptr++;
                break;
              case EINVAL:
                // A multibyte sequence is not terminated, but we can't
                // provide any more data, so we just add a placeholder to
                // indicate that the character is not properly converted,
                // and we stop the conversion
                memcpy(outbuf_ptr, invalid_char, invalid_char_len);
                outbuf_ptr += invalid_char_len;
                outbuf_ptr++;
                done = true;
                break;
              case E2BIG:
                // This should never happen
                done = true;
              default:
                // This should happen even neverer
                done = true;
                break;
              }
          }
        else
          {
            // The conversion finished without any error, stop converting
            done = true;
          }
      }
    // Terminate the converted buffer, and copy that buffer it into the
    // string we return
    *outbuf_ptr = '\0';
    res = outbuf;
    return res;
  }

}


A src/utils/encoding.hpp => src/utils/encoding.hpp +21 -0
@@ 0,0 1,21 @@
#ifndef ENCODING_INCLUDED
# define ENCODING_INCLUDED

#include <string>

namespace utils
{
  /**
   * Returns true if the given null-terminated string is valid utf-8.
   *
   * Based on http://en.wikipedia.org/wiki/UTF-8#Description
   */
  bool is_valid_utf8(const char* s);
  /**
   * Convert the given string (encoded is "encoding") into valid utf-8.
   * If some decoding fails, insert an utf-8 placeholder character instead.
   */
  std::string convert_to_utf8(const std::string& str, const char* encoding);
}

#endif // ENCODING_INCLUDED