Include/EgtStringDecoder.h

//----------------------------------------------------------------------------
//  EgalTech 2014-2014
//----------------------------------------------------------------------------
// File : EgtStringDecoder.h   Data : 01.06.14          Versione : 1.5f1
// Contenuto : Insieme di funzioni per decodificare stringhe UTF-8.
//
//
//
// Modifiche : 01.06.14 DS Creazione modulo.
//
//
//----------------------------------------------------------------------------

#pragma once

#include "/EgtDev/Include/EgtStringBase.h"
#include "/EgtDev/Include/EgtNumCollection.h"

//-----------------------------------------------------------------------------
typedef unsigned char  uint8_t ;
typedef unsigned int  uint32_t ;

//-----------------------------------------------------------------------------
const uint32_t UTF8_ACCEPT =  0 ;
const uint32_t UTF8_REJECT = 12 ;

//-----------------------------------------------------------------------------
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

//-----------------------------------------------------------------------------
static const uint8_t utf8d[] = {
  // The first part of the table maps bytes to character classes that
  // to reduce the size of the transition table and create bitmasks.
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
  // The second part is a transition table that maps a combination
  // of a state of the automaton and a character class to a state.
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12
} ;

//-----------------------------------------------------------------------------
uint32_t inline
decode( uint32_t* state, uint32_t* codep, uint8_t byte)
{
   uint32_t type = utf8d[byte] ;

   *codep = (*state != UTF8_ACCEPT) ?
               (byte & 0x3fu) | (*codep << 6) :
               (0xff >> type) & (byte) ;

   *state = utf8d[ 256 + *state + type] ;
   return *state ;
}

//-----------------------------------------------------------------------------
bool inline
CountCodePoints( const std::string& sString, int& nCount)
{
  // inizializzo numero caratteri
   nCount = 0 ;

  // ciclo sui byte della stringa UTF-8
   bool bOk = true ;
   uint32_t codepoint ;
   uint32_t state = UTF8_ACCEPT ;
   uint32_t stPrev = UTF8_ACCEPT ;
   for ( unsigned int i = 0 ; i < sString.length() ; ++ i) {
      switch ( decode( &state, &codepoint, sString[i])) {
      case UTF8_ACCEPT :
         ++ nCount ;
         break ;
      case UTF8_REJECT :
         bOk = false ;
         state = UTF8_ACCEPT ;
         if ( stPrev != UTF8_ACCEPT)
            -- i ;
         break ;
      }
      stPrev = state ;
   }

   return bOk ;
}

//-----------------------------------------------------------------------------
bool inline
GetCodePoints( const std::string& sString, UINTVECTOR& vCode)
{
  // pulisco vettore di codici da restituire
   vCode.reserve( sString.length()) ;
   vCode.clear() ;

  // ciclo sui byte della stringa UTF-8
   bool bOk = true ;
   uint32_t codepoint ;
   uint32_t state = UTF8_ACCEPT ;
   uint32_t stPrev = UTF8_ACCEPT ;
   for ( unsigned int i = 0 ; i < sString.length() ; ++ i) {

      switch ( decode( &state, &codepoint, sString[i])) {
      case UTF8_ACCEPT :
        // A properly encoded character has been found.
         vCode.push_back( codepoint) ;
         break ;
      case UTF8_REJECT :
        // The byte is invalid, replace it and restart.
         vCode.push_back( 0xFFFD) ;
         bOk = false ;
         state = UTF8_ACCEPT ;
         if ( stPrev != UTF8_ACCEPT)
            -- i ;
         break ;
      }
      stPrev = state ;
   }

   return bOk ;
}