//---------------------------------------------------------------------------- // EgalTech 2014-2014 //---------------------------------------------------------------------------- // File : EgtStringDecoder.h Data : 01.06.14 Versione : 1.5f1 // Contenuto : Insieme di funzioni per decodificare stringhe UTF-8. // // // // Modifiche : 01.06.14 DS Creazione modulo. // // //---------------------------------------------------------------------------- #pragma once #include "/EgtDev/Include/EgtStringBase.h" #include "/EgtDev/Include/EgtNumCollection.h" //----------------------------------------------------------------------------- typedef unsigned char uint8_t ; typedef unsigned int uint32_t ; //----------------------------------------------------------------------------- const uint32_t UTF8_ACCEPT = 0 ; const uint32_t UTF8_REJECT = 12 ; //----------------------------------------------------------------------------- // Copyright (c) 2008-2010 Bjoern Hoehrmann // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. //----------------------------------------------------------------------------- static const uint8_t utf8d[] = { // The first part of the table maps bytes to character classes that // to reduce the size of the transition table and create bitmasks. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // The second part is a transition table that maps a combination // of a state of the automaton and a character class to a state. 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,12,12,12,12,12 } ; //----------------------------------------------------------------------------- uint32_t inline decode( uint32_t* state, uint32_t* codep, uint8_t byte) { uint32_t type = utf8d[byte] ; *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte) ; *state = utf8d[ 256 + *state + type] ; return *state ; } //----------------------------------------------------------------------------- bool inline CountCodePoints( const std::string& sString, int& nCount) { // inizializzo numero caratteri nCount = 0 ; // ciclo sui byte della stringa UTF-8 bool bOk = true ; uint32_t codepoint ; uint32_t state = UTF8_ACCEPT ; uint32_t stPrev = UTF8_ACCEPT ; for ( unsigned int i = 0 ; i < sString.length() ; ++ i) { switch ( decode( &state, &codepoint, sString[i])) { case UTF8_ACCEPT : ++ nCount ; break ; case UTF8_REJECT : bOk = false ; state = UTF8_ACCEPT ; if ( stPrev != UTF8_ACCEPT) -- i ; break ; } stPrev = state ; } return bOk ; } //----------------------------------------------------------------------------- bool inline GetCodePoints( const std::string& sString, UINTVECTOR& vCode) { // pulisco vettore di codici da restituire vCode.reserve( sString.length()) ; vCode.clear() ; // ciclo sui byte della stringa UTF-8 bool bOk = true ; uint32_t codepoint ; uint32_t state = UTF8_ACCEPT ; uint32_t stPrev = UTF8_ACCEPT ; for ( unsigned int i = 0 ; i < sString.length() ; ++ i) { switch ( decode( &state, &codepoint, sString[i])) { case UTF8_ACCEPT : // A properly encoded character has been found. vCode.push_back( codepoint) ; break ; case UTF8_REJECT : // The byte is invalid, replace it and restart. vCode.push_back( 0xFFFD) ; bOk = false ; state = UTF8_ACCEPT ; if ( stPrev != UTF8_ACCEPT) -- i ; break ; } stPrev = state ; } return bOk ; }