75dac0c620
- cambio nome di alcuni include di base - aggiornamento prototipi.
130 lines
4.5 KiB
C++
130 lines
4.5 KiB
C++
//----------------------------------------------------------------------------
|
|
// EgalTech 2014-2014
|
|
//----------------------------------------------------------------------------
|
|
// File : EgtStringDecoder.h Data : 01.06.14 Versione : 1.5f1
|
|
// Contenuto : Insieme di funzioni per decodificare stringhe UTF-8.
|
|
//
|
|
//
|
|
//
|
|
// Modifiche : 01.06.14 DS Creazione modulo.
|
|
//
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
#pragma once
|
|
|
|
#include "/EgtDev/Include/EgtStringBase.h"
|
|
#include "/EgtDev/Include/EgtNumCollection.h"
|
|
|
|
//-----------------------------------------------------------------------------
|
|
typedef unsigned char uint8_t ;
|
|
typedef unsigned int uint32_t ;
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const uint32_t UTF8_ACCEPT = 0 ;
|
|
const uint32_t UTF8_REJECT = 12 ;
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
|
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
|
|
|
//-----------------------------------------------------------------------------
|
|
static const uint8_t utf8d[] = {
|
|
// The first part of the table maps bytes to character classes that
|
|
// to reduce the size of the transition table and create bitmasks.
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
// The second part is a transition table that maps a combination
|
|
// of a state of the automaton and a character class to a state.
|
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
12,36,12,12,12,12,12,12,12,12,12,12
|
|
} ;
|
|
|
|
//-----------------------------------------------------------------------------
|
|
uint32_t inline
|
|
decode( uint32_t* state, uint32_t* codep, uint8_t byte)
|
|
{
|
|
uint32_t type = utf8d[byte] ;
|
|
|
|
*codep = (*state != UTF8_ACCEPT) ?
|
|
(byte & 0x3fu) | (*codep << 6) :
|
|
(0xff >> type) & (byte) ;
|
|
|
|
*state = utf8d[ 256 + *state + type] ;
|
|
return *state ;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
bool inline
|
|
CountCodePoints( const std::string& sString, int& nCount)
|
|
{
|
|
// inizializzo numero caratteri
|
|
nCount = 0 ;
|
|
|
|
// ciclo sui byte della stringa UTF-8
|
|
bool bOk = true ;
|
|
uint32_t codepoint ;
|
|
uint32_t state = UTF8_ACCEPT ;
|
|
uint32_t stPrev = UTF8_ACCEPT ;
|
|
for ( unsigned int i = 0 ; i < sString.length() ; ++ i) {
|
|
switch ( decode( &state, &codepoint, sString[i])) {
|
|
case UTF8_ACCEPT :
|
|
++ nCount ;
|
|
break ;
|
|
case UTF8_REJECT :
|
|
bOk = false ;
|
|
state = UTF8_ACCEPT ;
|
|
if ( stPrev != UTF8_ACCEPT)
|
|
-- i ;
|
|
break ;
|
|
}
|
|
stPrev = state ;
|
|
}
|
|
|
|
return bOk ;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
bool inline
|
|
GetCodePoints( const std::string& sString, UINTVECTOR& vCode)
|
|
{
|
|
// pulisco vettore di codici da restituire
|
|
vCode.reserve( sString.length()) ;
|
|
vCode.clear() ;
|
|
|
|
// ciclo sui byte della stringa UTF-8
|
|
bool bOk = true ;
|
|
uint32_t codepoint ;
|
|
uint32_t state = UTF8_ACCEPT ;
|
|
uint32_t stPrev = UTF8_ACCEPT ;
|
|
for ( unsigned int i = 0 ; i < sString.length() ; ++ i) {
|
|
|
|
switch ( decode( &state, &codepoint, sString[i])) {
|
|
case UTF8_ACCEPT :
|
|
// A properly encoded character has been found.
|
|
vCode.push_back( codepoint) ;
|
|
break ;
|
|
case UTF8_REJECT :
|
|
// The byte is invalid, replace it and restart.
|
|
vCode.push_back( 0xFFFD) ;
|
|
bOk = false ;
|
|
state = UTF8_ACCEPT ;
|
|
if ( stPrev != UTF8_ACCEPT)
|
|
-- i ;
|
|
break ;
|
|
}
|
|
stPrev = state ;
|
|
}
|
|
|
|
return bOk ;
|
|
}
|