Files
Include/EgtStringDecoder.h
T
Dario Sassi 75dac0c620 Include :
- cambio nome di alcuni include di base
- aggiornamento prototipi.
2015-05-11 21:11:49 +00:00

130 lines
4.5 KiB
C++

//----------------------------------------------------------------------------
// EgalTech 2014-2014
//----------------------------------------------------------------------------
// File : EgtStringDecoder.h Data : 01.06.14 Versione : 1.5f1
// Contenuto : Insieme di funzioni per decodificare stringhe UTF-8.
//
//
//
// Modifiche : 01.06.14 DS Creazione modulo.
//
//
//----------------------------------------------------------------------------
#pragma once
#include "/EgtDev/Include/EgtStringBase.h"
#include "/EgtDev/Include/EgtNumCollection.h"
//-----------------------------------------------------------------------------
typedef unsigned char uint8_t ;
typedef unsigned int uint32_t ;
//-----------------------------------------------------------------------------
const uint32_t UTF8_ACCEPT = 0 ;
const uint32_t UTF8_REJECT = 12 ;
//-----------------------------------------------------------------------------
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
//-----------------------------------------------------------------------------
static const uint8_t utf8d[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12
} ;
//-----------------------------------------------------------------------------
uint32_t inline
decode( uint32_t* state, uint32_t* codep, uint8_t byte)
{
uint32_t type = utf8d[byte] ;
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte) ;
*state = utf8d[ 256 + *state + type] ;
return *state ;
}
//-----------------------------------------------------------------------------
bool inline
CountCodePoints( const std::string& sString, int& nCount)
{
// inizializzo numero caratteri
nCount = 0 ;
// ciclo sui byte della stringa UTF-8
bool bOk = true ;
uint32_t codepoint ;
uint32_t state = UTF8_ACCEPT ;
uint32_t stPrev = UTF8_ACCEPT ;
for ( unsigned int i = 0 ; i < sString.length() ; ++ i) {
switch ( decode( &state, &codepoint, sString[i])) {
case UTF8_ACCEPT :
++ nCount ;
break ;
case UTF8_REJECT :
bOk = false ;
state = UTF8_ACCEPT ;
if ( stPrev != UTF8_ACCEPT)
-- i ;
break ;
}
stPrev = state ;
}
return bOk ;
}
//-----------------------------------------------------------------------------
bool inline
GetCodePoints( const std::string& sString, UINTVECTOR& vCode)
{
// pulisco vettore di codici da restituire
vCode.reserve( sString.length()) ;
vCode.clear() ;
// ciclo sui byte della stringa UTF-8
bool bOk = true ;
uint32_t codepoint ;
uint32_t state = UTF8_ACCEPT ;
uint32_t stPrev = UTF8_ACCEPT ;
for ( unsigned int i = 0 ; i < sString.length() ; ++ i) {
switch ( decode( &state, &codepoint, sString[i])) {
case UTF8_ACCEPT :
// A properly encoded character has been found.
vCode.push_back( codepoint) ;
break ;
case UTF8_REJECT :
// The byte is invalid, replace it and restart.
vCode.push_back( 0xFFFD) ;
bOk = false ;
state = UTF8_ACCEPT ;
if ( stPrev != UTF8_ACCEPT)
-- i ;
break ;
}
stPrev = state ;
}
return bOk ;
}