Determine if the current char is the start of a unicode (UTF-8) sequence. If it is the value and the length (args 1 & 2) are set and true is returned. Otherwise the value is just the current char (either upper or lower ASCII), the length is 1 and false is returned. Note to be a UTF-8 sequence, the first byte must be either:- a) 110xxxxx for a 2-byte sequence with the next byte being of the form 10xxxxxx b) 1110xxxx for a 3-byte sequence with the next 2 bytes being of the form 10xxxxxx c) 11110xxx for a 4-byte sequence with the next 3 bytes being of the form 10xxxxxx d) 111110xx for a 5-byte sequence with the next 4 bytes being of the form 10xxxxxx e) 1111110x for a 6-byte sequence with the next 5 bytes being of the form 10xxxxxx
| Return Type | Function name | Arguments |
|---|---|---|
| bool | IsUnicodeSeq | (uint32_t&,uint32_t&,const char*,) |
Declared and defined in file: hzTextproc.cpp
Function Logic:
Function body:
bool IsUnicodeSeq (uint32_t& nEnt)uint32_t& nLen, const char* zi,
{
// Category: Text Presentation
//
// Determine if the current char is the start of a unicode (UTF-8) sequence. If it is the value and the length (args 1 & 2) are set and
// true is returned. Otherwise the value is just the current char (either upper or lower ASCII), the length is 1 and false is returned.
//
// Note to be a UTF-8 sequence, the first byte must be either:-
// a) 110xxxxx for a 2-byte sequence with the next byte being of the form 10xxxxxx
// b) 1110xxxx for a 3-byte sequence with the next 2 bytes being of the form 10xxxxxx
// c) 11110xxx for a 4-byte sequence with the next 3 bytes being of the form 10xxxxxx
// d) 111110xx for a 5-byte sequence with the next 4 bytes being of the form 10xxxxxx
// e) 1111110x for a 6-byte sequence with the next 5 bytes being of the form 10xxxxxx
//
// Arguments: 1) uVal The unicode value
// 2) nLen The length of the unicode sequence encountered
// 3) zi The chain or char sting iterator
//
// Returns: True If the chain iterator is at the start of a unicode sequence
// False Otherwise
const uchar* xi ; // Input chain iterator
uchar ubuf[8]; // Unicode buffer
uint32_t value ; // Unicode value
nLen = 0;
value = 0;
ubuf[0]= (uchar) *zi ;
if (ubuf[0]>&eq; 192)
{
// If this is a utf-8 sequence then between 1 and 5 subsequent bytes will have a value between 128 and 191. If this is
// the case we compute a value for the sequence and then look up this value for a printable form.
xi = (uchar*) zi ;
if (ubuf[0]< 224)
{
// Utf-8 sequence is this and next byte
xi++ ; ubuf[1]= (uchar) *xi ;
if (ubuf[1]> 127&&ubuf[1]<192)
{
value = ((ubuf[0]& 0x1F)<<6);
value += (ubuf[1]& 0x3F);
nLen = 2;
}
}
else if (ubuf[0]< 240)
{
// Utf-8 sequence is this and next 2 bytes
xi++ ; ubuf[1]= (uchar) *xi ;
xi++ ; ubuf[2]= (uchar) *xi ;
if (ubuf[1]> 127&&ubuf[1]<192&&ubuf[2]>127&&ubuf[2]<192)
{
value = ((ubuf[0]& 0x0F)<<12);
value += ((ubuf[1]& 0x3F)<<6);
value += (ubuf[2]& 0x3F);
nLen = 3;
}
}
else if (ubuf[0]< 248)
{
// Utf-8 sequence is this and next 3 bytes
xi++ ; ubuf[1]= (uchar) *xi ;
xi++ ; ubuf[2]= (uchar) *xi ;
xi++ ; ubuf[3]= (uchar) *xi ;
if (ubuf[1]> 127&&ubuf[1]<192&&ubuf[2]>127&&ubuf[2]<192&&ubuf[3]>127&&ubuf[3])
{
value = ((ubuf[0]& 0x07)<<18);
value += ((ubuf[1]& 0x3F)<<12);
value += ((ubuf[2]& 0x3F)<<6);
value += (ubuf[3]& 0x3F);
nLen = 4;
}
}
else if (ubuf[0]< 252)
{
// Utf-8 sequence is this and next 4 bytes
xi++ ; ubuf[1]= (uchar) *xi ;
xi++ ; ubuf[2]= (uchar) *xi ;
xi++ ; ubuf[3]= (uchar) *xi ;
xi++ ; ubuf[4]= (uchar) *xi ;
if (ubuf[1]> 127&&ubuf[1]<192&&ubuf[2]>127&&ubuf[2]<192
&& ubuf[3]> 127&&ubuf[3]&&ubuf[4]> 127&&ubuf[4])
{
value = ((ubuf[0]& 0x03)<<24);
value += ((ubuf[1]& 0x3F)<<18);
value += ((ubuf[2]& 0x3F)<<12);
value += ((ubuf[3]& 0x3F)<<6);
value += (ubuf[4]& 0x3F);
nLen = 5;
}
}
else
{
// Utf-8 sequence is this and next 5 bytes
xi++ ; ubuf[1]= (uchar) *xi ;
xi++ ; ubuf[2]= (uchar) *xi ;
xi++ ; ubuf[3]= (uchar) *xi ;
xi++ ; ubuf[4]= (uchar) *xi ;
xi++ ; ubuf[5]= (uchar) *xi ;
if (ubuf[1]> 127&&ubuf[1]<192&&ubuf[2]>127&&ubuf[2]<192
&& ubuf[3]> 127&&ubuf[3]&&ubuf[4]> 127&&ubuf[4]&&ubuf[5]> 127&&ubuf[5])
{
value = ((ubuf[0]& 0x01)<<30);
value += ((ubuf[1]& 0x3F)<<24);
value += ((ubuf[2]& 0x3F)<<18);
value += ((ubuf[3]& 0x3F)<<12);
value += ((ubuf[4]& 0x3F)<<6);
value += (ubuf[5]& 0x3F);
nLen = 6;
}
}
}
nEnt = value ;
return nLen ? true : false ;
}