Determine if the current char is the start of a unicode (UTF-8) sequence. If it is the value and the length (args 1 & 2) are set and true is returned. Otherwise the value is just the current char (either upper or lower ASCII), the length is 1 and false is returned. Note to be a UTF-8 sequence, the first byte must be either:- a) 110xxxxx for a 2-byte sequence with the next byte being of the form 10xxxxxx b) 1110xxxx for a 3-byte sequence with the next 2 bytes being of the form 10xxxxxx c) 11110xxx for a 4-byte sequence with the next 3 bytes being of the form 10xxxxxx d) 111110xx for a 5-byte sequence with the next 4 bytes being of the form 10xxxxxx e) 1111110x for a 6-byte sequence with the next 5 bytes being of the form 10xxxxxx

Return TypeFunction nameArguments
boolIsUnicodeSeq(uint32_t&,uint32_t&,const char*,)

Declared and defined in file: hzTextproc.cpp

Function Logic:

0:START 1:nLen value ubuf 2:unknown 3:xi 4:unknown 5:items ubuf 6:unknown 7:ubuf 8:value ubuf value nLen 9:ubuf[0]<240 10:items ubuf items ubuf 11:unknown 12:ubuf 13:value ubuf value ubuf value nLen 14:ubuf[0]<248 15:items ubuf items ubuf items ubuf 16:unknown 17:ubuf 18:value ubuf value ubuf value ubuf value nLen 19:ubuf[0]<252 20:items ubuf items ubuf items ubuf items ubuf 21:unknown 22:ubuf 23:value ubuf value ubuf value ubuf value ubuf value nLen 24:items ubuf items ubuf items ubuf items ubuf items ubuf 25:unknown 26:ubuf 27:value ubuf value ubuf value ubuf value ubuf value ubuf value nLen 28:nEnt 29:Return nLen?true:false

Function body:

bool IsUnicodeSeq (uint32_t& nEnt)uint32_t& nLen, const char* zi, 
{
   //  Category: Text Presentation
   //  
   //  Determine if the current char is the start of a unicode (UTF-8) sequence. If it is the value and the length (args 1 & 2) are set and
   //  true is returned. Otherwise the value is just the current char (either upper or lower ASCII), the length is 1 and false is returned.
   //  
   //  Note to be a UTF-8 sequence, the first byte must be either:-
   //   a) 110xxxxx for a 2-byte sequence with the next byte being of the form 10xxxxxx 
   //   b) 1110xxxx for a 3-byte sequence with the next 2 bytes being of the form 10xxxxxx
   //   c) 11110xxx for a 4-byte sequence with the next 3 bytes being of the form 10xxxxxx
   //   d) 111110xx for a 5-byte sequence with the next 4 bytes being of the form 10xxxxxx
   //   e) 1111110x for a 6-byte sequence with the next 5 bytes being of the form 10xxxxxx
   //  
   //  Arguments: 1) uVal The unicode value
   //     2) nLen The length of the unicode sequence encountered
   //     3) zi  The chain or char sting iterator
   //  
   //  Returns: True If the chain iterator is at the start of a unicode sequence
   //     False Otherwise
   const uchar*    xi ;    //  Input chain iterator
   uchar       ubuf[8];    //  Unicode buffer
   uint32_t    value ;     //  Unicode value
   nLen = 0;
   value = 0;
   ubuf[0]= (uchar) *zi ;
   if (ubuf[0]>&eq; 192)
   {
       //  If this is a utf-8 sequence then between 1 and 5 subsequent bytes will have a value between 128 and 191. If this is
       //  the case we compute a value for the sequence and then look up this value for a printable form.
       xi = (uchar*) zi ;
       if (ubuf[0]< 224)
       {
           //  Utf-8 sequence is this and next byte
           xi++ ; ubuf[1]= (uchar) *xi ;
           if (ubuf[1]> 127&&ubuf[1]<192)
           {
               value =  ((ubuf[0]& 0x1F)<<6);
               value += (ubuf[1]& 0x3F);
               nLen = 2;
           }
       }
       else if (ubuf[0]< 240)
       {
           //  Utf-8 sequence is this and next 2 bytes
           xi++ ; ubuf[1]= (uchar) *xi ;
           xi++ ; ubuf[2]= (uchar) *xi ;
           if (ubuf[1]> 127&&ubuf[1]<192&&ubuf[2]>127&&ubuf[2]<192)
           {
               value =  ((ubuf[0]& 0x0F)<<12);
               value += ((ubuf[1]& 0x3F)<<6);
               value += (ubuf[2]& 0x3F);
               nLen = 3;
           }
       }
       else if (ubuf[0]< 248)
       {
           //  Utf-8 sequence is this and next 3 bytes
           xi++ ; ubuf[1]= (uchar) *xi ;
           xi++ ; ubuf[2]= (uchar) *xi ;
           xi++ ; ubuf[3]= (uchar) *xi ;
           if (ubuf[1]> 127&&ubuf[1]<192&&ubuf[2]>127&&ubuf[2]<192&&ubuf[3]>127&&ubuf[3])
           {
               value =  ((ubuf[0]& 0x07)<<18);
               value += ((ubuf[1]& 0x3F)<<12);
               value += ((ubuf[2]& 0x3F)<<6);
               value += (ubuf[3]& 0x3F);
               nLen = 4;
           }
       }
       else if (ubuf[0]< 252)
       {
           //  Utf-8 sequence is this and next 4 bytes
           xi++ ; ubuf[1]= (uchar) *xi ;
           xi++ ; ubuf[2]= (uchar) *xi ;
           xi++ ; ubuf[3]= (uchar) *xi ;
           xi++ ; ubuf[4]= (uchar) *xi ;
           if (ubuf[1]> 127&&ubuf[1]<192&&ubuf[2]>127&&ubuf[2]<192
               && ubuf[3]> 127&&ubuf[3]&&ubuf[4]> 127&&ubuf[4])
           {
               value =  ((ubuf[0]& 0x03)<<24);
               value += ((ubuf[1]& 0x3F)<<18);
               value += ((ubuf[2]& 0x3F)<<12);
               value += ((ubuf[3]& 0x3F)<<6);
               value += (ubuf[4]& 0x3F);
               nLen = 5;
           }
       }
       else
       {
           //  Utf-8 sequence is this and next 5 bytes
           xi++ ; ubuf[1]= (uchar) *xi ;
           xi++ ; ubuf[2]= (uchar) *xi ;
           xi++ ; ubuf[3]= (uchar) *xi ;
           xi++ ; ubuf[4]= (uchar) *xi ;
           xi++ ; ubuf[5]= (uchar) *xi ;
           if (ubuf[1]> 127&&ubuf[1]<192&&ubuf[2]>127&&ubuf[2]<192
               && ubuf[3]> 127&&ubuf[3]&&ubuf[4]> 127&&ubuf[4]&&ubuf[5]> 127&&ubuf[5])
           {
               value =  ((ubuf[0]& 0x01)<<30);
               value += ((ubuf[1]& 0x3F)<<24);
               value += ((ubuf[2]& 0x3F)<<18);
               value += ((ubuf[3]& 0x3F)<<12);
               value += ((ubuf[4]& 0x3F)<<6);
               value += (ubuf[5]& 0x3F);
               nLen = 6;
           }
       }
   }
   nEnt = value ;
   return nLen ? true : false ;
}