// // File: hzTextproc.cpp // // Legal Notice: This file is part of the HadronZoo C++ Class Library. Copyright 2025 HadronZoo Project (http://www.hadronzoo.com) // // The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free // Software Foundation, either version 3 of the License, or any later version. // // The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR // A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses. //
// // Purpose: General text processing functions. //
#include <iostream> #include <fstream>
#include <stdarg.h> #include <unistd.h> #include <sys/stat.h>
#include "hzChars.h" #include "hzTextproc.h" #include "hzChain.h" #include "hzDirectory.h" #include "hzProcess.h"
using namespace std ;
/* ** Char type map */
int16_t chartype[256] = { /* 0x00 NULL */ CTYPE_BINARY, /* 0x01 CTRLA */ CTYPE_BINARY, /* 0x02 CTRLB */ CTYPE_BINARY, /* 0x03 CTRLC */ CTYPE_BINARY, /* 0x04 CTRLD */ CTYPE_BINARY, /* 0x05 CTRLE */ CTYPE_BINARY, /* 0x06 CTRLF */ CTYPE_BINARY, /* 0x07 CTRLG */ CTYPE_BINARY, /* 0x08 CTRLH */ CTYPE_BINARY, /* 0x09 TAB */ CTYPE_WHITE, /* 0x0a NEWLINE */ CTYPE_WHITE, /* 0x0b CTRLK */ CTYPE_BINARY, /* 0x0c CTRLL */ CTYPE_BINARY, /* 0x0d CTRLM */ CTYPE_WHITE, /* 0x0e CTRLN */ CTYPE_BINARY, /* 0x0f CTRLO */ CTYPE_BINARY, /* 0x10 CTRLP */ CTYPE_BINARY, /* 0x11 CTRLQ */ CTYPE_BINARY, /* 0x12 CTRLR */ CTYPE_BINARY, /* 0x13 CTRLS */ CTYPE_BINARY, /* 0x14 CTRLT */ CTYPE_BINARY, /* 0x15 CTRLU */ CTYPE_BINARY, /* 0x16 CTRLV */ CTYPE_BINARY, /* 0x17 CTRLW */ CTYPE_BINARY, /* 0x18 CTRLX */ CTYPE_BINARY, /* 0x19 CTRLY */ CTYPE_BINARY, /* 0x1a CTRLZ */ CTYPE_BINARY, /* 0x1b */ CTYPE_BINARY, /* 0x1c */ CTYPE_BINARY, /* 0x1d */ CTYPE_BINARY, /* 0x1e */ CTYPE_BINARY, /* 0x1f */ CTYPE_BINARY, /* 0x20 SPACE */ CTYPE_WHITE, /* 0x21 PLING */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x22 QUOTE */ CTYPE_PUNCT, /* 0x23 HASH */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x24 DOLLAR */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x25 PERCENT */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x26 AMPSAND */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x27 SQUOTE */ CTYPE_PUNCT | CTYPE_URL_RESV, /* 0x28 OPENPAR */ CTYPE_PUNCT | CTYPE_URL_RESV, /* 0x29 CLOSEPAR */ CTYPE_PUNCT | CTYPE_URL_RESV, /* 0x2a ASTERISK */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x2b PLUS */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x2c COMMA */ CTYPE_PUNCT | CTYPE_URL_RESV, /* 0x2d MINUS */ CTYPE_PUNCT | CTYPE_URL_NORM, /* 0x2e PERIOD */ CTYPE_PUNCT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x2f FWSLASH */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x30 DIGIT0 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x31 DIGIT1 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x32 DIGIT2 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x33 DIGIT3 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x34 DIGIT4 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x35 DIGIT5 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x36 DIGIT6 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x37 DIGIT7 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x38 DIGIT8 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x39 DIGIT9 */ CTYPE_DIGIT | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x3a COLON */ CTYPE_PUNCT | CTYPE_URL_RESV, /* 0x3b SCOLON */ CTYPE_PUNCT | CTYPE_URL_RESV, /* 0x3c LESS */ CTYPE_SYMB, /* 0x3d EQUAL */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x3e MORE */ CTYPE_SYMB, /* 0x3f QUERRY */ CTYPE_PUNCT | CTYPE_URL_RESV, /* 0x40 AT */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x41 UC_A */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x42 UC_B */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x43 UC_C */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x44 UC_D */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x45 UC_E */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x46 UC_F */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x47 UC_G */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x48 UC_H */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x49 UC_I */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x4a UC_J */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x4b UC_K */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x4c UC_L */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x4d UC_M */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x4e UC_L */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x4f UC_M */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x50 UC_N */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x51 UC_O */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x52 UC_R */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x53 UC_S */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x54 UC_T */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x55 UC_U */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x56 UC_V */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x57 UC_W */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x58 UC_X */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x59 UC_Y */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x5a UC_Z */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x5b SQOPEN */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x5c BWSLASH */ CTYPE_SYMB, /* 0x5d SQCLOSE */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x5e HAT */ CTYPE_SYMB, /* 0x5f USCORE */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x60 BKQUOTE */ CTYPE_PUNCT, /* 0x61 LC_A */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x62 LC_B */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x63 LC_C */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x64 LC_D */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x65 LC_E */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_NUMCHAR | CTYPE_URL_NORM, /* 0x66 LC_F */ CTYPE_ALPHA | CTYPE_HEXDIGIT | CTYPE_URL_NORM, /* 0x67 LC_G */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x68 LC_H */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x69 LC_I */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x6a LC_J */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x6b LC_K */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x6c LC_L */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x6d LC_M */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x6e LC_N */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x6f LC_O */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x70 LC_P */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x71 LC_Q */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x72 LC_R */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x73 LC_S */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x74 LC_T */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x75 LC_U */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x76 LC_V */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x77 LC_W */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x78 LC_X */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x79 LC_Y */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x7a LC_Z */ CTYPE_ALPHA | CTYPE_URL_NORM, /* 0x7b CUROPEN */ CTYPE_SYMB, /* 0x7c OR */ CTYPE_SYMB, /* 0x7d CURCLSE */ CTYPE_SYMB, /* 0x7e TILDA */ CTYPE_SYMB | CTYPE_URL_RESV, /* 0x7f BLOCK */ CTYPE_BINARY, /* 0x80 euro sign */ CTYPE_SYMB, /* 0x81 */ CTYPE_BINARY, /* 0x82 */ CTYPE_PUNCT, /* 0x83 */ CTYPE_ALPHA, /* 0x84 */ CTYPE_PUNCT, /* 0x85 */ CTYPE_PUNCT, /* 0x86 */ CTYPE_PUNCT, /* 0x87 */ CTYPE_PUNCT, /* 0x88 */ CTYPE_PUNCT, /* 0x89 */ CTYPE_SYMB, /* 0x8A */ CTYPE_ALPHA, /* 0x8B */ CTYPE_PUNCT, /* 0x8C */ CTYPE_ALPHA, /* 0x8D */ CTYPE_BINARY, /* 0x8E */ CTYPE_ALPHA, /* 0x8F */ CTYPE_BINARY, /* 0x90 */ CTYPE_BINARY, /* 0x91 */ CTYPE_PUNCT, /* 0x92 */ CTYPE_PUNCT, /* 0x93 */ CTYPE_PUNCT, /* 0x94 */ CTYPE_PUNCT, /* 0x95 */ CTYPE_PUNCT, /* 0x96 */ CTYPE_PUNCT, /* 0x97 */ CTYPE_PUNCT, /* 0x98 */ CTYPE_SYMB, /* 0x99 */ CTYPE_SYMB, /* 0x9A */ CTYPE_SYMB, /* 0x9B */ CTYPE_SYMB, /* 0x9C */ CTYPE_ALPHA, /* 0x9D */ CTYPE_BINARY, /* 0x9E */ CTYPE_ALPHA, /* 0x9F */ CTYPE_ALPHA, /* 0xA0 */ CTYPE_WHITE, /* 0xA1 */ CTYPE_SYMB, /* 0xA2 */ CTYPE_SYMB, /* 0xA3 */ CTYPE_SYMB, /* 0xA4 */ CTYPE_SYMB, /* 0xA5 */ CTYPE_SYMB, /* 0xA6 */ CTYPE_SYMB, /* 0xA7 */ CTYPE_SYMB, /* 0xA8 */ CTYPE_SYMB, /* 0xA9 */ CTYPE_SYMB, /* 0xAA */ CTYPE_SYMB, /* 0xAB */ CTYPE_SYMB, /* 0xAC */ CTYPE_SYMB, /* 0xAD */ CTYPE_SYMB, /* 0xAE */ CTYPE_SYMB, /* 0xAF */ CTYPE_SYMB, /* 0xB0 */ CTYPE_SYMB, /* 0xB1 */ CTYPE_SYMB, /* 0xB2 */ CTYPE_SYMB, /* 0xB3 */ CTYPE_SYMB, /* 0xB4 */ CTYPE_SYMB, /* 0xB5 */ CTYPE_SYMB, /* 0xB6 */ CTYPE_SYMB, /* 0xB7 */ CTYPE_SYMB, /* 0xB8 */ CTYPE_SYMB, /* 0xB9 */ CTYPE_SYMB, /* 0xBA */ CTYPE_SYMB, /* 0xBB */ CTYPE_SYMB, /* 0xBC */ CTYPE_SYMB, /* 0xBD */ CTYPE_SYMB, /* 0xBE */ CTYPE_SYMB, /* 0xBF */ CTYPE_SYMB, /* 0xC0 */ CTYPE_ALPHA, /* 0xC1 */ CTYPE_ALPHA, /* 0xC2 */ CTYPE_ALPHA, /* 0xC3 */ CTYPE_ALPHA, /* 0xC4 */ CTYPE_ALPHA, /* 0xC5 */ CTYPE_ALPHA, /* 0xC6 */ CTYPE_ALPHA, /* 0xC7 */ CTYPE_ALPHA, /* 0xC8 */ CTYPE_ALPHA, /* 0xC9 */ CTYPE_ALPHA, /* 0xCA */ CTYPE_ALPHA, /* 0xCB */ CTYPE_ALPHA, /* 0xCC */ CTYPE_ALPHA, /* 0xCD */ CTYPE_ALPHA, /* 0xCE */ CTYPE_ALPHA, /* 0xCF */ CTYPE_ALPHA, /* 0xD0 */ CTYPE_ALPHA, /* 0xD1 */ CTYPE_ALPHA, /* 0xD2 */ CTYPE_ALPHA, /* 0xD3 */ CTYPE_ALPHA, /* 0xD4 */ CTYPE_ALPHA, /* 0xD5 */ CTYPE_ALPHA, /* 0xD6 */ CTYPE_ALPHA, /* 0xD7 */ CTYPE_SYMB, /* 0xD8 */ CTYPE_ALPHA, /* 0xD9 */ CTYPE_ALPHA, /* 0xDA */ CTYPE_ALPHA, /* 0xDB */ CTYPE_ALPHA, /* 0xDC */ CTYPE_ALPHA, /* 0xDD */ CTYPE_ALPHA, /* 0xDE */ CTYPE_ALPHA, /* 0xDF */ CTYPE_ALPHA, /* 0xE0 */ CTYPE_ALPHA, /* 0xE1 */ CTYPE_ALPHA, /* 0xE2 */ CTYPE_ALPHA, /* 0xE3 */ CTYPE_ALPHA, /* 0xE4 */ CTYPE_ALPHA, /* 0xE5 */ CTYPE_ALPHA, /* 0xE6 */ CTYPE_ALPHA, /* 0xE7 */ CTYPE_ALPHA, /* 0xE8 */ CTYPE_ALPHA, /* 0xE9 */ CTYPE_ALPHA, /* 0xEA */ CTYPE_ALPHA, /* 0xEB */ CTYPE_ALPHA, /* 0xEC */ CTYPE_ALPHA, /* 0xED */ CTYPE_ALPHA, /* 0xEE */ CTYPE_ALPHA, /* 0xEF */ CTYPE_ALPHA, /* 0xF0 */ CTYPE_ALPHA, /* 0xF1 */ CTYPE_ALPHA, /* 0xF2 */ CTYPE_ALPHA, /* 0xF3 */ CTYPE_ALPHA, /* 0xF4 */ CTYPE_ALPHA, /* 0xF5 */ CTYPE_ALPHA, /* 0xF6 */ CTYPE_ALPHA, /* 0xF7 */ CTYPE_SYMB, /* 0xF8 */ CTYPE_ALPHA, /* 0xF9 */ CTYPE_ALPHA, /* 0xFA */ CTYPE_ALPHA, /* 0xFB */ CTYPE_ALPHA, /* 0xFC */ CTYPE_ALPHA, /* 0xFD */ CTYPE_ALPHA, /* 0xFE */ CTYPE_ALPHA, /* 0xFF */ CTYPE_ALPHA } ;
/* ** Character type functions */
// FnSet: Character-Type // Category: Text Processing // // The following two set of functions take either an int32_t or a char as the input character and return true if the character is of the implied // type. The set starting with 'I' take int32_t. The set starting with 'i' take char. // // Func: IsBinary(int32_t) // Func: IsWhite(int32_t) // Func: IsDigit(int32_t) // Func: IsHex(int32_t) // Func: IsAlpha(int32_t) // Func: IsHyphen(int32_t) // Func: IsAlphanum(int32_t) // Func: IsPunct(int32_t) // Func: IsSymb(int32_t) // Func: IsNumchar(int32_t) // Func: IsUrlnorm(int32_t) // Func: IsUrlresv(int32_t) // Func: IsTagchar(int32_t) // Func: IsDomainChar(int32_t)
bool IsBinary (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_BINARY ? true : false ; } bool IsWhite (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_WHITE ? true : false ; } bool IsDigit (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_DIGIT ? true : false ; } bool IsHex (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_HEXDIGIT ? true : false ; } bool IsAlpha (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_ALPHA ? true : false ; } bool IsHyphen (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_HYPEN ? true : false ; } bool IsAlphanum (int32_t c) { return c >= 0 && c < 256 && chartype[c] & (CTYPE_ALPHA | CTYPE_DIGIT) ? true : false ; } bool IsPunct (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_PUNCT ? true : false ; } bool IsSymb (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_SYMB ? true : false ; } bool IsNumchar (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_NUMCHAR ? true : false ; } bool IsUrlnorm (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_URL_NORM ? true : false ; } bool IsUrlresv (int32_t c) { return c >= 0 && c < 256 && chartype[c] & (CTYPE_URL_NORM | CTYPE_URL_RESV) ? true : false ; } bool IsTagchar (int32_t c) { return c >= 0 && c < 256 && chartype[c] & CTYPE_HTX_TAG ? true : false ; }
/* ** Functions */
uint32_t CstrIncidence (const char* str, char c) { // Category: Text Processing // // Count the incidence of a particular char in a source string // // Arguments: 1) cpStr The source string // 2) c The char value being counted // // Returns: 0+ Being the number of times the supplied test char occurs within the supplied cstr
const char* i ; // Input string iterator uint32_t n = 0 ; // Incidence counter
if (!str || !str[0] || !c) return 0 ;
for (i = str ; *i ; i++) { if (*i == c) n++ ; }
return n ; }
void SplitChain (hzVect<hzString>& ar, hzChain& input, char cDelim) { // Category: Text Processing // // Splits an input chain into a series of strings on the basis of a single delimiting char. // // Arguments: 1) ar The vector of strings that will be populated by this operation (note it is first cleared) // 2) input The input string or chain // 3) cDelim The delimitor (default of comma) // // Returns: None.
hzChain C ; // Temporary chain (for storing partial data) chIter i ; // Itererator for input hzString S ; // Set by temp chain upon delim
ar.Clear() ;
if (input.Size()) { if (!cDelim) cDelim = CHAR_COMMA ;
for (i = input ; !i.eof() ; i++) { if (*i == cDelim) { S = C ; ar.Add(S) ; C.Clear() ; }
C.AddByte(*i) ; }
if (C.Size()) { S = C ; ar.Add(S) ; } } }
void SplitStrOnChar (hzArray<hzString>& ar, hzString& input, char cDelim) { // Category: Text Processing // // Splits an input string into a series of strings on the basis of a single delimiting char. // // Arguments: 1) ar The vector of strings that will be populated by this operation (note it is first cleared) // 2) input The input string or chain // 3) cDelim The delimitor (default of comma) // // Returns: None
const char* i ; // Input iterator hzString S ; // Substring of input uint32_t nRef = 0 ; // Substring end position (within input) uint32_t nPos = 0 ; // Substring start position (within input)
ar.Clear() ;
if (input) { if (!cDelim) cDelim = 0 ;
i = *input ; nRef = nPos = 0 ;
for (;;) { if (i[nPos] == 0 || i[nPos] == cDelim) { S = input.SubString(nRef, nPos - nRef) ; if (S) ar.Add(S) ; if (i[nPos] == 0) break ; nRef = nPos + 1 ; } nPos++ ; } } }
hzEcode SplitCstrOnChar (hzVect<hzString>& ar, const char* input, char cDelim) { // Category: Text Processing // // Split a null terminated string using a single character as deliminator // // Arguments: 1) ar A vector of hzString that is populated by this operation // 2) input The input null terminated string to be split // 3) cDelim The delimiter // // Returns: E_ARGUMENT If either the input or delimiter is not supplied // E_OK If the input was processed
const char* i ; // Iterator hzString S ; // Value acceptor uint32_t nRef = 0 ; // Reference position uint32_t nPos = 0 ; // Position reached so far
ar.Clear() ;
if (!input || !input[0]) return E_ARGUMENT ; if (!cDelim) cDelim = CHAR_COMMA ;
for (i = input ;; i++) { if (*i == 0 || *i == cDelim) { if (nPos > nRef) { //j = S._blank(nPos - nRef) ; //memcpy(j, input + nRef, nPos - nRef) ; S.SetValue(input + nRef, nPos - nRef) ; } ar.Add(S) ; S.Clear() ;
nRef = nPos + 1 ; }
if (*i == 0) break ; nPos++ ; }
return E_OK ; }
hzEcode SplitCstrOnCstr (hzArray<hzString>& ar, const char* input, const char* delim) { // Category: Text Processing // // Split a null terminated string using another null terminated string as deliminator // // Arguments: 1) ar A vector of hzString that is populated by this operation // 2) input The input null terminated string to be split // 3) delim The delimiter string // // Returns: E_ARGUMENT If either the input or delimiter is not supplied // E_OK If the input was processed
const char* i ; // Iterator hzString S ; // Value acceptor uint32_t nRef = 0 ; // Reference position uint32_t nPos = 0 ; // Position reached so far uint32_t nLen ; // Delimiter length
ar.Clear() ;
if (!input || !input[0]) return E_ARGUMENT ; if (!delim || !delim[0]) return E_ARGUMENT ;
nLen = strlen(delim) ;
for (i = input ;;) { if (*i == 0) { if (nPos > nRef) { //j = S._blank(nPos - nRef) ; //memcpy(j, input + nRef, nPos - nRef) ; S.SetValue(input + nRef, nPos - nRef) ; } ar.Add(S) ; break ; }
if (*i == delim[0]) { if (CstrCompare(i, delim)) { if (nPos > nRef) { //j = S._blank(nPos - nRef) ; //memcpy(j, input + nRef, nPos - nRef) ; S.SetValue(input + nRef, nPos - nRef) ; } ar.Add(S) ; S.Clear() ;
nRef = nPos + nLen ; i += nLen ; nPos += nLen ; continue ; } }
i++ ; nPos++ ; }
return E_OK ; }
hzEcode SplitCSV (hzArray<hzString>& ar, const char* line, char cDelim) { // Category: Text Processing // // Purpose: Splits a line from a .csv file into it's fields. Copes with quoted values automatically, removes double-quote pairs // if present and de-escapes sequences. // // Arguments: 1) ar Either an array of char* or an array of hzString // 2) line The line to be split (char*) // 3) cDelim Delimitor char // // Returns: E_ARGUMENT If either the target array or the line is empty // E_FORMAT If the number of fields does not match expected number // E_OK If successful
hzChain C ; // Working chain const char* i ; // Input iterator hzString S ; // Data field
ar.Clear() ; i = line ; if (!i || !i[0]) return E_ARGUMENT ;
if (!cDelim || cDelim == CHAR_DQUOTE) cDelim = CHAR_COMMA ;
for (;;) { if (*i == CHAR_DQUOTE) { for (i++ ; *i ; i++) { if (*i == CHAR_BKSLASH) { if (i[1] == 'r') { i++ ; C.AddByte(CHAR_CR) ; continue ; } if (i[1] == 'n') { i++ ; C.AddByte(CHAR_NL) ; continue ; } if (i[1] == 't') { i++ ; C.AddByte(CHAR_TAB) ; continue ; } if (i[1] == '"') { i++ ; C.AddByte(CHAR_DQUOTE) ; continue ; } }
if (i[0] == CHAR_DQUOTE && i[1] == CHAR_DQUOTE) { i++ ; C.AddByte(CHAR_DQUOTE) ; continue ; }
if (*i == CHAR_DQUOTE) break ; C.AddByte(*i) ; }
if (*i == CHAR_DQUOTE) i++ ; else break ; }
if (*i == 0 || *i == cDelim) { S = C ; ar.Add(S) ; C.Clear() ; S.Clear() ;
if (*i == 0) break ; i++ ; continue ; }
// Just add char to chain if (*i == CHAR_BKSLASH) { if (i[1] == 'r') { i++ ; C.AddByte(CHAR_CR) ; continue ; } if (i[1] == 'n') { i++ ; C.AddByte(CHAR_NL) ; continue ; } if (i[1] == 't') { i++ ; C.AddByte(CHAR_TAB) ; continue ; } if (i[1] == '"') { i++ ; C.AddByte(CHAR_DQUOTE) ; continue ; } }
C.AddByte(*i) ; i++ ; }
return E_OK ; }
hzEcode SplitCSV (char** ar, char* line, uint32_t arSize, char cDelim) { // Category: Text Processing // // Purpose: Splits a line from a .csv file into it's fields. Copes with quoted values automatically, removes double-quote pairs // if present. Note this function does not de-escapes sequences. // // Arguments: 1) ar Either an array of char* or an array of hzString // 2) line The line to be split (char*) // 3) cDelim Delimitor char // // Returns: E_ARGUMENT If either the target array or the line is not supplied or the array size is 0 // E_FORMAT If the number of fields does not match expected number // E_OK If successful
char* i ; // Input iterator uint32_t nPos ; // Position within array
if (!ar || !arSize) return E_ARGUMENT ; nPos = 0 ;
if (!line || !line[0]) return E_ARGUMENT ;
if (!cDelim || cDelim == CHAR_DQUOTE) cDelim = CHAR_COMMA ;
for (i = line ; *i && nPos < arSize ; i++) { if (*i == CHAR_DQUOTE) { // Handle quoted CSV entry i++ ; ar[nPos++] = i ;
for (; *i ; i++) { if (*i == CHAR_DQUOTE) { *i++ = 0 ; break ; } }
// Expect delimiter if (*i && *i != cDelim) return E_FORMAT ; continue ; }
// Handle unquoted CSV entry ar[nPos++] = i ;
for (; *i ; i++) { if (*i == cDelim) { i++ ; break ; } } }
return E_OK ; }
hzEcode DosifyChain (hzChain& Z) { // Category: Text Processing // // 'Dosify' a chain (convert instances of newline into carriage return newline. Note that chains cannot be dosified more than once // by mistake. The instances of newline are not converted to CR-NL unless they lack the preceeding CR // // Arguments: 1) Z The input chain that will be dosified by this operation // // Returns: E_NODATA If the source file has not been supplied // E_ARGUMENT If the target file has not been supplied // E_NOTFOUND If the source file cannot be found // E_OPENFAIL If the source file cannot be opened // E_READFAIL If the source file cannot be read // E_WRITEFAIL If the target file cannot be created or written to // E_OK If the operation was successful
hzChain F ; // Resulting chain chIter zi ; // Input chain iterator
if (!Z.Size()) return E_NODATA ;
for (zi = Z ; !zi.eof() ; zi++) { if (*zi == CHAR_CR) { F.AddByte(*zi) ; zi++ ;
if (*zi == CHAR_NL) { F.AddByte(*zi) ; continue ; } }
if (*zi == CHAR_NL) { F.AddByte(CHAR_CR) ; F.AddByte(CHAR_NL) ; continue ; }
F.AddByte(*zi) ; }
Z.Clear() ; Z = F ;
return E_OK ; }
hzEcode DosifyFile (const hzString& tgt, const hzString& src) { // Category: Text Processing // // 'Dosify' a file by converting all instances of newline (with or without the preceeding carriage return) into the carriage return newline sequence. // // Arguments: 1) tgt The pathname of the target file // 2) src The pathname of the source file // // Returns: E_NODATA If the source file has not been supplied // E_ARGUMENT If the target file has not been supplied // E_NOTFOUND If the source file cannot be found // E_OPENFAIL If the source file cannot be opened // E_READFAIL If the source file cannot be read // E_WRITEFAIL If the target file cannot be created or written to // E_OK If the operation was successful
_hzfunc(__func__) ;
ifstream is ; // Input stream ofstream os ; // Output stream hzString target ; // Intermeadiate filename char buf [1024] ; // Working buffer bool bSame ; // Target/Source match indicator hzEcode rc = E_OK ; // Return code
if (!src) { hzerr(E_ARGUMENT, "No source file specified") ; return E_ARGUMENT ; } if (!tgt) { hzerr(E_ARGUMENT, "No target file specified") ; return E_ARGUMENT ; }
// If source and target file are the same if (tgt == src) { bSame = true ; target = tgt + ".x" ; } else { bSame = false ; target = tgt ; }
// Seek to open source file rc = OpenInputStrm(is, src) ; if (rc != E_OK) return rc ;
// Open target for writing os.open(*target) ; if (os.fail()) return E_WRITEFAIL ;
for (; rc == E_OK ;) { is.getline(buf, 1024) ; if (!is.gcount()) break ;
if (is.fail()) { rc = E_READFAIL ; break ; }
if (!buf[0]) continue ;
if (is.gcount() == 1024) { if (buf[1023] == CHAR_CR) { buf[1023] = 0 ; os << buf << "\r\n" ; continue ; }
os.write(buf, 1024) ; continue ; }
os << buf << "\r\n" ; if (os.fail()) rc = E_WRITEFAIL ; }
is.close() ; os.close() ;
if (bSame) { // Lose the source file and rename the target back to the original unlink(*src) ; rename(*target, *tgt) ; }
return rc ; }
uint32_t CstrCopy (char* cpDest, const char* cpSource, uint32_t nMaxlen) { // Category: Text Processing // // Copy a string (and handle null pointers without crashing) // // Arguments: 1) cpDest Destination string // 2) cpSource Source string // 3) nMaxlen Max length to copy or 0 for no whole string copy // // Returns: 0 If either the desination or the source are not supplied // 0+ The number of characters copied
_hzfunc("CstrCopy_a") ;
uint32_t nCount = 0 ; // Byte counter
if (!cpDest) return 0 ; if (!cpSource) return 0 ; if (!cpSource[0]) return 0 ;
if (nMaxlen) for (; *cpSource && nCount < nMaxlen ; *cpDest++ = *cpSource++, nCount++) ; else for (; *cpSource ; *cpDest++ = *cpSource++, nCount++) ; *cpDest = 0 ;
return nCount ; }
uint32_t CstrOverwrite (char* cpDest, const char* cpSource, uint32_t nMaxlen) { // Category: Text Processing // // Overwite the character string (arg 1) with that provided (arg 2) and optionally limit the number of bytes (arg 3) // // Arguments: 1) cpDest Destination string // 2) cpSource Source string // 3) nMaxlen Max length to copy or 0 for no whole string copy // // Returns: 0 If either the desination or the source are not supplied // 0+ The number of characters overwritten
_hzfunc("CstrOverwrite_a") ;
uint32_t nCount = 0 ; // Byte counter
if (!cpDest) return 0 ; if (!cpSource) return 0 ; if (!cpSource[0]) return 0 ;
if (nMaxlen) for (; *cpSource && nCount < nMaxlen ; *cpDest++ = *cpSource++, nCount++) ; else for (; *cpSource ; *cpDest++ = *cpSource++, nCount++) ;
return nCount ; }
int32_t CstrCompareI (const char* pA, const char* pB, uint32_t nMaxlen) { // Category: Text Processing // // Purpose: Compare two strings on a case insensitive basis. This does not crash when // given null arguments. // // Arguments: 1) pA First string // 2) pB Second string // 3) nMaxlen Max length to compare (default 0 for no maximum) // // Returns: +1 If pA is lexically greater than pB within the specified length // -1 If pA is lexically less than pB within the specified length // 0 If pA and pB are equivelent within the specified length
_hzfunc("CstrCompareI") ;
if (!pA || !pA[0]) { if (!pB || !pB[0]) return 0 ; return 1 ; }
if (!pB || !pB[0]) return conv2lower(*pA) ;
if (!nMaxlen) { for (; *pA && *pB ; pA++, pB++) { if (conv2lower(*pA) != conv2lower(*pB)) break ; } } else { uint32_t nCount ;
for (nCount = 0 ; nCount < nMaxlen ; nCount++, pA++, pB++) { if (conv2lower(*pA) != conv2lower(*pB)) break ; } if (nCount == nMaxlen) return 0 ; }
if (conv2lower(*pA) > conv2lower(*pB)) return 1 ; if (conv2lower(*pA) < conv2lower(*pB)) return -1 ; return 0 ; }
int32_t CstrCompare (const char* pA, const char* pB, uint32_t nMaxlen) { // Category: Text Processing // // Purpose: Compare two strings (cstr) on a case sensitive basis. This does not crash when given null arguments. // // Arguments: 1) pA First string // 2) pB Second string // 3) nMaxlen Max length to compare (default 0 for no maximum) // // Returns: +1 If pA is lexically greater than pB within the specified length // -1 If pA is lexically less than pB within the specified length // 0 If pA and pB are equal within the specified length
_hzfunc("CstrCompare") ;
uint32_t nCount ; // Byte counter
if (!pA || !pA[0]) { if (!pB) return 0 ; if (!pB[0]) return 0 ; return *pB ; }
if (!pB || !pB[0]) return *pA ;
if (!nMaxlen) for (; *pA && *pB && *pA == *pB ; pA++, pB++) ; else for (nCount = 0 ; nCount < nMaxlen && *pA && *pB && *pA == *pB ; nCount++, pA++, pB++) ;
return *pA - *pB ; }
bool CstrContains (const char* cpHaystack, const char* cpNeedle) { // Category: Text Processing // // Determine if the haystack string contains the needle string // // Arguments: 1) cpHaystack The source string // 2) cpNeedle The being sought // // Returns: True If the needle string occurs in the haystack string // False Otherwise
const char* i ; // Input string iterator
if (!cpHaystack || !cpHaystack[0] || !cpNeedle || !cpNeedle[0]) return false ;
for (i = cpHaystack ; *i ; i++) { if (i[0] != cpNeedle[0]) continue ;
if (!CstrCompare(i, cpNeedle)) return true ; }
return false ; }
bool CstrContainsI (const char* cpHaystack, const char* cpNeedle) { // Category: Text Processing // // Determine if the supplied sub-string is contained within the supplied string on a case-insensitive basis // // Arguments: 1) cpHaystack The source string // 2) cpNeedle The being sought // // Returns: True If the needle string occurs in the haystack string // False Otherwise
const char* i ; // Input string iterator uint32_t len ; // Test string length char lower ; // Lower case of first char of needle char upper ; // Upper case of first char of needle
if (!cpHaystack || !cpHaystack[0] || !cpNeedle || !cpNeedle[0]) return false ;
lower = tolower(cpNeedle[0]) ; upper = toupper(cpNeedle[0]) ;
len = strlen(cpNeedle) ;
for (i = cpHaystack ; *i ; i++) { if (*i != lower && *i != upper) continue ;
if (!CstrCompareI(i, cpNeedle, len)) return true ; }
return false ; }
int32_t CstrFirst (const char* cpStr, char testChar) { // Category: Text Processing // // This returns the position of the first instance of c in str. It has the advantage that it won't crash when given duff input // // Arguments: 1) cpStr The string to test // 2) testChar The char to test for // // Returns: -1 If the test char does not appear in the string // 0+ Being the position of the first occurence of the test char in the string
int32_t nPosn = 0 ; // Input string position
if (!cpStr || !cpStr[0] || !testChar) return -1 ;
for (nPosn = 0 ; cpStr[nPosn] ; nPosn++) { if (cpStr[nPosn] == testChar) return nPosn ; }
return -1 ; }
int32_t CstrFirstI (const char* cpStr, char testChar) { // Category: Text Processing // // This returns the position of the first instance of c in str. The comparison is case insensitive // // Arguments: 1) cpStr The string to test // 2) testChar The char to test for // // Returns: -1 If the test char does not appear in the string // 0+ Being the position of the first occurence of the test char in the string
int32_t nPosn = 0 ; // Input string position
if (!cpStr || !cpStr[0] || !testChar) return -1 ;
for (nPosn = 0 ; cpStr[nPosn] ; nPosn++) { if (_tolower(cpStr[nPosn]) == _tolower(testChar)) return nPosn ; }
return -1 ; }
int32_t CstrFirst (const char* cpHaystack, const char* cpNeedle) { // Category: Text Processing // // Determine if the supplied sub-string is contained within the supplied string on a case-sensitive basis // // Arguments: 1) cpHaystack The source string // 2) cpNeedle The string being sought // // Returns: Position If the sub-string occurs in the string // 0 Otherwise
uint32_t n ; // String iterator (position)
if (!cpHaystack || !cpHaystack[0] || !cpNeedle || !cpNeedle[0]) return false ;
for (n = 0 ; cpHaystack[n] ; n++) { if (cpHaystack[n] != cpNeedle[0]) continue ;
if (!CstrCompare(cpHaystack + n, cpNeedle)) return n ; }
return -1 ; }
int32_t CstrFirstI (const char* cpHaystack, const char* cpNeedle) { // Category: Text Processing // // Determine if the supplied sub-string is contained within the supplied string on a case-sensitive basis // // Arguments: 1) cpHaystack The source string // 2) cpNeedle The string being sought // // Returns: Position If the sub-string occurs in the string // 0 Otherwise
uint32_t n ; // String iterator (position) char lower ; // Lower case of first char of needle char upper ; // Upper case of first char of needle
if (!cpHaystack || !cpHaystack[0] || !cpNeedle || !cpNeedle[0]) return false ;
lower = tolower(cpNeedle[0]) ; upper = toupper(cpNeedle[0]) ;
for (n = 0 ; cpHaystack[n] ; n++) { if (cpHaystack[n] != lower && cpHaystack[n] != upper) continue ;
if (!CstrCompareI(cpHaystack + n, cpNeedle)) return n ; }
return -1 ; }
int32_t CstrLast (const char* cpStr, char testChar) { // Category: Text Processing // // This returns the position of the last instance of c in str. It has the advantage that it won't crash when given duff input // // Arguments: 1) cpStr The string to test // 2) testChar The char to test for // // Returns: -1 If the test char does not appear in the string // 0+ Being the position of the last occurence of the test char in the string
int32_t nPosn = 0 ; // Input string position int32_t nLast = -1 ; // Last position found
if (!cpStr || !cpStr[0] || !testChar) return -1 ;
for (nPosn = 0 ; cpStr[nPosn] ; nPosn++) { if (cpStr[nPosn] == testChar) nLast = nPosn ; }
return nLast ; }
int32_t CstrLastI (const char* cpStr, char testChar) { // Category: Text Processing // // This returns the position of the last instance of c in str. The comparison is case insensitive // // Arguments: 1) cpStr The source string // 2) testChar The char to test for // // Returns: -1 If the test char does not appear in the string // 0+ Being the position of the last occurence of the test char in the string
int32_t nPosn = 0 ; // Input string position int32_t nLast = -1 ; // Last position found
if (!cpStr || !cpStr[0] || !testChar) return -1 ;
for (nPosn = 0 ; cpStr[nPosn] ; nPosn++) { if (cpStr[nPosn] == testChar) nLast = nPosn ; }
return nLast ; }
int32_t CstrLast (const char* cpHaystack, const char* cpNeedle) { // Category: Text Processing // // Determine if the haystack string contains the needle string // // Arguments: 1) cpHaystack The source string // 2) cpNeedle The being sought // // Returns: True If the needle string occurs in the haystack string // False Otherwise
int32_t n ; // String iterator (position) int32_t posn = -1 ; // Last position needle found
if (!cpHaystack || !cpHaystack[0] || !cpNeedle || !cpNeedle[0]) return -1 ;
for (n = 0 ; cpHaystack[n] ; n++) { if (cpHaystack[n] != cpNeedle[0]) continue ;
if (!CstrCompare(cpHaystack + n, cpNeedle)) posn = n ; }
return posn ; }
int32_t CstrLastI (const char* cpHaystack, const char* cpNeedle) { // Category: Text Processing // // Determine if the supplied sub-string is contained within the supplied string on a case-sensitive basis // // Arguments: 1) cpHaystack The source string // 2) cpNeedle The string being sought // // Returns: Position If the sub-string occurs in the string // -1 Otherwise
int32_t n ; // String iterator (position) int32_t posn = -1 ; // Last position needle found char lower ; // Lower case of first char of needle char upper ; // Upper case of first char of needle
if (!cpHaystack || !cpHaystack[0] || !cpNeedle || !cpNeedle[0]) return -1 ;
lower = tolower(cpNeedle[0]) ; upper = toupper(cpNeedle[0]) ;
for (n = 0 ; cpHaystack[n] ; n++) { if (cpHaystack[n] != lower && cpHaystack[n] != upper) continue ;
if (!CstrCompareI(cpHaystack + n, cpNeedle)) posn = n ; }
return posn ; }
// FnGrp: FormalNumber // Category: Text Presentation // // Present a number formaly with commas every three digits // // Variations: 1) For large (64 bit) numbers (Maxlen 16 including a - sign) // 2) For int32_t (32 bit) numbers (Maxlen 27 including a - sign) // // Arguments: 1) nValue The number to print // 2) nMaxlen The size of the output (max depends on value range) // // Returns: Instance of string by value being the text representation of the supplied value // // Func: FormalNumber(int64_t,uint32_t) // Func: FormalNumber(uint64_t,uint32_t) // Func: FormalNumber(int32_t,uint32_t) // Func: FormalNumber(uint32_t,uint32_t)
const char* FormalNumber (int64_t nValue, uint32_t nMaxlen) { _hzfunc("FormalNumber_a") ;
char* i ; // Working buffer pointer char* pBuf ; // Working buffer uint32_t a ; // Number of billion billions uint32_t b ; // Number of million billions uint32_t c ; // Number of million millions uint32_t B ; // Number of billions uint32_t M ; // Number of millions uint32_t T ; // Number of thousands uint32_t U ; // Number of units
i = pBuf = _thisfn.ScratchPad(32) ;
if (nValue < 0) { *i++ = CHAR_MINUS ; nValue *= -1 ; }
a = (uint32_t) ((nValue / 1000000000000000000)) & 0xffff ; b = (uint32_t) ((nValue % 1000000000000000000) / 1000000000000000) & 0xffff ; c = (uint32_t) ((nValue % 1000000000000000) / 1000000000000) & 0xffff ; B = (uint32_t) ((nValue % 1000000000000) / 1000000000) & 0xffff ;
M = (uint32_t) ((nValue % 1000000000) / 1000000) & 0xffff ; T = (uint32_t) ((nValue % 1000000) / 1000) & 0xffff ; U = (uint32_t) ((nValue % 1000)) & 0xffff ;
if (nMaxlen > 0) { if (nMaxlen > 27) nMaxlen = 27 ; if (i[0] == CHAR_MINUS) nMaxlen-- ;
if (a > 9) nMaxlen -= 26 ; else if (a) nMaxlen -= 25 ; else if (b > 99) nMaxlen -= 23 ; else if (b > 9) nMaxlen -= 22 ; else if (b) nMaxlen -= 21 ; else if (c > 99) nMaxlen -= 19 ; else if (c > 9) nMaxlen -= 18 ; else if (c) nMaxlen -= 17 ; else if (B > 99) nMaxlen -= 15 ; else if (B > 9) nMaxlen -= 14 ; else if (B) nMaxlen -= 13 ; else if (M > 99) nMaxlen -= 11 ; else if (M > 9) nMaxlen -= 10 ; else if (M) nMaxlen -= 9 ; else if (T > 99) nMaxlen -= 7 ; else if (T > 9) nMaxlen -= 6 ; else if (T) nMaxlen -= 5 ; else if (U > 99) nMaxlen -= 3 ; else if (U > 9) nMaxlen -= 2 ; else nMaxlen-- ;
for (; nMaxlen > 0 ; nMaxlen--) *i++ = CHAR_SPACE ; } if (a) sprintf(i, "%d,%03d,%03d,%03d,%03d,%03d,%03d", a, b, c, B, M, T, U) ; else if (b) sprintf(i, "%d,%03d,%03d,%03d,%03d,%03d", b, c, B, M, T, U) ; else if (c) sprintf(i, "%d,%03d,%03d,%03d,%03d", c, B, M, T, U) ; else if (B) sprintf(i, "%d,%03d,%03d,%03d", B, M, T, U) ; else if (M) sprintf(i, "%d,%03d,%03d", M, T, U) ; else if (T) sprintf(i, "%d,%03d", T, U) ; else sprintf(i, "%d", U) ;
// correct font problem
return pBuf ; }
const char* FormalNumber (uint64_t nValue, uint32_t nMaxlen) { _hzfunc("FormalNumber(u64)") ;
char* i ; // Working buffer pointer char* pBuf ; // Working buffer uint32_t a ; // Number of billion billions uint32_t b ; // Number of million billions uint32_t c ; // Number of million millions uint32_t B ; // Number of billions uint32_t M ; // Number of millions uint32_t T ; // Number of thousands uint32_t U ; // Number of units
i = pBuf = _thisfn.ScratchPad(32) ;
if (nValue < 0) { *i++ = CHAR_MINUS ; nValue *= -1 ; }
a = (uint32_t) ((nValue / 1000000000000000000)) & 0xffff ; b = (uint32_t) ((nValue % 1000000000000000000) / 1000000000000000) & 0xffff ; c = (uint32_t) ((nValue % 1000000000000000) / 1000000000000) & 0xffff ; B = (uint32_t) ((nValue % 1000000000000) / 1000000000) & 0xffff ;
M = (uint32_t) ((nValue % 1000000000) / 1000000) & 0xffff ; T = (uint32_t) ((nValue % 1000000) / 1000) & 0xffff ; U = (uint32_t) ((nValue % 1000)) & 0xffff ;
if (nMaxlen > 0) { if (nMaxlen > 27) nMaxlen = 27 ; if (i[0] == CHAR_MINUS) nMaxlen-- ;
if (a > 9) nMaxlen -= 26 ; else if (a) nMaxlen -= 25 ; else if (b > 99) nMaxlen -= 23 ; else if (b > 9) nMaxlen -= 22 ; else if (b) nMaxlen -= 21 ; else if (c > 99) nMaxlen -= 19 ; else if (c > 9) nMaxlen -= 18 ; else if (c) nMaxlen -= 17 ; else if (B > 99) nMaxlen -= 15 ; else if (B > 9) nMaxlen -= 14 ; else if (B) nMaxlen -= 13 ; else if (M > 99) nMaxlen -= 11 ; else if (M > 9) nMaxlen -= 10 ; else if (M) nMaxlen -= 9 ; else if (T > 99) nMaxlen -= 7 ; else if (T > 9) nMaxlen -= 6 ; else if (T) nMaxlen -= 5 ; else if (U > 99) nMaxlen -= 3 ; else if (U > 9) nMaxlen -= 2 ; else nMaxlen-- ;
for (; nMaxlen > 0 ; nMaxlen--) *i++ = CHAR_SPACE ; } if (a) sprintf(i, "%u,%03u,%03u,%03u,%03u,%03u,%03u", a, b, c, B, M, T, U) ; else if (b) sprintf(i, "%u,%03u,%03u,%03u,%03u,%03u", b, c, B, M, T, U) ; else if (c) sprintf(i, "%u,%03u,%03u,%03u,%03u", c, B, M, T, U) ; else if (B) sprintf(i, "%u,%03u,%03u,%03u", B, M, T, U) ; else if (M) sprintf(i, "%u,%03u,%03u", M, T, U) ; else if (T) sprintf(i, "%u,%03u", T, U) ; else sprintf(i, "%u", U) ;
// correct font problem
return pBuf ; }
const char* FormalNumber (int32_t nValue, uint32_t nMaxlen) { _hzfunc("FormalNumber(i32)") ;
char* i ; // Working buffer pointer char* pBuf ; // Working buffer uint32_t B ; // Number of billions uint32_t M ; // Number of millions uint32_t T ; // Number of thousands uint32_t U ; // Number of units
i = pBuf = _thisfn.ScratchPad(16) ;
if (nValue < 0) { *i++ = CHAR_MINUS ; nValue *= -1 ; }
B = (nValue / 1000000000) ; M = (nValue % 1000000000) / 1000000 ; T = (nValue % 1000000) / 1000 ; U = (nValue % 1000) ;
if (nMaxlen > 0) { if (nMaxlen > 16) nMaxlen = 16 ; if (i[0] == CHAR_MINUS) nMaxlen-- ;
if (B) nMaxlen -= 13 ; else if (M > 99) nMaxlen -= 11 ; else if (M > 9) nMaxlen -= 10 ; else if (M) nMaxlen -= 9 ; else if (T > 99) nMaxlen -= 7 ; else if (T > 9) nMaxlen -= 6 ; else if (T) nMaxlen -= 5 ; else if (U > 99) nMaxlen -= 3 ; else if (U > 9) nMaxlen -= 2 ; else nMaxlen-- ;
for (; nMaxlen > 0 ; nMaxlen--) *i++ = CHAR_SPACE ; }
if (B) sprintf(i, "%d,%03d,%03d,%03d", B, M, T, U) ; else if (M) sprintf(i, "%d,%03d,%03d", M, T, U) ; else if (T) sprintf(i, "%d,%03d", T, U) ; else sprintf(i, "%d", U) ;
return pBuf ; }
const char* FormalNumber (uint32_t nValue, uint32_t nMaxlen) { _hzfunc("FormalNumber(u32)") ;
char* i ; // Working buffer pointer char* pBuf ; // Working buffer uint32_t B ; // Number of billions uint32_t M ; // Number of millions uint32_t T ; // Number of thousands uint32_t U ; // Number of units
i = pBuf = _thisfn.ScratchPad(16) ;
if (nValue < 0) { *i++ = CHAR_MINUS ; nValue *= -1 ; }
B = (nValue / 1000000000) ; M = (nValue % 1000000000) / 1000000 ; T = (nValue % 1000000) / 1000 ; U = (nValue % 1000) ;
if (nMaxlen > 0) { if (nMaxlen > 16) nMaxlen = 16 ; if (i[0] == CHAR_MINUS) nMaxlen-- ;
if (B) nMaxlen -= 13 ; else if (M > 99) nMaxlen -= 11 ; else if (M > 9) nMaxlen -= 10 ; else if (M) nMaxlen -= 9 ; else if (T > 99) nMaxlen -= 7 ; else if (T > 9) nMaxlen -= 6 ; else if (T) nMaxlen -= 5 ; else if (U > 99) nMaxlen -= 3 ; else if (U > 9) nMaxlen -= 2 ; else nMaxlen-- ;
for (; nMaxlen > 0 ; nMaxlen--) *i++ = CHAR_SPACE ; }
if (B) sprintf(i, "%u,%03u,%03u,%03u", B, M, T, U) ; else if (M) sprintf(i, "%u,%03u,%03u", M, T, U) ; else if (T) sprintf(i, "%u,%03u", T, U) ; else sprintf(i, "%u", U) ;
return pBuf ; }
const char* FormalMoney (int32_t nValue) { // Category: Text Presentation // // Present a sum of money formaly with either a minus or a space, commas every three digits a period and two digits for cents. // // Argument: nValue The number to print // // Returns: Instance of string by value being the text representation of the supplied value
_hzfunc("FormalMoney") ;
char* i ; // Working buffer pointer char* pBuf ; // Working buffer uint32_t M ; // Number of millions (value/100,000,000 cents) uint32_t T ; // Number of thousands (value%100,000,000/100,000) uint32_t U ; // Number of units uint32_t C ; // Number of cents uint32_t pad ; // For padding calculation
i = pBuf = _thisfn.ScratchPad(16) ;
i[0] = CHAR_SPACE ; if (nValue < 0) { i[0] = CHAR_MINUS ; nValue *= -1 ; }
M = (nValue / 100000000) ; T = (nValue % 100000000) / 100000 ; U = (nValue % 100000) / 100 ; C = (nValue % 100) ;
if (M > 9) pad = 0 ; else if (M) pad = 1 ; else if (T > 99) pad = 3 ; else if (T > 9) pad = 4 ; else if (T) pad = 5 ; else if (U > 99) pad = 7 ; else if (U > 9) pad = 8 ; else pad = 9 ;
for (i++ ; pad > 0 ; pad--) *i++ = CHAR_SPACE ;
if (M) sprintf(i, "%d,%03d,%03d.%02d", M, T, U, C) ; else if (T) sprintf(i, "%d,%03d.%02d", T, U, C) ; else if (U) sprintf(i, "%d.%02d", U, C) ; else sprintf(i, "0.%02d", C) ;
return pBuf ; }
/* ** Section X: Number to text conversion */
void _speakdigit (hzChain& Z, int32_t num) { // Category: Text Presentation // // Converts digits 0-9 to thier text equivelent and aggregates this to the supplied chain // // Arguments: 1) Z The chain to add to // 2) num The digig to convert to word // // Returns: None
switch (num) { case 0: Z << "zero" ; case 1: Z << "one" ; case 2: Z << "two" ; case 3: Z << "three" ; case 4: Z << "four" ; case 5: Z << "five" ; case 6: Z << "six" ; case 7: Z << "seven" ; case 8: Z << "eight" ; case 9: Z << "nine" ; } }
void _speakno (hzChain& Z, int32_t num) { // Category: Text Presentation // // Converts the supplied int32_t number (arg 2) to the textual equivelent and aggregates this to the supplied chain // // Arguments: 1) Z The chain to add to // 2) num The digig to convert to word // // Returns: None
int32_t h ; // Hundreds int32_t t ; // Tens
if (num > 99) { h = num / 100 ; num -= (h * 100) ;
_speakdigit(Z, h) ; if (num) Z << " hundred and " ; else { Z << " hundred" ; return ; } }
if (num > 19) { t = num / 10 ; num -= (t * 10) ;
switch (t) { case 2: Z << "twenty" ; case 3: Z << "thirty" ; case 4: Z << "fourty" ; case 5: Z << "fifty" ; case 6: Z << "sixty" ; case 7: Z << "seventy" ; case 8: Z << "eighty" ; case 9: Z << "ninety" ; }
if (!num) return ; }
if (num > 9) { switch (num) { case 10: Z << "ten" ; case 11: Z << "eleven" ; case 12: Z << "twelve" ; case 13: Z << "thirteen" ; case 14: Z << "fourteen" ; case 15: Z << "fifteen" ; case 16: Z << "sixteen" ; case 17: Z << "seventeen" ; case 18: Z << "eighteen" ; case 19: Z << "nineteen" ; } }
if (num) _speakdigit(Z, num) ; }
void SpeakNumber (hzChain& Z, int32_t nValue) { // Category: Text Presentation // // Converts the supplied int32_t number (arg 2) to the textual equivelent and aggregates this to the supplied chain // // Arguments: 1) Z Output chain aggregated by this operation // 2) nValue The numeric value // // Returns: None
_hzfunc("SpeakNumber") ;
hzString v ; // To be returned int32_t B ; // Billions int32_t M ; // Millions int32_t T ; // Thousands int32_t U ; // Units (0-999)
if (nValue < 0) { Z << "Minus " ; nValue *= -1 ; }
B = (nValue / 1000000000) ; M = (nValue % 1000000000) / 1000000 ; T = (nValue % 1000000) / 1000 ; U = (nValue % 1000) ;
if (B) { nValue -= (B * 1000000000) ; _speakno(Z, B) ; if (nValue) Z << " billion, " ; else { Z << " billion" ; return ; } }
if (M) { nValue -= (M * 1000000) ; _speakno(Z, M) ; if (nValue) Z << " million, " ; else { Z << " million" ; return ; } }
if (T) { nValue -= (T * 1000000) ; _speakno(Z, T) ; if (nValue) Z << " thousand and " ; else { Z << " thousand" ; return ; } }
if (U) _speakno(Z, U) ; }
hzString SpeakNumber (int32_t num) { // Category: Text Presentation // // Converts the supplied int32_t number (arg 1) to the textual equivelent (as the number would be spoken) // // Arguments: 1) nValue The numeric value // // Returns: Instance of hzString by value being text equivelent of supplied number
hzChain Z ; // Working chain hzString S ; // Target string
SpeakNumber(Z, num) ; S = Z ; return S ; }
bool ReadHex (int32_t& nVal, const char* s) { // Category: Text Processing // // Assumes the supplied character string pointer is the start of a hexadecimal number and reads the value. If there are 8 chars that match // [0-9] or [a-f] or [A-F], true is returned and arg1 (int32_t&) is set to the hex value. // // Arguments: 1) nVal The int32_t reference to be populated // 2) s The char string assumed to be at the start of a hex number // // Returns: True If the supplied cstr amounted to a hex value // False Otherwise
_hzfunc("ReadHex") ;
char* i = (char*) s ; // Need this because tolower violates const int32_t v = 0 ; // Value being read int32_t n = 0 ; // Number of chars processed
nVal = 0 ; if (!i || !i[0]) return false ;
for (; *i && n < 8 ; i++, n++) { if (chartype[*i] & CTYPE_DIGIT) { v *= 16 ; v += (*i - CHAR_0) ; continue ; }
*i = tolower(*i) ; if (*i < 'a' || *i > 'f') return false ;
v *= 16 ; v += 10 ; v += (*i - 'a') ; }
nVal = v ; return true ; }
uint32_t StripCRNL (char* cpLine) { // Category: Text Processing // // Purpose: Strip any carriage return and newline from input. // // Arguments: 1) cpLine Input line (char*) // // Returns: Value being length of the remaining string.
char* i = cpLine ; // Line iterator uint32_t nCount ; // Remaining length
if (!i) return 0 ; if (!i[0]) return 0 ;
for (nCount = 0 ; *i ; i++, nCount++) { if (*i == '\r') { if (i[1] == CHAR_NL && i[2] == 0) { i[0] = i[1] = 0 ; break ; } }
if (*i == '\r' || *i == CHAR_NL) { if (i[1] == 0) { i[0] = 0 ; break ; } } }
return nCount ; }
hzString EnEscape (const hzString& x) { // Category: Text Processing // // Replaces real values with thier escape sequences where approapriate. Eg "Hello" on one line followed by // "World" on another will become "Hello\nWorld". // // Arguments: 1) x Input string // // Returns: Instance of hzString by value being the escaped translation of the supplied string
_hzfunc(__func__) ;
hzChain ult ; // Working chain const char* i ; // Input string iterator hzString result ; // Output string
if (!x.Length()) return result ;
for (i = *x ; *i ; i++) { if (*i == CHAR_CR) { ult.AddByte(CHAR_BKSLASH) ; ult.AddByte('r') ; continue ; } if (*i == CHAR_NL) { ult.AddByte(CHAR_BKSLASH) ; ult.AddByte('n') ; continue ; } if (*i == CHAR_TAB) { ult.AddByte(CHAR_BKSLASH) ; ult.AddByte('t') ; continue ; }
if (*i < 27) { ult.AddByte(CHAR_HAT) ; ult.AddByte((*i + 'A') - 1) ; continue ; }
ult.AddByte(*i) ; }
result = ult ; return result ; }
hzString DeEscape (const hzString& x) { // Category: Text Processing // // Replaces escape sequences with the real vales. Eg "Hello\nWorld" will have the '\n' replaced by the ASCII value for a newline. Handles \r, \t // // Arguments: 1) x Input string // // Returns: Instance of hzString by value being the unescaped translation of the supplied string
_hzfunc(__func__) ;
char* buf ; // Working buffer char* j ; // Working buffer iterator const char* i ; // Input string iterator hzString result ; // Output string
if (!x.Length()) return result ;
j = buf = new char[x.Length() + 1] ;
for (i = *x ; *i ; i++) { if (*i == CHAR_BKSLASH) { i++ ; if (*i == 'r') { *j++ = CHAR_CR ; continue ; } if (*i == 'n') { *j++ = CHAR_NL ; continue ; } if (*i == 't') { *j++ = CHAR_TAB ; continue ; } if (*i == 'v') { *j++ = CHAR_CTRLI ; continue ; } if (*i == 'f') { *j++ = CHAR_CTRLL ; continue ; } if (*i == '"') { *j++ = CHAR_DQUOTE ; continue ; }
if (*i == CHAR_BKSLASH) { *j++ = CHAR_BKSLASH ; continue ; }
i-- ; } if (*i == CHAR_HAT) { i++ ; if (*i >= 'A' && *i <= 'Z') { *j++ = ((*i - 'A') + 1) ; continue ; } i-- ; } *j++ = *i ; } *j = 0 ;
result = buf ; delete buf ; return result ; }
/* ** Tokenization functions */
bool IsEntity (uint32_t& uVal, uint32_t& nLen, const char* tok) { // Category: Text Processing // // Purpose: Determine if a token could represent an entity of the form &#d..d; or 
..d; // Note that because of the anticipated context in which the function will be used, // the // // Arguments: 1) uVal A refernece to the result (int32_t) // 2) nLen The length of the token if confirmed as an entity. // 3) tok The token (const char*) // // Returns: True If token could be a valid large integer // False If string of zero length or contains non numeric chars
uint32_t v = 0 ; // Value int32_t c = 0 ; // Iterator const char* i ; // Pointer into token
uVal = 0 ; nLen = 0 ;
// First char must be an ampersand if (!tok) return false ; if (tok[0] != '&') return false ;
// Second char could be start of a non-numeric entity i = tok + 1 ; if (*i == 'a' && memcmp(i, "amp;", 4) == 0) { uVal = 38 ; nLen = 5 ; return true ; } if (*i == 'g' && memcmp(i, "gt;", 3) == 0) { uVal = 62 ; nLen = 4 ; return true ; } if (*i == 'l' && memcmp(i, "lt;", 3) == 0) { uVal = 60 ; nLen = 4 ; return true ; } if (*i == 'n' && memcmp(i, "nbsp;", 5) == 0) { uVal = 32 ; nLen = 6 ; return true ; }
// Now we only consider numeric entities and for these the second char must be a hash if (tok[1] != '#') return false ;
if (tok[2] == 'x') { // Suspect a hexadecimal entity
for (c = 3, i = tok + c ; IsHex(*i) ; c++, i++) { v *= 16 ;
if (*i >= '0' && *i <= '9') { v += (*i - '0') ; continue ; } if (*i >= 'A' && *i <= 'F') { v += 10 ; v += (*i - 'A') ; continue ; } if (*i >= 'a' && *i <= 'f') { v += 10 ; v += (*i - 'a') ; continue ; }
break ; } } else { // Suspect a decimal entity
for (c = 2, i = tok + c ; IsDigit(*i) ; c++, i++) { v *= 10 ; v += (*i - '0') ; } }
if (*i == ';') { // Entity confirmed
uVal = v ; nLen = c + 1 ; return true ; }
return false ; }
bool AtEntity (uint32_t& uVal, uint32_t& entLen, hzChain::Iter& zi) { // Category: Text Processing // // Determine if the supplied chain iterator is at the start of a numeric entity of the form &#x[hex number]; or &#[dec number]; // // Arguments: 1) uVal The value of the entity. // 2) entLen The length of the entity // 3) zi The chain iterator. // // Returns: True If the chain iterator is at the start of a valid entity // False Otherwise // // Note: This function always leaves the supplied iterator unchanged in accordance with the HadronZoo text-processing rules.
_hzfunc(__func__) ;
chIter xi ; // Input chain iterator uint32_t nLen ; // Length of entity
uVal = entLen = 0 ; nLen = 0 ;
// First char must be an ampersand xi = zi ; if (*xi != CHAR_AMPSAND) return false ;
// Second char could be start of a non-numeric entity xi++ ; if (*xi == 'a' && xi == "amp;") { uVal = 38 ; return 5 ; } if (*xi == 'g' && xi == "gt;") { uVal = 62 ; return 4 ; } if (*xi == 'l' && xi == "lt;") { uVal = 60 ; return 4 ; } if (*xi == 'n' && xi == "nbsp;") { uVal = 32 ; return 6 ; }
// Now we only consider numeric entities and for these the second char must be a hash if (*xi != CHAR_HASH) return false ;
xi++ ; if (*xi == 'x') { // Examining a suspected hex entity
nLen = 4 ; for (xi++ ; !xi.eof() && IsHex(*xi) ; nLen++, xi++) { uVal *= 16 ;
if (*xi >= '0' && *xi <= '9') { uVal += (*xi - '0') ; continue ; } if (*xi >= 'A' && *xi <= 'F') { uVal += 10 ; uVal += (*xi - 'A') ; continue ; } if (*xi >= 'a' && *xi <= 'f') { uVal += 10 ; uVal += (*xi - 'a') ; continue ; }
break ; } } else { // Expecting a decimal entity
nLen = 3 ; for (; !xi.eof() && IsDigit(*xi) ; nLen++, xi++) { uVal *= 10 ; uVal += (*xi - '0') ; } }
if (*xi == ';') { // Entity confirmed
entLen = nLen ; return true ; }
uVal = 0 ; return false ; }
/* ** Tokens: Unicode Sequences */
bool AtUnicodeSeq (uint32_t& uVal, uint32_t& nLen, hzChain::Iter& zi) { // Category: Text Presentation // // Determines if the supplied chain iterator is at the begining of a unicode sequence. If it is then both the value and size are determined. // // Note to be a UTF-8 sequence, the first byte must be either:- // a) 110xxxxx for a 2-byte sequence with the next byte being of the form 10xxxxxx // b) 1110xxxx for a 3-byte sequence with the next 2 bytes being of the form 10xxxxxx // c) 11110xxx for a 4-byte sequence with the next 3 bytes being of the form 10xxxxxx // d) 111110xx for a 5-byte sequence with the next 4 bytes being of the form 10xxxxxx // e) 1111110x for a 6-byte sequence with the next 5 bytes being of the form 10xxxxxx // // Arguments: 1) uVal The unicode value (set by this function) // 2) nLen The length of the unicode sequence encountered (set by this function) // 3) zi The chain or char sting iterator // // Returns: True If the supplied chain iterator is at the begining of a unicode sequence // False Otherwise
_hzfunc(__func__) ;
chIter xi ; // Input chain iterator uchar ubuf[8] ; // Unicode buffer
uVal = 0 ; nLen = 0 ;
if (zi.eof()) return false ;
if (!(*zi & 0x80)) return false ;
// Get first two bytes xi = zi ; ubuf[0] = (uchar) *xi ; xi++ ; ubuf[1] = (uchar) *xi ;
// If 2nd byte is not 0x80 or greater then we have a single byte unicode sequence if (!(ubuf[1] & 0x80)) { nLen = 1 ; uVal = ubuf[0] ; return false ; }
// If this is a utf-8 sequence then between 1 and 5 subsequent bytes will have a value between 128 and 191. If this is // the case we compute a value for the sequence and then look up this value for a printable form.
if ((ubuf[0] & 0xE0) == 0xC0) { // First 2 bits set, 3rd bit clear; Utf-8 sequence is this and next byte if ((ubuf[1] & 0xC0) == 0x80) { nLen = 2 ; uVal = ((ubuf[0] & 0x1F) << 6) + (ubuf[1] & 0x3F) ; return true ; } return false ; }
if ((ubuf[0] & 0xF0) == 0xE0) { // First 3 bits set, 4th bit clear; Utf-8 sequence is this and next 2 bytes xi++ ; ubuf[2] = (uchar) *xi ;
if ((ubuf[1] & 0xC0) == 0x80 && (ubuf[2] & 0xC0) == 0x80) { nLen = 3 ; uVal = ((ubuf[0] & 0x0F) << 12) + ((ubuf[1] & 0x3F) << 6) + (ubuf[2] & 0x3F) ; return true ; } return false ; }
if ((ubuf[0] & 0xF8) == 0xF0) { // First 4 bits set, 5th bit clear; Utf-8 sequence is this and next 3 bytes xi++ ; ubuf[2] = (uchar) *xi ; xi++ ; ubuf[3] = (uchar) *xi ;
if ((ubuf[1] & 0xC0) == 0x80 && (ubuf[2] & 0xC0) == 0x80 && (ubuf[3] & 0xC0) == 0x80) { uVal = ((ubuf[0] & 0x07) << 18) + ((ubuf[1] & 0x3F) << 12) + ((ubuf[2] & 0x3F) << 6) + (ubuf[3] & 0x3F) ; nLen = 4 ; return true ; }
return false ; }
if ((ubuf[0] & 0xFC) == 0xF8) { // First 5 bits set, 6th bit clear; Utf-8 sequence is this and next 4 bytes xi++ ; ubuf[2] = (uchar) *xi ; xi++ ; ubuf[3] = (uchar) *xi ; xi++ ; ubuf[4] = (uchar) *xi ;
if ((ubuf[1] & 0xC0) == 0x80 && (ubuf[2] & 0xC0) == 0x80 && (ubuf[3] & 0xC0) == 0x80 && (ubuf[4] & 0xC0) == 0x80) { uVal = ((ubuf[0] & 0x03) << 24) + ((ubuf[1] & 0x3F) << 18) + ((ubuf[2] & 0x3F) << 12) + ((ubuf[3] & 0x3F) << 6) + (ubuf[4] & 0x3F) ; nLen = 5 ; return true ; }
return false ; }
if ((ubuf[0] & 0xFE) == 0xFC) { // First 6 bits set, 7th bit clear; Utf-8 sequence is this and next 5 bytes xi++ ; ubuf[2] = (uchar) *xi ; xi++ ; ubuf[3] = (uchar) *xi ; xi++ ; ubuf[4] = (uchar) *xi ; xi++ ; ubuf[5] = (uchar) *xi ;
if ((ubuf[1] & 0xC0) == 0x80 && (ubuf[2] & 0xC0) == 0x80 && (ubuf[3] & 0xC0) == 0x80 && (ubuf[4] & 0xC0) == 0x80 && (ubuf[5] & 0xC0)) { uVal = ((ubuf[0] & 0x01) << 30) + ((ubuf[1] & 0x3F) << 24) + ((ubuf[2] & 0x3F) << 18) + ((ubuf[3] & 0x3F) << 12) + ((ubuf[4] & 0x3F) << 6) + (ubuf[5] & 0x3F) ; nLen = 6 ; return true ; } }
return false ; }
bool IsUnicodeSeq (uint32_t& nEnt, uint32_t& nLen, const char* zi) { // Category: Text Presentation // // Determine if the current char is the start of a unicode (UTF-8) sequence. If it is the value and the length (args 1 & 2) are set and // true is returned. Otherwise the value is just the current char (either upper or lower ASCII), the length is 1 and false is returned. // // Note to be a UTF-8 sequence, the first byte must be either:- // a) 110xxxxx for a 2-byte sequence with the next byte being of the form 10xxxxxx // b) 1110xxxx for a 3-byte sequence with the next 2 bytes being of the form 10xxxxxx // c) 11110xxx for a 4-byte sequence with the next 3 bytes being of the form 10xxxxxx // d) 111110xx for a 5-byte sequence with the next 4 bytes being of the form 10xxxxxx // e) 1111110x for a 6-byte sequence with the next 5 bytes being of the form 10xxxxxx // // Arguments: 1) uVal The unicode value // 2) nLen The length of the unicode sequence encountered // 3) zi The chain or char sting iterator // // Returns: True If the chain iterator is at the start of a unicode sequence // False Otherwise
const uchar* xi ; // Input chain iterator
uchar ubuf[8] ; // Unicode buffer uint32_t value ; // Unicode value
nLen = 0 ; value = 0 ; ubuf[0] = (uchar) *zi ;
if (ubuf[0] >= 192) { // If this is a utf-8 sequence then between 1 and 5 subsequent bytes will have a value between 128 and 191. If this is // the case we compute a value for the sequence and then look up this value for a printable form.
xi = (uchar*) zi ;
if (ubuf[0] < 224) { // Utf-8 sequence is this and next byte xi++ ; ubuf[1] = (uchar) *xi ;
if (ubuf[1] > 127 && ubuf[1] < 192) { value = ((ubuf[0] & 0x1F) << 6) ; value += (ubuf[1] & 0x3F) ; nLen = 2 ; } } else if (ubuf[0] < 240) { // Utf-8 sequence is this and next 2 bytes xi++ ; ubuf[1] = (uchar) *xi ; xi++ ; ubuf[2] = (uchar) *xi ;
if (ubuf[1] > 127 && ubuf[1] < 192 && ubuf[2] > 127 && ubuf[2] < 192) { value = ((ubuf[0] & 0x0F) << 12) ; value += ((ubuf[1] & 0x3F) << 6) ; value += (ubuf[2] & 0x3F) ; nLen = 3 ; } } else if (ubuf[0] < 248) { // Utf-8 sequence is this and next 3 bytes xi++ ; ubuf[1] = (uchar) *xi ; xi++ ; ubuf[2] = (uchar) *xi ; xi++ ; ubuf[3] = (uchar) *xi ;
if (ubuf[1] > 127 && ubuf[1] < 192 && ubuf[2] > 127 && ubuf[2] < 192 && ubuf[3] > 127 && ubuf[3]) { value = ((ubuf[0] & 0x07) << 18) ; value += ((ubuf[1] & 0x3F) << 12) ; value += ((ubuf[2] & 0x3F) << 6) ; value += (ubuf[3] & 0x3F) ; nLen = 4 ; } } else if (ubuf[0] < 252) { // Utf-8 sequence is this and next 4 bytes xi++ ; ubuf[1] = (uchar) *xi ; xi++ ; ubuf[2] = (uchar) *xi ; xi++ ; ubuf[3] = (uchar) *xi ; xi++ ; ubuf[4] = (uchar) *xi ;
if (ubuf[1] > 127 && ubuf[1] < 192 && ubuf[2] > 127 && ubuf[2] < 192 && ubuf[3] > 127 && ubuf[3] && ubuf[4] > 127 && ubuf[4]) { value = ((ubuf[0] & 0x03) << 24) ; value += ((ubuf[1] & 0x3F) << 18) ; value += ((ubuf[2] & 0x3F) << 12) ; value += ((ubuf[3] & 0x3F) << 6) ; value += (ubuf[4] & 0x3F) ; nLen = 5 ; } } else { // Utf-8 sequence is this and next 5 bytes xi++ ; ubuf[1] = (uchar) *xi ; xi++ ; ubuf[2] = (uchar) *xi ; xi++ ; ubuf[3] = (uchar) *xi ; xi++ ; ubuf[4] = (uchar) *xi ; xi++ ; ubuf[5] = (uchar) *xi ;
if (ubuf[1] > 127 && ubuf[1] < 192 && ubuf[2] > 127 && ubuf[2] < 192 && ubuf[3] > 127 && ubuf[3] && ubuf[4] > 127 && ubuf[4] && ubuf[5] > 127 && ubuf[5]) { value = ((ubuf[0] & 0x01) << 30) ; value += ((ubuf[1] & 0x3F) << 24) ; value += ((ubuf[2] & 0x3F) << 18) ; value += ((ubuf[3] & 0x3F) << 12) ; value += ((ubuf[4] & 0x3F) << 6) ; value += (ubuf[5] & 0x3F) ; nLen = 6 ; } } }
nEnt = value ; return nLen ? true : false ; }
uint32_t TestAlphanum (hzString& word, hzChain::Iter& ci) { // Category: Text Presentation // // If the supplied chain iterator's current character is alphanumeric, place it and all subsequent alphanumic characters into the supplied string. Note the // supplied iterator is not advanced by this function. // // Arguments: 1) word The found alphanumeric sequence // 2) ci The input chain iterator // // Returns: Length of alphanumeric sequence
word.Clear() ;
if (ci.eof()) return 0 ;
if (!IsAlphanum(*ci)) return 0 ;
hzChain::Iter zi ; // Forward iterator hzChain W ; // For building word
for (zi = ci ; !zi.eof() && IsAlpha(*zi) ; zi++) W.AddByte(*zi) ;
if (!W.Size()) return 0 ;
word = W ; return word.Length() ; }
int32_t CstrCompareW (const char* a, const char* b) { // Category: Text Processing // // Purpose: Compare two strings on a word basis. This ignores case and considers both input strings as a series of alphanumeric // words separated by whitespace. Adjacent whitespace characters are treated as a single space with leading & trailing // whitespace ignored. All punctuation characters and all characters beyond the lower ASCII are ignored. // // Arguments: 1) a First string // 2) b Second string // // Returns: +1 If a is lexically more than b // -1 If a is lexically less than b // 0 If a and b are lexically equivelent
const char* pA = a ; // String A iterator const char* pB = b ; // String B iterator
uint32_t entA ; // Value of char (be it ASCII, entity or unicode sequence) uint32_t entB ; // Same for second input uint32_t len ; // Length of sequence from which ent is derived
if (pA) for (pA++ ; *pA && *pA <= CHAR_SPACE ; pA++) ; if (pB) for (pB++ ; *pB && *pB <= CHAR_SPACE ; pB++) ;
for (;;) { entA = entB = len = 0 ;
// Get next value from A if (pA) { if (*pA && *pA <= CHAR_SPACE) { for (pA++ ; *pA <= CHAR_SPACE ; pA++) ; entA = CHAR_SPACE ; } else { for (; *pA ;) { if (IsEntity(entA, len, pA)) pA += len ; else if (IsUnicodeSeq(entA, len, pA)) pA += len ; else entA = *pA++ ;
if (IsAlphanum(entA)) { entA = conv2lower(entA) ; break ; } } }
if (*pA == 0 && entA == CHAR_SPACE) entA = 0 ; }
if (pB) { if (*pB && *pB <= CHAR_SPACE) { for (pB++ ; *pB && *pB <= CHAR_SPACE ; pB++) ; entB = CHAR_SPACE ; } else { for (; *pB ;) { if (IsEntity(entB, len, pB)) pB += len ; else if (IsUnicodeSeq(entB, len, pB)) pB += len ; else entB = *pB++ ;
if (IsAlphanum(entB)) { entB = conv2lower(entB) ; break ; } } }
if (*pB == 0 && entB == CHAR_SPACE) entB = 0 ; }
if (entA != entB) return entA - entB ;
if (!entA) break ; }
return 0 ; }
/* ** Section X: String substitution */
// FnGrp: Ersatz // Category: Text Processing // // Substitute all instances of one string with another. The source of text can be either a hzChain or a hzString. This may either be applied on a // case sensitive or a case insensitive basis. // // Substitute all instances found in hzChain Z (arg 1) of hzString 'from' (arg 2) with hzString 'to' (arg 3). Matches on OLD are case sensitive // by default but are case insensitive if arg 4 is true. // // Arguments: 1) Z The chain containing text upon which the substitions are to be performed. // 2) from The string for which all instances are to be replaced // 3) to The string that will serve as the replacement. // 4) bCase Boolean true for case insensitive. Default false for case sensitive // // Returns The number of substitutions that were performed // // Func: Ersatz(hzChain&,hzString&,hzString&,bool) // Func: Ersatz(hzChain&,const char*,const char*,bool) // Func: Ersatz(hzString&,hzString&,hzString&,bool) // Func: Ersatz(hzString&,const char*,const char*,bool)
uint32_t Ersatz (hzChain& Z, hzString& from, hzString& to, bool bCase) { _hzfunc("Ersatz1") ;
chIter zi ; // Chain iterator hzChain F ; // Result uint32_t nSubs = 0 ; // Number of substitutions
if (!from) return -1 ; if (!to) return -1 ;
if (!Z.Size()) return 0 ;
if (bCase) { for (zi = Z ; !zi.eof() ;) { if (zi == from) { nSubs++ ; F << to ; zi += from.Length() ; continue ; }
F.AddByte(*zi) ; zi++ ; } } else { for (zi = Z ; !zi.eof() ;) { if (zi.Equiv(from)) { nSubs++ ; F << to ; zi += from.Length() ; continue ; }
F.AddByte(*zi) ; zi++ ; } }
Z.Clear() ; Z = F ;
return nSubs ; }
uint32_t Ersatz (hzChain& Z, const char* from, const char* to, bool bCase) { _hzfunc("Ersatz2") ;
hzString a = from ; // Cast to hzString hzString b = to ; // Cast to hzString
return Ersatz(Z, a, b, bCase) ; }
hzString Ersatz (hzString& S, hzString& from, hzString& to, bool bCase) { _hzfunc("Ersatz3") ;
hzChain ult ; // For building result const char* i ; // Iterator
if (!S) return S ;
if (bCase) { for (i = *S ; *i ; i++) { if (CstrCompareI(i, *from, from.Length()) == 0) //if (strncasecmp(i, *from, from.Length()) == 0) ult << to ; else ult.AddByte(*i) ; } } else { for (i = *S ; *i ; i++) { if (memcmp(i, from, from.Length()) == 0) ult << to ; else ult.AddByte(*i) ; } }
S.Clear() ; S = ult ; return S ; }
hzString Ersatz (hzString& S, const char* from, const char* to, bool bCase) { _hzfunc("Ersatz4") ;
hzString a = from ; // Cast to hzString hzString b = to ; // Cast to hzString
return Ersatz(S, a, b, bCase) ; }