//
// File: hzTokens.cpp
//
// Legal Notice: This file is part of the HadronZoo C++ Class Library. Copyright 2025 HadronZoo Project (http://www.hadronzoo.com)
//
// The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free
// Software Foundation, either version 3 of the License, or any later version.
//
// The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses.
//
#include <fstream>
#include <sys/stat.h>
#include "hzChars.h"
#include "hzProcess.h"
#include "hzTextproc.h"
#include "hzDirectory.h"
#include "hzDatabase.h"
#include "hzTokens.h"
/*
** Section 1: Non Member Functions
*/
bool _testInteger (hzString& S, chIter& ci)
{
// Return true if the supplied chain iterator is at a digit. Gather all consecutive digits into the supplied string. As this is only for tokenization, the
// value of the integer is not calcuated.
//
// Arguments: 1) S String to contain the set of contigous digits
// 2) ci The chain iterator
//
// Returns: True If the supplied iterator is at a digit
// False Otherwise
_hzfunc("_testInteger") ;
hzChain W ; // Working chain for gathering a complete word
chIter x ; // Internal copy of supplied chain iterator
hzEcode rc ; // Return code
S.Clear() ;
for (x = ci ; !x.eof() ; x++)
{
if (!(chartype[*x] & CTYPE_DIGIT))
break ;
W.AddByte(*x) ;
}
if (!W.Size())
return false ;
S = W ;
ci += S.Length() ;
return true ;
}
bool _testNumber (hzString& S, chIter& ci)
{
// Return true if the sequence at the current supplied chain iterator amounts to a number. Gather all consecutive digits into the supplied string. As this
// is only for tokenization, the value of the integer is not calcuated.
//
// Arguments: 1) S String to contain the set of contigous digits
// 2) ci The chain iterator
//
// Returns: True If the supplied iterator is at the start of a valid number
// False Otherwise
hzChain W ; // Working chain for gathering a complete word
chIter x ; // Internal copy of supplied chain iterator
uint32_t nDigits = 0 ; // Number of digits found
uint32_t nBytes = 0 ; // Number of bytes found
x = ci ;
// Deal with leading sign
if (*x == CHAR_MINUS || *x == CHAR_PLUS)
{ W.AddByte(*x) ; nBytes++ ; x++ ; }
// Expect a series of at least one digit
for (nDigits = 0 ; IsDigit(*x) ; nBytes++, nDigits++, x++)
W.AddByte(*x) ;
if (!nDigits)
return false ;
// Test for a period that is followed by at least one digit
if (*x == CHAR_PERIOD)
{
x++ ;
W.AddByte(*x) ;
for (nDigits = 0 ; IsDigit(*x) ; nBytes++, nDigits++, x++)
W.AddByte(*x) ;
if (!nDigits)
return false ;
}
// Test for the 'e' followed by at least one digit or a +/- followed by at least one digit
if (*x == 'e')
{
x++ ;
W.AddByte(*x) ;
if (*x == CHAR_MINUS || *x == CHAR_PLUS)
{ W.AddByte(*x) ; nBytes++ ; x++ ; }
for (nDigits = 0 ; IsDigit(*x) ; nBytes++, nDigits++, x++)
W.AddByte(*x) ;
if (!nDigits)
return false ;
}
if (!nBytes)
return false ;
ci = x ;
S = W ;
return true ;
}
bool IsHexValue (uint32_t& nLen, const char* pStr)
{
// Category: Text Processing
//
// Determine if supplied char string (arg 2) amounts to a hexadecimal number. This may optionally be preceeded by # or 0x. If the string is
// a hexadecimal number true is returned and the supplied uint32_t reference (arg 1) is set to the value.
//
// Arguments: 1) nLen Set by the operation as the length of the hexadecimal number if found
// 2) pStr The pointer into the test string
//
// Returns: True If the supplied cstr amounts to a hexidecimal number
// False Otherwise
const char* i = pStr ; // Input string iterator
uint32_t nBytes = 0 ;
uint32_t nHex = 0 ;
nLen = 0 ;
if (!i)
return false ;
if (*i == CHAR_HASH && IsHex(i[1]))
{ nBytes++ ; i++ ; }
if (*i == '0' && i[1] == 'x')
{ nBytes += 2 ; i += 2 ; }
for (; IsHex(*i) ; nHex++, nBytes++, i++) ;
if (!nHex)
return false ;
nLen = nBytes ;
return true ;
}
bool _testHexnum (hzString& S, chIter& ci)
{
// Arguments: 1) S Reference to string to hold discovered hex number
// 2) ci Chain iterator of ongoing input
//
// Returns: True If the supplied cstr amounts to a hexidecimal number
// False Otherwise
_hzfunc("IsHexValue") ;
hzChain W ;
chIter x = ci ;
uint32_t nSize = 0 ;
if (x == CHAR_HASH)
{ W.AddByte(*x) ; x++ ; }
else if (x == "0x" || x == "0X")
{ W << "0x" ; x += 2 ; }
else
return false ;
for (; IsHex(*x) ; nSize++, x++)
W.AddByte(*x) ;
if (!W.Size())
return false ;
S = W ;
ci = x ;
return true ;
}
/*
** Tokenize Fuctions
*/
hzEcode TokenizeWords (hzVect<hzToken>& toks, hzChain& C)
{
// Category: Text Processing
//
// Tokenize into words and numbers only, ignoring all punctuation. This is suitable for indexation of documents although it is not
// suitable for querying a document index as it does not produce boolean expressions of words.
//
// Arguments: 1) toks The vector of tokens found in the input
// 2) C The input chain
//
// Returns: E_NODATA If the supplied chain is empty
// E_OK If the supplied chain is tokenized
_hzfunc("TokenizeWords") ;
chIter ci ; // For iteration of input
hzChain word ; // For building token
hzToken T ; // Token
hzString S ; // Temp string
uint32_t nLine ; // For assigning line numbers to tokens
toks.Clear() ;
if (!C.Size())
return E_NODATA ;
for (ci = C ; !ci.eof() ;)
{
if (*ci <= CHAR_SPACE)
{ ci++ ; continue ; }
if (*ci == CHAR_DQUOTE)
{
S.Clear() ;
word.Clear() ;
nLine = ci.Line() ;
for (ci++ ; !ci.eof() ; ci++)
{
if (*ci == CHAR_DQUOTE)
{ ci++ ; break ; }
word.AddByte(*ci) ;
}
if (word.Size())
{
S = word ;
T.Init(S, nLine, TOKEN_ALPHANUM) ;
toks.Add(T) ;
}
continue ;
}
if (ci == "/*")
{
for (ci += 2 ; !ci.eof() ; ci++)
{
if (ci == "*/")
{ ci += 2 ; break ; }
}
continue ;
}
if (IsAlphanum(*ci))
{
S.Clear() ;
word.Clear() ;
nLine = ci.Line() ;
word.AddByte(*ci) ;
for (ci++ ; !ci.eof() && IsAlphanum(*ci) ; ci++)
word.AddByte(*ci) ;
if (word.Size())
{
S = word ;
T.Init(S, nLine, TOKEN_ALPHANUM) ;
toks.Add(T) ;
}
continue ;
}
ci++ ;
}
return E_OK ;
}
hzEcode TokenizeFreetext (hzVect<hzToken>& toks, hzChain& C)
{
// Category: Text Processing
//
// Tokenize into words and numbers in accordance with the rules for freetext indexation. This, like the TokenizeWords() function,
// ignors all punctuation between words although it does process punctuation within words.
//
// Words are first split on the basis of whitespace only (although trailing punctuation chars are ignored). This produces 'raw'
// words from which other words may be derived. As the raw words are built up, a count of each character type is kept. This helps
// drive the word derivation process. The output then consists of the raw word (if it survives) followed by any derived word.
//
// Examples:-
// 1) $4million -> dollars 4 million
// 2) bee-keeper -> bee-keeper beekeeper bee keeper.
//
// Arguments: 1) toks The vector of tokens found in the input
// 2) C The input chain
//
// Returns: E_NODATA If the supplied chain is empty
// E_OK If the supplied chain is tokenized
_hzfunc("TokenizeFreetext") ;
hzChain raw ; // For building raw word
chIter ci ; // For iteration of input
chIter ri ; // For iteration of raw word
hzToken T ; // Token
hzString word ; // Word (from raw)
uint32_t ucVal ; // Value of unicode char
uint32_t nPunct ; // Count of punctuation chars in raw word
uint32_t nSymb ; // Count of symbol chars in raw word
uint32_t nDigit ; // Count of digits in raw word
uint32_t nAlpha ; // Count of alphas in raw word
uint32_t nWeird ; // Count of alphas in raw word
uint32_t ucLen ; // Length of unicode char
uint32_t nLine ; // Line number (at start of raw sequence)
toks.Clear() ;
if (!C.Size())
return E_NODATA ;
ci = C ;
ci.Line(1) ;
for (; !ci.eof() ;)
{
// Ignore leading spaces and other non alphnumerics
if (!IsAlphanum(*ci))
{ ci++ ; continue ; }
// Now we only have to deal with chars that are part of the raw word
raw.Clear() ;
nPunct = nSymb = nDigit = nAlpha = nWeird = 0 ;
nLine = ci.Line() ;
for (; !ci.eof() && IsAlphanum(*ci) ;)
{
if (IsPunct(*ci))
{
if (ci[1] <= CHAR_SPACE)
{ ci++ ; break ; }
nPunct++ ;
raw.AddByte(*ci) ;
ci++ ;
continue ;
}
if (IsSymb(*ci)) { nSymb++ ; raw.AddByte(*ci) ; ci++ ; continue ; }
if (IsDigit(*ci)) { nDigit++ ; raw.AddByte(*ci) ; ci++ ; continue ; }
if (IsAlpha(*ci)) { nAlpha++ ; raw.AddByte(*ci) ; ci++ ; continue ; }
// Do we have a unicode sequence?
if (AtUnicodeSeq(ucVal, ucLen, ci))
{
if (ucVal > 255)
{ nWeird++ ; ci++ ; }
else
{ raw.AddByte(ucVal & 0xff) ; ci += ucLen ; }
continue ;
}
raw.AddByte(*ci) ;
}
// Now we have a raw word, do we add it to the token list as-is or do we spawn derivatives?
if (!raw.Size())
continue ;
if (nAlpha)
{
if (!nDigit && !nPunct && !nSymb && !nWeird)
{
word = raw ;
word.ToLower() ;
T.Init(word, nLine, TOKEN_ALPHANUM) ;
toks.Add(T) ;
}
// Now must fill in what happens when we have digits, hyphens and what have you.
}
}
return E_OK ;
}
hzEcode TokenizeBool (hzVect<hzToken>& toks, hzChain& C)
{
// Category: Text Processing
//
// Tokenize supplied chain into tokens expected to form a boolean expression
//
// Arguments: 1) toks The vector of tokens found in the input
// 2) C The input chain
//
// Returns: E_NODATA If the supplied chain is empty
// E_OK If the supplied chain is tokenized
_hzfunc("hzTokenlist::TokenizeBool") ;
hzChain W ; // For building tokens
chIter ci ; // For iteration of input
hzToken T ; // Token
hzString S ; // For assembling the token value
uint32_t nLine ; // For assigning line numbers to tokens
char tmp [4] ; // For operator
toks.Clear() ;
if (!C.Size())
return E_NODATA ;
ci = C ;
nLine = 1 ;
for (; !ci.eof() ;)
{
// Increment line number when newlines are encountered
if (*ci == CHAR_NL)
{ nLine++ ; ci++ ; continue ; }
// Strip whitespace and other non-printable chars
if (IsBinary(*ci) || IsWhite(*ci))
{ ci++ ; continue ; }
// Eliminate comments
if (ci == "/*")
{
for (ci += 2 ; !ci.eof() && ci != "*/" ; ci++) ;
ci += 2 ;
continue ;
}
if (ci == "//")
{
for (ci += 2 ; !ci.eof() && *ci != CHAR_NL ; ci++) ;
ci++ ;
continue ;
}
// Assume we are at the start of a token - Check for quoted string
if (*ci == CHAR_DQUOTE)
{
for (ci++ ; !ci.eof() && *ci != CHAR_DQUOTE ; ci++)
W.AddByte(*ci) ;
S = W ;
W.Clear() ;
T.Init(S, ci.Line(), TOKEN_STRING) ;
toks.Add(T) ;
ci++ ;
continue ;
}
// Check for valid hexadecimal value
if (_testHexnum(S, ci))
{
T.Init(S, ci.Line(), TOKEN_STRING) ;
toks.Add(T) ;
continue ;
}
// Check for integer
if (_testInteger(S, ci))
{
T.Init(S, ci.Line(), TOKEN_INTEGER) ;
toks.Add(T) ;
continue ;
}
// Check for number (std form)
if (_testNumber(S, ci))
{
T.Init(S, ci.Line(), TOKEN_NUMBER) ;
toks.Add(T) ;
continue ;
}
// Check for seperator
if (IsPunct(*ci))
{
tmp[0] = *ci ;
tmp[1] = 0 ;
ci++ ;
T.Init(tmp, nLine, TOKEN_SEPARATOR) ;
toks.Add(T) ;
continue ;
}
// Check for operator
if (IsSymb(*ci))
{
for (; !ci.eof() && IsSymb(*ci) ; ci++)
W.AddByte(*ci) ;
S = W ;
W.Clear() ;
T.Init(S, nLine, TOKEN_OPERATOR) ;
toks.Add(T) ;
continue ;
}
// Not an operator or separator - must be general entitiy
// We have a rule that productions must be written out
// in full. E.g. 2X must be written as 2*X. The system
// will interpret 2X as an alpha numeric quantity
for (; !ci.eof() && IsAlphanum(*ci) ; ci++)
W.AddByte(*ci) ;
S = W ;
W.Clear() ;
T.Init(S, nLine, TOKEN_OPERATOR) ;
toks.Add(T) ;
}
return E_OK ;
}
/*
** Application level Tokenization Functions
*/
hzEcode TokenizeChain (hzVect<hzToken>& toks, hzChain& C, hzTokMode eMode)
{
// Category: Text Processing
//
// Populate supplied vector of tokens (arg 1) by tokenizing the supplied chain (arg 2) according to the modus operandi specified by arg 3
//
// Arguments: 1) toks The vector of tokens found in the input
// 2) C The input chain
// 3) eMode The tokenization regime (either WHITE, FTEXT or BOOL)
//
// Returns: E_NODATA If the supplied chain is empty
// E_RANGE If the supplied mode is invalid
// E_OK If the supplied chain is tokenized
switch (eMode)
{
case TOK_MO_WHITE: return TokenizeWords(toks, C) ;
case TOK_MO_FTEXT: return TokenizeFreetext(toks, C) ;
case TOK_MO_BOOL: return TokenizeBool(toks, C) ;
}
return E_RANGE ;
}
hzEcode TokenizeString (hzVect<hzToken>& toks, const char* pBuf, hzTokMode eMode)
{
// Category: Text Processing
//
// Populate supplied vector of tokens (arg 1) by tokenizing the supplied char string (arg 2) according to the modus operandi specified by
// arg 3
//
// Arguments: 1) toks The vector of tokens found in the input
// 2) C The input chain
// 3) eMode The tokenization regime (either WHITE, FTEXT or BOOL)
//
// Returns: E_NODATA If the supplied chain is empty
// E_RANGE If the supplied mode is invalid
// E_OK If the supplied chain is tokenized
_hzfunc("TokenizeString") ;
hzChain C ; // Working chain
C = pBuf ;
switch (eMode)
{
case TOK_MO_WHITE: return TokenizeWords(toks, C) ;
case TOK_MO_FTEXT: return TokenizeFreetext(toks, C) ;
case TOK_MO_BOOL: return TokenizeBool(toks, C) ;
}
return E_RANGE ;
}
hzEcode TokenizeFile (hzVect<hzToken>& toks, const char* fname, hzTokMode eMode)
{
// Category: Text Processing
//
// Populate supplied vector of tokens (arg 1) by tokenizing the supplied file (named in arg 2) according to the modus operandi specified by
// arg 3
//
// Arguments: 1) toks The vector of tokens found in the input
// 2) fname The input filename
// 3) eMode The tokenization regime (either WHITE, FTEXT or BOOL)
//
// Returns: E_NODATA If the supplied chain is empty
// E_RANGE If the supplied mode is invalid
// E_OK If the supplied chain is tokenized
_hzfunc("TokenizeFile") ;
/*
** Convert file into tokens
*/
std::ifstream is ; // Input stream
FSTAT fs ; // File status
hzChain C ; // Working chain
if (!fname || !fname[0])
{
hzerr(E_ARGUMENT, "Cannot tokenize unnamed file") ;
return E_ARGUMENT ;
}
if (stat(fname, &fs) == -1)
{
hzerr(E_ARGUMENT, "File (%s) does not exist") ;
return E_ARGUMENT ;
}
is.open(fname) ;
if (is.fail())
{
hzerr(E_OPENFAIL, "File %s", fname) ;
return E_OPENFAIL ;
}
C << is ;
is.close() ;
switch (eMode)
{
case TOK_MO_WHITE: return TokenizeWords(toks, C) ;
case TOK_MO_FTEXT: return TokenizeFreetext(toks, C) ;
case TOK_MO_BOOL: return TokenizeBool(toks, C) ;
}
return E_RANGE ;
}