<pre>//
//  File:   hzTokens.cpp
//
//  Legal Notice:   This file is part of the HadronZoo C++ Class Library. Copyright 2025 HadronZoo Project (http://www.hadronzoo.com)
//
//  The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free
//  Software Foundation, either version 3 of the License, or any later version.
//
//  The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
//  A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses.
//</pre>
			<pre>#include &lt;fstream&gt;</pre>
			<pre>#include &lt;sys/stat.h&gt;</pre>
			<pre>#include "hzChars.h"
#include "hzProcess.h"
#include "hzTextproc.h"
#include "hzDirectory.h"
#include "hzDatabase.h"
#include "hzTokens.h"</pre>
			<pre>/*
**  Section 1:  Non Member Functions
*/</pre>
			<pre>bool    _testInteger    (hzString&amp; S, chIter&amp; ci)
{
    //  Return true if the supplied chain iterator is at a digit. Gather all consecutive digits into the supplied string. As this is only for tokenization, the
    //  value of the integer is not calcuated.
    //
    //  Arguments:  1)  S   String to contain the set of contigous digits
    //              2)  ci  The chain iterator
    //
    //  Returns:    True    If the supplied iterator is at a digit
    //              False   Otherwise</pre>
			<pre>    _hzfunc("_testInteger") ;</pre>
			<pre>    hzChain     W ;     //  Working chain for gathering a complete word
    chIter      x ;     //  Internal copy of supplied chain iterator
    hzEcode     rc ;    //  Return code</pre>
			<pre>    S.Clear() ;
    for (x = ci ; !x.eof() ; x++)
    {
        if (!(chartype[*x] &amp; CTYPE_DIGIT))
            break ;
        W.AddByte(*x) ;
    }</pre>
			<pre>    if (!W.Size())
        return false ;</pre>
			<pre>    S = W ;
    ci += S.Length() ;
    return true ;
}</pre>
			<pre>bool    _testNumber (hzString&amp; S, chIter&amp; ci)
{
    //  Return true if the sequence at the current supplied chain iterator amounts to a number. Gather all consecutive digits into the supplied string. As this
    //  is only for tokenization, the value of the integer is not calcuated.
    //
    //  Arguments:  1)  S   String to contain the set of contigous digits
    //              2)  ci  The chain iterator
    //
    //  Returns:    True    If the supplied iterator is at the start of a valid number
    //              False   Otherwise</pre>
			<pre>    hzChain     W ;             //  Working chain for gathering a complete word
    chIter      x ;             //  Internal copy of supplied chain iterator
    uint32_t    nDigits = 0 ;   //  Number of digits found
    uint32_t    nBytes = 0 ;    //  Number of bytes found</pre>
			<pre>    x = ci ;</pre>
			<pre>    //  Deal with leading sign
    if (*x == CHAR_MINUS || *x == CHAR_PLUS)
        { W.AddByte(*x) ; nBytes++ ; x++ ; }</pre>
			<pre>    //  Expect a series of at least one digit
    for (nDigits = 0 ; IsDigit(*x) ; nBytes++, nDigits++, x++)
        W.AddByte(*x) ;
    if (!nDigits)
        return false ;</pre>
			<pre>    //  Test for a period that is followed by at least one digit
    if (*x == CHAR_PERIOD)
    {
        x++ ;
        W.AddByte(*x) ;
        for (nDigits = 0 ; IsDigit(*x) ; nBytes++, nDigits++, x++)
            W.AddByte(*x) ;
        if (!nDigits)
            return false ;
    }</pre>
			<pre>    //  Test for the 'e' followed by at least one digit or a +/- followed by at least one digit
    if (*x == 'e')
    {
        x++ ;
        W.AddByte(*x) ;
        if (*x == CHAR_MINUS || *x == CHAR_PLUS)
            { W.AddByte(*x) ; nBytes++ ; x++ ; }</pre>
			<pre>        for (nDigits = 0 ; IsDigit(*x) ; nBytes++, nDigits++, x++)
            W.AddByte(*x) ;
        if (!nDigits)
            return false ;
    }</pre>
			<pre>    if (!nBytes)
        return false ;</pre>
			<pre>    ci = x ;
    S = W ;
    return true ;
}</pre>
			<pre>bool    IsHexValue  (uint32_t&amp; nLen, const char* pStr)
{
    //  Category:   Text Processing
    //
    //  Determine if supplied char string (arg 2) amounts to a hexadecimal number. This may optionally be preceeded by # or 0x. If the string is
    //  a hexadecimal number true is returned and the supplied uint32_t reference (arg 1) is set to the value.
    //
    //  Arguments:  1)  nLen    Set by the operation as the length of the hexadecimal number if found
    //              2)  pStr    The pointer into the test string
    //
    //  Returns:    True    If the supplied cstr amounts to a hexidecimal number
    //              False   Otherwise</pre>
			<pre>    const char* i = pStr ;      //  Input string iterator
    uint32_t    nBytes = 0 ;
    uint32_t    nHex = 0 ;</pre>
			<pre>    nLen = 0 ;
    if (!i)
        return false ;</pre>
			<pre>    if (*i == CHAR_HASH &amp;&amp; IsHex(i[1]))
        { nBytes++ ; i++ ; }</pre>
			<pre>    if (*i == '0' &amp;&amp; i[1] == 'x')
        { nBytes += 2 ; i += 2 ; }</pre>
			<pre>    for (; IsHex(*i) ; nHex++, nBytes++, i++) ;</pre>
			<pre>    if (!nHex)
        return false ;</pre>
			<pre>    nLen = nBytes ;
    return true ;
}</pre>
			<pre>bool    _testHexnum (hzString&amp; S, chIter&amp; ci)
{
    //  Arguments:  1)  S   Reference to string to hold discovered hex number
    //              2)  ci  Chain iterator of ongoing input
    //
    //  Returns:    True    If the supplied cstr amounts to a hexidecimal number
    //              False   Otherwise</pre>
			<pre>    _hzfunc("IsHexValue") ;</pre>
			<pre>    hzChain W ;
    chIter  x = ci ;
    uint32_t    nSize = 0 ;</pre>
			<pre>    if (x == CHAR_HASH)
        { W.AddByte(*x) ; x++ ; }
    else if (x == "0x" || x == "0X")
        { W &lt;&lt; "0x" ; x += 2 ; }
    else
        return false ;</pre>
			<pre>    for (; IsHex(*x) ; nSize++, x++)
        W.AddByte(*x) ;</pre>
			<pre>    if (!W.Size())
        return false ;</pre>
			<pre>    S = W ;
    ci = x ;
    return true ;
}</pre>
			<pre>/*
**  Tokenize Fuctions
*/</pre>
			<pre>hzEcode TokenizeWords   (hzVect&lt;hzToken&gt;&amp; toks, hzChain&amp; C)
{
    //  Category:   Text Processing
    //
    //  Tokenize into words and numbers only, ignoring all punctuation. This is suitable for indexation of documents although it is not
    //  suitable for querying a document index as it does not produce boolean expressions of words.
    //
    //  Arguments:  1)  toks    The vector of tokens found in the input
    //              2)  C       The input chain
    //
    //  Returns:    E_NODATA    If the supplied chain is empty
    //              E_OK        If the supplied chain is tokenized</pre>
			<pre>    _hzfunc("TokenizeWords") ;</pre>
			<pre>    chIter      ci ;        //  For iteration of input
    hzChain     word ;      //  For building token
    hzToken     T ;         //  Token
    hzString    S ;         //  Temp string
    uint32_t    nLine ;     //  For assigning line numbers to tokens</pre>
			<pre>    toks.Clear() ;
    if (!C.Size())
        return E_NODATA ;</pre>
			<pre>    for (ci = C ; !ci.eof() ;)
    {
        if (*ci &lt;= CHAR_SPACE)
            { ci++ ; continue ; }</pre>
			<pre>        if (*ci == CHAR_DQUOTE)
        {
            S.Clear() ;
            word.Clear() ;
            nLine = ci.Line() ;</pre>
			<pre>            for (ci++ ; !ci.eof() ; ci++)
            {
                if (*ci == CHAR_DQUOTE)
                    { ci++ ; break ; }</pre>
			<pre>                word.AddByte(*ci) ;
            }</pre>
			<pre>            if (word.Size())
            {
                S = word ;
                T.Init(S, nLine, TOKEN_ALPHANUM) ;
                toks.Add(T) ;
            }</pre>
			<pre>            continue ;
        }</pre>
			<pre>        if (ci == "/*")
        {
            for (ci += 2 ; !ci.eof() ; ci++)
            {
                if (ci == "*/")
                    { ci += 2 ; break ; }
            }
            continue ;
        }</pre>
			<pre>        if (IsAlphanum(*ci))
        {
            S.Clear() ;
            word.Clear() ;
            nLine = ci.Line() ;</pre>
			<pre>            word.AddByte(*ci) ;
            for (ci++ ; !ci.eof() &amp;&amp; IsAlphanum(*ci) ; ci++)
                word.AddByte(*ci) ;</pre>
			<pre>            if (word.Size())
            {
                S = word ;
                T.Init(S, nLine, TOKEN_ALPHANUM) ;
                toks.Add(T) ;
            }</pre>
			<pre>            continue ;
        }</pre>
			<pre>        ci++ ;
    }</pre>
			<pre>    return E_OK ;
}</pre>
			<pre>hzEcode TokenizeFreetext    (hzVect&lt;hzToken&gt;&amp; toks, hzChain&amp; C)
{
    //  Category:   Text Processing
    //
    //  Tokenize into words and numbers in accordance with the rules for freetext indexation. This, like the TokenizeWords() function,
    //  ignors all punctuation between words although it does process punctuation within words.
    //
    //  Words are first split on the basis of whitespace only (although trailing punctuation chars are ignored). This produces 'raw'
    //  words from which other words may be derived. As the raw words are built up, a count of each character type is kept. This helps
    //  drive the word derivation process. The output then consists of the raw word (if it survives) followed by any derived word.
    //
    //  Examples:-
    //  1)  $4million   -&gt; dollars 4 million
    //  2)  bee-keeper  -&gt; bee-keeper beekeeper bee keeper.
    //
    //  Arguments:  1)  toks    The vector of tokens found in the input
    //              2)  C       The input chain
    //
    //  Returns:    E_NODATA    If the supplied chain is empty
    //              E_OK        If the supplied chain is tokenized</pre>
			<pre>    _hzfunc("TokenizeFreetext") ;</pre>
			<pre>    hzChain     raw ;       //  For building raw word
    chIter      ci ;        //  For iteration of input
    chIter      ri ;        //  For iteration of raw word
    hzToken     T ;         //  Token
    hzString    word ;      //  Word (from raw)
    uint32_t    ucVal ;     //  Value of unicode char
    uint32_t    nPunct ;    //  Count of punctuation chars in raw word
    uint32_t    nSymb ;     //  Count of symbol chars in raw word
    uint32_t    nDigit ;    //  Count of digits in raw word
    uint32_t    nAlpha ;    //  Count of alphas in raw word
    uint32_t    nWeird ;    //  Count of alphas in raw word
    uint32_t    ucLen ;     //  Length of unicode char
    uint32_t    nLine ;     //  Line number (at start of raw sequence)</pre>
			<pre>    toks.Clear() ;
    if (!C.Size())
        return E_NODATA ;</pre>
			<pre>    ci = C ;
    ci.Line(1) ;
    for (; !ci.eof() ;)
    {
        //  Ignore leading spaces and other non alphnumerics
        if (!IsAlphanum(*ci))
            { ci++ ; continue ; }</pre>
			<pre>        //  Now we only have to deal with chars that are part of the raw word
        raw.Clear() ;
        nPunct = nSymb = nDigit = nAlpha = nWeird = 0 ;</pre>
			<pre>        nLine = ci.Line() ;
        for (; !ci.eof() &amp;&amp; IsAlphanum(*ci) ;)
        {
            if (IsPunct(*ci))
            {
                if (ci[1] &lt;= CHAR_SPACE)
                    { ci++ ; break ; }</pre>
			<pre>                nPunct++ ;
                raw.AddByte(*ci) ;
                ci++ ;
                continue ;
            }</pre>
			<pre>            if (IsSymb(*ci))    { nSymb++ ;     raw.AddByte(*ci) ;  ci++ ;  continue ; }
            if (IsDigit(*ci))   { nDigit++ ;    raw.AddByte(*ci) ;  ci++ ;  continue ; }
            if (IsAlpha(*ci))   { nAlpha++ ;    raw.AddByte(*ci) ;  ci++ ;  continue ; }</pre>
			<pre>            //  Do we have a unicode sequence?
            if (AtUnicodeSeq(ucVal, ucLen, ci))
            {
                if (ucVal &gt; 255)
                    { nWeird++ ; ci++ ; }
                else
                    { raw.AddByte(ucVal &amp; 0xff) ; ci += ucLen ; }
                continue ;
            }</pre>
			<pre>            raw.AddByte(*ci) ;
        }</pre>
			<pre>        //  Now we have a raw word, do we add it to the token list as-is or do we spawn derivatives?</pre>
			<pre>        if (!raw.Size())
            continue ;</pre>
			<pre>        if (nAlpha)
        {
            if (!nDigit &amp;&amp; !nPunct &amp;&amp; !nSymb &amp;&amp; !nWeird)
            {
                word = raw ;
                word.ToLower() ;
                T.Init(word, nLine, TOKEN_ALPHANUM) ;
                toks.Add(T) ;
            }</pre>
			<pre>            //  Now must fill in what happens when we have digits, hyphens and what have you.
        }
    }</pre>
			<pre>    return E_OK ;
}</pre>
			<pre>hzEcode TokenizeBool    (hzVect&lt;hzToken&gt;&amp; toks, hzChain&amp; C)
{
    //  Category:   Text Processing
    //
    //  Tokenize supplied chain into tokens expected to form a boolean expression
    //
    //  Arguments:  1)  toks    The vector of tokens found in the input
    //              2)  C       The input chain
    //
    //  Returns:    E_NODATA    If the supplied chain is empty
    //              E_OK        If the supplied chain is tokenized</pre>
			<pre>    _hzfunc("hzTokenlist::TokenizeBool") ;</pre>
			<pre>    hzChain     W ;             //  For building tokens
    chIter      ci ;            //  For iteration of input
    hzToken     T ;             //  Token
    hzString    S ;             //  For assembling the token value
    uint32_t    nLine ;         //  For assigning line numbers to tokens
    char        tmp [4] ;       //  For operator</pre>
			<pre>    toks.Clear() ;
    if (!C.Size())
        return E_NODATA ;</pre>
			<pre>    ci = C ;
    nLine = 1 ;
    for (; !ci.eof() ;)
    {
        //  Increment line number when newlines are encountered
        if (*ci == CHAR_NL)
            { nLine++ ; ci++ ; continue ; }</pre>
			<pre>        //  Strip whitespace and other non-printable chars
        if (IsBinary(*ci) || IsWhite(*ci))
            { ci++ ; continue ; }</pre>
			<pre>        //  Eliminate comments
        if (ci == "/*")
        {
            for (ci += 2 ; !ci.eof() &amp;&amp; ci != "*/" ; ci++) ;
            ci += 2 ;
            continue ;
        }</pre>
			<pre>        if (ci == "//")
        {
            for (ci += 2 ; !ci.eof() &amp;&amp; *ci != CHAR_NL ; ci++) ;
            ci++ ;
            continue ;
        }</pre>
			<pre>        //  Assume we are at the start of a token - Check for quoted string
        if (*ci == CHAR_DQUOTE)
        {
            for (ci++ ; !ci.eof() &amp;&amp; *ci != CHAR_DQUOTE ; ci++)
                W.AddByte(*ci) ;
            S = W ;
            W.Clear() ;
            T.Init(S, ci.Line(), TOKEN_STRING) ;
            toks.Add(T) ;
            ci++ ;
            continue ;
        }</pre>
			<pre>        //  Check for valid hexadecimal value
        if (_testHexnum(S, ci))
        {
            T.Init(S, ci.Line(), TOKEN_STRING) ;
            toks.Add(T) ;
            continue ;
        }</pre>
			<pre>        //  Check for integer
        if (_testInteger(S, ci))
        {
            T.Init(S, ci.Line(), TOKEN_INTEGER) ;
            toks.Add(T) ;
            continue ;
        }</pre>
			<pre>        //  Check for number (std form)
        if (_testNumber(S, ci))
        {
            T.Init(S, ci.Line(), TOKEN_NUMBER) ;
            toks.Add(T) ;
            continue ;
        }</pre>
			<pre>        //  Check for seperator
        if (IsPunct(*ci))
        {
            tmp[0] = *ci ;
            tmp[1] = 0 ;
            ci++ ;
            T.Init(tmp, nLine, TOKEN_SEPARATOR) ;
            toks.Add(T) ;
            continue ;
        }</pre>
			<pre>        //  Check for operator
        if (IsSymb(*ci))
        {
            for (; !ci.eof() &amp;&amp; IsSymb(*ci) ; ci++)
                W.AddByte(*ci) ;
            S = W ;
            W.Clear() ;
            T.Init(S, nLine, TOKEN_OPERATOR) ;
            toks.Add(T) ;
            continue ;
        }</pre>
			<pre>        //  Not an operator or separator - must be general entitiy
        //  We have a rule that productions must be written out
        //  in full. E.g. 2X must be written as 2*X. The system
        //  will interpret 2X as an alpha numeric quantity</pre>
			<pre>        for (; !ci.eof() &amp;&amp; IsAlphanum(*ci) ; ci++)
            W.AddByte(*ci) ;
        S = W ;
        W.Clear() ;
        T.Init(S, nLine, TOKEN_OPERATOR) ;
        toks.Add(T) ;
    }</pre>
			<pre>    return E_OK ;
}</pre>
			<pre>/*
**  Application level Tokenization Functions
*/</pre>
			<pre>hzEcode TokenizeChain   (hzVect&lt;hzToken&gt;&amp; toks, hzChain&amp; C, hzTokMode eMode)
{
    //  Category:   Text Processing
    //
    //  Populate supplied vector of tokens (arg 1) by tokenizing the supplied chain (arg 2) according to the modus operandi specified by arg 3
    //
    //  Arguments:  1)  toks    The vector of tokens found in the input
    //              2)  C       The input chain
    //              3)  eMode   The tokenization regime (either WHITE, FTEXT or BOOL)
    //
    //  Returns:    E_NODATA    If the supplied chain is empty
    //              E_RANGE     If the supplied mode is invalid
    //              E_OK        If the supplied chain is tokenized</pre>
			<pre>    switch  (eMode)
    {
    case TOK_MO_WHITE:  return TokenizeWords(toks, C) ;
    case TOK_MO_FTEXT:  return TokenizeFreetext(toks, C) ;
    case TOK_MO_BOOL:   return TokenizeBool(toks, C) ;
    }</pre>
			<pre>    return E_RANGE ;
}</pre>
			<pre>hzEcode TokenizeString  (hzVect&lt;hzToken&gt;&amp; toks, const char* pBuf, hzTokMode eMode)
{
    //  Category:   Text Processing
    //
    //  Populate supplied vector of tokens (arg 1) by tokenizing the supplied char string (arg 2) according to the modus operandi specified by
    //  arg 3
    //
    //  Arguments:  1)  toks    The vector of tokens found in the input
    //              2)  C       The input chain
    //              3)  eMode   The tokenization regime (either WHITE, FTEXT or BOOL)
    //
    //  Returns:    E_NODATA    If the supplied chain is empty
    //              E_RANGE     If the supplied mode is invalid
    //              E_OK        If the supplied chain is tokenized</pre>
			<pre>    _hzfunc("TokenizeString") ;</pre>
			<pre>    hzChain C ;     //  Working chain</pre>
			<pre>    C = pBuf ;</pre>
			<pre>    switch  (eMode)
    {
    case TOK_MO_WHITE:  return TokenizeWords(toks, C) ;
    case TOK_MO_FTEXT:  return TokenizeFreetext(toks, C) ;
    case TOK_MO_BOOL:   return TokenizeBool(toks, C) ;
    }</pre>
			<pre>    return E_RANGE ;
}</pre>
			<pre>hzEcode TokenizeFile    (hzVect&lt;hzToken&gt;&amp; toks, const char* fname, hzTokMode eMode)
{
    //  Category:   Text Processing
    //
    //  Populate supplied vector of tokens (arg 1) by tokenizing the supplied file (named in arg 2) according to the modus operandi specified by
    //  arg 3
    //
    //  Arguments:  1)  toks    The vector of tokens found in the input
    //              2)  fname   The input filename
    //              3)  eMode   The tokenization regime (either WHITE, FTEXT or BOOL)
    //
    //  Returns:    E_NODATA    If the supplied chain is empty
    //              E_RANGE     If the supplied mode is invalid
    //              E_OK        If the supplied chain is tokenized</pre>
			<pre>    _hzfunc("TokenizeFile") ;</pre>
			<pre>    /*
    **  Convert file into tokens
    */</pre>
			<pre>    std::ifstream   is ;    //  Input stream
    FSTAT           fs ;    //  File status
    hzChain         C ;     //  Working chain</pre>
			<pre>    if (!fname || !fname[0])
    {
        hzerr(E_ARGUMENT, "Cannot tokenize unnamed file") ;
        return E_ARGUMENT ;
    }</pre>
			<pre>    if (stat(fname, &amp;fs) == -1)
    {
        hzerr(E_ARGUMENT, "File (&#37;s) does not exist") ;
        return E_ARGUMENT ;
    }</pre>
			<pre>    is.open(fname) ;
    if (is.fail())
    {
        hzerr(E_OPENFAIL, "File &#37;s", fname) ;
        return E_OPENFAIL ;
    }</pre>
			<pre>    C &lt;&lt; is ;
    is.close() ;</pre>
			<pre>    switch  (eMode)
    {
    case TOK_MO_WHITE:  return TokenizeWords(toks, C) ;
    case TOK_MO_FTEXT:  return TokenizeFreetext(toks, C) ;
    case TOK_MO_BOOL:   return TokenizeBool(toks, C) ;
    }</pre>
			<pre>    return E_RANGE ;
}</pre>