//
// File: hzTokens.h
//
// Legal Notice: This file is part of the HadronZoo C++ Class Library.
//
// Copyright 2025 HadronZoo Project (http://www.hadronzoo.com)
//
// The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free
// Software Foundation, either version 3 of the License, or any later version.
//
// The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses.
//
//
// See synopsis Tokenization
//
// This package facilitates tokenization of strings into tokens classified as one of the following types:-
//
// 1) Alphanumeric. The token may have chars A-Z, a-z, 0-9 or _
// 2) Operator. The token is an arithmetic or logical operator.
// 3) Value. This can be either of the following:-
// 1) An integer. (has only chars 0-9)
// 2) A Double. (has form of standard number)
// 3) A String Value. (Must be enclosed in quotes)
// 4) Separator. Punctuation. Always a single char.
//
// The operation is as follows:-
//
// 1) Create an instance of hzToken
// 2) Call member Tokenize() passing string to be tokenized.
// 3) Move thru the resulting hzVect of tokens
//
#ifndef hzTokens_h
#define hzTokens_h
#include "hzTmplVect.h"
#define MAXTOKENSIZE 256
/*
** Special char types for tokenization purposes
*/
#define NOTYPE 0x0000 // undefined char type
#define BINARY 0x0001 // unprintable, ctl etc
#define WHITE 0x0002 // space, tab or newline
#define DIGIT 0x0004 // 0 - 9 only
#define HEXDIGIT 0x0008 // 0 - 9 only
#define ALPHA 0x0010 // a - z and A - Z
#define HYPEN 0x0020 // any symbol used as hyphen
#define PUNCT 0x0040 // any punctuation char
#define SYMB 0x0080 // symbols, eg math operators
#define NUMCHAR 0x0100 // any char used in a number
/*
** Token types
*/
enum hzTokenType
{
// Category: Text Processing
//
// Enumeration of token types
TOKEN_ALPHANUM, // Token consists of string of [a-z] or [A-Z] or [0-9]
TOKEN_OPERATOR, // Token is any of the standard operators
TOKEN_SEPARATOR, // Token acts as a separator
TOKEN_INTEGER, // Token is a string of [0-9]
TOKEN_NUMBER, // Token is a number eg 10, 10.8, 1.08e-2 etc etc
TOKEN_HEXVALUE, // Token is an integer expresed in hex, must begin with '0x'
TOKEN_DOUBLE, // Token is a number of standard form
TOKEN_STRING, // Token is a string
TOKEN_COMMENT, // Token is comment. Either /* ... */ or //
TOKEN_UNDEFINED // Token is none of the above
} ;
enum hzTokMode
{
// Category: Text Processing
//
// Enumeration of different tokenization regimes
TOK_MO_WHITE, // Split into words separated by whitespace
TOK_MO_FTEXT, // Split into words according to free text rules
TOK_MO_BOOL, // Tokenize into boolean expression of operators and operands
TOK_MO_CPP // Tokenize into C++ classes, functions, operators and operands
} ;
/*
** hzToken class
*/
class hzToken
{
// Category: Text Processing
//
// Tokens are the meaningful units derived from the input text.
private:
hzString m_Tok ; // The token
uint32_t m_nLine ; // The line it was on in the file
hzTokenType m_eType ; // The token type
public:
hzToken (void)
{
m_nLine = 0 ;
m_eType = TOKEN_UNDEFINED ;
}
~hzToken (void)
{
}
// Set functions
hzEcode Init (const char* txt, uint32_t line, hzTokenType type) { m_Tok = txt ; m_nLine = line ; m_eType = type ; return E_OK ; }
hzEcode Init (const hzString& txt, uint32_t line, hzTokenType type) { m_Tok = txt ; m_nLine = line ; m_eType = type ; return E_OK ; }
// Get functions
const hzString& Value (void) const { return m_Tok ; }
uint32_t LineNo (void) const { return m_nLine ; }
hzTokenType Type (void) const { return m_eType ; }
// Operators
bool operator= (const hzToken& op)
{
m_Tok = op.m_Tok ;
m_nLine = op.m_nLine ;
m_eType = op.m_eType ;
return m_Tok.Length() ? true : false ;
}
bool operator== (const hzToken& op)
{
if (m_nLine != op.m_nLine) return false ;
if (m_eType != op.m_eType) return false ;
return m_Tok == op.m_Tok ;
}
bool operator== (const char* pStr) { return m_Tok == pStr ? true : false ; }
bool operator!= (const char* pStr) { return m_Tok == pStr ? false : true ; }
bool operator! (void) { return m_Tok ? false : true ; }
} ;
/*
** Prototypes
*/
hzEcode TokenizeChain (hzVect<hzToken>& Toklist, hzChain& C, hzTokMode eMode) ;
hzEcode TokenizeString (hzVect<hzToken>& Toklist, const char* pBuf, hzTokMode eMode) ;
hzEcode TokenizeFile (hzVect<hzToken>& Toklist, const char* filepath, hzTokMode eMode) ;
#endif // hzTokens_h