// // File: hzRegex.cpp // // Legal Notice: This file is part of the HadronZoo C++ Class Library. Copyright 2025 HadronZoo Project (http://www.hadronzoo.com) // // The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free // Software Foundation, either version 3 of the License, or any later version. // // The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR // A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses. //
// // Impliments regular expression matches for filenames or other purpose //
#include <iostream>
#include <stdarg.h>
#include "hzBasedefs.h" #include "hzChars.h" #include "hzTextproc.h" #include "hzChain.h" #include "hzDate.h" #include "hzProcess.h"
/* ** Functions */
static bool _checkformpart (const char** test, const char** part) { // Check current part of test string matches part
const char* c ; // For processing ctrl string into parts const char* s ; // Progressive ptr for test string
for (c = *part, s = *test ;;) { if (*c == CHAR_QUERY) { c++ ; s++ ; continue ; }
if (*c == CHAR_SQOPEN) { if (memcmp(c, "[0-9]", 5) == 0 && *s >= '0' && *s <= '9') { c += 5 ; s++ ; continue ; } if (memcmp(c, "[a-z]", 5) == 0 && *s >= 'a' && *s <= 'z') { c += 5 ; s++ ; continue ; } if (memcmp(c, "[A-Z]", 5) == 0 && *s >= 'A' && *s <= 'Z') { c += 5 ; s++ ; continue ; } }
if (*c == CHAR_ASTERISK) { *part = c ; *test = s ; return true ; }
if (*c == 0 && *s == 0) { *part = c ; *test = s ; return true ; }
if (*s == *c) { s++ ; c++ ; continue ; } break ; }
return false ; }
bool FormCheckCstr (const char* cpTest, const char* cpCtrl) { // Category: Regular Expression // // Checks if a test string is of the form specified by a control string. // // The method is aimed primarily at filtering filenames and is not intended as a formal regular expression interpreter. The following sequences appearing in the control string // are treated as wildcards as follows:- // // * One or more consequtive asterisks match to a series of zero or more characters of any form // ? Matches to one character // [0-9] Matches to any digit 0-9 // [a-z] Matches to any lower case letter // [A-Z] Matches to any upper case letter // // The control string is firstly broken into parts by an asterisk (or a series of consequtive asterisks), appearing anywhere within it. Each part either ends with an asterisk // or a null terminator. If the control string starts with an asterisk, the first part (part 0) will just comprise the asterisk and be of zero length. Likewise if the control // string ends with an asterisk, the last part will just comprise the asterisk and be of zero length. The test for a match is passed if the test string contains each part and // in the order of appreance in the control string. The test string automatically matches a part of zero length. // // Arguments: 1) cpTest The test string // 2) cpCtrl The control string // // Returns: True If the test string is of the form specified by the control // False Otherwise.
_hzfunc(__func__) ;
const char* c ; // For processing ctrl string into parts const char* s ; // Progressive ptr for test string uint32_t nParts ; // Total number of parts uint32_t nChars ; // Total number of chars to be matched uint32_t curPart ; // Current part bool bMatch ; // Match by _checkformpart()
// If no control string there is no criteria so an automatic pass if (!cpCtrl || !cpCtrl[0]) return true ;
// If no test string this is invalid so an automatic fail if (!cpTest || !cpTest[0]) return false ;
// Break up the control string for (c = cpCtrl ; *c ;) { if (*c == CHAR_ASTERISK) { nParts++ ; for (; *c == CHAR_ASTERISK ; c++) ; } else if (*c == CHAR_SQOPEN) { if (memcmp(c, "[0-9]", 5)) { c += 5 ; nChars++ ; continue ; } if (memcmp(c, "[a-z]", 5)) { c += 5 ; nChars++ ; continue ; } if (memcmp(c, "[A-Z]", 5)) { c += 5 ; nChars++ ; continue ; } } else { c++ ; nChars++ ; } }
// Process the CTRL string for (curPart = 0, c = cpCtrl, s = cpTest ; *c && *s ;) { bMatch = _checkformpart(&s, &c) ; if (!curPart && !bMatch) return false ;
if (!bMatch) s++ ; else { if (*c == CHAR_ASTERISK) { // Advance the control until no longer on an asterisk. Then look for the first occurence of the control char in the test curPart++ ; for (; *c == CHAR_ASTERISK ; c++) ; } } }
if (*c == 0 && *s == 0) return true ; return false ; }
uint32_t FormCheckChain (hzChain::Iter& ci, const char* cpCtrl) { // Category: Regular Expression // // This determines if the implied string (at the current supplied chain iterator) is of the supplied form // // Arguments: 1) ci The input chain as chain iterator // 2) cpCtrl The control string // // Returns: Number of chars in the chain that comprise the construct of the supplied form
_hzfunc(__func__) ;
chIter z ; // For processing ctrl string into parts const char* c ; // Progressive ptr for test string uint32_t len = 0 ; // Length of construct
if (!cpCtrl || !cpCtrl[0]) return true ;
if (ci.eof()) return false ;
// Process the CTRL string for (c = cpCtrl, z = ci ; *c && !z.eof() ;) { if (*c == CHAR_QUERY) { c++ ; len++ ; z++ ; continue ; }
if (*c == CHAR_SQOPEN) { if (memcmp(c, "[0-9]", 5) == 0 && *z >= '0' && *z <= '9') { c += 5 ; len++ ; z++ ; continue ; } if (memcmp(c, "[a-z]", 5) == 0 && *z >= 'a' && *z <= 'z') { c += 5 ; len++ ; z++ ; continue ; } if (memcmp(c, "[A-Z]", 5) == 0 && *z >= 'A' && *z <= 'Z') { c += 5 ; len++ ; z++ ; continue ; } }
if (*c == CHAR_ASTERISK) { // Advance the control until no longer on an asterisk. Then look for the first occurence of the control char in the test for (; *c == CHAR_ASTERISK ; c++) ; for (; !z.eof() && *z != *c ; len++, z++) ; continue ; }
if (*c == *z) { c++ ; len++ ; z++ ; continue ; } break ; }
if (*c == 0) return len ; return 0 ; }
/* ** Filename or Glob Filtering with support for HadronZoo date forms */
struct _psudoDate { // Non-compact date for lexical-date comparisons
int16_t Y ; // Year char M ; // Month char D ; // Day char h ; // Hour char m ; // Minute char s ; // Second char resv ; // Reserved
_psudoDate () { _clear() ; }
void _clear () { Y = 0 ; M = D = h = m = s = resv = 0 ; } } ;
bool _set_psudo_date (_psudoDate& pd, const char** test, const char** ctrl) { // Support function for FormCheckDate // // Returns: True If the psudo date is set // False Otherwise.
_hzfunc(__func__) ;
const char* t = *test ; const char* c = *ctrl ;
for (c++ ; *c && *t ;) { if (*c == '}') break ;
if (c[0] == 'Y' && c[1] == 'Y' && c[2] == 'Y' && c[3] == 'Y' && IsDigit(t[0]) && IsDigit(t[1]) && IsDigit(t[2]) && IsDigit(t[3])) { pd.Y = ((t[0]-'0')*1000) + ((t[1]-'0')*100) + ((t[2]-'0')*10) + (t[3]-'0') ; t += 4 ; c += 4 ; continue ; }
if (c[0] == 'Y' && c[1] == 'Y' && IsDigit(t[0]) && IsDigit(t[1])) { pd.Y = ((t[0]-'0')*10) + (t[1]-'0') + 2000 ; t += 2 ; c += 2 ; continue ; }
if (c[0] == 'M' && c[1] == 'M' && IsDigit(t[0]) && IsDigit(t[1])) { pd.M = ((t[0]-'0')*10) + (t[1]-'0') ; if (pd.M > 0 && pd.M < 13) { t += 2 ; c += 2 ; continue ; } return false ; }
if (c[0] == 'D' && c[1] == 'D' && IsDigit(t[0]) && IsDigit(t[1])) { pd.D = ((t[0]-'0')*10) + (t[1]-'0') ; if (pd.D < 32) { t += 2 ; c += 2 ; continue ; } return false ; }
if (c[0] == 'h' && c[1] == 'h' && IsDigit(t[0]) && IsDigit(t[1])) { pd.h = ((t[0]-'0')*10) + (t[1]-'0') ; if (pd.h < 24) { t += 2 ; c += 2 ; continue ; } return false ; }
if (c[0] == 'm' && c[1] == 'm' && IsDigit(t[0]) && IsDigit(t[1])) { pd.m = ((t[0]-'0')*10) + (t[1]-'0') ; if (pd.m < 60) { t += 2 ; c += 2 ; continue ; } return false ; }
if (c[0] == 's' && c[1] == 's' && IsDigit(t[0]) && IsDigit(t[1])) { pd.s = ((t[0]-'0')*10) + (t[1]-'0') ; if (pd.s < 60) { t += 2 ; c += 2 ; continue ; } return false ; }
// No other formats accepted yet return false ; }
if (*c != '}') return false ; c++ ;
*test = t ; *ctrl = c ; return true ; }
bool FormCheckDate (hzXDate& xdate, const char* cpTest, const char* cpCtrl) { // Category: Regular Expression // // Purpose: Check if a test string contains a date and is of the form implied by a control string // // The control string is interpreted as a simplfied regular expression. The method is aimed primarily at filtering filenames which are expected to contain // dates and is not intended as a formal regular expression interpreter. The ? will match to any one character and the * will match to a series of zero or // more characters as per the standards. However treatment of the [] constructs is more limited. // // Arguments: 1) xdate The hzXDate instance to be populated in the event of HadronZoo date forms in the control being met in the test. // 2) cpTest The test string // 3) cpCtrl The control or 'form' string // // Returns: True If the test string is of the form specified in the control // False Otherwise. Note that this may be because the test string contains an invalid date or partial date. //
_hzfunc(__func__) ;
_psudoDate pd ; // For showing that anticipated date or partial date in the test string are valid const char* ctrl ; // For processing ctrl string into parts const char* test ; // Progressive ptr for test string uint32_t val ; // For Hex number read
xdate.Clear() ;
if (!cpCtrl || !cpCtrl[0]) return true ; if (!cpTest || !cpTest[0]) return false ;
test = cpTest ; ctrl = cpCtrl ;
// Perform the test for (; *test && *ctrl ;) { if (*ctrl == '*') { // Because we don't know how many chars this pertains to, we recursivly call this function in a loop. We start with test // where it is and advance it one place for each call. The control string for all these calls is set to one place beyond // the asterisk(s).
for (; *ctrl == '*' ; ctrl++) ;
for (; *test ; test++) { if (FormCheckDate(xdate, test, ctrl)) return true ; }
return false ; }
if (*ctrl == '?') { test++ ; ctrl++ ; continue ; }
if (*ctrl == '{') { if (!_set_psudo_date(pd, &test, &ctrl)) return false ; continue ;
}
if (*ctrl == CHAR_BKSLASH) { // This will be treated only as a backslash if followed by another backslash if (ctrl[1] == CHAR_BKSLASH) if (*test == CHAR_BKSLASH) { test++ ; ctrl += 2 ; continue ; }
if (ctrl[1] == 'n') if (*test == CHAR_NL) { test++ ; ctrl += 2 ; continue ; }
// Whitespace if (ctrl[1] == 's') if (*test == CHAR_CTRLL || *test == CHAR_NL || *test == CHAR_CR || *test == CHAR_TAB || *test == CHAR_CTRLK) { test++ ; ctrl += 2 ; continue ; }
// Non white space if (ctrl[1] == 'S') if (*test > CHAR_SPACE) { test++ ; ctrl += 2 ; continue ; }
if (ctrl[1] == 'c') { // \cx // Matches the control character indicated by x. For example, \cM matches a Control-M or carriage return character. // The value of x must be in the range of A-Z or a-z. If not, c is assumed to be a literal c character. if (ctrl[2] >= 'a' && ctrl[2] <= 'z') if (*test == (ctrl[2]-'a')) { test++ ; ctrl += 3 ; continue ; }
if (ctrl[2] >= 'A' && ctrl[2] <= 'Z') if (*test == (ctrl[2]-'A')) { test++ ; ctrl += 3 ; continue ; }
if (*test == 'c') { test++ ; ctrl += 2 ; continue ; } }
if (ctrl[1] == 'x') if (IsHexnum(val, ctrl+1)) if (*test == (uchar) (val & 0xff)) { test++ ; ctrl += 4 ; continue ; }
if (ctrl[1] == 't') if (*test == CHAR_TAB) { test++ ; ctrl += 2 ; continue ; }
// Cope with vertical tab if (ctrl[1] == 'v') if (*test == CHAR_CTRLI) { test++ ; ctrl += 2 ; continue ; }
// Cope with \f, \r, \n if (*test == ctrl[1]) { test++ ; ctrl += 2 ; continue ; }
// Drop thru and just test for the backslash }
// No we have ordinary char in ctrl if (*test == *ctrl) { test++ ; ctrl++ ; continue ; } return false ; }
if (pd.Y || pd.M || pd.D) { if ( pd.Y && pd.M && pd.D) xdate.SetDate(pd.Y, pd.M, pd.D) ;
else if ( pd.Y && pd.M && !pd.D) xdate.SetDate(pd.Y, pd.M, 1) ;
else if ( pd.Y && !pd.M && pd.D) xdate.Clear() ; //invalid else if ( pd.Y && !pd.M && !pd.D) xdate.Clear() ; //invalid
else if (!pd.Y && pd.M && pd.D) { if (xdate.IsSet()) pd.Y = xdate.Year() ; else { xdate.SysDateTime() ; pd.Y = xdate.Year() ; } xdate.SetDate(pd.Y, pd.M, pd.D) ; }
else if (!pd.Y && pd.M && !pd.D) xdate.Clear() ; //invalid else if (!pd.Y && !pd.M && pd.D) xdate.Clear() ; //invalid else if (!pd.Y && !pd.M && !pd.D) xdate.Clear() ; //invalid
// if (!pd.Y || !pd.M || !pd.D) // xdate.SysDateTime() ;
// if (!pd.Y) pd.Y = xdate.Year() ; // if (!pd.M) pd.M = xdate.Month() ; // if (!pd.D) pd.D = xdate.Day() ;
// xdate.SetDate(pd.Y, pd.M, pd.D) ;
if (!xdate.IsSet()) return false ; }
return true ; }
/* ** Inline grep for statistical analysis etc */
hzEcode Grep (hzChain& Zo, hzChain& Zi, const char* exp) { // Category: Regular Expression // // Effect a grep on the supplied exp to the input chain. Place all matching lines in the output chain // // Arguments: 1) Zo The output chain being a list of matching lines // 2) Zi The input chain // 3) exp The test expression // // Returns: E_ARGUMENT If no search expression is supplied // E_OK In all other circumstances
_hzfunc(__func__) ;
hzChain::Iter zi ; // For iteration of the input hzChain L ; // For isolating a line. This is then tested for the expression. If it passes, it is added as is to the output. hzString S ; // Temp string
Zo.Clear() ; if (!Zi.Size()) return E_OK ;
if (!exp || !exp[0]) return E_ARGUMENT ;
for (zi = Zi ; !zi.eof() ; zi++) { if (*zi != CHAR_NL) { L.AddByte(*zi) ; continue ; }
// Now have a line in L S = L ; L.Clear() ; if (!FormCheckCstr(*S, exp)) continue ;
// Line has passed so add Zo << S ; Zo.AddByte(CHAR_NL) ; L.Clear() ; }
return E_OK ; }