//
//  File:   hzRegex.cpp
//
//  Legal Notice:   This file is part of the HadronZoo C++ Class Library. Copyright 2025 HadronZoo Project (http://www.hadronzoo.com)
//
//  The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free
//  Software Foundation, either version 3 of the License, or any later version.
//
//  The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
//  A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses.
//
//
//  Impliments regular expression matches for filenames or other purpose
//
#include <iostream>
#include <stdarg.h>
#include "hzBasedefs.h"
#include "hzChars.h"
#include "hzTextproc.h"
#include "hzChain.h"
#include "hzDate.h"
#include "hzProcess.h"
/*
**  Functions
*/
static  bool    _checkformpart  (const char** test, const char** part)
{
    //  Check current part of test string matches part
    const char* c ;         //  For processing ctrl string into parts
    const char* s ;         //  Progressive ptr for test string
    for (c = *part, s = *test ;;)
    {
        if (*c == CHAR_QUERY)
            { c++ ; s++ ; continue ; }
        if (*c == CHAR_SQOPEN)
        {
            if (memcmp(c, "[0-9]", 5) == 0 && *s >= '0' && *s <= '9')   { c += 5 ; s++ ; continue ; }
            if (memcmp(c, "[a-z]", 5) == 0 && *s >= 'a' && *s <= 'z')   { c += 5 ; s++ ; continue ; }
            if (memcmp(c, "[A-Z]", 5) == 0 && *s >= 'A' && *s <= 'Z')   { c += 5 ; s++ ; continue ; }
        }
        if (*c == CHAR_ASTERISK)
        {
            *part = c ;
            *test = s ;
            return true ;
        }
        if (*c == 0 && *s == 0)
        {
            *part = c ;
            *test = s ;
            return true ;
        }
        if (*s == *c)
            { s++ ; c++ ; continue ; }
        break ;
    }
    return false ;
}
bool    FormCheckCstr   (const char* cpTest, const char* cpCtrl)
{
    //  Category:   Regular Expression
    //
    //  Checks if a test string is of the form specified by a control string.
    //
    //  The method is aimed primarily at filtering filenames and is not intended as a formal regular expression interpreter. The following sequences appearing in the control string
    //  are treated as wildcards as follows:-
    //
    //      *       One or more consequtive asterisks match to a series of zero or more characters of any form
    //      ?       Matches to one character
    //      [0-9]   Matches to any digit 0-9
    //      [a-z]   Matches to any lower case letter
    //      [A-Z]   Matches to any upper case letter
    //
    //  The control string is firstly broken into parts by an asterisk (or a series of consequtive asterisks), appearing anywhere within it. Each part either ends with an asterisk
    //  or a null terminator. If the control string starts with an asterisk, the first part (part 0) will just comprise the asterisk and be of zero length. Likewise if the control
    //  string ends with an asterisk, the last part will just comprise the asterisk and be of zero length. The test for a match is passed if the test string contains each part and
    //  in the order of appreance in the control string. The test string automatically matches a part of zero length.
    //
    //  Arguments:  1) cpTest   The test string
    //              2) cpCtrl   The control string
    //
    //  Returns:    True    If the test string is of the form specified by the control
    //              False   Otherwise.
    _hzfunc(__func__) ;
    const char*     c ;         //  For processing ctrl string into parts
    const char*     s ;         //  Progressive ptr for test string
    uint32_t        nParts ;    //  Total number of parts
    uint32_t        nChars ;    //  Total number of chars to be matched
    uint32_t        curPart ;   //  Current part
    bool            bMatch ;    //  Match by _checkformpart()
    //  If no control string there is no criteria so an automatic pass
    if (!cpCtrl || !cpCtrl[0])
        return true ;
    //  If no test string this is invalid so an automatic fail
    if (!cpTest || !cpTest[0])
        return false ;
    //  Break up the control string
    for (c = cpCtrl ; *c ;)
    {
        if (*c == CHAR_ASTERISK)
            { nParts++ ; for (; *c == CHAR_ASTERISK ; c++) ; }
        else if (*c == CHAR_SQOPEN)
        {
            if (memcmp(c, "[0-9]", 5))  { c += 5 ; nChars++ ; continue ; }
            if (memcmp(c, "[a-z]", 5))  { c += 5 ; nChars++ ; continue ; }
            if (memcmp(c, "[A-Z]", 5))  { c += 5 ; nChars++ ; continue ; }
        }
        else
            { c++ ; nChars++ ; }
    }
    //  Process the CTRL string
    for (curPart = 0, c = cpCtrl, s = cpTest ; *c && *s ;)
    {
        bMatch = _checkformpart(&s, &c) ;
        if (!curPart && !bMatch)
            return false ;
        if (!bMatch)
            s++ ;
        else
        {
            if (*c == CHAR_ASTERISK)
            {
                //  Advance the control until no longer on an asterisk. Then look for the first occurence of the control char in the test
                curPart++ ;
                for (; *c == CHAR_ASTERISK ; c++) ;
            }
        }
    }
    if (*c == 0 && *s == 0)
        return true ;
    return false ;
}
uint32_t    FormCheckChain  (hzChain::Iter& ci, const char* cpCtrl)
{
    //  Category:   Regular Expression
    //
    //  This determines if the implied string (at the current supplied chain iterator) is of the supplied form
    //
    //  Arguments:  1)  ci      The input chain as chain iterator
    //              2)  cpCtrl  The control string
    //
    //  Returns:    Number of chars in the chain that comprise the construct of the supplied form
    _hzfunc(__func__) ;
    chIter      z ;         //  For processing ctrl string into parts
    const char* c ;         //  Progressive ptr for test string
    uint32_t    len = 0 ;   //  Length of construct
    if (!cpCtrl || !cpCtrl[0])
        return true ;
    if (ci.eof())
        return false ;
    //  Process the CTRL string
    for (c = cpCtrl, z = ci ; *c && !z.eof() ;)
    {
        if (*c == CHAR_QUERY)
            { c++ ; len++ ; z++ ; continue ; }
        if (*c == CHAR_SQOPEN)
        {
            if (memcmp(c, "[0-9]", 5) == 0 && *z >= '0' && *z <= '9')   { c += 5 ; len++ ; z++ ; continue ; }
            if (memcmp(c, "[a-z]", 5) == 0 && *z >= 'a' && *z <= 'z')   { c += 5 ; len++ ; z++ ; continue ; }
            if (memcmp(c, "[A-Z]", 5) == 0 && *z >= 'A' && *z <= 'Z')   { c += 5 ; len++ ; z++ ; continue ; }
        }
        if (*c == CHAR_ASTERISK)
        {
            //  Advance the control until no longer on an asterisk. Then look for the first occurence of the control char in the test
            for (; *c == CHAR_ASTERISK ; c++) ;
            for (; !z.eof() && *z != *c ; len++, z++) ;
            continue ;
        }
        if (*c == *z)
            { c++ ; len++ ; z++ ; continue ; }
        break ;
    }
    if (*c == 0)
        return len ;
    return 0 ;
}
/*
**  Filename or Glob Filtering with support for HadronZoo date forms
*/
struct  _psudoDate
{
    //  Non-compact date for lexical-date comparisons
    int16_t Y ;         //  Year
    char    M ;         //  Month
    char    D ;         //  Day
    char    h ;         //  Hour
    char    m ;         //  Minute
    char    s ;         //  Second
    char    resv ;      //  Reserved
    _psudoDate  ()  { _clear() ; }
    void    _clear  ()  { Y = 0 ; M = D = h = m = s = resv = 0 ; }
} ;
bool    _set_psudo_date (_psudoDate& pd, const char** test, const char** ctrl)
{
    //  Support function for FormCheckDate
    //
    //  Returns:    True    If the psudo date is set
    //              False   Otherwise.
    _hzfunc(__func__) ;
    const char* t = *test ;
    const char* c = *ctrl ;
    for (c++ ; *c && *t ;)
    {
        if (*c == '}')
            break ;
        if (c[0] == 'Y' && c[1] == 'Y' && c[2] == 'Y' && c[3] == 'Y' && IsDigit(t[0]) && IsDigit(t[1]) && IsDigit(t[2]) && IsDigit(t[3]))
            { pd.Y = ((t[0]-'0')*1000) + ((t[1]-'0')*100) + ((t[2]-'0')*10) + (t[3]-'0') ; t += 4 ; c += 4 ; continue ; }
        if (c[0] == 'Y' && c[1] == 'Y' && IsDigit(t[0]) && IsDigit(t[1]))
            { pd.Y = ((t[0]-'0')*10) + (t[1]-'0') + 2000 ; t += 2 ; c += 2 ; continue ; }
        if (c[0] == 'M' && c[1] == 'M' && IsDigit(t[0]) && IsDigit(t[1]))
        {
            pd.M = ((t[0]-'0')*10) + (t[1]-'0') ;
            if (pd.M > 0 && pd.M < 13)
                { t += 2 ; c += 2 ; continue ; }
            return false ;
        }
        if (c[0] == 'D' && c[1] == 'D' && IsDigit(t[0]) && IsDigit(t[1]))
        {
            pd.D = ((t[0]-'0')*10) + (t[1]-'0') ;
            if (pd.D < 32)
                { t += 2 ; c += 2 ; continue ; }
            return false ;
        }
        if (c[0] == 'h' && c[1] == 'h' && IsDigit(t[0]) && IsDigit(t[1]))
        {
            pd.h = ((t[0]-'0')*10) + (t[1]-'0') ;
            if (pd.h < 24)
                { t += 2 ; c += 2 ; continue ; }
            return false ;
        }
        if (c[0] == 'm' && c[1] == 'm' && IsDigit(t[0]) && IsDigit(t[1]))
        {
            pd.m = ((t[0]-'0')*10) + (t[1]-'0') ;
            if (pd.m < 60)
                { t += 2 ; c += 2 ; continue ; }
            return false ;
        }
        if (c[0] == 's' && c[1] == 's' && IsDigit(t[0]) && IsDigit(t[1]))
        {
            pd.s = ((t[0]-'0')*10) + (t[1]-'0') ;
            if (pd.s < 60)
                { t += 2 ; c += 2 ; continue ; }
            return false ;
        }
        //  No other formats accepted yet
        return false ;
    }
    if (*c != '}')
        return false ;
    c++ ;
    *test = t ;
    *ctrl = c ;
    return true ;
}
bool    FormCheckDate   (hzXDate& xdate, const char* cpTest, const char* cpCtrl)
{
    //  Category:   Regular Expression
    //
    //  Purpose:    Check if a test string contains a date and is of the form implied by a control string
    //
    //  The control string is interpreted as a simplfied regular expression. The method is aimed primarily at filtering filenames which are expected to contain
    //  dates and is not intended as a formal regular expression interpreter. The ? will match to any one character and the * will match to a series of zero or
    //  more characters as per the standards. However treatment of the [] constructs is more limited.
    //
    //  Arguments:  1)  xdate   The hzXDate instance to be populated in the event of HadronZoo date forms in the control being met in the test.
    //              2)  cpTest  The test string
    //              3)  cpCtrl  The control or 'form' string
    //
    //  Returns:    True    If the test string is of the form specified in the control
    //              False   Otherwise. Note that this may be because the test string contains an invalid date or partial date.
    //
    _hzfunc(__func__) ;
    _psudoDate  pd ;        //  For showing that anticipated date or partial date in the test string are valid
    const char* ctrl ;      //  For processing ctrl string into parts
    const char* test ;      //  Progressive ptr for test string
    uint32_t    val ;       //  For Hex number read
    xdate.Clear() ;
    if (!cpCtrl || !cpCtrl[0])  return true ;
    if (!cpTest || !cpTest[0])  return false ;
    test = cpTest ;
    ctrl = cpCtrl ;
    //  Perform the test
    for (; *test && *ctrl ;)
    {
        if (*ctrl == '*')
        {
            //  Because we don't know how many chars this pertains to, we recursivly call this function in a loop. We start with test
            //  where it is and advance it one place for each call. The control string for all these calls is set to one place beyond
            //  the asterisk(s).
            for (; *ctrl == '*' ; ctrl++) ;
            for (; *test ; test++)
            {
                if (FormCheckDate(xdate, test, ctrl))
                    return true ;
            }
            return false ;
        }
        if (*ctrl == '?')
            { test++ ; ctrl++ ; continue ; }
        if (*ctrl == '{')
        {
            if (!_set_psudo_date(pd, &test, &ctrl))
                return false ;
            continue ;
        }
        if (*ctrl == CHAR_BKSLASH)
        {
            //  This will be treated only as a backslash if followed by another backslash
            if (ctrl[1] == CHAR_BKSLASH)
                if (*test == CHAR_BKSLASH)
                    { test++ ; ctrl += 2 ; continue ; }
            if (ctrl[1] == 'n')
                if (*test == CHAR_NL)
                    { test++ ; ctrl += 2 ; continue ; }
            //  Whitespace
            if (ctrl[1] == 's')
                if (*test == CHAR_CTRLL || *test == CHAR_NL || *test == CHAR_CR || *test == CHAR_TAB || *test == CHAR_CTRLK)
                    { test++ ; ctrl += 2 ; continue ; }
            //  Non white space
            if (ctrl[1] == 'S')
                if (*test > CHAR_SPACE)
                    { test++ ; ctrl += 2 ; continue ; }
            if (ctrl[1] == 'c')
            {
                //  \cx
                //  Matches the control character indicated by x. For example, \cM matches a Control-M or carriage return character.
                //  The value of x must be in the range of A-Z or a-z. If not, c is assumed to be a literal c character.
                if (ctrl[2] >= 'a' && ctrl[2] <= 'z')
                    if (*test == (ctrl[2]-'a'))
                        { test++ ; ctrl += 3 ; continue ; }
                if (ctrl[2] >= 'A' && ctrl[2] <= 'Z')
                    if (*test == (ctrl[2]-'A'))
                        { test++ ; ctrl += 3 ; continue ; }
                if (*test == 'c')
                    { test++ ; ctrl += 2 ; continue ; }
            }
            if (ctrl[1] == 'x')
                if (IsHexnum(val, ctrl+1))
                    if (*test == (uchar) (val & 0xff))
                        { test++ ; ctrl += 4 ; continue ; }
            if (ctrl[1] == 't')
                if (*test == CHAR_TAB)
                    { test++ ; ctrl += 2 ; continue ; }
            //  Cope with vertical tab
            if (ctrl[1] == 'v')
                if (*test == CHAR_CTRLI)
                    { test++ ; ctrl += 2 ; continue ; }
            //  Cope with \f, \r, \n
            if (*test == ctrl[1])
                { test++ ; ctrl += 2 ; continue ; }
            //  Drop thru and just test for the backslash
        }
        //  No we have ordinary char in ctrl
        if (*test == *ctrl)
            { test++ ; ctrl++ ; continue ; }
        return false ;
        
    }
    if (pd.Y || pd.M || pd.D)
    {
        if ( pd.Y &&  pd.M &&  pd.D)
            xdate.SetDate(pd.Y, pd.M, pd.D) ;
        else if ( pd.Y &&  pd.M && !pd.D)
            xdate.SetDate(pd.Y, pd.M, 1) ;
        else if ( pd.Y && !pd.M &&  pd.D)   xdate.Clear() ; //invalid
        else if ( pd.Y && !pd.M && !pd.D)   xdate.Clear() ; //invalid
        else if (!pd.Y &&  pd.M &&  pd.D)
        {
            if (xdate.IsSet())
                pd.Y = xdate.Year() ;
            else
                { xdate.SysDateTime() ; pd.Y = xdate.Year() ; }
            xdate.SetDate(pd.Y, pd.M, pd.D) ;
        }
        else if (!pd.Y &&  pd.M && !pd.D)   xdate.Clear() ; //invalid
        else if (!pd.Y && !pd.M &&  pd.D)   xdate.Clear() ; //invalid
        else if (!pd.Y && !pd.M && !pd.D)   xdate.Clear() ; //invalid
        //  if (!pd.Y || !pd.M || !pd.D)
        //      xdate.SysDateTime() ;
        //  if (!pd.Y)  pd.Y = xdate.Year() ;
        //  if (!pd.M)  pd.M = xdate.Month() ;
        //  if (!pd.D)  pd.D = xdate.Day() ;
        //  xdate.SetDate(pd.Y, pd.M, pd.D) ;
        if (!xdate.IsSet())
            return false ;
    }
    return true ;
}
/*
**  Inline grep for statistical analysis etc
*/
hzEcode Grep    (hzChain& Zo, hzChain& Zi, const char* exp)
{
    //  Category:   Regular Expression
    //
    //  Effect a grep on the supplied exp to the input chain. Place all matching lines in the output chain
    //
    //  Arguments:  1)  Zo  The output chain being a list of matching lines
    //          2)  Zi  The input chain
    //          3)  exp The test expression
    //
    //  Returns:    E_ARGUMENT  If no search expression is supplied
    //              E_OK        In all other circumstances 
    _hzfunc(__func__) ;
    hzChain::Iter   zi ;    //  For iteration of the input
    hzChain         L ;     //  For isolating a line. This is then tested for the expression. If it passes, it is added as is to the output.
    hzString        S ;     //  Temp string
    Zo.Clear() ;
    if (!Zi.Size())
        return E_OK ;
    if (!exp || !exp[0])
        return E_ARGUMENT ;
    for (zi = Zi ; !zi.eof() ; zi++)
    {
        if (*zi != CHAR_NL)
            { L.AddByte(*zi) ; continue ; }
        //  Now have a line in L
        S = L ;
        L.Clear() ;
        if (!FormCheckCstr(*S, exp))
            continue ;
        //  Line has passed so add
        Zo << S ;
        Zo.AddByte(CHAR_NL) ;
        L.Clear() ;
    }
    return E_OK ;
}