//
//  File:   hzDocument.h
//
//  Legal Notice: This file is part of the HadronZoo C++ Class Library.
//
//  Copyright 2025 HadronZoo Project (http://www.hadronzoo.com)
//
//  The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free
//  Software Foundation, either version 3 of the License, or any later version.
//
//  The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
//  A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses.
//
//
//  The hzDocument class is the base for hzDocXml and hzDocHtml classes, instances of which would constitute a single XML or HTML document respectively.
//
#ifndef hzDocument_h
#define hzDocument_h
#include "hzErrcode.h"
#include "hzTmplArray.h"
#include "hzTmplList.h"
#include "hzTmplVect.h"
#include "hzTmplSet.h"
#include "hzTmplMapS.h"
#include "hzTmplMapM.h"
#include "hzEmaddr.h"
#include "hzUrl.h"
#include "hzProcess.h"
#define HDOC_ONLOAD_LINKS   0x01    //  Upon loading of HTML document, populate the m_setLinks and m_vecLinks with links found in page
#define HDOC_ONLOAD_FORMS   0x02    //  Upon loading of HTML document, populate the m_Forms with forms & thier fields found in page
/*
**  SECTION 1:  HTML Tags
*/
enum    hzDoctype
{
    //  Category:   Document
    //
    //  Enumeration for acceptable document types. This is a vastly cut down set compared to MIME types because it is limited to document structures
    //  that can be loaded into instances of a derivative of the hzDocument class.
    DOCTYPE_UNDEFINED,      //  Document type undefined
    DOCTYPE_HTML,           //  Document is HTML
    DOCTYPE_XML,            //  Document is XML
} ;
enum    hzHtagtype
{
    //  Category:   Document
    //
    //  Enumeration for all currently legal HTML5 tags.
    //
    //  Note that this enum, along with enum hzHtagclass, enum hzHtagrule and the hzHtagform class, are related matter, pertenant both to the generation of HTML in Dissemino and
    //  the parsing of any incoming HTML (web scraping)
    //
    //  Note in the below comments, the fields P, C and S respectfully means '', 'content' and 'subtags'. 
    HTAG_NULL,          //  No valid tag
    //  PAGE STRUCTURE              Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_DOCTYPE,               //  <!DOCTYPE>          N N N   Defines the document type
    HTAG_HTML,                  //  <html>              N N N   Defines an HTML document
    HTAG_HEAD,                  //  <head>              N N N   Defines information about the document
    HTAG_TITLE,                 //  <title>             N T N   Defines a title for the document
    HTAG_META,                  //  <meta>              N N N   Defines metadata about an HTML document
    HTAG_BODY,                  //  <body>              N N Y   Defines the document's body
    HTAG_BASE,                  //  <base>              N N ?   Specifies the base URL/target for all relative URLs in a document
    HTAG_BASEFONT,              //  <basefont>          N N ?   (deprecated) Specifies a default color, size, and font for all text in a document
    HTAG_STYLE,                 //  <style>             N N N   Marks a stylesheet
    //  FRAMES                      Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_FRAME,                 //  <frame>             N N Y   (deprecated) Defines a window (a frame) in a frameset
    HTAG_FRAMESET,              //  <frameset>          N N Y   (deprecated) Loads 2 or more frame elements which are separate document
    HTAG_IFRAME,                //  <iframe>            N N Y   (HTML5 vers) Defines an inline frame
    //  PROGRAMING                  Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_PARAM,                 //  <param>             N N ?   ???
    HTAG_SCRIPT,                //  <script>            N N N   Marks start of JavaScript (or other)
    HTAG_NOFRAMES,              //  <noframes>          N X N   (deprecated) Defines an alternate content for users that do not support frames
    HTAG_NOSCRIPT,              //  <noscript>          N X N   The content of a tag-antitag pair gives message if browser does not support scripts
    HTAG_APPLET,                //  <applet>            N T Y   Marks an applet
    //  DATA/LAYOUT                 Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_TABLE,                 //  <table>             N N Y   For starting a table
    HTAG_TCOL,                  //  <col>               N Y Y   For defining alignment in a table's columns (saves doing it in the cols themselves)
    HTAG_TCOLGRP,               //  <colgroup>
    HTAG_TH,                    //  <th>                N X N   Table heading
    HTAG_TR,                    //  <tr>                N N Y   Table row marker
    HTAG_TBL_CEL,               //  <td>                N E Y   Table column cell (may either have a set of child tags OR have text content on the same basis as <p>)
    HTAG_TBODY,                 //  <tbody>             N Y N   Uused to group the body content in an HTML table - used in conjunction with the thead and tfoot elements
    HTAG_THEAD,                 //  <thead>             N Y N   See above
    HTAG_TFOOT,                 //  <tfoot>             N Y N   See above
    HTAG_DIV,                   //  <div>               N Y Y   Defines a functional section in a document
    HTAG_SPAN,                  //  <span>              N N Y   Defines a section in a document
    HTAG_FIELDSET,              //  <fieldset>          N N Y   Tag is used to logically group together elements in a form. (draws a box around the related form elements)
    HTAG_LEGEND,                //  <legend>            N Y N   Tag defines a caption for the fieldset element.
    HTAG_MENU,                  //  <menu>              N Y Y   Marks menu (of items)
    HTAG_DT,                    //  <dt>                X X N   Definition term
    HTAG_DD,                    //  <dd>                X X N   The definition (dd always follows a dt)
    HTAG_DFN,                   //  <dfn>               X X N   This is another form of the definition term tag
    HTAG_DIR,                   //  <dir>               N N Y   Directory list (of <li> tags)
    HTAG_DLIST,                 //  <dl>                N N Y   Definition lists
    HTAG_OLIST,                 //  <ol>                N N Y   Ordered list
    HTAG_ULIST,                 //  <ul>                N N Y   Unordered list
    HTAG_ITEM,                  //  <li>                N Y N   Xlat List item
    HTAG_TIME,                  //  <time>              Y Y N   Cont Either attr of datetime=value or value found in the content
    //  LINKS                       Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_ANCHOR,                //  <a>                 X X N   Marks a link
    HTAG_NAV,                   //  <nav>               X X N   Marks a link
    HTAG_LINK,                  //  <link>              X X N   Marks a link
    //  INPUT/FORMS                 Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_FORM,                  //  <form>              N N Y   Marks start of a form
    HTAG_INPUT,                 //  <input>             X X N   Covers button/checkbox/file/hidden/image/password/radio/reset/submit/text/value
    HTAG_TEXTAREA,              //  <textarea>          N X N   Marks an text area (multiline input box)
    HTAG_SELECT,                //  <select>            N N Y   Selector
    HTAG_OPTGROUP,              //  <optgroup>          N X Y   Groups of select options
    HTAG_OPTION,                //  <option>            N X N   Select options
    HTAG_BUTTON,                //  <button>            X X N   Alternative to <input type=submit ...>
    HTAG_LABEL,                 //  <label>             N Y N   E.g. <form>
                                //                          <label for="male">Male</label><input type="radio" name="sex" id="male" />
                                //                          <label for="female">Female</label><input type="radio" name="sex" id="female" />
                                //                          </form>
    //  INFORMATION                 Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_ABBR,                  //  <abbr>
    HTAG_ACRONYM,               //  <acronym>
    HTAG_ADDRESS,               //  <address>
    //  SYSTEM                      Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_EMBED,                 //  <embed>
    HTAG_NOEMBED,               //  <noembed>
    //  Font control tags. Note these cannot have meaningful content in thier own right and are instead considered part of the content of a tag
    //  that can.
    //  FONT CONTROL                Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_BOLD,                  //  <b>                 X X N   Bold text (must be closed)
    HTAG_ULINE,                 //  <u>                 X X N   Underline
    HATG_ITALIC,                //  <i>                 X X N   Italics
    HTAG_EM,                    //  <em>                X X N   Emphasize
    HTAG_STRONG,                //  <strong>            X X N   Emphasize
    HTAG_CENTER,                //  <center>            N X Y   Center
    HTAG_FONT,                  //  <font>              X X Y   Font setting
    HTAG_BIG,                   //  <big>               X X Y   Larger text
    HTAG_SMALL,                 //  <small>             X X Y   Smaller text
    //  TEXT DESCRIPTION            Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_HEADER,                //  <header>            N X N   Defines a header for a document or section
    HTAG_FOOTER,                //  <footer>            N X N   Defines a footer for a document or section
    HTAG_SECTION,               //  <section>           N X N   Defines a section in a document
    HTAG_ARTICLE,               //  <article>           N X N   Defines an article
    HTAG_ASIDE,                 //  <aside>             N X N   Defines content aside from the page content
    HTAG_DETAILS,               //  <details>           N X N   Defines additional details that the user can view or hide
    HTAG_SUMMARY,               //  <summary>           N X N   Defines a visible heading for a <details> element
    HTAG_DIALOG,                //  <dialog>            N X N   Defines a dialog box or window
    //  TEXT GROUPING               Tagname             P C S   Desc
    //  ----------------------------------------------------
    //HTAG_FB,                  //  <fb>                ? ? ?   Facebook tag - treak as a comment block and ignore
    //HTAG_GOOGLE,              //  <g>                 ? ? ?   Google tag - treak as a comment block and ignore
    HTAG_MISC_ORG,              //  <x:y>               ? ? ?   Third party tag containing a colon
    HTAG_STRIKE,                //  <strike>            ? ? ?   Effectively a comment block
    HTAG_S,                     //  <s>                 ? ? ?   Same as <strike>
    HTAG_DEL,                   //  <del>               X N N   Marks deleted text (contents to be ignored)
    HTAG_INS,                   //  <ins>               ? ? ?   Marks an inserted part! (contents to be agregated to parent)
    HTAG_KBD,                   //  <kbd>               ? ? ?   Keyboard text
    HTAG_QUOTATION,             //  <q>, </q>           ? ? ?   Paragragh (can be closed or be ended by next <p> or by data structure tags
    //  TEXT CONTROL                Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_PARAG,                 //  <p>, </p>           N X Y   Paragragh (can be closed or be ended by next <p> or by data structure tags)
    HTAG_H1,                    //  <h1>                X X N   Size 1 heading
    HTAG_H2,                    //  <h2>                X X N   Size 2 heading
    HTAG_H3,                    //  <h3>                X X N   Size 3 heading
    HTAG_H4,                    //  <h4>                X X N   Size 4 heading
    HTAG_H5,                    //  <h5>                X X N   Size 5 heading
    HTAG_H6,                    //  <h6>                X X N   Size 6 heading
    HATG_BR,                    //  <br>                X X N   Non-breaking line (newline). This has no anti-tag
    HATG_HR,                    //  <hr>                X X N   Defines a thematic change in the content. No anti-tag
    HATG_TT,                    //  <tt>                X X N   Teletype
    HTAG_HGROUP,                //  <hgroup>            X X N   heading group
    HTAG_CODE,                  //  <code>              X X N   For de-marking computer code
    HTAG_SAMP,                  //  <samp>              X X N   For de-marking smaple computer code
    HTAG_CITE,                  //  <cite>              X X N   For marking out text
    HTAG_CAPTION,               //  <caption>           X X N   For marking out text
    HTAG_VAR,                   //  <var>               X X N   For marking out text
    HTAG_PRE,                   //  <pre>               N T N   As-is text (allows multiple spaces and tags)
    HTAG_BQ,                    //  <bq>                ? ? ?   ???
    HTAG_BLOCKQUOTE,            //  <blockquote>        X X Y   Indents encased HTML block
    HTAB_BDO,                   //  <bdo>               X X Y   Controls text direction (left-to-right instead of right-to-left)
    HTAG_SUBSCRIPT,             //  <sub>               X T N   Subscripts (eg chemical formula)
    HTAG_SUPERSCRIPT,           //  <sup>               X T N   For expressing powers
    //  IMAGE                       Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_IMG,                   //  <img>               X N N   Includes an image file
    HTAG_MAP,                   //  <map>               X N N   Uses a series of <area> tags to mark out area of an image.
    HTAG_AREA,                  //  <area>              ? ? ?   Used to mark out areas of an image
    HTAG_OBJECT,                //  <object>            X X Y   Includes objects such as images, audio, videos, Java applets, ActiveX, PDF, and Flash.
    HTAG_HR,                    //  <hr>                X N N   Horizontal line
    HTAG_MARQUEE,               //  <marquee>           ? ? ?   Ticker
    HTAG_CANVAS,                //  <canvas>            N N Y   Used to draw graphics, on the fly, via JavaScript
    HTAG_FIGCAPTION,            //  <figcaption>        X X N   Defines a caption for a <figure> element
    HTAG_FIGURE,                //  <figure>            X X N   Specifies self-contained content
    //  IMAGE SVG                   Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_SVG,                   //  <svg>               N N Y   Used to draw graphics, on the fly, no JS needed
    HTAG_SVG_ANIMATE,           //  <animate>
    HTAG_SVG_ANIMATEMOTION,     //  <animateMotion>
    HTAG_SVG_ANIMATEXFORM,      //  <animateTransform>
    HTAG_SVG_CIRCLE,            //  <circle>            N N N   Used in SVG
    HTAG_SVG_CLIPPATH,          //  <clipPath>
    HTAG_SVG_DEFS,              //  <defs>              N N N   Used in SVG
    HTAG_SVG_DESC,              //  <desc>
    HTAG_SVG_DISCARD,           //  <discard>
    HTAG_SVG_ELLIPSE,           //  <ellipse>
    HTAG_SVG_BLEND,             //  <feBlend>
    HTAG_SVG_COLORMATRIX,       //  <feColorMatrix>
    HTAG_SVG_COMPONENTXFER,     //  <feComponentTransfer>
    HTAG_SVG_COMPOSITE,         //  <feComposite>
    HTAG_SVG_CONVOLVEMATRIX,    //  <feConvolveMatrix>
    HTAG_SVG_DIFFUSELIGHTING,   //  <feDiffuseLighting>
    HTAG_SVG_DISPLACEMENTMAP,   //  <feDisplacementMap>
    HTAG_SVG_DISTANTLIGHT,      //  <feDistantLight>
    HTAG_SVG_DROPSHADOW,        //  <feDropShadow>
    HTAG_SVG_FLOOD,             //  <feFlood>
    HTAG_SVG_FUNC_A,            //  <feFuncA>
    HTAG_SVG_FUNC_B,            //  <feFuncB>
    HTAG_SVG_FUNC_G,            //  <feFuncG>
    HTAG_SVG_FUNC_R,            //  <feFuncR>
    HTAG_SVG_DEGAUSS,           //  <feGaussianBlur>    N N N   Used in SVG
    HTAG_SVG_IMAGE,             //  <feImage>
    HTAG_SVG_MERGE,             //  <feMerge>
    HTAG_SVG_MERGENODE,         //  <feMergeNode>
    HTAG_SVG_MORPHOLOGY,        //  <feMorphology>
    HTAG_SVG_OFFSET,            //  <feOffset>
    HTAG_SVG_POINTLIGHT,        //  <fePointLight>
    HTAG_SVG_SPECLIGHT,         //  <feSpecularLighting>
    HTAG_SVG_SPOTLIGHT,         //  <feSpotLight>
    HTAG_SVG_TITLE,             //  <feTile>
    HTAG_SVG_TURBULENCE,        //  <feTurbulence>
    HTAG_SVG_FILTER,            //  <filter>            N N N   Used in SVG
    HTAG_SVG_FOREIGNOBJECT,     //  <foreignObject>
    HTAG_SVG_GENERIC,           //  <g>
    HTAG_SVG_HATCH,             //  <hatch>
    HTAG_SVG_HATCHPATH,         //  <hatchpath>
    HTAG_SVG_LINE,              //  <line>
    HTAG_SVG_lINEARGRADIENT,    //  <linearGradient>
    HTAG_SVG_MARKER,            //  <marker>
    HTAG_SVG_MASK,              //  <mask>
    HTAG_SVG_METADATA,          //  <metadata>
    HTAG_SVG_MPATH,             //  <mpath>
    HTAG_SVG_PATH,              //  <path>
    HTAG_SVG_PATTERN,           //  <pattern>
    HTAG_SVG_POLYGON,           //  <polygon>           N N N   Used in SVG
    HTAG_SVG_POLYLINE,          //  <polyline>
    HTAG_SVG_RADIALGRADIENT,    //  <radialGradient>
    HTAG_SVG_RECT,              //  <rect>              N N N   Used in SVG
    HTAG_SVG_SET,               //  <set>
    HTAG_SVG_STOP,              //  <stop>
    HTAG_SVG_SWITCH,            //  <switch>
    HTAG_SVG_SYMBOL,            //  <symbol>
    HTAG_SVG_TEXT,              //  <text>
    HTAG_SVG_TEXTPATH,          //  <textPath>
    HTAG_SVG_TSPAN,             //  <tspan>
    HTAG_SVG_USE,               //  <use>
    HTAG_SVG_VIEW,              //  <view>
    //  AUDIO/VIDEO                 Tagname             P C S   Desc
    //  ----------------------------------------------------
    HTAG_AUDIO,                 //  <audio>             X X N   Defines sound content
    HTAG_SOURCE,                //  <source>            X X N   Defines multiple media resources for media elements (<video> and <audio>)
    HTAG_TRACK,                 //  <track>             X X N   Defines text tracks for media elements (<video> and <audio>)
    HTAG_VIDEO,                 //  <video>             X X N   Defines a video or movie
    //  THIRD PARTY
    //  ----------------------------------------------------
    HTAG_FBLIKE,                //  <fb:like>   Defines the Facebook like icon
    HTAG_UNKNOWN                //  Invalid tag
} ;
enum    hzHtagclass
{
    //  Category:   Document
    //
    //  HTML Tag groupings by function
    //
    //  Note that this enum, along with enum hzHtagtype, enum hzHtagrule and the hzHtagform class, are related matter, pertenant both to the generation of HTML
    //  in Dissemino and the parsing of any incoming HTML (web scraping)
    HTCLASS_NUL,        //  No valid class
    HTCLASS_HDR,        //  Page structure tags
    HTCLASS_DAT,        //  Data/layout tags
    HTCLASS_LNK,        //  Link tags
    HTCLASS_INP,        //  Input/form tags. Attrs only, no content
    HTCLASS_INF,        //  Information tags
    HTCLASS_SYS,        //  System tags (embed directives)
    HTCLASS_TXT,        //  Font control tags. Note these cannot have meaningful content in thier own right and are instead treated as part of the content of a
                        //  parent tag that can. This also means that during data extraction from a HTML page, these tags are ignored.
    HTCLASS_IMG,        //  Image tags (no text content)
    HTCLASS_3RD         //  Third party tag (no text content, ignored)
} ;
enum    hzHtagrule
{
    //  Category:   Document
    //
    //  Rules concerning opening and closing of tags
    //
    //  Note that this enum, along with enum hzHtagtype, enum hzHtagclass and the hzHtagform class, are related matter, pertenant both to the generation of HTML
    //  in Dissemino and the parsing of any incoming HTML (web scraping)
    HTRULE_NULL,        //  No valid rule
    HTRULE_PAIRED,      //  HTML Tag must be closed with either the <... /> notation or with the anti-tag
    HTRULE_SINGLE,      //  HTML Tag is not closed as it is its own anti-tag
    HTRULE_OPTION       //  HTML Tag closure is optional (anti-tag exists but is not required)
} ;
class   hzHtagform
{
    //  Category:   Document
    //
    //  Used in the tag lookup table to tie together the name, type, class and rule for each HTML tag
    //
    //  hzTagform is the item of interest when establishing if a tag is a legal HTML(5) tag. It comprises the tag name/type both in text and enumerated form and
    //  the applicable tag class and rule. The function HtmlInit() establishes a lookup table of all known HTML5 tags for the benefit of the HTML parser.
public:
    hzString    name ;  //  Tag name
    hzHtagtype  type ;  //  Tag type
    hzHtagclass klas ;  //  Tag class
    hzHtagrule  rule ;  //  Tag rule
    hzHtagform& operator=   (const hzHtagform& op)
    {
        name = op.name ;
        type = op.type ;
        klas = op.klas ;
        rule = op.rule ;
        return *this ;
    }
    operator const char*    (void) const    { return *name ; }
} ;
class   hzDocHtml ;
class   hzDocMeta
{
    //  Category:   Document
    //
    //  A document description, commonly used as a 'page marker' for webpages in web-scraping programs.
    //
    //  This will have the URL, the scrambled filename for storing the page in the repository and both the date last fetched and the expiry date.
public:
    hzXDate     m_Download ;    //  Date and time of last download
    hzXDate     m_Modified ;    //  Last modified date according to page header
    hzXDate     m_Expires ;     //  When page falls out of date
    hzUrl       m_urlReq ;      //  Requested URL
    hzUrl       m_urlAct ;      //  Actual location of page
    hzString    m_Title ;       //  Title of page (or sub-RSS)
    hzString    m_Desc ;        //  Description (RSS only)
    hzString    m_Filename ;    //  Filename in repository
    hzString    m_Etag ;        //  Entity tag if supplied
    uint32_t    m_Id ;          //  Assigned by webscrape to track order
    hzDoctype   m_Doctype ;     //  Document type (XML/HTML)
    hzDocMeta   (void)
    {
        m_Id = 0 ;
    }
    void    Clear   (void)
    {
        m_urlReq = (char*) 0 ;
        m_urlAct = (char*) 0 ;
        m_Title = (char*) 0 ;
        m_Desc = (char*) 0 ;
        m_Filename = (char*) 0 ;
        m_Etag = (char*) 0 ;
        m_Modified.Clear() ;
        m_Expires.Clear() ;
        m_Doctype = DOCTYPE_UNDEFINED ;
    }
    hzDocMeta&  operator=   (const hzDocMeta& op)
    {
        m_urlReq = op.m_urlReq ;
        m_urlAct = op.m_urlAct ;
        m_Title = op.m_Title ;
        m_Desc = op.m_Desc ;
        m_Filename = op.m_Filename ;
        m_Etag = op.m_Etag ;
        m_Modified = op.m_Modified ;
        m_Expires = op.m_Expires ;
        m_Doctype = op.m_Doctype ;
        m_Id = op.m_Id ;
        return *this ;
    }
    hzUrl&  Locale  (void)
    {
        if (*m_urlAct)
            return m_urlAct ;
        return m_urlReq ;
    }
    hzString    Domain  (void)
    {
        if (*m_urlAct)
            return m_urlAct.Domain() ;
        return m_urlReq.Domain() ;
    }
} ;
class   hzDocument
{
    //  Category:   Document
    //
    //  Pure virtual base class for the HTML document (hzDocHtml) and the XML document (hzDocXml). These two classes were tied together only because web scrape
    //  tools could find themselves dowloading both but all were considered documents.
protected:
    hzDocMeta           m_Info ;    //  Metadata
    hzChain             m_Error ;   //  Error reporting
public:
    hzMapM  <uint32_t,hzPair>   m_NodeAttrs ;   //  I:Many Node id to node attributes
    hzSet   <hzString>          m_Dict ;        //  All strings
    //  Constructor/Destructor
    hzDocument          (void)  {}
    virtual ~hzDocument (void)  {}
    hzEcode Init    (const hzUrl& url) ;
    //const char*   Xlate   (uint32_t strNo) const  { return m_Dict.Xlate(strNo) ; }
    void        SetMeta (const hzDocMeta& dm)   { m_Info = dm ; }
    const hzChain&  Error   (void)  { return m_Error ; }
    virtual hzEcode     Load    (hzChain& Z) = 0 ;
    virtual hzDoctype   Whatami (void) const = 0 ;
} ;
class   hzDocHtml ;
class   hzHtmElem
{
    //  Category:   Document
    //
    //  hzHtmlElem is the internal manifestation of a HTML tag, within a HTML document.
    //
    //  Note that the parent node, first child and next sibling, are represented by 32-bit unsigned numbers. These are addresses in the document hzStrRepos (dictionary). To access
    //  the dictionary, each HTML tag contains a pointer back to the host document which holds the dictionary.
protected:
    hzDocHtml*  m_pHostDoc ;        //  Host document
    hzString    m_Name ;            //  Name of this tag
    uint32_t    m_Parent ;          //  Parent node (for root this is a hzDocHtml, all other nodes this is a hzHtmElem)
    uint32_t    m_Children ;        //  Sub nodes of this node
    uint32_t    m_Sibling ;         //  Next node (in the series m_Chridren belonging to the parent of this)
    uint32_t    m_Uid ;             //  Unique id (within page)
    uint32_t    m_nLine ;           //  Line number of tag in the page source
    uint32_t    m_nAnti ;           //  Line number of anti-tag in the page source
    uint16_t    m_nLevel ;          //  Level of node (root node is 0)
    uint16_t    m_nAttrs ;          //  Number of parameters (not set until page load complete)
    uint32_t    m_nSubnodes ;       //  Number of sub-nodes (not set until page load complete)
    hzHtagtype  m_Type ;            //  Type of HTML tag
    //  Adding subnodes
    hzEcode     _addnode    (hzHtmElem* pNode) ;
    uint32_t    _testnode   (hzVect<hzHtmElem*>& ar, const char* srchExp, uint32_t& nLimit, uint32_t nLevel, bool bLog = false) ;
public:
    hzChain     m_tmpContent ;      //  Content of this tag
    hzString    m_fixContent ;      //  Contents of the tag (after loading if n_tempContent is small)
    //  Constructors/Destructors
    hzHtmElem   (void)
    {
        m_Parent = m_Children = m_Sibling = 0 ;
        m_Uid = -1 ;
        m_nLine = 0 ;
        m_nAnti = 0 ;
        m_nLevel = 0 ;
        m_nAttrs = m_nSubnodes = 0 ; 
        m_Type = HTAG_NULL ;
    }
    virtual ~hzHtmElem  (void)  {}
    //  Initialization
    hzEcode     Init        (hzDocHtml* pRoot, hzHtmElem* pParent, hzString& tagname, hzHtagtype type, uint32_t id, uint32_t line) ;
    void        _setanti    (uint32_t line) { m_nAnti = line ; }
    //  Geting subnodes and params
    void        FindSubnodes    (hzVect<hzHtmElem*>& result, const char* srchExp, bool bLog = false) ;
    hzHtmElem*  GetFirstChild   (void) const ;
    hzHtmElem*  Sibling         (void) const ;
    hzHtmElem*  Parent          (void) const ;
    const hzDocHtml*    GetHostDoc  (void) const    { return m_pHostDoc ; }
    //  Get other node info
    hzDocHtml*  GetTree     (void) ;
    hzString    Name        (void) const    { return m_Name ; }
    hzHtagtype  Type        (void) const    { return m_Type ; }
    uint32_t    Level       (void) const    { return m_nLevel ; }
    uint32_t    Line        (void) const    { return m_nLine ; }
    uint32_t    Anti        (void) const    { return m_nAnti ; }
    uint32_t    GetUid      (void) const    { return m_Uid ; }
} ;
class   hzHtmCol    : public hzHtmElem
{
    //  Category:   Document
    //
    //  Column in a table
public:
    hzString    m_Title ;       //  To appear above each row
    hzString    m_HdrRef ;      //  Link from heading if any
    hzString    m_Value ;       //  Cell value (must update this for each cell value)
    hzString    m_ValRef ;      //  Link from cell if any
    hzString    m_Class ;       //  Style-sheet class (if set this will be default for the table's <td> tags)
    uint32_t    m_BgColor ;     //  Background color (if not set use table value)
    uint32_t    m_FgColor ;     //  Foreground color (if not set use table value)
    uint16_t    m_Margin ;      //  Width of preceeding spaces
    uint16_t    m_Width ;       //  Width in pixels
    hzHtmCol    (void)
    {
        m_BgColor = 0x000000 ;
        m_BgColor = 0xffffff ;
        m_Margin = 5 ;
        m_Width = 10 ;
    }
} ;
class   hzHtmTbl    : public hzHtmElem
{
    //  Category:   Document
    //
    //  In HTML tables have sub-tags only of <tr> (table row). Columns are effected by the <tr> sub-tags. In the first row the columns can be named by the <th>
    //  (table heading) tag, although this is frequently done with the <td> tag instead - meaning that 
    //
    //  When querying a table for a cell value, we have to specify the column-name and the row number. If the table has <th> tags in
    //  the first row, the values of these will be compared to the supplied column name. If the table does not have <th> tags in the
    //  first row and instead has <td> tags, the values of these will be used instead.
    hzVect<hzHtmCol*>   m_Cols ;    //  Columns
    hzString    m_Title ;           //  Title of table
    hzString    m_Class ;           //  Style sheet class
    hzString    m_Url ;             //  To be pre-pending for links
    hzString    m_Empty ;           //  Msg to be displayed when DoFooter() is called with no rows done
    uint32_t    m_BgColor ;         //  Background color
    uint32_t    m_FgColor ;         //  Foreground color
    uint16_t    m_Height ;          //  Total hieght in pixels
    uint16_t    m_Width ;           //  Total width in pixels
    uint16_t    m_Border ;          //  Border width
    uint16_t    m_Cellspace ;       //  Cell spacing
    uint16_t    m_Cellpad ;         //  Cell pading
    uint16_t    m_nCols ;           //  Number of columns
    uint16_t    m_nRows ;           //  Number of data rows
public:
    hzHtmTbl    (void)
    {
        m_BgColor = 0xffffff ;
        m_FgColor = 0x000000 ;
        m_Height = 500 ;
        m_Width = 800 ;
        m_Border = 0 ;
        m_Cellspace = 0 ;
        m_Cellpad = 0 ;
        m_nCols = m_nRows = 0 ;
    }
    ~hzHtmTbl   (void)
    {
    }
    //  Initialization
    void    AddColumn   (hzHtmCol* col)
    {
        m_Cols.Add(col) ;
    }
    //  Set functions
    void        SetTitle    (const char* title)     { m_Title = title ; }
    void        SetClass    (const char* cls)       { m_Class = cls ; }
    void        SetUrl      (const char* url)       { m_Url = url ; }
    void        SetEmpty    (const char* empty)     { m_Empty = empty ; }
    void        SetBgColor  (uint32_t color)        { m_BgColor = color ; }
    void        SetFgColor  (uint32_t color)        { m_FgColor = color ; }
    void        SetHeight   (uint32_t h)            { m_Height = h ; }
    void        SetWidth    (uint32_t w)            { m_Width = w ; }
    //  Get functions
    uint32_t    Colcount    (void) ;
    uint32_t    Rowcount    (void) ;
    hzString&   GetUrl      (void)  { return m_Url ; }
    hzString    GetColl     (uint32_t nCol) ;
    hzString    GetCell     (uint32_t nRow, uint32_t nCol) ;
} ;
/*
**  Section 3:  Entities that comprise a series of HTML tags - The hzDocHtml class
*/
class   hzHtmForm
{
    //  Category:   Document
    //
    //  hzHtmForm is analogous to the hzwForm (web form) class except that the latter is for page generation and so has 'rendering control' data
    //  to enable HTML formation. The hzHtmForm class marks a form found in a downloaded page and notes only the form name and a list of fields
    //  manifest as name-value pairs (name of field plus a pre-set value if provided).
public:
    hzList<hzPair>  fields ;    //  Incident fields
    hzString        name ;      //  Form name
    ~hzHtmForm  (void)
    {
        fields.Clear() ;
    }
} ;
class   hzDocHtml   : public hzDocument
{
    //  Category:   Document
    //
    //  A whole or partial HTML Page or Document
    hzHtmElem*  m_pRoot ;           //  All tags found on level 0
    hzHtmElem*  m_pHead ;           //  All tags found in the header (head is level 1)
    hzHtmElem*  m_pBody ;           //  All tags found in the body (body is level 1)
    hzString    m_CookieSess ;      //  Set by HTML header upon Browse() or LoadHtml() or LoadFile()
    hzString    m_CookiePath ;      //  Set by HTML header upon Browse() or LoadHtml() or LoadFile()
    hzString    m_Title ;           //  Will be filename on export
    hzString    m_EntityTag ;       //  Entity tag from the header if given
    //  Documet building functions
    hzHtmElem*  _proctag    (hzHtmElem* pParent, hzChain::Iter& cur, hzHtagtype type) ;
    hzEcode     _htmPreproc (hzChain& Z) ;
    //  Reporting and export
    void        _report     (hzLogger& xlog, hzHtmElem* node) ;
    hzEcode     _xport      (hzChain& Z, hzHtmElem* node) ;
    //  Support for element selection
    hzEcode     _selectTag  (hzSet<hzHtmElem*>& parents, hzSet<hzHtmElem*>& elements, const hzString& tagspec) ;
    hzEcode     _selectTerm (hzSet<hzHtmElem*>& elements, const hzString& term) ;
    hzEcode     _selectExp  (hzSet<hzHtmElem*>& elements, const hzString& exp) ;
public:
    hzMapM  <hzString,hzHtmElem*>   m_mapTags ; //  All nodes within document
    hzArray <hzHtmElem>     m_arrNodes ;        //  Complete set of nodes, in order of appearence in the document
    hzSet   <hzUrl>         m_setLinks ;        //  Links to other pages occuring in this page's body
    hzSet   <hzEmaddr>      m_Emails ;          //  Email addresses occuring in this page's body
    hzVect  <hzUrl>         m_vecLinks ;        //  All elements in the order they appear
    hzVect  <hzHtmElem*>    m_vecTags ;         //  All elements in the order they appear
    hzVect  <hzString>      m_vecText ;         //  All text sections found in page
    hzList  <hzHtmForm*>    m_Forms ;           //  List of forms appearing in the page (if any)
    hzChain     m_Content ;     //  Full content of web-page
    hzString    m_Base ;        //  Base for URLs begining with /
    hzDocHtml   (void) ;
    ~hzDocHtml  (void) ;
    hzDoctype   Whatami (void) const    { return DOCTYPE_HTML ; }
    hzEcode Load    (hzChain& Z) ;              //  Load HTML document from a hzChain instance
    hzEcode Load    (const char* cpFilename) ;  //  Load HTML document from a file
    hzEcode Import  (const hzString& filepath) ;
    hzEcode Export  (const hzString& filepath) ;
    void    Report  (hzLogger& xlog) ;
    void    Clear   (void) ;
    //  Get functions
    hzHtmElem*  GetRoot     (void)  { return m_pRoot ; }
    hzString&   CookieSess  (void)  { return m_CookieSess ; }
    hzString&   CookiePath  (void)  { return m_CookiePath ; }
    //  Obtain a vector of elements according to tagname and attribute incidence
    hzEcode     FindElements    (hzVect<hzHtmElem*>& elements, hzString& tagname, hzString& attrName, hzString& attrValue) ;
    //  Obtain a vector of elements according to filtering criteria
    hzEcode     FindElements    (hzVect<hzHtmElem*>& elements, const char* srchExp) ;
    hzEcode     FindElements    (hzVect<hzHtmElem*>& elements, const hzString& srchExp) ;
    //  Extract links according to filtering criteria, from a page either as basic (URLs only) or URLs plus tag content
    uint32_t    ExtractLinksBasic   (hzVect<hzUrl>&links, const hzSet<hzString>& domains, const hzString& criteria) ;
    uint32_t    ExtractLinksContent (hzMapS<hzUrl,hzString>&links, const hzSet<hzString>& domains, const hzString& criteria) ;
} ;
#define INIT_START  0   //  Nothing happened yet
/*
**  Classes concerning XML formatting issues
**
**  XML, DTD, XLST and HadronZoo Records.
**
**  XML is a heirarchical data form in which data is held in tags. The data itself boils down to a series of name-value pairs which are 
**  related by virtual of thier position. The values are typeless strings although XML does give direction as to how these strings are
**  to be parsed.
**
**  DTD (Document Type Definition) puts constraints on the XML which will be regared by the XML parser as invalid unless it conforms
**  to the form implied in the DTD.
**
**  XLST (Extensible Language Stylesheet Transformations) are applied to XML documents so that they may be transformed from one form
**  (in one DTD) into another (in a second DTD). XLST describes the steps that must be taken to achieve this and this includes function
**  calls and data typing.
**
**  HadronZoo Records (hzRecord) are like XML, a heirarchical data form except that the values in the name-value pairs are strongly
**  type controlled. The form of the records is strictly defined by hzRecfmt which is alogous to DTD except that is contains typing
**  directives. To populate records from an XML source will require some form of an XLST.
*/
enum    XmlType
{
    //  Category:   Document
    //
    //  Controls the type of data held by a tag
    XML_TYPE_UNDEF,     //  Undefined (default)
    XML_CDATA,          //  Interpret node contents as pure character data requiring no processing or interpretation
    XML_PCDATA          //  Parsed char data - Change chars to entities where appropriate and treat <p> ect as HTML
                        //  tags rather than XML tags (nodes) 
} ;
enum    hzXOccur
{
    //  Category:   Document
    //
    //  This is a quantifier (given by a single character in the DTD) that immediately follows the specified item to which it applies,
    //  to restrict the number of successive occurrences of these items at the specified position in the content of the element; it
    //  and may be either:
    XML_INCID_PLUS,     //  (+) Specifying that there must be one or more occurrences of the item.
    XML_INCID_STAR,     //  (*) Specifying that any number (zero or more) of occurrences is allowed, the item is thus optional.
    XML_INCID_MAKR,     //  (?) Specifying that there must not be more than one occurrence, the item is optional.
    XML_INCID_DFLT      //  ( ) If there is no quantifier, the specified item must occur exactly once at the specified position in the content of the element.
} ;
enum    hzHtagInd
{
    //  Category:   Document
    //
    //  This is returned by the AtHtmlTag function which determines if the supplied chain iterator is at either:-
    HTAG_IND_NULL,      //  Not at a HTML tag or anti-tag
    HTAG_IND_OPEN,      //  At the start of an opening HTML tag
    HTAG_IND_ANTI,      //  At the start of an HTML anti-tag
    HTAG_IND_SELF       //  At the start of a self closing HTML tag
} ;
#if 0
struct  hzAttrSet
{
    //  Category:   Document
    //
    //  This defines what attributes are allowed within an XML tag
    hzAttrSet*  next ;          //  Next attribute if applicable
    hzString    m_Name ;        //  Name of attribute
    hzXOccur    m_Incidence ;   //  Quantity control
    hzAttrSet   (void)
    {
        next = 0 ;
        m_Incidence = XML_INCID_DFLT ;
    }
} ;
struct  hzTagCtrl
{
    //  Category:   Document
    //
    //  This defines what form of data and attributes nodes of this tag can have. hzTagCtrl is part
    //  of the doctype for the hzDocXml class.
    hzAttrSet*  m_pAttrs ;      //  Allowed attributes
    hzString    m_Name ;        //  Name of tag (copy of that in doctype map)
    hzString    m_FQN ;         //  Fully qualified name of tag (copy of that in doctype map)
    XmlType     m_Type ;        //  Type eg CDATA
    hzXOccur    m_Incidence ;   //  Quantity control
    hzTagCtrl   (void)
    {
        m_pAttrs = 0 ;
        m_Type = XML_TYPE_UNDEF ;
        m_Incidence = XML_INCID_DFLT ;
    }
} ;
class   hzDocCtrl
{
    //  Category:   Document
    //
    //  Document controller: This is the internal manifestation of a document type definition (DTD) and defines an object model to which a hzDocuement
    //  instance must conform to be valid.
public:
    hzMapM<hzString,hzTagCtrl*> m_TagsByName ;  //  All tags within document (by tag name only)
    hzMapS<hzString,hzTagCtrl*> m_TagsByFQN ;   //  All tags within document (by fully qualified tag name)
    hzTagCtrl*  m_pRoot ;       //  The root tag, the document has this tag at it's root
    hzString    m_Name ;        //  Name of the doctype
    hzDocCtrl   (void)
    {
        m_pRoot = 0 ;
    }
} ;
#endif
/*
**  SECTION 2:  Classes storing XML data
*/
class   hzDocXml ;
class   hzXmlNode
{
    //  Category:   Document
    //
    //  These form the nodes in the hzXmlObj tree (the XML document)
    hzDocXml*   m_pHostDoc ;    //  Host XML document
    hzString    m_Ptxt ;        //  Node pre-text
    hzString    m_Name ;        //  Node name
    uint32_t    m_Parent ;      //  Parent node. This will be 0 in the root node.
    uint32_t    m_Children ;    //  First child (in a series of children)
    uint32_t    m_Sibling ;     //  Next node (in the series of children). This will be 0 in the last child node.
    uint32_t    m_Uid ;         //  Unique id
    uint32_t    m_nLine ;       //  Line number of tag
    uint32_t    m_nAnti ;       //  Line number of anti-tag
    uint16_t    m_nLevel ;      //  Level of node (root node is 0)
    uint16_t    m_nCol ;        //  Column of node within line wthin document
    uint16_t    m_bXmlesce ;    //  Node may contain HTML tags as part of the content
    uint16_t    m_nAttrs ;      //  Number of attributes
    hzXmlNode*  _findsubnode    (bool& bMatch, const hzString& name, const hzString& attr, const hzString& value) ;
    uint32_t    _testnode       (hzVect<hzXmlNode*>& ar, const char* srchExp, uint32_t& nLimit) ;
public:
    hzString    m_fixContent ;  //  Contents of the tag (after loading if m_tmpContent is small)
    hzXmlNode   (void)
    {
        m_pHostDoc = 0 ;
        m_Parent = m_Children = m_Sibling = 0 ;
        //m_Uid = m_snPtxt = m_snName = m_nLine = m_nAnti = 0 ;
        m_Uid = m_nLine = m_nAnti = 0 ;
        m_nLevel = m_nCol = m_bXmlesce = m_nAttrs = 0 ;
    }
    ~hzXmlNode  (void)  {}
    hzXmlNode&  operator=   (const hzXmlNode& op)
    {
        m_pHostDoc = op.m_pHostDoc ;
        m_Parent = op.m_Parent ;
        m_Children = op.m_Children ;
        m_Sibling = op.m_Sibling ;
        m_Uid = op.m_Uid ;
        //m_snPtxt = op.m_snPtxt ;
        //m_snName = op.m_snName ;
        m_Ptxt = op.m_Ptxt ;
        m_Name = op.m_Name ;
        m_nLine = op.m_nLine ;
        m_nAnti = op.m_nAnti ;
        m_nLevel = op.m_nLevel ;
        m_nCol = op.m_nCol ;
        m_bXmlesce = op.m_bXmlesce ;
        m_nAttrs = op.m_nAttrs ;
        return *this ;
    } 
    hzXmlNode*  Init    (hzDocXml* pHostDoc, hzXmlNode* pParent, const hzString& name, uint32_t nLineNo, uint32_t nCol, bool bXmlesce = false) ;
    void    _setanti    (uint32_t lineNo)   { m_nAnti = lineNo ; }  //  Set line number of anti-tag (mark end of node)
    void    Clear   (void) ;
    //  Adding subnodes
    hzEcode AddNode (hzXmlNode* pNode) ;
    //  Set content
    void    SetCDATA    (hzChain& C) ;
    hzEcode SetPretext  (hzChain& C) ;
    void    SetContent  (hzChain& C) ;
    //  Geting subnodes and params
    void        Export_r        (hzDocXml* pDoc, hzChain& Z, uint32_t& relLine) ;
    void        Export          (hzChain& Z) ;
    hzEcode     SelectSubnodes  (hzVect<hzXmlNode*>& result, hzMapM<hzString,hzXmlNode*>& allsubnodes, const char* srchExp) ;
    void        FindSubnodes    (hzVect<hzXmlNode*>& result, const char* srchExp) ;
    hzXmlNode*  FindSubnode     (const char* srchExp) ;
    bool        IsAncestor      (hzXmlNode* candidate) ;
    bool        IsXmlesce       (void) const    { return m_bXmlesce == 1 ? true : false ; }
    hzXmlNode*  GetFirstChild   (void) const ;
    hzXmlNode*  Sibling         (void) const ;
    hzXmlNode*  Parent          (void) const ;
    uint32_t    Uid             (void)  { return m_Uid ; }
    uint32_t    ParentId        (void)  { return m_Parent ; }
    uint32_t    FirstChildId    (void)  { return m_Children ; }
    uint32_t    SiblingId       (void)  { return m_Sibling ; }
    const char* Xlate           (uint32_t strNo) const ;
    uint32_t    GetNoAttrs      (void) const    { return m_nAttrs ; }
    //  Get other node info
    //hzString  Lineage     (void) const ;
    hzString    Filename    (void) const ;
    const char* Fname       (void) const ;
    const hzDocXml* GetHostDoc  (void) const    { return m_pHostDoc ; }
    uint32_t    GetUid      (void) const    { return m_Uid ; }
    uint32_t    Line        (void) const    { return m_nLine ; }
    uint32_t    Anti        (void) const    { return m_nAnti ; }
    uint32_t    Level       (void) const    { return m_nLevel ; }
    //uint32_t  StrnoName   (void) const    { return m_snName ; }
    //uint32_t  StrnoPtxt   (void) const    { return m_snPtxt ; }
    const char* txtName     (void) const ;
    const char* txtPtxt     (void) const ;
    bool        NameEQ      (const char* testname) const ;
} ;
class   hzAttrset
{
    //  Category:   Document
    //
    //  The hzAttrset class is really an convenient attribute iterator. The tag attributes are stored in the host document in a one-to-many map between tag uids
    //  and attribute, rather than in a list held by the host tag. Given this arrangement, iteration of tag attributes would normally require a first and a last
    //  position within the map, as well as another variable to iterate between the two. In previous version of the HadronZoo library, each tag had an attribute
    //  pointer and each attribute had a next pointer. Attributes could be iterated by control loops of the form:-
    //
    //      for (attr = first_attr ; attr ; attr = attr->next)  {}
    //
    //  The objective of the attribute iterator is to achieve a similar interface. The hzAttrset is initialized to a tag (node) which in turn points to the host
    //  document. The initialization does a lookup in the document map and the Valid() method returns true if there is an attribute. The Advance() method moves
    //  on to the next attribut in the tag so:-
    //
    //      for (attrset = node ; attrset.Valid() ; attrset.Advance())  {}
    const hzDocument*   m_pHostDoc ;    //  This is needed for the node's uid and the document hosting the map
    hzPair      m_Pair ;        //  Name/value pair
    uint32_t    m_NodeUid ;     //  Node Uid (key to node/attr map)
    int32_t     m_Start ;       //  First attribute for the tag in the document 1:many map of tag ids to attrs
    int32_t     m_Final ;       //  Last attribute for the tag in the document 1:many map of tag ids to attrs
    int32_t     m_Current ;     //  Current attribute for the tag in the map
    //  Prevent copies
    hzAttrset   (const hzAttrset&) ;
    hzAttrset&  operator=   (const hzAttrset&) ;
public:
    hzAttrset       (void)
    {
        //m_pHostNode = 0 ;
        m_pHostDoc = 0 ;
        m_NodeUid = 0 ;
        //m_Current = m_Start = m_Final = -1 ;
        //m_Pair.m_A = m_Pair.m_B = 0 ;
    }
    ~hzAttrset  (void)  {}
    //  Set the attribute iterator to the start of the attributes for the XML node
    hzAttrset&  operator=   (hzXmlNode* pNode) ;
    hzAttrset&  operator=   (hzHtmElem* pElem) ;
    //bool  Valid   (void) const    { return m_pHostDoc && m_Pair.m_A ? true : false ; }
    bool    Valid   (void) const    { return m_pHostDoc && m_Pair.name ? true : false ; }
    bool        NameEQ  (const char* cstr) const ;
    bool        ValEQ   (const char* cstr) const ;
    void        Advance (void) ;
    const char* Name    (void) const ;
    const char* Value   (void) const ;
} ;
#define XMLESCE_OFF     0   //  Upon load, assume all tags are XML even if they are known HTML tags.
#define XMLESCE_ON      1   //  Upon load, treat all HTML subtags found within a non-HTML tag as part of the non-HTML tag's content
#define XMLESCE_MIX     2   //  Upon load, where tag content text is mixed with subtags, treat the sections of text as nodes in their own right.
class   hzDocXml    : public hzDocument
{
    //  Category:   Document
    //
    //  The XML tree. This is usually populated by reading a single XML file
    hzSet<hzString>     m_Xmlesce ;     //  List of tags whose nodes are permitted to contain HTML data. The node may still contain normal XML subnodes but any tag with a name that
                                        //  is a legal HTML tag - will be treated as HTML.
    //hzDocCtrl m_Doctype ;             //  The doctype
    hzXmlNode*  m_pRoot ;               //  All tags found on level 0
    hzString    m_Filename ;            //  Full path of loaded file (optional)
    uint32_t    m_FileEpoch ;           //  Modified time of loaded file (optional)
    uint32_t    m_bXmlesce ;            //  Treat all legal HTML tag/antitags as part of node content
    //  Support functions
    int32_t     _proctagopen    (hzXmlNode** ppChild, hzXmlNode* pParent, hzChain::Iter& ci) ;
public:
    hzMapM  <hzString,uint32_t>     m_NodesName ;   //  1:many strings (tag names) to node ids
    hzMapM  <uint32_t,uint32_t>     m_NodesPar ;    //  1:many parent nodes (node uid) to child node ids
    hzArray <hzXmlNode>             m_arrNodes ;    //  All nodes within document
    
    hzDocXml    (void) ;
    ~hzDocXml   (void) ;
    hzDoctype   Whatami     (void) const    { return DOCTYPE_XML ; }
    hzString    Filename    (void) const    { return m_Filename ; }
    const char* Fname       (void) const    { return *m_Filename ; }
    hzXmlNode*  GetRoot     (void) const    { return m_pRoot ; }
    void        AddXmlesce  (hzString& tagname) { m_Xmlesce.Insert(tagname) ; } 
    void        SetXmlesce  (bool bXmlesce)     { m_bXmlesce = bXmlesce ; }
    //  Load and Export
    hzEcode     Load        (hzChain& Z) ;
    hzEcode     Load        (const char* cpFilename) ;
    hzEcode     Export      (hzChain& e) ;
    hzEcode     Export      (const hzString& filepath) ;
    void        Clear       (void) ;
    //  Naviagation
    void        listnodes   (void) ;
    hzXmlNode*  GetNode     (uint32_t nodeId) const ;
    hzEcode     FindNodes   (hzVect<hzXmlNode*>& Nodes, const char* srchExp) ;
    hzString    GetValue    (hzXmlNode* pBasenode, hzString& Nodename, hzString& Info) ;
} ;
class   hzXmlSlct
{
    //  Category:   Document
    //
    //  XML Selector - used to extract information from an XML document, such as the set of links in an RSS Feed page. The assumption being that the XML will be of a known form and
    //  that the information of interest will be found in known nodes (tags) - usually the case with subscription data feeds. hzXmlSlct comprises selection criteria to locate nodes
    //  of interest, together with a method which directs the extraction process.
    //
    //  Node location can be on node name alone, but only if the nodes of interest use a node name that is not used within the document for any other purpose. As this is not always
    //  the case it is often necessary to provide further qualification. This can be done by specifying node ancestry, by requiring particular attributes and if need be, requiring
    //  the attributes to have particular values. In the notation the $ sign is used to separate node names, the -> symbol to state a required attribute and the = sign to specify a
    //  value. Location criteria are thus of the form:-
    //
    //      [acestor1$]...[ancestorN$]node_name[->attr1[=val1]]...[->attrN[=valN]]
    //
    //  The information of interest is usually the content of the node(s) of interest, but it in rare cases it can lay in the attributes. Accordingly, the method of extraction will
    //  either be "node" to extract node content or a series of "->attr" to name the attribute(s), or it can be both.
    //
    //  The string values extracted will in the case of multiple nodes, be aggregated.
    //  Category:   Document
    //
    //  The hzXmlSlct or 'XML selector' class is a configuration device, used to direct extraction of information from XML documents. Instead of hard coding how
    //  an XML document is read (the approach always taken when reading config files themselves), a set of one or more XML selectors defined within the program
    //  configs are applied to acheive the same. Both apporaches have merit. The hard coding is intentionally rigid and will typically invalidate documents that
    //  contain superfluous nodes or attributes. The set of XML selectors will only look for nodes of interest and ignore the rest.
    //
    //  Either way the extraction process is a matter of looking for nodes matching particular criteria and pulling the node content and/or values of particular
    //  node attributes. The XML selector sets out the following:-  
    //
    //      1)  What the node must have:
    //      
    //          a)  The node name. The node must match on node name.
    //          b)  Expected attribtes. If any then the node must have all attributes listed.
    //          c)  Expecte attribute values. If a value as assigned to an expected attribute, then the node must have that attribute with that value.
    //          d)  Particular ancestry:
    //          e)  Particular subnodes: ...
    //
    //      2)  What the node must not have:
    //
    //          a)  Unexpected attributes: If the node has any of the attributes listed, it is excluded
    //          b)  Unexpected attribute values. If the node has any of the attributes listed and the values they are listed with, it is excluded.
    //
    //      3)  Method of extraction:
    //
    //          a)  Node: This meaans the content of the node
    //          b)  Attr: This is written as "->attr_name" and will return the value of the attrubute if the node has it
    //
    //  Note that where XML nodes may have content mixed with subnodes, particular care should be taken when specifying the extraction method. The HadronZoo XML
    //  parser assigns node content to nodes in two ways. Content appearing BEFORE the closing tag but AFTER ANY opening tag is treated as content. However, any
    //  content appearing AFTER the opening tag but BEFORE an opening sub-tag is not. Instead it is assigned as pre-text to the sub-tag. For example:-
    //
    //      <tagA>blurb about A<tagB>blurb about B</tagB>more blurb about A</tagA>
    //
    //  The content of the node named tagB is "blurb about B" as expected but the content of the node named tagA is just "more blurb about A". The first part of
    //  the tagA content has gone missing. It can only be found as the pre-text of the subnode.
public:
    hzString    m_Slct ;    //  Criteria needed to locate node
    hzString    m_Info ;    //  How to obtain data from the node (eg ->'Date' meaning value of an attr called 'Date')
    hzString    m_Filt ;    //  Filter to be applied to the extracted data.
    hzXmlSlct   (void)
    {
    }
    bool    IsValid (void)  { return (!m_Slct || !m_Info)? false : true ; }
    bool    IsNull  (void)  { return (!m_Slct && !m_Info)? true : false ; }
    bool    IsPart  (void)  { return (m_Slct || m_Info)? true : false ; }
    hzXmlSlct&  operator=   (const hzXmlSlct& op)
    {
        m_Slct = op.m_Slct ;
        m_Info = op.m_Info ;
        m_Filt = op.m_Filt ;
        return *this ;
    }
} ;
/*
**  Prototypes
*/
hzEcode     InitHtml        (void) ;
hzHtagInd   AtHtmlTag       (hzString& tag, hzChain::Iter& ci) ;
void        XmlCleanHtags   (hzChain& output, const hzChain& input) ;
hzDoctype   DeriveDoctype   (hzChain& Z) ;
const char* Doctype2Txt     (hzDoctype) ;
hzString    Tagtype2Txt     (hzHtagtype type) ;
hzHtagtype  Txt2Tagtype     (const hzString& tagtype) ;
const hzHtagform&   TagLookup   (const hzString& htag) ;
const hzHtagform&   TagLookup   (hzChain::Iter& ci) ;
#endif  //  hzDocument_h