//
//  File:   hzHttpClient.cpp
//
//  Legal Notice:   This file is part of the HadronZoo C++ Class Library. Copyright 2025 HadronZoo Project (http://www.hadronzoo.com)
//
//  The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free
//  Software Foundation, either version 3 of the License, or any later version.
//
//  The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
//  A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses.
//
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <netdb.h>
#include <sys/stat.h>
#include "hzChars.h"
#include "hzTextproc.h"
#include "hzDirectory.h"
#include "hzCodec.h"
#include "hzHttpClient.h"
#include "hzProcess.h"
using namespace std ;
/*
**  Prototypes
*/
uint32_t    _extractHttpHeader  (hzString& Attr, hzString& Value, hzChain::Iter& ci, bool bConvert) ;
/*
**  Section 1:  hzHttpClient member functions
*/
hzEcode hzHttpClient::Connect   (const hzUrl& url)
{
    _hzfunc("hzHttpClient::Connect") ;
    hzEcode rc ;    //  Return code
    if (url.IsSSL())
        rc = m_Webhost.ConnectSSL(url.Domain(), url.Port()) ;
    else
        rc = m_Webhost.ConnectStd(url.Domain(), url.Port()) ;
    if (rc != E_OK)
        m_Error.Printf("Could not connect to domain [%s] on port %d (error=%s)\n", *url.Domain(), url.Port(), Err2Txt(rc)) ;
    else
    {
        rc = m_Webhost.SetSendTimeout(30) ;
        if (rc != E_OK)
            m_Error.Printf("Could not set send_timeout on connection to domain [%s] on port %d (error=%s)\n", *url.Domain(), url.Port(), Err2Txt(rc)) ;
        else
        {
            rc = m_Webhost.SetRecvTimeout(30) ;
            if (rc != E_OK)
                m_Error.Printf("Could not set recv_timeout on connection to domain [%s] on port %d (error=%s)\n", *url.Domain(), url.Port(), Err2Txt(rc)) ;
        }
    }
    return rc ;
}
hzEcode hzHttpClient::Close (void)
{
    _hzfunc("hzHttpClient::Close") ;
    m_Webhost.Close() ;
    return E_OK ;
}
uint32_t    _extractHttpHeader  (hzString& Param, hzString& Value, hzChain::Iter& ci, bool bConvert)
{
    //  Support function to extract parameter name and value from a HTTP header (either that of a request or response). HTTP header lines are of the form param_name: param_value
    //  and are terminated by a CR/NL
    //
    //  Arguments:  1)  Param       The hzString to store the parameter name.
    //              2)  Value       The hzString to store the parameter value.
    //              3)  ci          A reference to the chain iterator processing the HTTP request.
    //              4)  bConvert    Flag to convert percent sign followed by two hex digits into single char value
    //
    //  Returns:    Number of charachters processed.
    _hzfunc("_extractHttpHeader") ;
    chIter      xi ;            //  For iterating line
    hzChain     temp ;          //  For building param and then value
    uint32_t    nCount = 0 ;    //  Returned length of HTTP header line
    uint32_t    nHex ;          //  Hex value
    char        cvHex[4] ;      //  Hex value buffer
    Param.Clear() ;
    Value.Clear() ;
    cvHex[2] = 0 ;
    xi = ci ;
    for (; !xi.eof() ;)
    {
        if (*xi == CHAR_PERCENT)
        {
            if (bConvert)
            {
                xi++ ; cvHex[0] = *xi ;
                xi++ ; cvHex[1] = *xi ;
                xi++ ;
                nCount += 3 ;
                if (IsHexnum(nHex, cvHex))
                    temp.AddByte(nHex) ;
                continue ;
            }
        }
        if (*xi == CHAR_COLON && !Param)
        {
            xi++ ;
            nCount++ ;
            Param = temp ;
            temp.Clear() ;
            if (*xi == CHAR_SPACE)
                for (; !xi.eof() && (*xi == CHAR_SPACE || *xi == CHAR_TAB) ; xi++, nCount++) ;
        }
        if (xi == "\r\n")
            { xi += 2 ; nCount += 2 ; break ; }
        if (*xi == CHAR_NL)
            { xi++ ; nCount++ ; break ; }
        if (*xi < CHAR_SPACE)
            threadLog("Illegal char (%u) in HTTP Header\n", (uchar) *xi) ;
        if (*xi == CHAR_PLUS)
            temp.AddByte(CHAR_SPACE) ;
        else
            temp.AddByte(*xi) ;
        xi++ ;
        nCount++ ;
    }
    Value = temp ;
    return nCount ;
}
hzEcode hzHttpClient::_procHttpResponse (HttpRC& hRet, const hzUrl& url)
{
    //  Support funtion to the hzHttpClient member functions GetPage() and PostForm(). The purpose is to gather the server response to
    //  an earlier HTTP GET, POST or HEAD request.
    //
    //  Arguments:  1)  hRet    HTTP return code
    //              2)  url     The URL
    //
    //  Returns:    E_NOSOCKET  If the external server has closed the connection
    //              E_NODATA    If nothing was recived
    //              E_FORMAT    If the response was malformed
    //              E_OK        If the response was recieved without error
    _hzfunc("hzHttpClient::_procHttpResponse") ;
    chIter      zi ;                //  To iterate the returned page
    chIter      hi ;                //  To re-iterate lines of interest in the header of the returned page
    chIter      ti ;                //  Temp iterator
    hzChain     Z ;                 //  Request buffer
    hzChain     X ;                 //  Temp buffer
    hzCookie    cookie ;            //  Cookie (to be checked against supplied map of cookies)
    hzString    S ;                 //  Temp string
    hzString    param ;             //  Header parameter name
    hzString    value ;             //  Header parameter value
    uint32_t    nRecv ;             //  Bytes received
    uint32_t    nExpect = 0 ;       //  Size of current chunk
    uint32_t    nLen = 0 ;          //  Content length
    uint32_t    nLine ;             //  Line number (of header)
    uint32_t    nTry ;              //  Number of tries
    uint32_t    nCount ;            //  Number of bytes counted off from those expected
    bool        duHast = false ;    //  Have read a chunking directive or have a content len
    bool        bTerm = false ;     //  Terminate chunking (only set upon a 0 value on a line by itself
    hzEcode     sRet = E_OK ;       //  Return code
    char        numBuf[4] ;         //  For HTTP return code
    //  Clear variables
    m_CacheCtrl = (char*) 0 ;
    m_Pragma = (char*) 0 ;
    m_Redirect = (char*) 0 ;
    m_KeepAlive = (char*) 0 ;
    m_ContentType = (char*) 0 ;
    m_XferEncoding = (char*) 0 ;
    m_ContEncoding = (char*) 0 ;
    m_Etag = (char*) 0 ;
    m_bConnection = false ;
    m_nContentLen = 0 ;
    m_Content.Clear() ;
    m_Header.Clear() ;
    //  Garner first the header, from the response
    for (nTry = 0 ; nTry < 4 && !m_Header.Size() ; nTry++)
    {
        sRet = m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
        if (sRet != E_OK)
        {
            if (sRet == E_NOSOCKET)
                m_Error.Printf("Connection closed by server\n") ;
            else
                m_Error.Printf("Could not recv bytes (nbytes=%d) from page %s (error=%s)\n", nRecv, *url.Resource(), Err2Txt(sRet)) ;
            break ;
        }
        if (!nRecv)
        {
            m_Error.Printf("Got no response, retrying ...\n") ;
            sleep(1) ;
            continue ;
        }
        Z.Append(m_buf, nRecv) ;
        //  Test for presence of \r\n\r\n to mark end of header
        for (zi = Z ; !zi.eof() ; zi++)
        {
            if (*zi != CHAR_CR)
                continue ;
            if (zi == "\r\n\r\n")
            {
                //  Bytes before the header's end are now copied from temp chain Z to the header
                for (ti = Z ; ti != zi ; ti++)
                    m_Header.AddByte(*ti) ;
                zi += 4 ;
                break ;
            }
        }
    }
    if (nTry == 4)
        { m_Error.Printf("Given up!\n") ; return E_NODATA ; }
    if (!m_Header.Size())
        { m_Error.Printf("Given up! Header is empty\n") ; return E_NODATA ; }
    /*
    **  Examine header
    */
    //  First part is the HTTP return code
    memset(numBuf, 0, 4) ;
    hi = m_Header ;
    if (hi == "HTTP/")
    {
        for (hi += 5 ; !hi.eof() && *hi > CHAR_SPACE ; hi++) ;
    }
    else
    {
        m_Error.Printf("case 1: 1st line of server response should be HTTP/{version} followed by a 3 digit HTML return code\n") ;
        m_Error.Printf("got %d bytes of header namely:-\n[", m_Header.Size()) ;
        m_Error << m_Header ;
        m_Error << "]\n" ;
        return E_FORMAT ;
    }
    m_Error << "Response\n" << m_Header << "\n--------------------------\n" ;
    hi++ ;  numBuf[0] = *hi ;
    hi++ ;  numBuf[1] = *hi ;
    hi++ ;  numBuf[2] = *hi ;
    hi++ ;  numBuf[3] = 0 ;
    if (*hi != CHAR_SPACE || !IsDigit(numBuf[0]) || !IsDigit(numBuf[1]) || !IsDigit(numBuf[2])) 
    {
        m_Error.Printf("case 2: 1st line of server response should be HTTP/1.1 followed by a 3 digit HTML return code - got [%s]\n\n", numBuf) ;
        return E_FORMAT ;
    }
    hRet = (HttpRC) atoi(numBuf) ;
    for (hi++ ; !hi.eof() && *hi != CHAR_NL ; hi++) ;
    hi++ ;
    //  Next part is the header lines
    for (nLine = 1 ; !hi.eof() ; nLine++, hi += nLen)
    {
        nLen = _extractHttpHeader(param, value, hi, false) ;
        if (nLen == 0)
        {
            for (hi++ ; !hi.eof() && *hi != CHAR_NL ; hi++) ;
            hi++ ;
            m_Error.Printf("Line %d of header rejected (param=%s, value=%s)\n", nLine, *param, *value) ;
            continue ;
        }
        if (param.Equiv("Date"))                { m_Accessed = value ; continue ; }
        if (param.Equiv("Expires"))             { m_Expires = value ; continue ; }
        if (param.Equiv("Last-Modified"))       { m_Modified = value ; continue ; }
        if (param.Equiv("Cache-Control"))       { m_CacheCtrl = value ; continue ; }
        if (param.Equiv("Pragma"))              { m_Pragma = value ; continue ; }
        if (param.Equiv("Location"))            { m_Redirect = value ; continue ; }
        if (param.Equiv("Keep-Alive"))          { m_KeepAlive = value ; continue ; }
        if (param.Equiv("Connection"))          { m_bConnection = value == "close" ? false : true ; continue ; }
        if (param.Equiv("Content-Type"))        { m_ContentType = value ; continue ; }
        if (param.Equiv("Content-Encoding"))    { m_ContEncoding = value ; continue ; }
        if (param.Equiv("Transfer-Encoding"))   { m_XferEncoding = value ; continue ; }
        if (param.Equiv("Alternate-Protocol"))  { m_AltProto = value ; continue ; }
        if (param.Equiv("ETag"))                { m_Etag = value ; continue ; }
        if (param.Equiv("Set-Cookie"))
        {
            //  Get the cookie value
            ti = hi ;
            for (ti += 12 ; !ti.eof() && *ti != CHAR_EQUAL ; ti++)
                X.AddByte(*ti) ;
            cookie.m_Name = X ;
            X.Clear() ;
            for (ti++ ; !ti.eof() && *ti != CHAR_SCOLON ; ti++)
                X.AddByte(*ti) ;
            cookie.m_Value = X ;
            //cookie.m_Value.FnameDecode() ;
            X.Clear() ;
            //  Get the path
            for (ti++ ; !ti.eof() && *ti == CHAR_SPACE ; ti++) ;
            if (ti == "path=")
            {
                for (ti += 5 ; !ti.eof() && *ti > CHAR_SPACE ; ti++)
                    X.AddByte(*ti) ;
                cookie.m_Path = X ;
                X.Clear() ;
            }
            //  Get special directives (eg HttpOnly)
            for (ti++ ; !ti.eof() && *ti == CHAR_SPACE ; ti++) ;
            if (ti == "HttpOnly")
                cookie.m_Flags |= COOKIE_HTTPONLY ;
            m_Cookies.Insert(cookie.m_Name, cookie) ;
            cookie.Clear() ;
            continue ;
        }
        if (param.Equiv("Content-Length"))
        {
            if (*value && value[0])
            {
                duHast = true ;
                m_nContentLen = atoi(*value) ;
            }
            continue ;
        }
    }
    /*
    **  Garner next the body, from the response
    */
    m_Error.Printf("Getting body. xfer=%s, expect=%d, clen=%d\n", *m_XferEncoding, duHast?1:0, m_nContentLen) ;
    if (!duHast)
    {
        //  In chunked encoding the first part (directly after the header and the terminating \r\n\r\n), will be a hex number followed
        //  by a \r\n (on a line by itself). This hex number will mean the size of the following chunk. At the end of the chunk will be
        //  another hex number on a line by itself. Only when this number is zero are we at the end of the page.
        //
        //  While reading the chunk size and chunk, we will most probably, reach the end of the buffer and have to do a read operation
        //  on the socket.
        m_Error.Printf("Encoding is chunked\n") ;
        nExpect = nCount = 0 ;
        bTerm = false ;
        for (; !bTerm ;)
        {
            //  If we are at the end of the buffer, read more
            for (; zi.eof() ;)
            {
                //  If out of data, get more
                m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
                if (nRecv <= 0)
                    break ;
                m_Error.Printf("Read buffer %d bytes\n", nRecv) ;
                Z.Clear() ;
                Z.Append(m_buf, nRecv) ;
                for (zi = Z ; nExpect && !zi.eof() ; nExpect--, zi++)
                    m_Content.AddByte(*zi) ;
                if (!nExpect)
                    break ;
            }
            if (!nExpect)
            {
                //  We are on the 'chunk size' directive. This will be of the form \r\nXXX\r\n where X is a hex number
                //  Get rid of any \r\n sequences that are beyond the expected chars and before the chunk size directive
                for (; !zi.eof() && (*zi == CHAR_CR || *zi == CHAR_NL) ; zi++) ;
                if (zi.eof())
                {
                    //  If out of input data, get more
                    m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
                    if (nRecv)
                    {
                        m_Error.Printf("Read extras %d bytes\n", nRecv) ;
                        Z.Clear() ;
                        Z.Append(m_buf, nRecv) ;
                        for (zi = Z ; !zi.eof() && (*zi == CHAR_CR || *zi == CHAR_NL) ; zi++) ;
                    }
                }
                duHast = false ;
                for (;;)
                {
                    if (zi.eof())
                    {
                        m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
                        if (nRecv)
                        {
                            m_Error.Printf("Read extras %d bytes\n", nRecv) ;
                            Z.Clear() ;
                            Z.Append(m_buf, nRecv) ;
                            zi = Z ;
                        }
                    }
                    //  Read the chunk size
                    if (*zi >= '0' && *zi <= '9')   { duHast = true ; nExpect *= 16 ; nExpect += (*zi - '0') ;  zi++ ; continue ; }
                    if (*zi >= 'A' && *zi <= 'F')   { duHast = true ; nExpect *= 16 ; nExpect += (*zi-'A'+10) ; zi++ ; continue ; }
                    if (*zi >= 'a' && *zi <= 'f')   { duHast = true ; nExpect *= 16 ; nExpect += (*zi-'a'+10) ; zi++ ; continue ; }
                    if (zi == "\r\n")
                        { zi += 2 ; break ; }
                    if (*zi == CHAR_CR)
                        { zi++ ; continue ; }
                    if (*zi == CHAR_NL)
                        { zi++ ; break ; }
                    sRet = E_FORMAT ;
                    m_Error.Printf("Unexpected char (%d) in chunking directive - from page %s\n", *zi, *url.Resource()) ;
                    break ;
                }
                if (!duHast)
                {
                    m_Error.Printf("Chunk notice missing\n") ;
                    sRet = E_FORMAT ;
                }
                if (sRet != E_OK)
                    break ;
                if (nExpect == 0)
                    bTerm = true ;
                //m_Error.Printf("Chunk notice %d bytes\n", nExpect) ;
                if (nExpect)
                {
                    //  Play out rest of buffer but make sure we don't exceed the chunk size
                    for (; !zi.eof() && nExpect ; zi++, nExpect--)
                        m_Content.AddByte(*zi) ;
                }
                else
                {
                    //  At end of page, just play out rest of buffer
                    for (; !zi.eof() ; zi++) ;
                        //m_Content.AddByte(*zi) ;
                }
                m_Error.Printf("Chunk complete. Expect = %d\n", nExpect) ;
            }
        }
    }
    else
    {
        //  Not chunked - just read until stated Content-Length is reached
        if (m_nContentLen)
        {
            for (; !zi.eof() ; zi++)
                m_Content.AddByte(*zi) ;
            Z.Clear() ;
            for (; m_Content.Size() < m_nContentLen ;)
            {
                sRet = m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
                if (sRet != E_OK)
                {
                    m_Error.Printf("(1) Could not recv bytes from page %s (error=%s)\n", *url.Resource(), Err2Txt(sRet)) ;
                    break ;
                }
                if (nRecv == 0)
                {
                    sRet = m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
                    if (sRet != E_OK)
                    {
                        m_Error.Printf("(2) Could not recv bytes from page %s (error=%s)\n", *url.Resource(), Err2Txt(sRet)) ;
                        break ;
                    }
                }
                if (nRecv <= 0)
                {
                    m_Error.Printf("Breaking after recv %d of %d bytes\n", m_Content.Size(), m_nContentLen) ;
                    break ;
                }
                m_Content.Append(m_buf, nRecv) ;
            }
            if (m_Content.Size() < m_nContentLen)
            {
                if (m_Content.Size() == (m_nContentLen - 4))
                    m_Error.Printf("Allowing 4-byte shortfall\n") ;
                else
                    sRet = E_READFAIL ;
            }
        }
    }
    if (hRet == 200)
    {
        if (!m_Content.Size())
        {
            m_Error.Printf("No content (xfer_encoding=%s content_size=%d)\n", *m_XferEncoding, m_nContentLen) ;
            sRet = E_NODATA ;
        }
    }
    if (sRet == E_OK && m_ContEncoding)
    {
        //  Must apply appropiate decoding to content
        if (m_ContEncoding == "gzip")
        {
            X = m_Content ;
            m_Content.Clear() ;
            m_Error.Printf("doing gunzip\n") ;
            sRet = Gunzip(m_Content, X) ;
            if (sRet != E_OK)
                m_Error.Printf("Gunzip failed\n") ;
        }
    }
    m_Error.Printf("URL [%s] Header %d bytes, Content %d bytes (%d)\n\n", *url, m_Header.Size(), m_Content.Size(), m_nContentLen) ;
    if (m_Content.Size() < 2000)
    {
        m_Error << "Content:\n" ;
        m_Error << m_Content ;
        m_Error << "------------------------\n" ;
    }
    return sRet ;
}
hzEcode hzHttpClient::TestPage  (hzChain& Z, const hzUrl& url)
{
    //  Get a HTTP page from a website but do not process it in any way. This is for speed testing only.
    //
    //  Note:   The website (server) must already be connected to.
    //          No account is taken of redirected pages.
    //
    //  Arguments:  1)  Z   The chain into which page content is to be received
    //              2)  url The URL of the page
    //
    //  Returns:    E_ARGUMENT  If no URL was specified
    //              E_NODATA    If nothing was recived
    //              E_OK        If the response was recieved without error
    _hzfunc("hzHttpClient::Testpage") ;
    chIter      zi ;            //  To iterate the returned page
    chIter      hi ;            //  To re-iterate lines of interest in the header of the returned page
    chIter      ti ;            //  Temp iterator
    hzChain     X ;             //  Temp buffer
    hzCookie    cookie ;        //  Cookie (drawn from supplied map of cookies)
    hzString    S ;             //  Temp string
    hzString    param ;         //  Header parameter name
    hzString    value ;         //  Header parameter value
    hzString    encoding ;      //  Page content is encoded, eg gzip
    uint32_t    nRecv ;         //  Bytes received
    uint32_t    nTry ;          //  Number of tries
    hzEcode     rc = E_OK ;     //  Return code
    //  Clear buffers
    Z.Clear() ;
    m_Header.Clear() ;
    m_Content.Clear() ;
    if (!url.Domain())
        { m_Error.Printf("TestPage: No host to locate\n") ; return E_ARGUMENT ; }
    /*
    **  Formulate HTTP request
    */
    m_Request.Clear() ;
    if (url.Resource())
        m_Request << "GET " <<  url.Resource() << " HTTP/1.1\r\n" ;
    else
        m_Request << "GET / HTTP/1.1\r\n" ;
    m_Request <<
    "Accept: */*\r\n"
    "Accept-Language: en-gb\r\n" ;
    if (m_AuthBasic)
        m_Request << "Authorization: Basic " << m_AuthBasic << "\r\n" ;
    m_Request << "User-Agent: HadronZoo/0.8 Linux 2.6.18\r\n" ;
    m_Request << "Host: " << url.Domain() << "\r\n" ;
    if (m_Referer)
        m_Request << "Referer: " << m_Referer << "\r\n" ;
    m_Request << "Connection: Keep-Alive\r\n\r\n" ;
    /*
    **  Send request
    */
    m_Error << " Sending [" << m_Request << "] to domain " << url.Domain() << "\n" ;
    rc = m_Webhost.Send(m_Request) ;
    if (rc != E_OK)
    {
        m_Error.Printf("Could not send request to domain [%s] (error=%s)\n", *url.Domain(), Err2Txt(rc)) ;
        return rc ;
    }
    //  Garner response
    for (nTry = 0 ; nTry < 4 && !m_Header.Size() ; nTry++)
    {
        rc = m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
        if (rc != E_OK)
        {
            if (rc == E_NOSOCKET)
                m_Error.Printf("Connection closed by server\n") ;
            else
                m_Error.Printf("Could not recv bytes (nbytes=%d) from page %s (error=%s)\n", nRecv, *url.Resource(), Err2Txt(rc)) ;
            break ;
        }
        if (!nRecv)
        {
            m_Error.Printf("Got no response, retrying ...\n") ;
            sleep(1) ;
            continue ;
        }
        Z.Append(m_buf, nRecv) ;
    }
    if (rc != E_OK)
    {
        m_Error.Printf("Could not process response from [%s] (error=%s)\n", *url, Err2Txt(rc)) ;
        return rc ;
    }
    m_Referer = url ;
    return rc ;
}
hzEcode hzHttpClient::_getpage  (HttpRC& hRet, const hzUrl& url, const hzString& etag)
{
    //  Get a HTTP page from a website but do not redirect. This is a support function for GetPage()
    //
    //  Arguments:  1) hRet     The HTTP return code from server
    //              2) url      The URL
    //              3) etag     Entity tag
    //
    //  Returns:    E_ARGUMENT  If the URL is not supplied or no domain specified
    //              E_NOSOCKET  If the external server has closed the connection
    //              E_NODATA    If nothing was recived
    //              E_FORMAT    If the response was malformed
    //              E_OK        If the response was recieved without error
    _hzfunc("hzHttpClient::_getpage") ;
    chIter      zi ;                //  To iterate the returned page
    chIter      hi ;                //  To re-iterate lines of interest in the header of the returned page
    chIter      ti ;                //  Temp iterator
    hzChain     Z ;                 //  Request buffer
    hzChain     X ;                 //  Temp buffer
    hzCookie    cookie ;            //  Cookie (drawn from supplied map of cookies)
    hzString    S ;                 //  Temp string
    hzString    param ;             //  Header parameter name
    hzString    value ;             //  Header parameter value
    hzString    encoding ;          //  Page content is encoded, eg gzip
    uint32_t    x = 0 ;             //  Size of current chunk
    bool        bFirstCookie ;      //  Controls form of cookie header
    hzEcode     rc = E_OK ;
    //  Clear buffers
    m_Header.Clear() ;
    m_Content.Clear() ;
    if (!url.Domain())
        { m_Error.Printf("No host to locate\n") ; return E_ARGUMENT ; }
    /*
    **  Formulate HTTP request
    */
    m_Request.Clear() ;
    if (url.Resource())
        m_Request << "GET " <<  url.Resource() << " HTTP/1.1\r\n" ;
    else
        m_Request << "GET / HTTP/1.1\r\n" ;
    m_Request << "Accept: */*\r\n" ;
    //m_Request << "Accept-Encoding: gzip\r\n" ;
    m_Request << "Accept-Language: en-gb\r\n" ;
    if (m_Cookies.Count())
    {
        m_Request << "Cookie: " ;
        bFirstCookie = false ;
        for (x = 0 ; x < m_Cookies.Count() ; x++)
        {
            cookie = m_Cookies.GetObj(x) ;
            if (bFirstCookie)
                m_Request << "; " ;
            m_Request.Printf("%s=%s", *cookie.m_Name, *cookie.m_Value) ;
            bFirstCookie = true ;
        }
        m_Request << "\r\n" ;
    }
    if (etag)
        m_Request << "If-None-Match: " << etag << "\r\n" ;
    if (m_AuthBasic)
        m_Request << "Authorization: Basic " << m_AuthBasic << "\r\n" ;
    m_Request << "User-Agent: HadronZoo/0.8 Linux 2.6.18\r\n" ;
    m_Request << "Host: " << url.Domain() << "\r\n" ;
    if (m_Referer)
        m_Request << "Referer: " << m_Referer << "\r\n" ;
    m_Request << "Connection: keepalive\r\n\r\n" ;
    //  Connect to server
    if (url.IsSSL())
        rc = m_Webhost.ConnectSSL(url.Domain(), url.Port()) ;
    else
        rc = m_Webhost.ConnectStd(url.Domain(), url.Port()) ;
    if (rc != E_OK)
    {
        m_Error.Printf("Could not connect to domain [%s] on port %d (error=%s)\n", *url.Domain(), url.Port(), Err2Txt(rc)) ;
        return rc ;
    }
    //  Send request
    m_Error << " Sending [" << m_Request << "] to domain " << url.Domain() << "\n" ;
    rc = m_Webhost.Send(m_Request) ;
    if (rc != E_OK)
    {
        m_Error.Printf("Could not send request to domain [%s] (error=%s)\n", *url.Domain(), Err2Txt(rc)) ;
        return rc ;
    }
    //  Garner response
    rc = _procHttpResponse(hRet, url) ;
    if (rc != E_OK)
    {
        m_Error.Printf("Could not process response from [%s] (error=%s)\n", *url, Err2Txt(rc)) ;
        return rc ;
    }
    m_Referer = url ;
    m_Webhost.Close() ;
    return rc ;
}
hzEcode hzHttpClient::GetPage   (HttpRC& hRet, const hzUrl& url, const hzString& etag)
{
    //  Get a HTTP page from a website. Note that the whole page is retrieved or abandoned before this function returns. Some servers send pages with
    //  the header 'Transfer-Encoding: chunked' instead of the 'Content-Length:' header. This is done because the size of the page is not known at the
    //  start of transmission. The body part of the message is sent in chunks with the chunk size given (in hex on a line by itself) at the start of
    //  each chunk. Because of the existance of the chunked approach, this function has to handle it but it is currently not possible for applications
    //  to take advantage in the intended way. Instead applications calling this function have to wait until it returns with a complete page, however
    //  long!
    //
    //  Note that no assumptions can be made about packets that are sent except that since the connection is TCP, they will be in order. The header
    //  may be comprised of a number of whole packets or it may be that a packet stradles the end of the header and the start of the contents.
    //
    //  Arguments:  1) hRet     HTTP return code from the server.
    //              2) url      The URL of the page to retrieve.
    //              3) etag     Page entity tag (as maintained by hzWebhost instance)
    //
    //  Returns:    E_ARGUMENT  If the URL is not supplied or no domain specified
    //              E_NOSOCKET  If the external server has closed the connection
    //              E_NODATA    If nothing was recived
    //              E_FORMAT    If the response was malformed
    //              E_OK        If the response was recieved without error
    _hzfunc("hzHttpClient::GetPage") ;
    hzUrl       dest ;          //  Actual URL for downloading - may be result of a redirection
    hzString    dom ;           //  This is set first to the called URL's domain but afterwards to any redirected domain
    hzString    etag2 ;         //  Set as null for the benefit of _getpage() in the case of redirection
    hzEcode     rc = E_OK ;     //  Return code
    //  Considered a top-level function so we clear the error chain
    m_Error.Clear() ;
    m_Error.Printf("GETTING PAGE %s\n", *url) ;
    dest = url ;
    m_rtRequest = RealtimeNano() ;
    rc = _getpage(hRet, dest, etag) ;
    m_rtResponse = RealtimeNano() ;
    if (rc != E_OK)
    {
        m_Error.Printf("ABORTED (_getpage failure)\n") ;
        return rc ;
    }
    for (; hRet == HTTPMSG_REDIRECT_PERM || hRet == HTTPMSG_REDIRECT_TEMP ;)
    {
        //Clear() ;
        if (!m_Redirect)
            m_Error.Printf("Oops - no URL to redirect to\n") ;
        else
        {
            if (m_Redirect[0] == CHAR_FWSLASH)
                { dom = dest.Domain() ; dest.SetValue(dom, m_Redirect) ; }
            else
                dest = m_Redirect ;
            m_Error.Printf("redirecting to %s\n", *dest) ;
            rc = _getpage(hRet, dest, etag2) ;
            if (rc != E_OK)
            {
                m_Error.Printf("Redirect FAILED (error=%s)\n", Err2Txt(rc)) ;
                return rc ;
            }
        }
    }
    //  Obtain document type. If HTML then also get links
    m_Error.Printf("Got response %d (size %d bytes)\n", hRet, m_Content.Size()) ;
    return rc ;
}
hzEcode hzHttpClient::_postform (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& hdrs, const hzChain& formData)
{
    //  Support function for hzHttpClient::PostForm(). Compiles the HTTP request and adds the supplied form. The functionality herin would just
    //  appear in PostForm() except for the need to cope with redirection. This requires that the request ...
    //
    //  Arguments:  1)  hRet        Reference to HTTP return code, set by this operation
    //              2)  url         The URL to post the form to
    //              3)  hdrs        Vector of additional HTTP headers
    //              4)  formData    The actual form data
    //
    //  Returns:    E_ARGUMENT  If the URL is not supplied or no domain specified
    //              E_NOSOCKET  If the external server has closed the connection
    //              E_NODATA    If nothing was recived
    //              E_FORMAT    If the response was malformed
    //              E_OK        If the form was posted and the response was recieved without error
    _hzfunc("hzHttpClient::PostForm") ;
    hzCookie    cookie ;        //  Cookie (drawn from supplied map of cookies)
    hzString    dom ;           //  Domain part of URL
    hzString    res ;           //  Resource part of URL
    uint32_t    nPort ;         //  Port (from URL)
    uint32_t    nIndex ;        //  Form data iterator
    bool        bFirstCookie ;  //  Controls form of cookie header
    hzEcode     rc ;            //  Return code
    m_Request.Clear() ;
    dom = url.Domain() ;
    res = url.Resource() ;
    nPort = url.Port() ;
    if (url.IsSSL())
        m_Request.Printf("POST https://%s%s HTTP/1.1\r\n", *dom, *res) ;
    else
        m_Request.Printf("POST http://%s%s HTTP/1.1\r\n", *dom, *res) ;
    m_Request << "Host: " << dom << "\r\n" ;
    m_Request << "User-Agent: HadronZoo/0.8 Linux 2.6.18\r\n" ;
    m_Request << "Accept: */*\r\n" ;
    m_Request << "Accept-Language: en-gb,en;q=0.5\r\n" ;
    //m_Request << "Accept-Encoding: gzip, deflate\r\n" ;
    m_Request << "Content-Type: application/x-www-form-urlencoded; charset=UTF-8\r\n" ;
    if (m_Referer)
        m_Request << "Referer: " << m_Referer << "\r\n" ;
    m_Request.Printf("Content-Length: %d\r\n", formData.Size()) ;
    if (m_Cookies.Count())
    {
        m_Request << "Cookie: " ;
        bFirstCookie = false ;
        for (nIndex = 0 ; nIndex < m_Cookies.Count() ; nIndex++)
        {
            cookie = m_Cookies.GetObj(nIndex) ;
            if (bFirstCookie)
                m_Request << "; " ;
            m_Request.Printf("%s=%s", *cookie.m_Name, *cookie.m_Value) ;
            bFirstCookie = true ;
        }
        m_Request << "\r\n" ;
    }
    if (hdrs.Count())
    {
        for (nIndex = 0 ; nIndex < hdrs.Count() ; nIndex++)
            //m_Request << hdrs.Element(nIndex) ;
            m_Request << hdrs[nIndex] ;
    }
    m_Request << "Connection: keep-alive\r\n" ;
    m_Request << "Pragma: no-cache\r\n" ;
    m_Request << "Cache-Control: no-cache\r\n\r\n" ;
    m_Request << formData ;
    //  Connect to server
    if (url.IsSSL())
        rc = m_Webhost.ConnectSSL(dom, nPort) ;
    else
        rc = m_Webhost.ConnectStd(dom, nPort) ;
    if (rc != E_OK)
    {
        m_Error.Printf("Could not connect to %s on port %d\n", *dom, nPort) ;
        return rc ;
    }
    m_Error.Printf("Connected to %s on port %d\n[\n", *dom, nPort) ;
    m_Error << m_Request ;
    m_Error << "\n-------------------------\n\n" ;
    rc = m_Webhost.Send(m_Request) ;
    if (rc != E_OK)
        m_Error.Printf("Could not send request (error=%s)\n", Err2Txt(rc)) ;
    else
    {
        rc = _procHttpResponse(hRet, url) ;
        if (rc != E_OK)
            m_Error.Printf("Could not get response (error=%s)\n", Err2Txt(rc)) ;
    }
    return rc ;
}
hzEcode hzHttpClient::PostForm  (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& hdrs, const hzList<hzPair>& formData)
{
    //  Post a form to the server. Note that this will normally result in a HTTP response. This response must be processed in the same
    //  way (ie values are extracted from lines in the HTTP header).
    //
    //  Arguments:  1) hRet     HTTP return code
    //              2) url      The URL
    //              3) hdrs     Lines in HTTP header
    //              4) formData The form data to be submitted
    //
    //  Returns:    E_ARGUMENT  If the URL is not supplied or no domain specified
    //              E_NOSOCKET  If the external server has closed the connection
    //              E_NODATA    If nothing was recived
    //              E_FORMAT    If the response was malformed
    //              E_OK        If the form was posted and the response was recieved without error
    _hzfunc("hzHttpClient::PostForm") ;
    hzList<hzPair>::Iter    iD ;    //  Form data iterator
    hzChain     F ;                 //  Form data in submissible form
    hzCookie    cookie ;            //  Cookie (drawn from supplied map of cookies)
    hzPair      P ;                 //  Form data field
    hzUrl       dest ;              //  Url may change due to redirection
    hzString    dom ;               //  Domain part of URL
    hzString    res ;               //  Resource part of URL
    hzString    etag ;              //  Temp string for reading form data
    hzEcode     rc ;                //  Return code
    //  Considered a top-level function so we clear the error chain
    m_Error.Clear() ;
    m_Error.Printf("POSTING FORM %s\n", *url) ;
    //Clear() ;
    m_Header.Clear() ;
    m_Content.Clear() ;
    m_Request.Clear() ;
    if (!formData.Count())
        return E_NODATA ;
    for (iD = formData ; iD.Valid() ; iD++)
    {
        P = iD.Element() ;
        if (F.Size())
            F.AddByte(CHAR_AMPSAND) ;
        F << P.name ;
        F.AddByte(CHAR_EQUAL) ;
        P.value.UrlEncode() ;
        F << P.value ;
    }
    dest = url ;
    rc = _postform(hRet, dest, hdrs, F) ;
    if (rc != E_OK)
    {
        m_Error.Printf("FAILED (error=%s)\n", Err2Txt(rc)) ;
        return rc ;
    }
    for (; hRet == HTTPMSG_REDIRECT_PERM || hRet == HTTPMSG_REDIRECT_TEMP ;)
    {
        if (!m_Redirect)
            m_Error.Printf("Oops - no URL to redirect to\n") ;
        else
        {
            if (m_Redirect[0] == CHAR_FWSLASH)
                { dom = dest.Domain() ; dest.SetValue(dom, m_Redirect) ; }
            else
                dest = m_Redirect ;
            m_Error.Printf("redirecting to %s\n", *dest) ;
            etag = (char*) 0 ;
            rc = _getpage(hRet, dest, etag) ;
            if (rc != E_OK)
            {
                m_Error.Printf("Redirect FAILED (error=%s)\n", Err2Txt(rc)) ;
                break ;
            }
        }
    }
    return rc ;
}
hzEcode hzHttpClient::PostAjax  (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& hdrs, const hzList<hzPair>& formData)
{
    //  Post a form to the server but do not seek a HTTP response.
    //
    //  Arguments:  1)  hRet        HTTP return code
    //              2)  url         The URL
    //              3)  hdrs        Lines in HTTP header
    //              4)  formData    The form data to be submitted
    //
    //  Returns:    E_ARGUMENT  If the URL is not supplied or no domain specified
    //              E_NOSOCKET  If the external server has closed the connection
    //              E_NODATA    If nothing was recived
    //              E_FORMAT    If the response was malformed
    //              E_OK        If the AJAX request was sent and the response was recieved without error
    _hzfunc("hzHttpClient::PostAjax") ;
    hzList<hzPair>::Iter    iD ;    //  Form data iterator
    hzChain     F ;                 //  Form data in submissible form
    hzCookie    cookie ;            //  Cookie (drawn from supplied map of cookies)
    hzPair      P ;                 //  Form data field
    hzString    dom ;               //  Domain part of URL
    hzString    res ;               //  Resource part of URL
    hzString    S ;                 //  Temp string for reading form data
    uint32_t    nPort ;             //  Port (from URL)
    uint32_t    nIndex ;            //  Form data iterator
    hzEcode     rc ;                //  Return code
    //Clear() ;
    m_Header.Clear() ;
    m_Content.Clear() ;
    m_Request.Clear() ;
    if (!formData.Count())
        return E_NODATA ;
    for (iD = formData ; iD.Valid() ; iD++)
    {
        P = iD.Element() ;
        if (F.Size())
            F.AddByte(CHAR_AMPSAND) ;
        F << P.name ;
        F.AddByte(CHAR_EQUAL) ;
        F << P.value ;
    }
    dom = url.Domain() ;
    res = url.Resource() ;
    nPort = url.Port() ;
    if (url.IsSSL())
        m_Request.Printf("POST https://%s%s HTTP/1.1\r\n", *dom, *res) ;
    else
        m_Request.Printf("POST http://%s%s HTTP/1.1\r\n", *dom, *res) ;
    //m_Request << "POST " << "http://" << dom << res << " HTTP/1.1\r\n" ;
    m_Request << "Accept: text/*\r\n" ;
    m_Request << "Accept-Language: en-gb\r\n" ;
    //m_Request << "Accept-Encoding:\r\n" ;
    //m_Request << "Accept-Encoding: gzip, deflate\r\n" ;
    for (nIndex = 0 ; nIndex < m_Cookies.Count() ; nIndex++)
    {
        cookie = m_Cookies.GetObj(nIndex) ;
        if (cookie.m_Flags & COOKIE_HTTPONLY)
            continue ;
        m_Request.Printf("Cookie: %s=%s\r\n", *cookie.m_Name, *cookie.m_Value) ;
    }
    //m_Request << "User-Agent: HadronZoo/0.8 (compatible; MSIE 6.0;)\r\n" ;
    m_Request << "User-Agent: HadronZoo/0.8 Linux 2.6.18\r\n" ;
    m_Request.Printf("Content-Length: %d\r\n", F.Size()) ;
    m_Request << "Host: " << dom << "\r\n" ;
    if (hdrs.Count())
    {
        for (nIndex = 0 ; nIndex < hdrs.Count() ; nIndex++)
            //m_Request << hdrs.Element(nIndex) ;
            m_Request << hdrs[nIndex] ;
    }
    m_Request << "Connection: close\r\n\r\n" ;
    m_Request << F ;
    S = m_Request ;
    threadLog("Sending [\n%s]\n", *S) ;
    //  Connect to server
    if (url.IsSSL())
        rc = m_Webhost.ConnectSSL(dom, nPort) ;
    else
        rc = m_Webhost.ConnectStd(dom, nPort) ;
    if (rc != E_OK)
        return rc ;
    //  Send request
    rc = m_Webhost.Send(m_Request) ;
    return rc ;
}
/*
**  Section 2, Subsect-A:   hzWebhost private functions
*/
void    hzWebhost::_clear   (void)
{
    //  Clears the hzWebhost for shutdown or for re-initialization for syncing another website
    //
    //  Arguments:  None
    //  Returns:    None
    _hzfunc("hzWebhost::_clear") ;
    hzDocMeta*  pMark ;     //  Document info
    uint32_t    nIndex ;    //  History itterator
    m_Offsite.Clear() ;
    m_Domains.Clear() ;
    m_Roots.Clear() ;
    m_Feeds.Clear() ;
    m_Emails.Clear() ;
    m_Banned.Clear() ;
    for (nIndex = 0 ; nIndex < m_mapHist.Count() ; nIndex++)
    {
        pMark = m_mapHist.GetObj(nIndex) ;
        delete pMark ;
    }
    m_mapHist.Clear() ;
    m_vecHist.Clear() ;
}
hzEcode hzWebhost::_loadstatus  (void)
{
    //  Load visit status file (called upon startup). This way we do not re-fetch pages that have already been loaded unless they are out of date.
    //
    //  Arguments:  None
    //
    //  Returns:    E_NOINIT    If the repository for the webhost has not previously been defined
    //              E_OPENFAIL  If the visit status file could not be opened
    //              E_OK        If the visit status file is read in or was empty
    _hzfunc("hzWebhost::_loadstatus") ;
    hzDocXml        X ;             //  The manifest as XML document
    hzWebCMD        wc ;            //  Current web command
    hzAttrset       ai ;            //  Attribute itterator
    hzDocMeta*      pMark ;         //  Link meta data
    hzXmlNode*      pRoot ;         //  Root XML node
    hzXmlNode*      pN1 ;           //  Level 1 XML node
    hzXmlNode*      pN2 ;           //  Level 2 XML node
    hzXmlNode*      pN3 ;           //  Level 3 XML node
    _pageList*      pgl ;           //  List of lists of pages
    hzPair          p ;             //  Pair from formdata
    hzUrl           url ;           //  in-page link
    hzString        vs_fname ;      //  Vistation status file
    hzString        anam ;          //  Attribute name
    hzString        aval ;          //  Attribute value
    hzEcode         rc = E_OK ;     //  Return
    m_mapHist.Clear() ;
    m_vecHist.Clear() ;
    if (!m_Repos)
        return hzerr(E_NOINIT, "No repository specified. Cannot determine data state") ;
    vs_fname = m_Repos + "/manifest" ;
    rc = TestFile(vs_fname) ;
    if (rc == E_NOTFOUND)
        { threadLog("No status file found. Repository in virgin state\n") ; return E_OK ; }
    if (rc != E_OK)
        { threadLog("manifest file lookup error (%s)\n", Err2Txt(rc)) ; return rc ; }
    rc = X.Load(vs_fname) ;
    if (rc != E_OK)
        { threadLog("Could not open Visit Status File %s for writing\n", *vs_fname) ; return E_OPENFAIL ; }
    pRoot = X.GetRoot() ;
    for (pN1 = pRoot->GetFirstChild() ; rc == E_OK && pN1 ; pN1 = pN1->Sibling())
    {
        if (pN1->NameEQ("pagelists"))
        {
            for (pN2 = pN1->GetFirstChild() ; rc == E_OK && pN2 ; pN2 = pN2->Sibling())
            {
                if (pN2->NameEQ("pagelist"))
                {
                    ai = pN2 ;
                    if (ai.Valid())
                    {
                        anam = ai.Name() ; aval = ai.Value() ;
                        pgl = new _pageList() ;
                        if (anam == "name")
                            pgl->name = aval ;
                    }
                    for (pN3 = pN2->GetFirstChild() ; rc == E_OK && pN3 ; pN3 = pN3->Sibling())
                    {
                        if (pN3->NameEQ("page"))
                        {
                            ai = pN3 ;
                            if (ai.Valid())
                            {
                                anam = ai.Name() ; aval = ai.Value() ;
                                if (anam == "url")
                                    pgl->links.Add(aval) ;
                            }
                        }
                    }
                }
            }
        }
        if (pN1->NameEQ("commands"))
        {
            ai = pN1 ;
            if (ai.Valid())
            {
                anam = ai.Name() ; ai.Value() ;
                if (anam == "sofar")
                    m_Sofar = atoi(*aval) ;
            }
        
            for (pN2 = pN1->GetFirstChild() ; rc == E_OK && pN2 ; pN2 = pN2->Sibling())
            {
                if (pN2->NameEQ("command"))
                    continue ;
                for (ai = pN2 ; ai.Valid() ; ai.Advance())
                {
                    anam = ai.Name() ; ai.Value() ;
                    if      (anam == "url")     wc.m_Url = aval ;
                    else if (anam == "crit")    wc.m_Crit = aval ;
                    else if (anam == "slct")    wc.m_Slct = aval ;
                    else if (anam == "inps")    wc.m_Inputs = aval ;
                    else if (anam == "outs")    wc.m_Output = aval ;
                }
                pN3 = pN2->GetFirstChild() ;
                if (pN3 && pN3->NameEQ("form"))
                {
                    for (ai = pN3 ; ai.Valid() ; ai.Advance())
                    {
                        anam = ai.Name() ; ai.Value() ;
                        p.name = anam ;
                        p.value = aval ;
                        wc.m_Formdata.Add(p) ;
                    }
                }
            }
        }
        if (pN1->NameEQ("history"))
        {
            for (pN2 = pN1->GetFirstChild() ; rc == E_OK && pN2 ; pN2 = pN2->Sibling())
            {
                if (pN2->NameEQ("page"))
                {
                    pMark = new hzDocMeta() ;
                    for (ai = pN2 ; ai.Valid() ; ai.Advance())
                    {
                        anam = ai.Name() ; ai.Value() ;
                        if      (anam == "urlReq")  pMark->m_urlReq = aval ;
                        else if (anam == "urlAct")  pMark->m_urlAct = aval ;
                        else if (anam == "title")   pMark->m_Title = aval ;
                        else if (anam == "desc")    pMark->m_Desc = aval ;
                        else if (anam == "fname")   pMark->m_Filename = aval ;
                        else if (anam == "etag")    pMark->m_Etag = aval ;
                        else if (anam == "dtDnl")   pMark->m_Download.SetDateTime(aval) ;
                        else if (anam == "dtMod")   pMark->m_Modified.SetDateTime(aval) ;
                        else if (anam == "dtExp")   pMark->m_Expires.SetDateTime(aval) ;
                        else if (anam == "type")    pMark->m_Doctype = (hzDoctype) atoi(*aval) ;
                        else
                            threadLog("Unexpected page attribute %s=%s\n", *anam, *aval) ;
                    }
                    m_vecHist.Add(pMark) ;
                }
            }
        }
    }
    return rc ;
}
hzEcode hzWebhost::_savestatus  (void)
{
    //  Write out visit status file. This keeps a record of what URL's have already been downloaded and to which files, and the expiry
    //  date (after which the page will have to be fetched again)
    //
    //  Arguments:  None
    //
    //  Returns:    E_NOINIT    If the repository for the webhost has not previously been defined
    //              E_OPENFAIL  If the visit status file could not be opened
    //              E_OK        If the visit status file is read in or was empty
    _hzfunc("hzWebhost::_status") ;
    hzList<hzUrl>::Iter     li ;    //  Links iterator (for pagelists)
    hzList<hzWebCMD>::Iter  ci ;    //  Iterator for web commands
    hzList<hzPair>::Iter    pi ;    //  Iterator for web commands
    ofstream    os ;                //  Output stream
    hzWebCMD    wc ;                //  Current web command
    hzCookie    cook ;              //  Cookie instance
    hzChain     Z ;                 //  For building status file
    _pageList*  pgl ;               //  Pagelist
    hzDocMeta*  pMark ;             //  Document meta data
    hzPair      p ;                 //  Pair from formdata
    hzString    vs_fname ;          //  Vistation status file
    hzString    S ;                 //  Tmp string
    hzUrl       url ;               //  Link
    uint32_t    nIndex ;            //  Links iterator
    uint32_t    x ;                 //  Links iterator
    hzEcode     rc = E_OK ;         //  Return
    if (!m_Repos)
        return hzerr(E_NOINIT, "No repository specified. Cannot determine data state") ;
    vs_fname = m_Repos + "/manifest" ;
    os.open(*vs_fname) ;
    if (os.fail())
    {
        threadLog("Could not open Visit Status File %s for writing\n", *vs_fname) ;
        return E_OPENFAIL ;
    }
    threadLog("savestat: case 1\n") ;
    if (m_Cookies.Count())
    {
        Z << "<cookies>\n" ;
        for (x = 0 ; x < m_Cookies.Count() ; x++)
        {
            cook = m_Cookies.GetObj(x) ;
            Z.Printf("\t<cookie sig=\"%s\" name=\"%s\" path=\"%s\" flg=\"%d\" expire=\"%s\"/>\n",
                *cook.m_Value, *cook.m_Name, *cook.m_Path, cook.m_Flags, *cook.m_Expires) ;
        }
        Z << "</cookies>\n" ;
    }
    threadLog("savestat: case 2\n") ;
    if (m_Pagelists.Count())
    {
        Z << "<pagelists>\n" ;
        for (x = 0 ; x < m_Pagelists.Count() ; x++)
        {
            pgl = m_Pagelists.GetObj(x) ;
            Z.Printf("\t<pagelist name=\"%s\">\n", *pgl->name) ;
            if (pgl->links.Count())
            {
                for (li = pgl->links ; li.Valid() ; li++)
                {
                    url = li.Element() ;
                    Z.Printf("\t\t<page url=\"%s\">\n", *url.Whole()) ;
                }
            }
            Z << "\t</pagelist>\n" ;
        }
        Z << "</pagelists>\n" ;
    }
    threadLog("savestat: case 3\n") ;
    /*
    **  Do command list and status
    */
    Z.Printf("<commands sofar=\"%d\">\n", m_Sofar) ;
    for (ci = m_Commands ; ci.Valid() ; ci++)
    {
        wc = ci.Element() ;
        if (wc.m_Cmd == WEBCMD_LOAD_PAGE)   Z << "\t<command type=^WEBCMD_LOAD_PAGE^" ;
        if (wc.m_Cmd == WEBCMD_LOAD_LIST)   Z << "\t<command type=^WEBCMD_LOAD_LIST^" ;
        if (wc.m_Cmd == WEBCMD_SLCT_PAGE)   Z << "\t<command type=^WEBCMD_SLCT_PAGE^" ;
        if (wc.m_Cmd == WEBCMD_SLCT_LIST)   Z << "\t<command type=^WEBCMD_SLCT_LIST^" ;
        if (wc.m_Cmd == WEBCMD_RGET)        Z << "\t<command type=^WEBCMD_RGET^" ;
        if (wc.m_Cmd == WEBCMD_POST)        Z << "\t<command type=^WEBCMD_POST^" ;
        if (wc.m_Cmd == WEBCMD_RSS)         Z << "\t<command type=^WEBCMD_RSS^" ;
        if (wc.m_Url)       Z.Printf(" url=\"%s\"", *wc.m_Url) ; 
        if (wc.m_Crit)      Z.Printf(" crit=\"%s\"", *wc.m_Crit) ;
        if (wc.m_Slct)      Z.Printf(" slct=\"%s\"", *wc.m_Slct) ;
        if (wc.m_Inputs)    Z.Printf(" inps=\"%s\"", *wc.m_Inputs) ;
        if (wc.m_Output)    Z.Printf(" outs=\"%s\"", *wc.m_Output) ;
        if (!wc.m_Formdata.Count())
            Z << " />\n" ;
        else
        {
            Z << ">\n" ;
            Z << "\t\t<form " ;
            for (pi = wc.m_Formdata ; pi.Valid() ; pi++)
            {
                p = pi.Element() ;
    
                Z.Printf(" %s=\"%s\"", *p.name, *p.value) ;
            }
            Z << " />\n" ;
            Z << "\t</command>\n" ;
        }
    }
    Z << "</commands>\n" ;
    threadLog("savestat: case 4\n") ;
    /*
    **  Do History
    */
    Z << "<history>\n" ;
    for (nIndex = 0 ; nIndex < m_vecHist.Count() ; nIndex++)
    {
        pMark = m_vecHist[nIndex] ;
        Z.Printf("\t<webpage id=\"%d\" type=\"%d\"", pMark->m_Id, (uint32_t) pMark->m_Doctype) ;
        if (pMark->m_urlReq)            Z.Printf("\n\t\turlReq=\"%s\"", *pMark->m_urlReq) ;
        if (pMark->m_urlAct)            Z.Printf("\n\t\turlAct=\"%s\"", *pMark->m_urlAct) ;
        if (pMark->m_Title)             Z.Printf("\n\t\ttitle=\"%s\"",  *pMark->m_Title) ;
        if (pMark->m_Desc)              Z.Printf("\n\t\tdesc=\"%s\"",   *pMark->m_Desc) ;
        if (pMark->m_Filename)          Z.Printf("\n\t\tfname=\"%s\"",  *pMark->m_Filename) ;
        if (pMark->m_Etag)              Z.Printf("\n\t\e-tag=\"%s\"",   *pMark->m_Etag) ;
        if (pMark->m_Download.IsSet())  Z.Printf("\n\t\tdtDnl=\"%s\"",  *pMark->m_Download) ;
        if (pMark->m_Modified.IsSet())  Z.Printf("\n\t\tdtMod=\"%s\"",  *pMark->m_Modified) ;
        if (pMark->m_Expires.IsSet())   Z.Printf("\n\t\tdtExp=\"%s\"",  *pMark->m_Expires) ;
        Z << "/>\n" ;
    }
    Z << "</history>\n" ;
    threadLog("savestat: case 5\n") ;
    if (m_Trace.Size())
    {
        Z << "<trace>\n" ;
        Z << m_Trace ;
        Z << "</trace>\n" ;
    }
    threadLog("savestat: case 6\n") ;
    //Rat4Html(Z) ;
    os << Z ;
    os.close() ;
    return rc ;
}
hzEcode hzWebhost::AddRoot (hzUrl& url, hzString& criteria)
{
    //  Adds a root URL for the target website
    //
    //  Arguments:  1)  url         The root URL of the website
    //              2)  criteria    The resource we want as the entry point
    //
    //  Returns:    E_ARGUMENT  If the URL is not specified
    //              E_OK        If the root is added
    _hzfunc("hzWebhost::AddRoot") ;
    hzPair  X ;     //  URL/Search critiria pair
    if (!url)
        return E_ARGUMENT ;
    X.name = url.Whole() ;
    X.value = criteria ;
    m_Roots.Add(X) ;
    return E_OK ;
}
hzEcode hzWebhost::AddRSS  (hzUrl& rss)
{
    //  Adds an RSS feed URL for the target website
    //
    //  Arguments:  1)  rss     The URL of the website's RSS feed
    //
    //  Returns:    E_ARGUMENT  If the URL is not specified
    //              E_OK        If the root is added
    _hzfunc("hzWebhost::AddRSS") ;
    m_Feeds.Add(rss) ;
    return E_OK ;
}
#define SITEPARAM_USE_FIRST_COOKIE  0x01    //  Use the first cookie provided for the rest of session
#define SITEPARAM_USE_LOGIN_COOKIE  0x02    //  Use the cookie in the login response for the rest of session
hzEcode hzWebhost::AuthBasic    (const char* username, const char* password)
{
    //  Sets the basic authentication string for the website (if the site uses this method). Once set all requests to the target website will be
    //  submitted with this string in the HTTP header.
    //
    //  Arguments:  1)  username    The user account username
    //              2)  password    The user account password
    //
    //  Returns:    E_ARGUMENT  If either the username or password is not supplied
    //              E_OK        If the root is added
    _hzfunc("hzWebhost::AuthBasic") ;
    hzChain Enc ;           //  The encrypted sequence
    hzChain Raw ;           //  The raw sequence
    if (!username || !username[0] || !password || !password[0])
    {
        threadLog("Must supply both a username and password\n") ;
        return E_ARGUMENT ;
    }
    Raw << username ;
    Raw.AddByte(CHAR_COLON) ;
    Raw << password ;
    Base64Encode(Enc, Raw) ;
    HC.m_AuthBasic = m_AuthBasic = Enc ;
    return E_OK ;
}
hzEcode hzWebhost::Login    (void)
{
    //  Execute the login process. This is always a case of downloading each page listed in m_Authspteps (if any) and then posting to the URL given in m_Authpage (if provided) with
    //  the name-value pairs listed in in m_Authform.
    //
    //  Arguments:  None
    //
    //  Returns:    E_NOTFOUND  If the login page was not located
    //              E_WRITEFAIL If the form recieved was not written to the repository
    //              E_OK        If the login form was posted (not the same thing as a successful login)
    _hzfunc("hzWebhost::Login") ;
    hzList<hzUrl>::Iter     ias ;       //  Iterator for URLs in m_Authsteps
    hzList<hzPair>::Iter    inv ;       //  Iterator for name-value pairs in m_Authform
    hzVect<hzString>        hdrs ;      //  Extra headers, needed for submit form (not generally applicable)
    ofstream    os ;                    //  For exporting to file
    hzDocument* pDoc ;                  //  Downoaded document
    hzPair      P ;                     //  Name-value pair instance
    hzUrl       url ;                   //  URL instance
    hzString    S ;                     //  Temp string
    hzString    etag ;                  //  For GetPage() call
    HttpRC      hRet ;                  //  HTML return code
    bool        bAuthpage = false ;     //  Set to true if the login form (if used) is correctly listed in m_Authsteps
    hzEcode     rc = E_OK ;             //  Return code
    threadLog("Starting Login Sequence\n") ;
    //  Werify we have to log on and if so, that the parameters are in place to support the login
    if (m_Opflags & HZ_WEBSYNC_AUTH_BASIC)
        { threadLog("Basis Authentication. No login process required\n") ; return E_OK ; }
    if (!(m_Opflags & (HZ_WEBSYNC_AUTH_POST | HZ_WEBSYNC_AUTH_GET)))
    {
        threadLog("No Authentication method\n") ;
        if (!m_Authsteps.Count() && !m_Authform.Count())
            { threadLog("No Authentication steps or form submission. No login process required\n") ; return E_OK ; }
    }
    //  Download all pages listed in m_Authsteps (note the download must happen even if the page is in the history because we need the cookies)
    for (ias = m_Authsteps ; rc == E_OK && ias.Valid() ; ias++)
    {
        url = ias.Element() ;
        if (url == m_Authpage)
            bAuthpage = true ;
        rc = HC.GetPage(hRet, url, etag) ;
        if (rc != E_OK)
            { rc = E_NOTFOUND ; threadLog("Could not download %s\n", *url) ; }
    }
    if (rc != E_OK)
        return rc ;
    if (!bAuthpage && m_Authpage)
    {
        pDoc = Download(m_Authpage) ;
        if (!pDoc)
            { threadLog("Could not download %s\n", *url) ; return E_NOTFOUND ; }
    }
    //  Now if there is a login form, post this now
    if (m_Authform.Count())
    {
        //  Write out login form to file
        if (m_Repos)
        {
            S = m_Repos + "/login_form" ;
            os.open(*S) ;
            if (os.fail())
                { threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
            os << HC.m_Header ;
            os << "\r\n\r\n" ;
            os << HC.m_Content ;
            os.close() ;
            os.clear() ;
        }
        //  Post the form
        rc = HC.PostForm(hRet, m_Authpage, hdrs, m_Authform) ;
        if (rc != E_OK)
            { threadLog("Could not post form to %s\n", *m_Authpage) ; return rc ; }
        //  Write out the login response
        if (m_Repos)
        {
            S = m_Repos + "/login_response" ;
    
            os.open(*S) ;
            if (os.fail())
                { threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
            os << HC.m_Header ;
            os << "\r\n\r\n" ;
            os << HC.m_Content ;
            os.close() ;
        }
    }
    return rc ;
}
void    hzWebhost::Logout   (void)
{
    //  Execute the logout process.
    //
    //  Arguments:  None
    //  Returns:    None
    _hzfunc("hzWebhost::Logout") ;
    //  STUB
}
hzEcode hzWebhost::Sync (void)
{
    //  Run the series of hzWebCMD directives to sync key pages from a website to a repository
    //
    //  Arguments:  None
    //
    //  Returns:    E_NOINIT    If no repository, no domain or no homepage has been specified
    //              E_NOTFOUND  If the login page was not located
    //              E_WRITEFAIL If the login form recieved was not written to the repository
    //              E_OPENFAIL  If the visit status file could not be opened
    //              E_OK        If the scrape operation was successful
    _hzfunc("hzWebhost::Sync") ;
    hzMapS  <hzUrl,hzDocument*> cur ;       //  Currently loaded documents
    hzMapS  <hzString,hzString> fvals ;     //  Form values to be submitted
    hzVect  <hzHtmElem*>        elems ;     //  Elements selected by the web selector command
    hzList  <hzWebCMD>::Iter    ci ;        //  Iterator for web commands
    hzList  <hzPair>::Iter      pi ;        //  Iterator for form data
    hzList  <hzUrl>::Iter       si ;        //  Iterator for pagelist
    hzList  <hzHtmForm*>::Iter  fi ;        //  Iterator for forms
    hzSet   <hzUrl>     set_ctrl ;          //  Initial links from processing config params
    hzVect  <hzUrl>     pglinks ;           //  Links encountered within a given pages
    hzVect  <hzUrl>     allinks ;           //  Links encountered within a given pages
    hzVect  <hzString>  hdrs ;              //  Extra headers, needed for submit form
    hzList  <hzPair>    flist ;             //  Filtered list of form values
    ofstream        os ;                    //  For writing form respose
    _pageList*      pgl = 0 ;               //  Primary pagelist instance
    _pageList*      pgl2 = 0 ;              //  Secondary pagelist instance
    hzWebCMD        wc ;                    //  Current web command
    hzDocument*     pDoc ;                  //  Downloaded document
    hzDocHtml*      pHdoc ;                 //  Set if downloaded document is a HTML page.
    hzHtmElem*      pElem ;                 //  HTML element (tag) lifted from page
    hzHtmForm*      pForm ;                 //  Form found in page
    hzPair          P ;                     //  Name value pair
    hzXDate         now ;                   //  Date/time now (for cheking is pages have expired
    hzAttrset       ai ;                    //  HTML element attribute iterator
    hzString        anam ;                  //  Attribute name
    hzString        aval ;                  //  Attribute value
    hzString        S ;                     //  Temp string
    hzUrl           url ;                   //  Temp link
    uint32_t        nStart ;                //  Links iterator
    uint32_t        nLimit ;                //  Links iterator
    uint32_t        nCount ;                //  Links iterator
    uint32_t        n ;                     //  Aggregation iterator
    HttpRC          hRet = HTTPMSG_OK ;     //  HTML return code
    hzEcode         rc ;                    //  Return code
    threadLog("Called hzWebhost::Sync\n") ;
    //  Check if repository and list of command is set up
    if (!m_Repos)
        { threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
    if (!m_Commands.Count())
        { threadLog("Website is not properly initialized (no commands)\n") ; return E_NOINIT ; }
    //  Read in any existing manifest file
    rc = _loadstatus() ;
    if (rc != E_OK)
        { threadLog("Error on loading status - aborting\n") ; return rc ; }
    //  If resuming execution, start we left off
    for (n = 0, ci = m_Commands ; n < m_Sofar ; n++, ci++) ;
    //  Execute commands in order
    for (; rc == E_OK && hRet == HTTPMSG_OK && ci.Valid() ; ci++)
    {
        pDoc = 0 ;
        wc = ci.Element() ;
        switch  (wc.m_Cmd)
        {
        case WEBCMD_LOAD_PAGE:  //  Get a page (no conditions)
            if (!wc.m_Url)
                { threadLog("Invalid loadPage command - no URL\n") ; rc = E_NOINIT ; break ; }
            threadLog("Doing WEBCMD_LOAD_PAGE\n") ;
            pDoc = Download(wc.m_Url) ;
            if (!pDoc)
                { threadLog("case 1. Could not fetch page %s\n", *wc.m_Url) ; rc = E_NOTFOUND ; break ; }
            cur.Insert(wc.m_Url, pDoc) ;
            if (pDoc->Whatami() == DOCTYPE_HTML)
            {
                pHdoc = (hzDocHtml*) pDoc ;
                if (pHdoc->m_Forms.Count())
                {
                    //  Add the forms to the m_Forms map in the hzWebhost instance
                    for (fi = pHdoc->m_Forms ; fi.Valid() ; fi++)
                    {
                        pForm = fi.Element() ;
                        m_Forms.Insert(pForm->name, pForm) ;
                    }
                }
            }
            break ;
        case WEBCMD_LOAD_LIST:  //  Get a list of pages (list supplied in command)
            threadLog("Doing WEBCMD_LOAD_LIST\n") ;
            if (!wc.m_Inputs)
                { threadLog(" - Invalid loadList command - no list of links named\n") ; rc  = E_NOTFOUND ; break ; }
            if (!m_Pagelists.Exists(wc.m_Inputs))
                { threadLog(" - No such list of links as %s\n", *wc.m_Inputs) ; rc  = E_NOTFOUND ; break ; }
            pgl = m_Pagelists[wc.m_Inputs] ;
            for (si = pgl->links ; si.Valid() ; si++)
            {
                url = si.Element() ;
                pDoc = Download(url) ;
                if (!pDoc)
                    { threadLog(" - case 3. Could not fetch page %s\n", *url) ; rc = E_NOTFOUND ; }
                else
                    threadLog(" - Fetched page %s\n", *url) ;
            }
            threadLog("Ending WEBCMD_LOAD_LIST (%s)\n", *wc.m_Inputs) ;
            break ;
        case WEBCMD_SLCT_PAGE:  //  Select links from a page
            threadLog("Doing WEBCMD_SLCT_PAGE\n") ;
            if (wc.m_Url && wc.m_Inputs)    { rc = E_NOINIT ; threadLog("Invalid request. Both a URL and an Input set specified\n") ; }
            if (!wc.m_Url && !wc.m_Inputs)  { rc = E_NOINIT ; threadLog("Invalid request. No URL or Input set specified\n") ; }
            if (!wc.m_Output)               { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no name for output list\n") ; }
            if (!wc.m_Slct && !wc.m_Crit)   { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no node selection or globing criteria\n") ; }
            if (rc != E_OK)
                break ;
            if (cur.Exists(wc.m_Url))
                pDoc = cur[wc.m_Url] ;
            else
                pDoc = Download(wc.m_Url) ;
            if (!pDoc)
                { rc = E_NOTFOUND ; threadLog("case 2. Could not fetch page %s\n", *wc.m_Url) ; break ; }
            pgl = new _pageList() ;
            pgl->name = wc.m_Output ;
            if (pDoc->Whatami() != DOCTYPE_HTML)
                threadLog("Not a HTML document\n") ;
            else
            {
                pHdoc = (hzDocHtml*) pDoc ;
                for (n = 0 ; n < pHdoc->m_vecTags.Count() ; n++)
                {
                    pElem = pHdoc->m_vecTags[n] ;
                    threadLog("VEC TAG %d <%s ", n, *pElem->Name()) ;
                    for (ai = pElem ; ai.Valid() ; ai.Advance())
                    {
                        threadLog(" %s=%s", ai.Name(), ai.Value()) ;
                    }
                    threadLog(" />\n") ;
                }
                rc = pHdoc->FindElements(elems, wc.m_Slct) ;
                for (n = 0 ; n < elems.Count() ; n++)
                {
                    pElem = elems[n] ;
                    threadLog("%s. GOT <%s ", *pElem->Name()) ;
                    for (ai = pElem ; ai.Valid() ; ai.Advance())
                    {
                        anam = ai.Name() ; aval = ai.Value() ;
                        threadLog(" %s=%s", *anam, *aval) ;
                        if (anam == "href")
                        {
                            url = aval ;
                            pgl->links.Add(url) ;
                        }
                    }
                    threadLog(" />\n") ;
                }
            }
            threadLog("Inserting pagelist %s of %d items\n", *pgl->name, pgl->links.Count()) ;
            m_Pagelists.Insert(pgl->name, pgl) ;
            break ;
        case WEBCMD_SLCT_LIST:  //  Select links from a set of pages (supplied as a set of links)
            threadLog("Doing WEBCMD_SLCT_LIST (%s)\n", *wc.m_Url) ;
            if (!wc.m_Inputs)
                { threadLog("Invalid slctList command - no source list of links\n") ; rc = E_NOINIT ; break ; }
            if (!wc.m_Output)
                { rc = E_NOINIT ; threadLog("Invalid slctList command - no name for output list\n") ; }
            if (!wc.m_Slct && !wc.m_Crit)
                { rc = E_NOINIT ; threadLog("Invalid slctList command - no node selection or globing criteria\n") ; }
            if (rc != E_OK)
                break ;
            pgl2 = new _pageList() ;
            pgl2->name = wc.m_Output ;
            //  Begin
            pgl = m_Pagelists[wc.m_Inputs] ;
            if (!pgl)
                { rc = E_CORRUPT ; threadLog("Pagelist of %s not found\n", *wc.m_Inputs) ; break ; }
            for (si = pgl->links ; si.Valid() ; si++)
            {
                url = si.Element() ;
                if (cur.Exists(url))
                    pDoc = cur[url] ;
                else
                    pDoc = Download(url) ;
                if (!pDoc)
                    { rc = E_NOTFOUND ; threadLog("case 2.2 Could not fetch page %s\n", *url) ; break ; }
                if (pDoc->Whatami() == DOCTYPE_HTML)
                {
                    pHdoc = (hzDocHtml*) pDoc ;
                    rc = pHdoc->FindElements(elems, wc.m_Slct) ;
                    for (n = 0 ; n < elems.Count() ; n++)
                    {
                        pElem = elems[n] ;
                        threadLog("%s. GOT <%s ", *pElem->Name()) ;
                        for (ai = pElem ; ai.Valid() ; ai.Advance())
                        {
                            anam = ai.Name() ; aval = ai.Value() ;
                            threadLog(" %s=%s", *anam, *aval) ;
                            if (anam == "href")
                            {
                                url = aval ;
                                pgl2->links.Add(url) ;
                            }
                        }
                        threadLog(" />\n") ;
                    }
                }
            }
            threadLog("Case 2. Inserting pagelist %s of %d items\n", *pgl2->name, pgl2->links.Count()) ;
            m_Pagelists.Insert(pgl2->name, pgl2) ;
            break ;
        case WEBCMD_RGET:   //  Get a root page
            threadLog("Doing WEBCMD_RGET\n") ;
            threadLog("Page=%s Crit=%s\n", *wc.m_Url, *wc.m_Crit) ;
            //  Get root page first
            pDoc = Download(wc.m_Url) ;
            if (!pDoc)
                threadLog("case 4. Could not fetch page %s\n", *wc.m_Url) ;
            else
            {
                if (pDoc->Whatami() != DOCTYPE_HTML)
                    threadLog("Page %s not HTML\n", *wc.m_Url) ;
                else
                {
                    pHdoc = (hzDocHtml*) pDoc ;
                    pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
                }
                delete pDoc ;
            }
            //  Now aggregate the vector of links from the page to a vector of all links from all pages. Use a set to avoid repeats.
            for (n = 0 ; n < pglinks.Count() ; n++)
            {
                url = pglinks[n] ;
                if (!set_ctrl.Exists(url))
                    allinks.Add(url) ;
            }
            //  Starting at the site root and for each page, grab all links and go to each link in turn
            threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
            for (nStart = 0 ; nStart < allinks.Count() ; nStart = nCount)
            {
                now.SysDateTime() ;
                pglinks.Clear() ;
                for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
                {
                    url = allinks[nCount] ;
                    threadLog("Cosidering link %s - ", *url.Whole()) ;
                    if (m_mapHist.Exists(url))              { threadLog("historic\n") ; continue ; }
                    if (url == m_Authexit)                  { threadLog("exit-page\n") ; continue ; }
                    if (!m_Domains.Exists(url.Domain()))    { threadLog("URL %s outside domain\n", *url) ; continue ; }
                    //  Page not yet visted so we visit it, put it in list of pages visited and get the links. Some of these links may add to
                    //  the list of links.
                    threadLog("Fetching\n") ;
                    pDoc = Download(url) ;
                    if (!pDoc)
                        threadLog("case 2. Could not fetch page %s\n", *url) ;
                    else
                    {
                        if (pDoc->Whatami() == DOCTYPE_HTML)
                        {
                            pHdoc = (hzDocHtml*) pDoc ;
                            pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
                            //  Re-aggregate the all-links vector
                            for (n = 0 ; n < pglinks.Count() ; n++)
                            {
                                url = pglinks[n] ;
                                if (!set_ctrl.Exists(url))
                                    allinks.Add(url) ;
                            }
                        }
                        delete pDoc ;
                    }
                }
            }
            break ;
        case WEBCMD_POST:   //  Post a form. The form should have been previously downloaded and will be looked for by name
            threadLog("Doing WEBCMD_POST\n") ;
            pForm = m_Forms[wc.m_Output] ;
            if (!pForm)
                threadLog("Warning: No such form as [%s]\n", *wc.m_Output) ;
            //  Take the command's formdata and use it to populate the form's set of fields
            /*
            for (pi = pForm->fields ; pi.Valid() ; pi++)
                { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
            for (pi = wc.m_Formdata ; pi.Valid() ; pi++)
                { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
            for (n = 0 ; n < fvals.Count() ; n++)
            {
                P.name = fvals.GetKey(n) ;
                P.value = fvals.GetObj(n) ;
                flist.Add(P) ;
            }
            */
            rc = HC.PostForm(hRet, wc.m_Url, hdrs, wc.m_Formdata) ;
            if (rc != E_OK)
                { threadLog("Could not post form to %s\n", *wc.m_Url) ; return rc ; }
            if (hRet != HTTPMSG_OK)
                { threadLog("Invalid response to post form (to %s)\n", *wc.m_Url) ; return rc ; }
            //  Write out the login response
            if (m_Repos)
            {
                url = wc.m_Url ;
                S = m_Repos + "/" + url.Filename() ;
                S += ".response" ;
    
                os.open(*S) ;
                if (os.fail())
                    { threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
                os << HC.m_Header ;
                os << "\r\n\r\n" ;
                os << HC.m_Content ;
                os.close() ;
            }
            break ;
        case WEBCMD_RSS:    //  Get an RSS feed
            threadLog("Doing WEBCMD_RSS\n") ;
            //  If XML selectors for RSS feed are not initialized, set them here
            if (!m_tagItem.m_Slct)  { m_tagItem.m_Filt = (char*) 0 ; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
            if (!m_tagUqid.m_Slct)  { m_tagUqid.m_Filt = (char*) 0 ; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
            if (!m_tagLink.m_Slct)  { m_tagLink.m_Filt = (char*) 0 ; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
            if (!m_tagDesc.m_Slct)  { m_tagDesc.m_Filt = (char*) 0 ; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
            if (!m_tagDate.m_Slct)  { m_tagDate.m_Filt = (char*) 0 ; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
            //  Get the feed
            rc = getRss_r(hRet, wc.m_Url, 0) ;
            threadLog("Processed items\n") ;
            break ;
        }
    }
    //  Write out manifest file
    rc = _savestatus() ;
    //  Clear documents
    for (n = 0 ; n < m_Pagelists.Count() ; n++)
    {
        pgl = m_Pagelists.GetObj(n) ;
        delete pgl ;
    }
    for (n = 0 ; n < cur.Count() ; n++)
    {
        pDoc = cur.GetObj(n) ;
        delete pDoc ;
    }
    return rc ;
}
hzEcode hzWebhost::Scrape   (void)
{
    //  In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created and existing pages can be
    //  modified on an ad-hoc basis. A scrape captures the current state of the website or a limited portion of it to file.
    //
    //  The scraping process runs through a set of known links for the website, downloading the page for each in turn. Each downloaded page is then examined for
    //  links. Links to domains other than the one in qestion are ignored. Links to such things as images are also ignored. Remaining links not found in the set
    //  of known links are added to this set. The process terminates when all the links have been attempted.
    //
    //  The set of known links will need to comprise the site's home-page and a login page if this exists and if it is not the same as the home page. These will
    //  usually be enough to 'bootstrap' the rest of the site.
    //
    //  Arguments:  None
    //
    //  Returns:    E_NOINIT    If no repository, no domain or no homepage has been specified
    //              E_NOTFOUND  If the login page was not located
    //              E_WRITEFAIL If the login form recieved was not written to the repository
    //              E_OPENFAIL  If the visit status file could not be opened
    //              E_OK        If the scrape operation was successful
    _hzfunc("hzWebhost::Scrape") ;
    hzMapS<hzString,hzString>   formData ;  //  Set of name value pairs
    hzVect<hzString>            hdrs ;      //  Extra headers, needed for submit form
    hzList<hzPair>::Iter        ci ;        //  Root commands iterator
    hzSet<hzUrl>    set_ctrl ;      //  Initial links from processing config params
    hzVect<hzUrl>   pglinks ;       //  Links encountered within a given pages
    hzVect<hzUrl>   allinks ;       //  Links encountered within a given pages
    hzVect<hzUrl>   todo ;          //  Links encountered in the pages in ctrl
    ifstream        is ;            //  For reading in visit status file
    ofstream        os ;            //  For writing out visit status file at end of scrape
    hzDocMeta       mark ;          //  Document meta data
    hzChain         Response ;      //  Response from form submission
    hzDocument*     pDoc ;          //  Downloaded document
    hzDocHtml*      pHdoc ;         //  Set if downloaded document is a HTML page.
    hzPair          X ;             //  Root comand instance
    hzXDate         now ;           //  Date/time now (for cheking is pages have expired
    hzUrl           url ;           //  Temp link
    hzString        vs_fname ;      //  Visit status filename
    hzString        pagepath ;      //  Filepath for file to store downloaded page
    hzString        S ;             //  Temp string
    hzString        etag ;          //  Temp string
    uint32_t        nStart ;        //  Links iterator
    uint32_t        nLimit ;        //  Links iterator
    uint32_t        nCount ;        //  Links iterator
    uint32_t        n ;             //  Aggregation iterator
    hzEcode         rc = E_OK ;     //  Return code
    threadLog("Called hzWebhost::Scrape\n") ;
    //  Check if repository is set up (website is initialized)
    if (!m_Repos)
        { threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
    //  Is there anything to do?
    if (!m_Roots.Count())
        { threadLog("Website has no starting point (URL) for a WEB SCRAPE.\n") ; return E_NOINIT ; }
    //  Get the home page
    //  if (*m_Homepage)
    if (m_Homepage)
    {
        //etag = 0 ;
        //etag = (char*) 0 ;
        pDoc = Download(m_Homepage) ;
        if (!pDoc)
            { threadLog("Could not download page %s\n", *m_Homepage) ; return E_NOINIT ; }
        m_docHome = pDoc ;
        threadLog("HOMEPAGE SUCCESS\n") ;
    }
    //  Login
    rc = Login() ;
    if (rc != E_OK)
        { threadLog("Login failed\n") ; return rc ; }
    threadLog("Login SUCCESS\n") ;
    //  Run the root commands to obtain the set of roots. A root command may have either a URL or a 'link criteria' or both. If only a
    //  URL is present, this URL and ALL links found within it are added to the list of pages to process. If only a link criteria is
    //  present, the links found in the HOME page and the LOGIN RESPONSE page are tested against the criteria. If they match the link
    //  is added to the list of pages to process. If both a URL and a link criteria is found then the URL and any matching links found
    //  within it are added to the list of pages to process.
    threadLog("Have %d root commands\n", m_Roots.Count()) ;
    for (ci = m_Roots ; ci.Valid() ; ci++)
    {
        X = ci.Element() ;
        threadLog("Page=%s Crit=%s\n", *X.name, *X.value) ;
        //  Get the page
        if (X.name == "homepage")
        {
            //  No page to get, just compare the criteria to the home
            pHdoc = (hzDocHtml*) m_docHome ;
            pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
        }
        else if (X.name == "loginResponse")
        {
            //  No page to get, just compare the criteria to the login response
            pHdoc = (hzDocHtml*) m_resAuth ;
            pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
        }
        else
        {
            url = X.name ;
            if (!url)
                { threadLog("Root command invalid page %s\n", *X.name) ; continue ; }
            etag = (char*) 0 ;
            pDoc = Download(url) ;
            if (!pDoc)
                threadLog("case 1. Could not fetch page %s\n", *url) ;
            else
            {
                if (pDoc->Whatami() != DOCTYPE_HTML)
                    threadLog("Page %s not HTML\n", *url) ;
                else
                {
                    pHdoc = (hzDocHtml*) pDoc ;
                    pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
                    threadLog("Got page content, extracted %d links\n", pglinks.Count()) ;
                }
                delete pDoc ;
            }
        }
        //  Now aggregate the vector of links from the page to a vector of all links from all pages. Use a set to avoid repeats.
        for (n = 0 ; n < pglinks.Count() ; n++)
        {
            url = pglinks[n] ;
            if (!set_ctrl.Exists(url))
                allinks.Add(url) ;
        }
    }
    /*
    **  Starting at the site root and for each page, grab all links and go to each link in turn
    */
    threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
    for (nStart = 0 ; nStart < allinks.Count() ; nStart = nCount)
    {
        now.SysDateTime() ;
        todo.Clear() ;
        for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
        {
            url = allinks[nCount] ;
            threadLog("Cosidering link %s - ", *url.Whole()) ;
            if (m_mapHist.Exists(url))              { threadLog("historic\n") ; continue ; }
            if (url == m_Authexit)                  { threadLog("exit-page\n") ; continue ; }
            if (!m_Domains.Exists(url.Domain()))    { threadLog("URL %s outside domain\n", *url) ; continue ; }
            //  Page not yet visted so we visit it, put it in list of pages visited and get the links. Some of these links may add to
            //  the list of links.
            threadLog("Fetching\n") ;
            pDoc = Download(url) ;
            threadLog("Fetched page %p\n", pDoc) ;
            if (!pDoc)
                threadLog("case 2. Could not fetch page %s\n", *url) ;
            else
            {
                if (pDoc->Whatami() == DOCTYPE_HTML)
                {
                    pHdoc = (hzDocHtml*) pDoc ;
                    pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
                    //  Re-aggregate the all-links vector
                    for (n = 0 ; n < pglinks.Count() ; n++)
                    {
                        url = pglinks[n] ;
                        if (!set_ctrl.Exists(url))
                            allinks.Add(url) ;
                    }
                }
                delete pDoc ;
            }
        }
        /*
        for (nAdded = nX = 0 ; nX < todo.Count() ; nX++)
        {
            //url = todo.GetObj(nX) ;
            url = todo[nX] ;    //.GetObj(nX) ;
            if (set_ctrl.Exists(url))
                continue ;
            nAdded++ ;
            set_ctrl.Insert(url) ;
        }
        todo.Clear() ;
        if (!nAdded)
            break ;
        */
    }
    //  Write out manifest file
    rc = _savestatus() ;
    return rc ;
}
hzEcode hzWebhost::getRss_r (HttpRC& hRet, const hzUrl& feed, uint32_t nLevel)
{
    //  Recursive fetch of RSS documents. The supplied URL is downloaded and loaded into an XML document. There it is tested to ensure it is an
    //  XML document. The RSS feed is assumed to contain only links. These links may be to HTML pages or other (sub RSS feeds). The HTML pages
    //  are end points of the process. They are downloaded but any links they may contain are recorded but not followed. The sub-RSS feeds are
    //  then processed by recursive call to this function.
    //
    //  Arguments:  1) hRet     Set by this operation
    //              2) feed     The RSS URL
    //              3) nLevel   RSS Hierarchy
    //
    //  Returns:    E_NODATA    If the download failed
    //              E_TYPE      If the downloaded material does not appear to be XML
    //              E_FORMAT    If the downloaded material could not be loaded into an XML document
    //              E_ARGUMENT  If the RSS tags are not defined
    //              E_NOTFOUND  If no tags were found in the RSS
    //              E_OK        If the RSS data was collected
    _hzfunc("hzWebhost::getRss_r") ;
    hzVect<hzXmlNode*>  linx ;      //  Links found in (this) RSS feed page
    hzVect<hzUrl>       todo ;      //  Links found in RSS feed page (additions to this are controlled by the set above)
    hzDocXml        X ;             //  For loading of RSS feed pages and extraction of links
    hzXmlNode*      pN1 ;           //  Nodes (containing <item>)
    hzXmlNode*      pN2 ;           //  Nodes (containing <item> subnodes of title, link, description)
    hzDocMeta*      pMark ;         //  Document meta data
    hzDocument*     pDoc ;          //  Document found at URL (could be XML of HTML)
    hzUrl           page ;          //  Temp link
    hzString        desc ;          //  RSS article description
    hzString        dstr ;          //  RSS article date
    hzString        uqid ;          //  Unique ID of RSS item
    hzString        title ;         //  RSS article title
    uint32_t        nIndex ;        //  Links iterator
    hzEcode         rc = E_OK ;     //  Return code
    //  Fetch the current RSS document
    pDoc = Download(feed) ;
    if (rc != E_OK)
        { threadLog("Could not fetch URL %s\n", *feed) ; return rc ; }
    //  If not an XML document then it is just a page. Nothing further.
    if (pDoc->Whatami() != DOCTYPE_XML)
        { threadLog("case 1. Fetched feed (%s) is not of doctype XML\n", *feed) ; return E_TYPE ; }
    nLevel++ ;
    //  Load current RSS document into XML document tree
    rc = X.Load(HC.m_Content) ;
    if (rc != E_OK)
        return hzerr(rc, "Could not load feed %s", *feed) ;
    //  The page is an RSS document so select the <itme> tags
    rc = X.FindNodes(linx, m_tagItem.m_Slct) ;
    threadLog("Found %d <item> tags in feed %s\n", linx.Count(), *feed) ;
    if (rc != E_OK)
        return rc ;
    for (nIndex = 0 ; nIndex < linx.Count() ; nIndex++)
    {
        threadLog("case 1\n") ;
        pN1 = linx[nIndex] ;
        title = (char*) 0 ; desc = (char*) 0 ; page = (char*) 0 ; uqid = (char*) 0 ; dstr = (char*) 0 ;
        for (pN2 = pN1->GetFirstChild() ; pN2 ; pN2 = pN2->Sibling())
        {
            threadLog("case 2\n") ;
            if (pN2->NameEQ(*m_tagTitl.m_Slct)) { title = pN2->m_fixContent ; continue ; }
            if (pN2->NameEQ(*m_tagDesc.m_Slct)) { desc = pN2->m_fixContent ; continue ; }
            if (pN2->NameEQ(*m_tagLink.m_Slct)) { page = pN2->m_fixContent ; continue ; }
            if (pN2->NameEQ(*m_tagUqid.m_Slct)) { uqid = pN2->m_fixContent ; continue ; }
            if (pN2->NameEQ(*m_tagDate.m_Slct)) { dstr = pN2->m_fixContent ; continue ; }
        }
        threadLog("case 3\n") ;
        if (!page)
            { threadLog("case 1: title=%s; link=null uqid=%s\n", *title, *uqid) ; page = uqid ; }
        if (!page)
            { threadLog("case 2: title=%s; link=null uqid=%s\n", *title, *uqid) ; continue ; }
        threadLog("title=%s; link=%s\n", *title, *page) ;
        if (m_mapHist.Exists(page))
            threadLog("Exists in history, page %s\n", *page) ;
        else
        {
            pMark = new hzDocMeta() ;
            pMark->m_Title = title ;
            pMark->m_Desc = desc ;
            pMark->m_urlReq = page ;
            if (dstr)
                pMark->m_Modified.SetDateTime(*dstr) ;
            //todo.Insert(page) ;
            todo.Add(page) ;
            threadLog("Adding to history, page %s\n", *page) ;
        }
    }
    //  Fetch all the new links found above by recursive call
    for (nIndex = 0 ; nIndex < todo.Count() ; nIndex++)
    {
        page = todo[nIndex] ;
        //pMark = m_mapHist[page] ;
        threadLog("Processing %s\n", *page) ;
        rc = getRss_r(hRet, page, nLevel) ;
    }
    return rc ;
}
hzEcode hzWebhost::GetRSS   (void)
{
    //  In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created
    //  and existing pages can be modified on an ad-hoc basis. The RSS feeds allow greter ease when syncing an external website to the
    //  local machine. By periodically reading one or more RSS feeds one can obtain a set of links which can generally be taken as the
    //  set of pages deemed 'current' by the website. By comparing these links to a history file of already fetched links, new pages
    //  can be added to a respository as they appear on the site. The RSS feeds are just XML files containing links.
    //
    //  This function will obtain all the RSS feeds from the site, garner all the links from them and then download any pages from the
    //  links that are not already in the site history. The feeds themselves are not saved as these will be fetched again.
    //
    //  Arguments:  None
    //
    //  Returns:    E_NOINIT    If the repository for the webhost has not previously been defined
    //              E_OPENFAIL  If the visit status file could not be opened
    //              E_NODATA    If the download failed
    //              E_TYPE      If the downloaded material does not appear to be XML
    //              E_FORMAT    If the downloaded material could not be loaded into an XML document
    //              E_ARGUMENT  If the RSS tags are not defined
    //              E_NOTFOUND  If no tags were found in the RSS
    //              E_OK        If the RSS data was collected
    _hzfunc("hzWebhost::GetRSS") ;
    hzList<hzUrl>::Iter fi ;        //  RSS feeds iterator
    hzUrl       feed ;              //  Temp link
    HttpRC      hRet ;              //  HTML return code
    hzEcode     rc = E_OK ;         //  Return code
    threadLog("Called\n") ;
    //  Login
    rc = Login() ;
    if (rc != E_OK)
        { threadLog("Login failed\n") ; return rc ; }
    //  Get the home page if one applies. Do this regardless of weather we already have it because we need the cookie
    if (!m_Feeds.Count())
        { threadLog("Website has no starting point (URL) for an RSS feed.\n") ; return E_NOINIT ; }
    //  If XML selectors for RSS feed are not initialized, set them here
    if (!m_tagItem.m_Slct)  { m_tagItem.m_Filt = (char*) 0 ; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
    if (!m_tagUqid.m_Slct)  { m_tagUqid.m_Filt = (char*) 0 ; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
    if (!m_tagLink.m_Slct)  { m_tagLink.m_Filt = (char*) 0 ; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
    if (!m_tagDesc.m_Slct)  { m_tagDesc.m_Filt = (char*) 0 ; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
    if (!m_tagDate.m_Slct)  { m_tagDate.m_Filt = (char*) 0 ; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
    /*
    **  Fetch all the feed XML documents from the RSS source(s)
    */
    for (fi = m_Feeds ; fi.Valid() ; fi++)
    {
        feed = fi.Element() ;
        //  Get the feed
        rc = getRss_r(hRet, feed, 0) ;
        threadLog("Processed items\n") ;
    }
    //  Write out visit status file
    rc = _savestatus() ;
    return rc ;
}
hzDocument* hzWebhost::Download (const hzUrl& url)
{
    //  Fetch the page found at the supplied URL and return as a document (either XML or HTML).
    //
    //  Note that if the page has already been downloaded (is in the site's history) then it is only downloaded again if it the time to
    //  live has expired. If the page is not downloaded then this function will reload it from file.
    //
    //  Arguments:  1) url      The URL of the file/resource to download
    //
    //  Returns:    Pointer to newly allocated document. Must be deleted after use.
    _hzfunc("hzWebhost::Download") ;
    static uint32_t nlast = 0 ;     //  Last point reached (for download rsumption)
    ofstream    os ;                //  To write out page contents
    hzDocument* pDoc = 0 ;          //  Document downloaded
    hzDocXml*   pXdoc = 0 ;         //  XML Document downloaded
    hzDocHtml*  pHdoc = 0 ;         //  HTML Document downloaded
    hzDocMeta*  pMark ;             //  Document meta data
    hzXDate     now ;               //  Date & Time now
    hzString    S ;                 //  Temp string
    HttpRC      hc ;                //  HTTP server return code
    hzEcode     rc ;                //  Return code
    bool        bHist = false ;     //  Set if url is already in history and downloaded again because of being out of date
    char        numbuf [8] ;        //  Working buffer
    /*
    **  Check URL, insert in visited links if not already there
    */
    if (!url)
        { threadLog("No supplied address\n") ; return 0 ; }
    threadLog("FETCHING PAGE: %s\n", *url) ;
    now.SysDateTime() ;
    if (!(m_Opflags & WEBFLG_FORCE))
    {
        if (m_mapHist.Exists(url))
        {
            //  The requested URL exists in the repository already. We check if it has expired and if not we terminate with OK
            pMark = m_mapHist[url] ;
            bHist = true ;
            threadLog("Page %s is historic\n", *url) ;
            //  Create a document of the right type (XML or HTML)
            if (pMark->m_Doctype == DOCTYPE_HTML)
                pDoc = pHdoc = new hzDocHtml() ;
            else if (pMark->m_Doctype == DOCTYPE_XML)
                pDoc = pXdoc = new hzDocXml() ;
            else
                pDoc = pHdoc = new hzDocHtml() ;
            pDoc->SetMeta(*pMark) ;
            //  Check if expiry is known and if so if it has expired
            if (pMark->m_Expires.IsSet())
            {
                if (pMark->m_Expires < now)
                {
                    //  Set the markers and return
                    if (pMark->m_Doctype == DOCTYPE_XML)
                    {
                        //  XML
                        pDoc = pXdoc = new hzDocXml() ;
                        pDoc->SetMeta(*pMark) ;
                        rc = pDoc->Load(HC.m_Content) ;
                    }
                    else
                    {
                        //  HTML
                        pDoc = pHdoc = new hzDocHtml() ;
                        pDoc->SetMeta(*pMark) ;
                        rc = pDoc->Load(HC.m_Content) ;
                    }
                    threadLog("DOWNLOAD PREVIOUS (error=%s)\n\n", Err2Txt(rc)) ;
                    return pDoc ;
                }
            }
            //  At this point either the expiry date is unknown or it is known and has expired. Load from file
            if (!HC.m_Content.Size())
            {
                threadLog("Case 1 Bloody thing is empty!\n") ;
                return 0 ;
            }
            rc = pDoc->Load(HC.m_Content) ;
            if (rc != E_OK)
                threadLog("LOAD failed (error=%s)\n\n", Err2Txt(rc)) ;
            return pDoc ;
        }
    }
    //  The requested URL is not in the history. Create the document meta for it and download it.
    S = url.Filename() ;
    pMark = new hzDocMeta() ;
    pMark->m_urlReq = url ;
    pMark->m_urlAct = url ;
    pMark->m_Id = m_mapHist.Count() ;
    sprintf(numbuf, "/%04d", pMark->m_Id) ;
    pMark->m_Filename = m_Repos + numbuf + S ;
    /*
    **  Get page content and process it into a tree
    */
    threadLog("GETTIG PAGE: %s\n", *url) ;
    rc = HC.GetPage(hc, url, pMark->m_Etag) ;
    if (rc != E_OK)
    {
        threadLog("FAILED (error=%s) synopsis\n", Err2Txt(rc)) ;
        threadLog(HC.m_Error) ;
        return 0 ;
    }
    if (HC.m_Redirect)
        pMark->m_urlAct = HC.m_Redirect ;
    pMark->m_Modified = HC.m_Modified ;
    threadLog("HTTP Return code = %d, cookie (value %s, path %s)\n", (uint32_t) hc, *m_CookieSess, *m_CookiePath) ;
    /*
    **  Write out header to .hdr file and content to .con file
    */
    if (m_Repos)
    {
        os.open(*pMark->m_Filename) ;
        if (os.fail())
            threadLog("Cannot write out header file %s\n", *pMark->m_Filename) ;
        else
        {
            os << HC.m_Content ;
            os.close() ;
        }
        os.clear() ;
    }
    /*
    **  Add the page but only process pages that are of a known HTML type .htm, .html, .shtml, .xhtml etc
    */
    threadLog("PROCESSING Content: %d bytes\n", HC.m_Content.Size()) ;
    if (!HC.m_Content.Size())
    {
        threadLog("Case 2 Bloody thing is empty!\n") ;
        return 0 ;
    }
    pMark->m_Doctype = DeriveDoctype(HC.m_Content) ;
    rc = E_NODATA ;
    if (pMark->m_Doctype == DOCTYPE_XML)
    {
        //  XML
        pDoc = pXdoc = new hzDocXml() ;
        pXdoc->Init(url) ;
        rc = pXdoc->Load(HC.m_Content) ;
    }
    else
    {
        //  HTML
        pDoc = pHdoc = new hzDocHtml() ;
        pHdoc->Init(url) ;
        rc = pHdoc->Load(HC.m_Content) ;
        if (rc != E_OK)
        threadLog("Case 2 Bloody thing failed (error=%s)!\n", Err2Txt(rc)) ;
    }
    if (rc != E_OK)
    {
        threadLog("Load page failed error=%s\n", Err2Txt(rc)) ;
        //delete pDoc ;
        //return 0 ;
    }
    pDoc->SetMeta(*pMark) ;
    //  Place the URL in the site's history
    m_mapHist.Insert(pMark->m_urlReq, pMark) ;
    threadLog("Inserted URL %s\n", *pMark->m_urlReq) ;
    if (pMark->m_urlAct != pMark->m_urlReq)
    {
        m_mapHist.Insert(pMark->m_urlAct, pMark) ;
        threadLog("Inserted URL %s\n", *pMark->m_urlAct) ;
    }
    if (!bHist)
        m_vecHist.Add(pMark) ;
    if (pXdoc)
        threadLog("DOWNLOAD SUCCESS XML Page %s. Now have %d (%d) items in history\n\n", *url, m_mapHist.Count(), nlast) ;
    if (pHdoc)
        threadLog("DOWNLOAD SUCCESS Page %s has %d links. Now have %d (%d) items in history\n\n", *url, pHdoc->m_vecLinks.Count(), m_mapHist.Count(), nlast) ;
    threadLog(HC.m_Error) ;
    return pDoc ;
}