//
// File: hzHttpClient.cpp
//
// Legal Notice: This file is part of the HadronZoo C++ Class Library. Copyright 2025 HadronZoo Project (http://www.hadronzoo.com)
//
// The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free
// Software Foundation, either version 3 of the License, or any later version.
//
// The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses.
//
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <netdb.h>
#include <sys/stat.h>
#include "hzChars.h"
#include "hzTextproc.h"
#include "hzDirectory.h"
#include "hzCodec.h"
#include "hzHttpClient.h"
#include "hzProcess.h"
using namespace std ;
/*
** Prototypes
*/
uint32_t _extractHttpHeader (hzString& Attr, hzString& Value, hzChain::Iter& ci, bool bConvert) ;
/*
** Section 1: hzHttpClient member functions
*/
hzEcode hzHttpClient::Connect (const hzUrl& url)
{
_hzfunc("hzHttpClient::Connect") ;
hzEcode rc ; // Return code
if (url.IsSSL())
rc = m_Webhost.ConnectSSL(url.Domain(), url.Port()) ;
else
rc = m_Webhost.ConnectStd(url.Domain(), url.Port()) ;
if (rc != E_OK)
m_Error.Printf("Could not connect to domain [%s] on port %d (error=%s)\n", *url.Domain(), url.Port(), Err2Txt(rc)) ;
else
{
rc = m_Webhost.SetSendTimeout(30) ;
if (rc != E_OK)
m_Error.Printf("Could not set send_timeout on connection to domain [%s] on port %d (error=%s)\n", *url.Domain(), url.Port(), Err2Txt(rc)) ;
else
{
rc = m_Webhost.SetRecvTimeout(30) ;
if (rc != E_OK)
m_Error.Printf("Could not set recv_timeout on connection to domain [%s] on port %d (error=%s)\n", *url.Domain(), url.Port(), Err2Txt(rc)) ;
}
}
return rc ;
}
hzEcode hzHttpClient::Close (void)
{
_hzfunc("hzHttpClient::Close") ;
m_Webhost.Close() ;
return E_OK ;
}
uint32_t _extractHttpHeader (hzString& Param, hzString& Value, hzChain::Iter& ci, bool bConvert)
{
// Support function to extract parameter name and value from a HTTP header (either that of a request or response). HTTP header lines are of the form param_name: param_value
// and are terminated by a CR/NL
//
// Arguments: 1) Param The hzString to store the parameter name.
// 2) Value The hzString to store the parameter value.
// 3) ci A reference to the chain iterator processing the HTTP request.
// 4) bConvert Flag to convert percent sign followed by two hex digits into single char value
//
// Returns: Number of charachters processed.
_hzfunc("_extractHttpHeader") ;
chIter xi ; // For iterating line
hzChain temp ; // For building param and then value
uint32_t nCount = 0 ; // Returned length of HTTP header line
uint32_t nHex ; // Hex value
char cvHex[4] ; // Hex value buffer
Param.Clear() ;
Value.Clear() ;
cvHex[2] = 0 ;
xi = ci ;
for (; !xi.eof() ;)
{
if (*xi == CHAR_PERCENT)
{
if (bConvert)
{
xi++ ; cvHex[0] = *xi ;
xi++ ; cvHex[1] = *xi ;
xi++ ;
nCount += 3 ;
if (IsHexnum(nHex, cvHex))
temp.AddByte(nHex) ;
continue ;
}
}
if (*xi == CHAR_COLON && !Param)
{
xi++ ;
nCount++ ;
Param = temp ;
temp.Clear() ;
if (*xi == CHAR_SPACE)
for (; !xi.eof() && (*xi == CHAR_SPACE || *xi == CHAR_TAB) ; xi++, nCount++) ;
}
if (xi == "\r\n")
{ xi += 2 ; nCount += 2 ; break ; }
if (*xi == CHAR_NL)
{ xi++ ; nCount++ ; break ; }
if (*xi < CHAR_SPACE)
threadLog("Illegal char (%u) in HTTP Header\n", (uchar) *xi) ;
if (*xi == CHAR_PLUS)
temp.AddByte(CHAR_SPACE) ;
else
temp.AddByte(*xi) ;
xi++ ;
nCount++ ;
}
Value = temp ;
return nCount ;
}
hzEcode hzHttpClient::_procHttpResponse (HttpRC& hRet, const hzUrl& url)
{
// Support funtion to the hzHttpClient member functions GetPage() and PostForm(). The purpose is to gather the server response to
// an earlier HTTP GET, POST or HEAD request.
//
// Arguments: 1) hRet HTTP return code
// 2) url The URL
//
// Returns: E_NOSOCKET If the external server has closed the connection
// E_NODATA If nothing was recived
// E_FORMAT If the response was malformed
// E_OK If the response was recieved without error
_hzfunc("hzHttpClient::_procHttpResponse") ;
chIter zi ; // To iterate the returned page
chIter hi ; // To re-iterate lines of interest in the header of the returned page
chIter ti ; // Temp iterator
hzChain Z ; // Request buffer
hzChain X ; // Temp buffer
hzCookie cookie ; // Cookie (to be checked against supplied map of cookies)
hzString S ; // Temp string
hzString param ; // Header parameter name
hzString value ; // Header parameter value
uint32_t nRecv ; // Bytes received
uint32_t nExpect = 0 ; // Size of current chunk
uint32_t nLen = 0 ; // Content length
uint32_t nLine ; // Line number (of header)
uint32_t nTry ; // Number of tries
uint32_t nCount ; // Number of bytes counted off from those expected
bool duHast = false ; // Have read a chunking directive or have a content len
bool bTerm = false ; // Terminate chunking (only set upon a 0 value on a line by itself
hzEcode sRet = E_OK ; // Return code
char numBuf[4] ; // For HTTP return code
// Clear variables
m_CacheCtrl = (char*) 0 ;
m_Pragma = (char*) 0 ;
m_Redirect = (char*) 0 ;
m_KeepAlive = (char*) 0 ;
m_ContentType = (char*) 0 ;
m_XferEncoding = (char*) 0 ;
m_ContEncoding = (char*) 0 ;
m_Etag = (char*) 0 ;
m_bConnection = false ;
m_nContentLen = 0 ;
m_Content.Clear() ;
m_Header.Clear() ;
// Garner first the header, from the response
for (nTry = 0 ; nTry < 4 && !m_Header.Size() ; nTry++)
{
sRet = m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
if (sRet != E_OK)
{
if (sRet == E_NOSOCKET)
m_Error.Printf("Connection closed by server\n") ;
else
m_Error.Printf("Could not recv bytes (nbytes=%d) from page %s (error=%s)\n", nRecv, *url.Resource(), Err2Txt(sRet)) ;
break ;
}
if (!nRecv)
{
m_Error.Printf("Got no response, retrying ...\n") ;
sleep(1) ;
continue ;
}
Z.Append(m_buf, nRecv) ;
// Test for presence of \r\n\r\n to mark end of header
for (zi = Z ; !zi.eof() ; zi++)
{
if (*zi != CHAR_CR)
continue ;
if (zi == "\r\n\r\n")
{
// Bytes before the header's end are now copied from temp chain Z to the header
for (ti = Z ; ti != zi ; ti++)
m_Header.AddByte(*ti) ;
zi += 4 ;
break ;
}
}
}
if (nTry == 4)
{ m_Error.Printf("Given up!\n") ; return E_NODATA ; }
if (!m_Header.Size())
{ m_Error.Printf("Given up! Header is empty\n") ; return E_NODATA ; }
/*
** Examine header
*/
// First part is the HTTP return code
memset(numBuf, 0, 4) ;
hi = m_Header ;
if (hi == "HTTP/")
{
for (hi += 5 ; !hi.eof() && *hi > CHAR_SPACE ; hi++) ;
}
else
{
m_Error.Printf("case 1: 1st line of server response should be HTTP/{version} followed by a 3 digit HTML return code\n") ;
m_Error.Printf("got %d bytes of header namely:-\n[", m_Header.Size()) ;
m_Error << m_Header ;
m_Error << "]\n" ;
return E_FORMAT ;
}
m_Error << "Response\n" << m_Header << "\n--------------------------\n" ;
hi++ ; numBuf[0] = *hi ;
hi++ ; numBuf[1] = *hi ;
hi++ ; numBuf[2] = *hi ;
hi++ ; numBuf[3] = 0 ;
if (*hi != CHAR_SPACE || !IsDigit(numBuf[0]) || !IsDigit(numBuf[1]) || !IsDigit(numBuf[2]))
{
m_Error.Printf("case 2: 1st line of server response should be HTTP/1.1 followed by a 3 digit HTML return code - got [%s]\n\n", numBuf) ;
return E_FORMAT ;
}
hRet = (HttpRC) atoi(numBuf) ;
for (hi++ ; !hi.eof() && *hi != CHAR_NL ; hi++) ;
hi++ ;
// Next part is the header lines
for (nLine = 1 ; !hi.eof() ; nLine++, hi += nLen)
{
nLen = _extractHttpHeader(param, value, hi, false) ;
if (nLen == 0)
{
for (hi++ ; !hi.eof() && *hi != CHAR_NL ; hi++) ;
hi++ ;
m_Error.Printf("Line %d of header rejected (param=%s, value=%s)\n", nLine, *param, *value) ;
continue ;
}
if (param.Equiv("Date")) { m_Accessed = value ; continue ; }
if (param.Equiv("Expires")) { m_Expires = value ; continue ; }
if (param.Equiv("Last-Modified")) { m_Modified = value ; continue ; }
if (param.Equiv("Cache-Control")) { m_CacheCtrl = value ; continue ; }
if (param.Equiv("Pragma")) { m_Pragma = value ; continue ; }
if (param.Equiv("Location")) { m_Redirect = value ; continue ; }
if (param.Equiv("Keep-Alive")) { m_KeepAlive = value ; continue ; }
if (param.Equiv("Connection")) { m_bConnection = value == "close" ? false : true ; continue ; }
if (param.Equiv("Content-Type")) { m_ContentType = value ; continue ; }
if (param.Equiv("Content-Encoding")) { m_ContEncoding = value ; continue ; }
if (param.Equiv("Transfer-Encoding")) { m_XferEncoding = value ; continue ; }
if (param.Equiv("Alternate-Protocol")) { m_AltProto = value ; continue ; }
if (param.Equiv("ETag")) { m_Etag = value ; continue ; }
if (param.Equiv("Set-Cookie"))
{
// Get the cookie value
ti = hi ;
for (ti += 12 ; !ti.eof() && *ti != CHAR_EQUAL ; ti++)
X.AddByte(*ti) ;
cookie.m_Name = X ;
X.Clear() ;
for (ti++ ; !ti.eof() && *ti != CHAR_SCOLON ; ti++)
X.AddByte(*ti) ;
cookie.m_Value = X ;
//cookie.m_Value.FnameDecode() ;
X.Clear() ;
// Get the path
for (ti++ ; !ti.eof() && *ti == CHAR_SPACE ; ti++) ;
if (ti == "path=")
{
for (ti += 5 ; !ti.eof() && *ti > CHAR_SPACE ; ti++)
X.AddByte(*ti) ;
cookie.m_Path = X ;
X.Clear() ;
}
// Get special directives (eg HttpOnly)
for (ti++ ; !ti.eof() && *ti == CHAR_SPACE ; ti++) ;
if (ti == "HttpOnly")
cookie.m_Flags |= COOKIE_HTTPONLY ;
m_Cookies.Insert(cookie.m_Name, cookie) ;
cookie.Clear() ;
continue ;
}
if (param.Equiv("Content-Length"))
{
if (*value && value[0])
{
duHast = true ;
m_nContentLen = atoi(*value) ;
}
continue ;
}
}
/*
** Garner next the body, from the response
*/
m_Error.Printf("Getting body. xfer=%s, expect=%d, clen=%d\n", *m_XferEncoding, duHast?1:0, m_nContentLen) ;
if (!duHast)
{
// In chunked encoding the first part (directly after the header and the terminating \r\n\r\n), will be a hex number followed
// by a \r\n (on a line by itself). This hex number will mean the size of the following chunk. At the end of the chunk will be
// another hex number on a line by itself. Only when this number is zero are we at the end of the page.
//
// While reading the chunk size and chunk, we will most probably, reach the end of the buffer and have to do a read operation
// on the socket.
m_Error.Printf("Encoding is chunked\n") ;
nExpect = nCount = 0 ;
bTerm = false ;
for (; !bTerm ;)
{
// If we are at the end of the buffer, read more
for (; zi.eof() ;)
{
// If out of data, get more
m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
if (nRecv <= 0)
break ;
m_Error.Printf("Read buffer %d bytes\n", nRecv) ;
Z.Clear() ;
Z.Append(m_buf, nRecv) ;
for (zi = Z ; nExpect && !zi.eof() ; nExpect--, zi++)
m_Content.AddByte(*zi) ;
if (!nExpect)
break ;
}
if (!nExpect)
{
// We are on the 'chunk size' directive. This will be of the form \r\nXXX\r\n where X is a hex number
// Get rid of any \r\n sequences that are beyond the expected chars and before the chunk size directive
for (; !zi.eof() && (*zi == CHAR_CR || *zi == CHAR_NL) ; zi++) ;
if (zi.eof())
{
// If out of input data, get more
m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
if (nRecv)
{
m_Error.Printf("Read extras %d bytes\n", nRecv) ;
Z.Clear() ;
Z.Append(m_buf, nRecv) ;
for (zi = Z ; !zi.eof() && (*zi == CHAR_CR || *zi == CHAR_NL) ; zi++) ;
}
}
duHast = false ;
for (;;)
{
if (zi.eof())
{
m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
if (nRecv)
{
m_Error.Printf("Read extras %d bytes\n", nRecv) ;
Z.Clear() ;
Z.Append(m_buf, nRecv) ;
zi = Z ;
}
}
// Read the chunk size
if (*zi >= '0' && *zi <= '9') { duHast = true ; nExpect *= 16 ; nExpect += (*zi - '0') ; zi++ ; continue ; }
if (*zi >= 'A' && *zi <= 'F') { duHast = true ; nExpect *= 16 ; nExpect += (*zi-'A'+10) ; zi++ ; continue ; }
if (*zi >= 'a' && *zi <= 'f') { duHast = true ; nExpect *= 16 ; nExpect += (*zi-'a'+10) ; zi++ ; continue ; }
if (zi == "\r\n")
{ zi += 2 ; break ; }
if (*zi == CHAR_CR)
{ zi++ ; continue ; }
if (*zi == CHAR_NL)
{ zi++ ; break ; }
sRet = E_FORMAT ;
m_Error.Printf("Unexpected char (%d) in chunking directive - from page %s\n", *zi, *url.Resource()) ;
break ;
}
if (!duHast)
{
m_Error.Printf("Chunk notice missing\n") ;
sRet = E_FORMAT ;
}
if (sRet != E_OK)
break ;
if (nExpect == 0)
bTerm = true ;
//m_Error.Printf("Chunk notice %d bytes\n", nExpect) ;
if (nExpect)
{
// Play out rest of buffer but make sure we don't exceed the chunk size
for (; !zi.eof() && nExpect ; zi++, nExpect--)
m_Content.AddByte(*zi) ;
}
else
{
// At end of page, just play out rest of buffer
for (; !zi.eof() ; zi++) ;
//m_Content.AddByte(*zi) ;
}
m_Error.Printf("Chunk complete. Expect = %d\n", nExpect) ;
}
}
}
else
{
// Not chunked - just read until stated Content-Length is reached
if (m_nContentLen)
{
for (; !zi.eof() ; zi++)
m_Content.AddByte(*zi) ;
Z.Clear() ;
for (; m_Content.Size() < m_nContentLen ;)
{
sRet = m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
if (sRet != E_OK)
{
m_Error.Printf("(1) Could not recv bytes from page %s (error=%s)\n", *url.Resource(), Err2Txt(sRet)) ;
break ;
}
if (nRecv == 0)
{
sRet = m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
if (sRet != E_OK)
{
m_Error.Printf("(2) Could not recv bytes from page %s (error=%s)\n", *url.Resource(), Err2Txt(sRet)) ;
break ;
}
}
if (nRecv <= 0)
{
m_Error.Printf("Breaking after recv %d of %d bytes\n", m_Content.Size(), m_nContentLen) ;
break ;
}
m_Content.Append(m_buf, nRecv) ;
}
if (m_Content.Size() < m_nContentLen)
{
if (m_Content.Size() == (m_nContentLen - 4))
m_Error.Printf("Allowing 4-byte shortfall\n") ;
else
sRet = E_READFAIL ;
}
}
}
if (hRet == 200)
{
if (!m_Content.Size())
{
m_Error.Printf("No content (xfer_encoding=%s content_size=%d)\n", *m_XferEncoding, m_nContentLen) ;
sRet = E_NODATA ;
}
}
if (sRet == E_OK && m_ContEncoding)
{
// Must apply appropiate decoding to content
if (m_ContEncoding == "gzip")
{
X = m_Content ;
m_Content.Clear() ;
m_Error.Printf("doing gunzip\n") ;
sRet = Gunzip(m_Content, X) ;
if (sRet != E_OK)
m_Error.Printf("Gunzip failed\n") ;
}
}
m_Error.Printf("URL [%s] Header %d bytes, Content %d bytes (%d)\n\n", *url, m_Header.Size(), m_Content.Size(), m_nContentLen) ;
if (m_Content.Size() < 2000)
{
m_Error << "Content:\n" ;
m_Error << m_Content ;
m_Error << "------------------------\n" ;
}
return sRet ;
}
hzEcode hzHttpClient::TestPage (hzChain& Z, const hzUrl& url)
{
// Get a HTTP page from a website but do not process it in any way. This is for speed testing only.
//
// Note: The website (server) must already be connected to.
// No account is taken of redirected pages.
//
// Arguments: 1) Z The chain into which page content is to be received
// 2) url The URL of the page
//
// Returns: E_ARGUMENT If no URL was specified
// E_NODATA If nothing was recived
// E_OK If the response was recieved without error
_hzfunc("hzHttpClient::Testpage") ;
chIter zi ; // To iterate the returned page
chIter hi ; // To re-iterate lines of interest in the header of the returned page
chIter ti ; // Temp iterator
hzChain X ; // Temp buffer
hzCookie cookie ; // Cookie (drawn from supplied map of cookies)
hzString S ; // Temp string
hzString param ; // Header parameter name
hzString value ; // Header parameter value
hzString encoding ; // Page content is encoded, eg gzip
uint32_t nRecv ; // Bytes received
uint32_t nTry ; // Number of tries
hzEcode rc = E_OK ; // Return code
// Clear buffers
Z.Clear() ;
m_Header.Clear() ;
m_Content.Clear() ;
if (!url.Domain())
{ m_Error.Printf("TestPage: No host to locate\n") ; return E_ARGUMENT ; }
/*
** Formulate HTTP request
*/
m_Request.Clear() ;
if (url.Resource())
m_Request << "GET " << url.Resource() << " HTTP/1.1\r\n" ;
else
m_Request << "GET / HTTP/1.1\r\n" ;
m_Request <<
"Accept: */*\r\n"
"Accept-Language: en-gb\r\n" ;
if (m_AuthBasic)
m_Request << "Authorization: Basic " << m_AuthBasic << "\r\n" ;
m_Request << "User-Agent: HadronZoo/0.8 Linux 2.6.18\r\n" ;
m_Request << "Host: " << url.Domain() << "\r\n" ;
if (m_Referer)
m_Request << "Referer: " << m_Referer << "\r\n" ;
m_Request << "Connection: Keep-Alive\r\n\r\n" ;
/*
** Send request
*/
m_Error << " Sending [" << m_Request << "] to domain " << url.Domain() << "\n" ;
rc = m_Webhost.Send(m_Request) ;
if (rc != E_OK)
{
m_Error.Printf("Could not send request to domain [%s] (error=%s)\n", *url.Domain(), Err2Txt(rc)) ;
return rc ;
}
// Garner response
for (nTry = 0 ; nTry < 4 && !m_Header.Size() ; nTry++)
{
rc = m_Webhost.Recv(m_buf, nRecv, HZ_MAXPACKET) ;
if (rc != E_OK)
{
if (rc == E_NOSOCKET)
m_Error.Printf("Connection closed by server\n") ;
else
m_Error.Printf("Could not recv bytes (nbytes=%d) from page %s (error=%s)\n", nRecv, *url.Resource(), Err2Txt(rc)) ;
break ;
}
if (!nRecv)
{
m_Error.Printf("Got no response, retrying ...\n") ;
sleep(1) ;
continue ;
}
Z.Append(m_buf, nRecv) ;
}
if (rc != E_OK)
{
m_Error.Printf("Could not process response from [%s] (error=%s)\n", *url, Err2Txt(rc)) ;
return rc ;
}
m_Referer = url ;
return rc ;
}
hzEcode hzHttpClient::_getpage (HttpRC& hRet, const hzUrl& url, const hzString& etag)
{
// Get a HTTP page from a website but do not redirect. This is a support function for GetPage()
//
// Arguments: 1) hRet The HTTP return code from server
// 2) url The URL
// 3) etag Entity tag
//
// Returns: E_ARGUMENT If the URL is not supplied or no domain specified
// E_NOSOCKET If the external server has closed the connection
// E_NODATA If nothing was recived
// E_FORMAT If the response was malformed
// E_OK If the response was recieved without error
_hzfunc("hzHttpClient::_getpage") ;
chIter zi ; // To iterate the returned page
chIter hi ; // To re-iterate lines of interest in the header of the returned page
chIter ti ; // Temp iterator
hzChain Z ; // Request buffer
hzChain X ; // Temp buffer
hzCookie cookie ; // Cookie (drawn from supplied map of cookies)
hzString S ; // Temp string
hzString param ; // Header parameter name
hzString value ; // Header parameter value
hzString encoding ; // Page content is encoded, eg gzip
uint32_t x = 0 ; // Size of current chunk
bool bFirstCookie ; // Controls form of cookie header
hzEcode rc = E_OK ;
// Clear buffers
m_Header.Clear() ;
m_Content.Clear() ;
if (!url.Domain())
{ m_Error.Printf("No host to locate\n") ; return E_ARGUMENT ; }
/*
** Formulate HTTP request
*/
m_Request.Clear() ;
if (url.Resource())
m_Request << "GET " << url.Resource() << " HTTP/1.1\r\n" ;
else
m_Request << "GET / HTTP/1.1\r\n" ;
m_Request << "Accept: */*\r\n" ;
//m_Request << "Accept-Encoding: gzip\r\n" ;
m_Request << "Accept-Language: en-gb\r\n" ;
if (m_Cookies.Count())
{
m_Request << "Cookie: " ;
bFirstCookie = false ;
for (x = 0 ; x < m_Cookies.Count() ; x++)
{
cookie = m_Cookies.GetObj(x) ;
if (bFirstCookie)
m_Request << "; " ;
m_Request.Printf("%s=%s", *cookie.m_Name, *cookie.m_Value) ;
bFirstCookie = true ;
}
m_Request << "\r\n" ;
}
if (etag)
m_Request << "If-None-Match: " << etag << "\r\n" ;
if (m_AuthBasic)
m_Request << "Authorization: Basic " << m_AuthBasic << "\r\n" ;
m_Request << "User-Agent: HadronZoo/0.8 Linux 2.6.18\r\n" ;
m_Request << "Host: " << url.Domain() << "\r\n" ;
if (m_Referer)
m_Request << "Referer: " << m_Referer << "\r\n" ;
m_Request << "Connection: keepalive\r\n\r\n" ;
// Connect to server
if (url.IsSSL())
rc = m_Webhost.ConnectSSL(url.Domain(), url.Port()) ;
else
rc = m_Webhost.ConnectStd(url.Domain(), url.Port()) ;
if (rc != E_OK)
{
m_Error.Printf("Could not connect to domain [%s] on port %d (error=%s)\n", *url.Domain(), url.Port(), Err2Txt(rc)) ;
return rc ;
}
// Send request
m_Error << " Sending [" << m_Request << "] to domain " << url.Domain() << "\n" ;
rc = m_Webhost.Send(m_Request) ;
if (rc != E_OK)
{
m_Error.Printf("Could not send request to domain [%s] (error=%s)\n", *url.Domain(), Err2Txt(rc)) ;
return rc ;
}
// Garner response
rc = _procHttpResponse(hRet, url) ;
if (rc != E_OK)
{
m_Error.Printf("Could not process response from [%s] (error=%s)\n", *url, Err2Txt(rc)) ;
return rc ;
}
m_Referer = url ;
m_Webhost.Close() ;
return rc ;
}
hzEcode hzHttpClient::GetPage (HttpRC& hRet, const hzUrl& url, const hzString& etag)
{
// Get a HTTP page from a website. Note that the whole page is retrieved or abandoned before this function returns. Some servers send pages with
// the header 'Transfer-Encoding: chunked' instead of the 'Content-Length:' header. This is done because the size of the page is not known at the
// start of transmission. The body part of the message is sent in chunks with the chunk size given (in hex on a line by itself) at the start of
// each chunk. Because of the existance of the chunked approach, this function has to handle it but it is currently not possible for applications
// to take advantage in the intended way. Instead applications calling this function have to wait until it returns with a complete page, however
// long!
//
// Note that no assumptions can be made about packets that are sent except that since the connection is TCP, they will be in order. The header
// may be comprised of a number of whole packets or it may be that a packet stradles the end of the header and the start of the contents.
//
// Arguments: 1) hRet HTTP return code from the server.
// 2) url The URL of the page to retrieve.
// 3) etag Page entity tag (as maintained by hzWebhost instance)
//
// Returns: E_ARGUMENT If the URL is not supplied or no domain specified
// E_NOSOCKET If the external server has closed the connection
// E_NODATA If nothing was recived
// E_FORMAT If the response was malformed
// E_OK If the response was recieved without error
_hzfunc("hzHttpClient::GetPage") ;
hzUrl dest ; // Actual URL for downloading - may be result of a redirection
hzString dom ; // This is set first to the called URL's domain but afterwards to any redirected domain
hzString etag2 ; // Set as null for the benefit of _getpage() in the case of redirection
hzEcode rc = E_OK ; // Return code
// Considered a top-level function so we clear the error chain
m_Error.Clear() ;
m_Error.Printf("GETTING PAGE %s\n", *url) ;
dest = url ;
m_rtRequest = RealtimeNano() ;
rc = _getpage(hRet, dest, etag) ;
m_rtResponse = RealtimeNano() ;
if (rc != E_OK)
{
m_Error.Printf("ABORTED (_getpage failure)\n") ;
return rc ;
}
for (; hRet == HTTPMSG_REDIRECT_PERM || hRet == HTTPMSG_REDIRECT_TEMP ;)
{
//Clear() ;
if (!m_Redirect)
m_Error.Printf("Oops - no URL to redirect to\n") ;
else
{
if (m_Redirect[0] == CHAR_FWSLASH)
{ dom = dest.Domain() ; dest.SetValue(dom, m_Redirect) ; }
else
dest = m_Redirect ;
m_Error.Printf("redirecting to %s\n", *dest) ;
rc = _getpage(hRet, dest, etag2) ;
if (rc != E_OK)
{
m_Error.Printf("Redirect FAILED (error=%s)\n", Err2Txt(rc)) ;
return rc ;
}
}
}
// Obtain document type. If HTML then also get links
m_Error.Printf("Got response %d (size %d bytes)\n", hRet, m_Content.Size()) ;
return rc ;
}
hzEcode hzHttpClient::_postform (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& hdrs, const hzChain& formData)
{
// Support function for hzHttpClient::PostForm(). Compiles the HTTP request and adds the supplied form. The functionality herin would just
// appear in PostForm() except for the need to cope with redirection. This requires that the request ...
//
// Arguments: 1) hRet Reference to HTTP return code, set by this operation
// 2) url The URL to post the form to
// 3) hdrs Vector of additional HTTP headers
// 4) formData The actual form data
//
// Returns: E_ARGUMENT If the URL is not supplied or no domain specified
// E_NOSOCKET If the external server has closed the connection
// E_NODATA If nothing was recived
// E_FORMAT If the response was malformed
// E_OK If the form was posted and the response was recieved without error
_hzfunc("hzHttpClient::PostForm") ;
hzCookie cookie ; // Cookie (drawn from supplied map of cookies)
hzString dom ; // Domain part of URL
hzString res ; // Resource part of URL
uint32_t nPort ; // Port (from URL)
uint32_t nIndex ; // Form data iterator
bool bFirstCookie ; // Controls form of cookie header
hzEcode rc ; // Return code
m_Request.Clear() ;
dom = url.Domain() ;
res = url.Resource() ;
nPort = url.Port() ;
if (url.IsSSL())
m_Request.Printf("POST https://%s%s HTTP/1.1\r\n", *dom, *res) ;
else
m_Request.Printf("POST http://%s%s HTTP/1.1\r\n", *dom, *res) ;
m_Request << "Host: " << dom << "\r\n" ;
m_Request << "User-Agent: HadronZoo/0.8 Linux 2.6.18\r\n" ;
m_Request << "Accept: */*\r\n" ;
m_Request << "Accept-Language: en-gb,en;q=0.5\r\n" ;
//m_Request << "Accept-Encoding: gzip, deflate\r\n" ;
m_Request << "Content-Type: application/x-www-form-urlencoded; charset=UTF-8\r\n" ;
if (m_Referer)
m_Request << "Referer: " << m_Referer << "\r\n" ;
m_Request.Printf("Content-Length: %d\r\n", formData.Size()) ;
if (m_Cookies.Count())
{
m_Request << "Cookie: " ;
bFirstCookie = false ;
for (nIndex = 0 ; nIndex < m_Cookies.Count() ; nIndex++)
{
cookie = m_Cookies.GetObj(nIndex) ;
if (bFirstCookie)
m_Request << "; " ;
m_Request.Printf("%s=%s", *cookie.m_Name, *cookie.m_Value) ;
bFirstCookie = true ;
}
m_Request << "\r\n" ;
}
if (hdrs.Count())
{
for (nIndex = 0 ; nIndex < hdrs.Count() ; nIndex++)
//m_Request << hdrs.Element(nIndex) ;
m_Request << hdrs[nIndex] ;
}
m_Request << "Connection: keep-alive\r\n" ;
m_Request << "Pragma: no-cache\r\n" ;
m_Request << "Cache-Control: no-cache\r\n\r\n" ;
m_Request << formData ;
// Connect to server
if (url.IsSSL())
rc = m_Webhost.ConnectSSL(dom, nPort) ;
else
rc = m_Webhost.ConnectStd(dom, nPort) ;
if (rc != E_OK)
{
m_Error.Printf("Could not connect to %s on port %d\n", *dom, nPort) ;
return rc ;
}
m_Error.Printf("Connected to %s on port %d\n[\n", *dom, nPort) ;
m_Error << m_Request ;
m_Error << "\n-------------------------\n\n" ;
rc = m_Webhost.Send(m_Request) ;
if (rc != E_OK)
m_Error.Printf("Could not send request (error=%s)\n", Err2Txt(rc)) ;
else
{
rc = _procHttpResponse(hRet, url) ;
if (rc != E_OK)
m_Error.Printf("Could not get response (error=%s)\n", Err2Txt(rc)) ;
}
return rc ;
}
hzEcode hzHttpClient::PostForm (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& hdrs, const hzList<hzPair>& formData)
{
// Post a form to the server. Note that this will normally result in a HTTP response. This response must be processed in the same
// way (ie values are extracted from lines in the HTTP header).
//
// Arguments: 1) hRet HTTP return code
// 2) url The URL
// 3) hdrs Lines in HTTP header
// 4) formData The form data to be submitted
//
// Returns: E_ARGUMENT If the URL is not supplied or no domain specified
// E_NOSOCKET If the external server has closed the connection
// E_NODATA If nothing was recived
// E_FORMAT If the response was malformed
// E_OK If the form was posted and the response was recieved without error
_hzfunc("hzHttpClient::PostForm") ;
hzList<hzPair>::Iter iD ; // Form data iterator
hzChain F ; // Form data in submissible form
hzCookie cookie ; // Cookie (drawn from supplied map of cookies)
hzPair P ; // Form data field
hzUrl dest ; // Url may change due to redirection
hzString dom ; // Domain part of URL
hzString res ; // Resource part of URL
hzString etag ; // Temp string for reading form data
hzEcode rc ; // Return code
// Considered a top-level function so we clear the error chain
m_Error.Clear() ;
m_Error.Printf("POSTING FORM %s\n", *url) ;
//Clear() ;
m_Header.Clear() ;
m_Content.Clear() ;
m_Request.Clear() ;
if (!formData.Count())
return E_NODATA ;
for (iD = formData ; iD.Valid() ; iD++)
{
P = iD.Element() ;
if (F.Size())
F.AddByte(CHAR_AMPSAND) ;
F << P.name ;
F.AddByte(CHAR_EQUAL) ;
P.value.UrlEncode() ;
F << P.value ;
}
dest = url ;
rc = _postform(hRet, dest, hdrs, F) ;
if (rc != E_OK)
{
m_Error.Printf("FAILED (error=%s)\n", Err2Txt(rc)) ;
return rc ;
}
for (; hRet == HTTPMSG_REDIRECT_PERM || hRet == HTTPMSG_REDIRECT_TEMP ;)
{
if (!m_Redirect)
m_Error.Printf("Oops - no URL to redirect to\n") ;
else
{
if (m_Redirect[0] == CHAR_FWSLASH)
{ dom = dest.Domain() ; dest.SetValue(dom, m_Redirect) ; }
else
dest = m_Redirect ;
m_Error.Printf("redirecting to %s\n", *dest) ;
etag = (char*) 0 ;
rc = _getpage(hRet, dest, etag) ;
if (rc != E_OK)
{
m_Error.Printf("Redirect FAILED (error=%s)\n", Err2Txt(rc)) ;
break ;
}
}
}
return rc ;
}
hzEcode hzHttpClient::PostAjax (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& hdrs, const hzList<hzPair>& formData)
{
// Post a form to the server but do not seek a HTTP response.
//
// Arguments: 1) hRet HTTP return code
// 2) url The URL
// 3) hdrs Lines in HTTP header
// 4) formData The form data to be submitted
//
// Returns: E_ARGUMENT If the URL is not supplied or no domain specified
// E_NOSOCKET If the external server has closed the connection
// E_NODATA If nothing was recived
// E_FORMAT If the response was malformed
// E_OK If the AJAX request was sent and the response was recieved without error
_hzfunc("hzHttpClient::PostAjax") ;
hzList<hzPair>::Iter iD ; // Form data iterator
hzChain F ; // Form data in submissible form
hzCookie cookie ; // Cookie (drawn from supplied map of cookies)
hzPair P ; // Form data field
hzString dom ; // Domain part of URL
hzString res ; // Resource part of URL
hzString S ; // Temp string for reading form data
uint32_t nPort ; // Port (from URL)
uint32_t nIndex ; // Form data iterator
hzEcode rc ; // Return code
//Clear() ;
m_Header.Clear() ;
m_Content.Clear() ;
m_Request.Clear() ;
if (!formData.Count())
return E_NODATA ;
for (iD = formData ; iD.Valid() ; iD++)
{
P = iD.Element() ;
if (F.Size())
F.AddByte(CHAR_AMPSAND) ;
F << P.name ;
F.AddByte(CHAR_EQUAL) ;
F << P.value ;
}
dom = url.Domain() ;
res = url.Resource() ;
nPort = url.Port() ;
if (url.IsSSL())
m_Request.Printf("POST https://%s%s HTTP/1.1\r\n", *dom, *res) ;
else
m_Request.Printf("POST http://%s%s HTTP/1.1\r\n", *dom, *res) ;
//m_Request << "POST " << "http://" << dom << res << " HTTP/1.1\r\n" ;
m_Request << "Accept: text/*\r\n" ;
m_Request << "Accept-Language: en-gb\r\n" ;
//m_Request << "Accept-Encoding:\r\n" ;
//m_Request << "Accept-Encoding: gzip, deflate\r\n" ;
for (nIndex = 0 ; nIndex < m_Cookies.Count() ; nIndex++)
{
cookie = m_Cookies.GetObj(nIndex) ;
if (cookie.m_Flags & COOKIE_HTTPONLY)
continue ;
m_Request.Printf("Cookie: %s=%s\r\n", *cookie.m_Name, *cookie.m_Value) ;
}
//m_Request << "User-Agent: HadronZoo/0.8 (compatible; MSIE 6.0;)\r\n" ;
m_Request << "User-Agent: HadronZoo/0.8 Linux 2.6.18\r\n" ;
m_Request.Printf("Content-Length: %d\r\n", F.Size()) ;
m_Request << "Host: " << dom << "\r\n" ;
if (hdrs.Count())
{
for (nIndex = 0 ; nIndex < hdrs.Count() ; nIndex++)
//m_Request << hdrs.Element(nIndex) ;
m_Request << hdrs[nIndex] ;
}
m_Request << "Connection: close\r\n\r\n" ;
m_Request << F ;
S = m_Request ;
threadLog("Sending [\n%s]\n", *S) ;
// Connect to server
if (url.IsSSL())
rc = m_Webhost.ConnectSSL(dom, nPort) ;
else
rc = m_Webhost.ConnectStd(dom, nPort) ;
if (rc != E_OK)
return rc ;
// Send request
rc = m_Webhost.Send(m_Request) ;
return rc ;
}
/*
** Section 2, Subsect-A: hzWebhost private functions
*/
void hzWebhost::_clear (void)
{
// Clears the hzWebhost for shutdown or for re-initialization for syncing another website
//
// Arguments: None
// Returns: None
_hzfunc("hzWebhost::_clear") ;
hzDocMeta* pMark ; // Document info
uint32_t nIndex ; // History itterator
m_Offsite.Clear() ;
m_Domains.Clear() ;
m_Roots.Clear() ;
m_Feeds.Clear() ;
m_Emails.Clear() ;
m_Banned.Clear() ;
for (nIndex = 0 ; nIndex < m_mapHist.Count() ; nIndex++)
{
pMark = m_mapHist.GetObj(nIndex) ;
delete pMark ;
}
m_mapHist.Clear() ;
m_vecHist.Clear() ;
}
hzEcode hzWebhost::_loadstatus (void)
{
// Load visit status file (called upon startup). This way we do not re-fetch pages that have already been loaded unless they are out of date.
//
// Arguments: None
//
// Returns: E_NOINIT If the repository for the webhost has not previously been defined
// E_OPENFAIL If the visit status file could not be opened
// E_OK If the visit status file is read in or was empty
_hzfunc("hzWebhost::_loadstatus") ;
hzDocXml X ; // The manifest as XML document
hzWebCMD wc ; // Current web command
hzAttrset ai ; // Attribute itterator
hzDocMeta* pMark ; // Link meta data
hzXmlNode* pRoot ; // Root XML node
hzXmlNode* pN1 ; // Level 1 XML node
hzXmlNode* pN2 ; // Level 2 XML node
hzXmlNode* pN3 ; // Level 3 XML node
_pageList* pgl ; // List of lists of pages
hzPair p ; // Pair from formdata
hzUrl url ; // in-page link
hzString vs_fname ; // Vistation status file
hzString anam ; // Attribute name
hzString aval ; // Attribute value
hzEcode rc = E_OK ; // Return
m_mapHist.Clear() ;
m_vecHist.Clear() ;
if (!m_Repos)
return hzerr(E_NOINIT, "No repository specified. Cannot determine data state") ;
vs_fname = m_Repos + "/manifest" ;
rc = TestFile(vs_fname) ;
if (rc == E_NOTFOUND)
{ threadLog("No status file found. Repository in virgin state\n") ; return E_OK ; }
if (rc != E_OK)
{ threadLog("manifest file lookup error (%s)\n", Err2Txt(rc)) ; return rc ; }
rc = X.Load(vs_fname) ;
if (rc != E_OK)
{ threadLog("Could not open Visit Status File %s for writing\n", *vs_fname) ; return E_OPENFAIL ; }
pRoot = X.GetRoot() ;
for (pN1 = pRoot->GetFirstChild() ; rc == E_OK && pN1 ; pN1 = pN1->Sibling())
{
if (pN1->NameEQ("pagelists"))
{
for (pN2 = pN1->GetFirstChild() ; rc == E_OK && pN2 ; pN2 = pN2->Sibling())
{
if (pN2->NameEQ("pagelist"))
{
ai = pN2 ;
if (ai.Valid())
{
anam = ai.Name() ; aval = ai.Value() ;
pgl = new _pageList() ;
if (anam == "name")
pgl->name = aval ;
}
for (pN3 = pN2->GetFirstChild() ; rc == E_OK && pN3 ; pN3 = pN3->Sibling())
{
if (pN3->NameEQ("page"))
{
ai = pN3 ;
if (ai.Valid())
{
anam = ai.Name() ; aval = ai.Value() ;
if (anam == "url")
pgl->links.Add(aval) ;
}
}
}
}
}
}
if (pN1->NameEQ("commands"))
{
ai = pN1 ;
if (ai.Valid())
{
anam = ai.Name() ; ai.Value() ;
if (anam == "sofar")
m_Sofar = atoi(*aval) ;
}
for (pN2 = pN1->GetFirstChild() ; rc == E_OK && pN2 ; pN2 = pN2->Sibling())
{
if (pN2->NameEQ("command"))
continue ;
for (ai = pN2 ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; ai.Value() ;
if (anam == "url") wc.m_Url = aval ;
else if (anam == "crit") wc.m_Crit = aval ;
else if (anam == "slct") wc.m_Slct = aval ;
else if (anam == "inps") wc.m_Inputs = aval ;
else if (anam == "outs") wc.m_Output = aval ;
}
pN3 = pN2->GetFirstChild() ;
if (pN3 && pN3->NameEQ("form"))
{
for (ai = pN3 ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; ai.Value() ;
p.name = anam ;
p.value = aval ;
wc.m_Formdata.Add(p) ;
}
}
}
}
if (pN1->NameEQ("history"))
{
for (pN2 = pN1->GetFirstChild() ; rc == E_OK && pN2 ; pN2 = pN2->Sibling())
{
if (pN2->NameEQ("page"))
{
pMark = new hzDocMeta() ;
for (ai = pN2 ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; ai.Value() ;
if (anam == "urlReq") pMark->m_urlReq = aval ;
else if (anam == "urlAct") pMark->m_urlAct = aval ;
else if (anam == "title") pMark->m_Title = aval ;
else if (anam == "desc") pMark->m_Desc = aval ;
else if (anam == "fname") pMark->m_Filename = aval ;
else if (anam == "etag") pMark->m_Etag = aval ;
else if (anam == "dtDnl") pMark->m_Download.SetDateTime(aval) ;
else if (anam == "dtMod") pMark->m_Modified.SetDateTime(aval) ;
else if (anam == "dtExp") pMark->m_Expires.SetDateTime(aval) ;
else if (anam == "type") pMark->m_Doctype = (hzDoctype) atoi(*aval) ;
else
threadLog("Unexpected page attribute %s=%s\n", *anam, *aval) ;
}
m_vecHist.Add(pMark) ;
}
}
}
}
return rc ;
}
hzEcode hzWebhost::_savestatus (void)
{
// Write out visit status file. This keeps a record of what URL's have already been downloaded and to which files, and the expiry
// date (after which the page will have to be fetched again)
//
// Arguments: None
//
// Returns: E_NOINIT If the repository for the webhost has not previously been defined
// E_OPENFAIL If the visit status file could not be opened
// E_OK If the visit status file is read in or was empty
_hzfunc("hzWebhost::_status") ;
hzList<hzUrl>::Iter li ; // Links iterator (for pagelists)
hzList<hzWebCMD>::Iter ci ; // Iterator for web commands
hzList<hzPair>::Iter pi ; // Iterator for web commands
ofstream os ; // Output stream
hzWebCMD wc ; // Current web command
hzCookie cook ; // Cookie instance
hzChain Z ; // For building status file
_pageList* pgl ; // Pagelist
hzDocMeta* pMark ; // Document meta data
hzPair p ; // Pair from formdata
hzString vs_fname ; // Vistation status file
hzString S ; // Tmp string
hzUrl url ; // Link
uint32_t nIndex ; // Links iterator
uint32_t x ; // Links iterator
hzEcode rc = E_OK ; // Return
if (!m_Repos)
return hzerr(E_NOINIT, "No repository specified. Cannot determine data state") ;
vs_fname = m_Repos + "/manifest" ;
os.open(*vs_fname) ;
if (os.fail())
{
threadLog("Could not open Visit Status File %s for writing\n", *vs_fname) ;
return E_OPENFAIL ;
}
threadLog("savestat: case 1\n") ;
if (m_Cookies.Count())
{
Z << "<cookies>\n" ;
for (x = 0 ; x < m_Cookies.Count() ; x++)
{
cook = m_Cookies.GetObj(x) ;
Z.Printf("\t<cookie sig=\"%s\" name=\"%s\" path=\"%s\" flg=\"%d\" expire=\"%s\"/>\n",
*cook.m_Value, *cook.m_Name, *cook.m_Path, cook.m_Flags, *cook.m_Expires) ;
}
Z << "</cookies>\n" ;
}
threadLog("savestat: case 2\n") ;
if (m_Pagelists.Count())
{
Z << "<pagelists>\n" ;
for (x = 0 ; x < m_Pagelists.Count() ; x++)
{
pgl = m_Pagelists.GetObj(x) ;
Z.Printf("\t<pagelist name=\"%s\">\n", *pgl->name) ;
if (pgl->links.Count())
{
for (li = pgl->links ; li.Valid() ; li++)
{
url = li.Element() ;
Z.Printf("\t\t<page url=\"%s\">\n", *url.Whole()) ;
}
}
Z << "\t</pagelist>\n" ;
}
Z << "</pagelists>\n" ;
}
threadLog("savestat: case 3\n") ;
/*
** Do command list and status
*/
Z.Printf("<commands sofar=\"%d\">\n", m_Sofar) ;
for (ci = m_Commands ; ci.Valid() ; ci++)
{
wc = ci.Element() ;
if (wc.m_Cmd == WEBCMD_LOAD_PAGE) Z << "\t<command type=^WEBCMD_LOAD_PAGE^" ;
if (wc.m_Cmd == WEBCMD_LOAD_LIST) Z << "\t<command type=^WEBCMD_LOAD_LIST^" ;
if (wc.m_Cmd == WEBCMD_SLCT_PAGE) Z << "\t<command type=^WEBCMD_SLCT_PAGE^" ;
if (wc.m_Cmd == WEBCMD_SLCT_LIST) Z << "\t<command type=^WEBCMD_SLCT_LIST^" ;
if (wc.m_Cmd == WEBCMD_RGET) Z << "\t<command type=^WEBCMD_RGET^" ;
if (wc.m_Cmd == WEBCMD_POST) Z << "\t<command type=^WEBCMD_POST^" ;
if (wc.m_Cmd == WEBCMD_RSS) Z << "\t<command type=^WEBCMD_RSS^" ;
if (wc.m_Url) Z.Printf(" url=\"%s\"", *wc.m_Url) ;
if (wc.m_Crit) Z.Printf(" crit=\"%s\"", *wc.m_Crit) ;
if (wc.m_Slct) Z.Printf(" slct=\"%s\"", *wc.m_Slct) ;
if (wc.m_Inputs) Z.Printf(" inps=\"%s\"", *wc.m_Inputs) ;
if (wc.m_Output) Z.Printf(" outs=\"%s\"", *wc.m_Output) ;
if (!wc.m_Formdata.Count())
Z << " />\n" ;
else
{
Z << ">\n" ;
Z << "\t\t<form " ;
for (pi = wc.m_Formdata ; pi.Valid() ; pi++)
{
p = pi.Element() ;
Z.Printf(" %s=\"%s\"", *p.name, *p.value) ;
}
Z << " />\n" ;
Z << "\t</command>\n" ;
}
}
Z << "</commands>\n" ;
threadLog("savestat: case 4\n") ;
/*
** Do History
*/
Z << "<history>\n" ;
for (nIndex = 0 ; nIndex < m_vecHist.Count() ; nIndex++)
{
pMark = m_vecHist[nIndex] ;
Z.Printf("\t<webpage id=\"%d\" type=\"%d\"", pMark->m_Id, (uint32_t) pMark->m_Doctype) ;
if (pMark->m_urlReq) Z.Printf("\n\t\turlReq=\"%s\"", *pMark->m_urlReq) ;
if (pMark->m_urlAct) Z.Printf("\n\t\turlAct=\"%s\"", *pMark->m_urlAct) ;
if (pMark->m_Title) Z.Printf("\n\t\ttitle=\"%s\"", *pMark->m_Title) ;
if (pMark->m_Desc) Z.Printf("\n\t\tdesc=\"%s\"", *pMark->m_Desc) ;
if (pMark->m_Filename) Z.Printf("\n\t\tfname=\"%s\"", *pMark->m_Filename) ;
if (pMark->m_Etag) Z.Printf("\n\t\e-tag=\"%s\"", *pMark->m_Etag) ;
if (pMark->m_Download.IsSet()) Z.Printf("\n\t\tdtDnl=\"%s\"", *pMark->m_Download) ;
if (pMark->m_Modified.IsSet()) Z.Printf("\n\t\tdtMod=\"%s\"", *pMark->m_Modified) ;
if (pMark->m_Expires.IsSet()) Z.Printf("\n\t\tdtExp=\"%s\"", *pMark->m_Expires) ;
Z << "/>\n" ;
}
Z << "</history>\n" ;
threadLog("savestat: case 5\n") ;
if (m_Trace.Size())
{
Z << "<trace>\n" ;
Z << m_Trace ;
Z << "</trace>\n" ;
}
threadLog("savestat: case 6\n") ;
//Rat4Html(Z) ;
os << Z ;
os.close() ;
return rc ;
}
hzEcode hzWebhost::AddRoot (hzUrl& url, hzString& criteria)
{
// Adds a root URL for the target website
//
// Arguments: 1) url The root URL of the website
// 2) criteria The resource we want as the entry point
//
// Returns: E_ARGUMENT If the URL is not specified
// E_OK If the root is added
_hzfunc("hzWebhost::AddRoot") ;
hzPair X ; // URL/Search critiria pair
if (!url)
return E_ARGUMENT ;
X.name = url.Whole() ;
X.value = criteria ;
m_Roots.Add(X) ;
return E_OK ;
}
hzEcode hzWebhost::AddRSS (hzUrl& rss)
{
// Adds an RSS feed URL for the target website
//
// Arguments: 1) rss The URL of the website's RSS feed
//
// Returns: E_ARGUMENT If the URL is not specified
// E_OK If the root is added
_hzfunc("hzWebhost::AddRSS") ;
m_Feeds.Add(rss) ;
return E_OK ;
}
#define SITEPARAM_USE_FIRST_COOKIE 0x01 // Use the first cookie provided for the rest of session
#define SITEPARAM_USE_LOGIN_COOKIE 0x02 // Use the cookie in the login response for the rest of session
hzEcode hzWebhost::AuthBasic (const char* username, const char* password)
{
// Sets the basic authentication string for the website (if the site uses this method). Once set all requests to the target website will be
// submitted with this string in the HTTP header.
//
// Arguments: 1) username The user account username
// 2) password The user account password
//
// Returns: E_ARGUMENT If either the username or password is not supplied
// E_OK If the root is added
_hzfunc("hzWebhost::AuthBasic") ;
hzChain Enc ; // The encrypted sequence
hzChain Raw ; // The raw sequence
if (!username || !username[0] || !password || !password[0])
{
threadLog("Must supply both a username and password\n") ;
return E_ARGUMENT ;
}
Raw << username ;
Raw.AddByte(CHAR_COLON) ;
Raw << password ;
Base64Encode(Enc, Raw) ;
HC.m_AuthBasic = m_AuthBasic = Enc ;
return E_OK ;
}
hzEcode hzWebhost::Login (void)
{
// Execute the login process. This is always a case of downloading each page listed in m_Authspteps (if any) and then posting to the URL given in m_Authpage (if provided) with
// the name-value pairs listed in in m_Authform.
//
// Arguments: None
//
// Returns: E_NOTFOUND If the login page was not located
// E_WRITEFAIL If the form recieved was not written to the repository
// E_OK If the login form was posted (not the same thing as a successful login)
_hzfunc("hzWebhost::Login") ;
hzList<hzUrl>::Iter ias ; // Iterator for URLs in m_Authsteps
hzList<hzPair>::Iter inv ; // Iterator for name-value pairs in m_Authform
hzVect<hzString> hdrs ; // Extra headers, needed for submit form (not generally applicable)
ofstream os ; // For exporting to file
hzDocument* pDoc ; // Downoaded document
hzPair P ; // Name-value pair instance
hzUrl url ; // URL instance
hzString S ; // Temp string
hzString etag ; // For GetPage() call
HttpRC hRet ; // HTML return code
bool bAuthpage = false ; // Set to true if the login form (if used) is correctly listed in m_Authsteps
hzEcode rc = E_OK ; // Return code
threadLog("Starting Login Sequence\n") ;
// Werify we have to log on and if so, that the parameters are in place to support the login
if (m_Opflags & HZ_WEBSYNC_AUTH_BASIC)
{ threadLog("Basis Authentication. No login process required\n") ; return E_OK ; }
if (!(m_Opflags & (HZ_WEBSYNC_AUTH_POST | HZ_WEBSYNC_AUTH_GET)))
{
threadLog("No Authentication method\n") ;
if (!m_Authsteps.Count() && !m_Authform.Count())
{ threadLog("No Authentication steps or form submission. No login process required\n") ; return E_OK ; }
}
// Download all pages listed in m_Authsteps (note the download must happen even if the page is in the history because we need the cookies)
for (ias = m_Authsteps ; rc == E_OK && ias.Valid() ; ias++)
{
url = ias.Element() ;
if (url == m_Authpage)
bAuthpage = true ;
rc = HC.GetPage(hRet, url, etag) ;
if (rc != E_OK)
{ rc = E_NOTFOUND ; threadLog("Could not download %s\n", *url) ; }
}
if (rc != E_OK)
return rc ;
if (!bAuthpage && m_Authpage)
{
pDoc = Download(m_Authpage) ;
if (!pDoc)
{ threadLog("Could not download %s\n", *url) ; return E_NOTFOUND ; }
}
// Now if there is a login form, post this now
if (m_Authform.Count())
{
// Write out login form to file
if (m_Repos)
{
S = m_Repos + "/login_form" ;
os.open(*S) ;
if (os.fail())
{ threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
os << HC.m_Header ;
os << "\r\n\r\n" ;
os << HC.m_Content ;
os.close() ;
os.clear() ;
}
// Post the form
rc = HC.PostForm(hRet, m_Authpage, hdrs, m_Authform) ;
if (rc != E_OK)
{ threadLog("Could not post form to %s\n", *m_Authpage) ; return rc ; }
// Write out the login response
if (m_Repos)
{
S = m_Repos + "/login_response" ;
os.open(*S) ;
if (os.fail())
{ threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
os << HC.m_Header ;
os << "\r\n\r\n" ;
os << HC.m_Content ;
os.close() ;
}
}
return rc ;
}
void hzWebhost::Logout (void)
{
// Execute the logout process.
//
// Arguments: None
// Returns: None
_hzfunc("hzWebhost::Logout") ;
// STUB
}
hzEcode hzWebhost::Sync (void)
{
// Run the series of hzWebCMD directives to sync key pages from a website to a repository
//
// Arguments: None
//
// Returns: E_NOINIT If no repository, no domain or no homepage has been specified
// E_NOTFOUND If the login page was not located
// E_WRITEFAIL If the login form recieved was not written to the repository
// E_OPENFAIL If the visit status file could not be opened
// E_OK If the scrape operation was successful
_hzfunc("hzWebhost::Sync") ;
hzMapS <hzUrl,hzDocument*> cur ; // Currently loaded documents
hzMapS <hzString,hzString> fvals ; // Form values to be submitted
hzVect <hzHtmElem*> elems ; // Elements selected by the web selector command
hzList <hzWebCMD>::Iter ci ; // Iterator for web commands
hzList <hzPair>::Iter pi ; // Iterator for form data
hzList <hzUrl>::Iter si ; // Iterator for pagelist
hzList <hzHtmForm*>::Iter fi ; // Iterator for forms
hzSet <hzUrl> set_ctrl ; // Initial links from processing config params
hzVect <hzUrl> pglinks ; // Links encountered within a given pages
hzVect <hzUrl> allinks ; // Links encountered within a given pages
hzVect <hzString> hdrs ; // Extra headers, needed for submit form
hzList <hzPair> flist ; // Filtered list of form values
ofstream os ; // For writing form respose
_pageList* pgl = 0 ; // Primary pagelist instance
_pageList* pgl2 = 0 ; // Secondary pagelist instance
hzWebCMD wc ; // Current web command
hzDocument* pDoc ; // Downloaded document
hzDocHtml* pHdoc ; // Set if downloaded document is a HTML page.
hzHtmElem* pElem ; // HTML element (tag) lifted from page
hzHtmForm* pForm ; // Form found in page
hzPair P ; // Name value pair
hzXDate now ; // Date/time now (for cheking is pages have expired
hzAttrset ai ; // HTML element attribute iterator
hzString anam ; // Attribute name
hzString aval ; // Attribute value
hzString S ; // Temp string
hzUrl url ; // Temp link
uint32_t nStart ; // Links iterator
uint32_t nLimit ; // Links iterator
uint32_t nCount ; // Links iterator
uint32_t n ; // Aggregation iterator
HttpRC hRet = HTTPMSG_OK ; // HTML return code
hzEcode rc ; // Return code
threadLog("Called hzWebhost::Sync\n") ;
// Check if repository and list of command is set up
if (!m_Repos)
{ threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
if (!m_Commands.Count())
{ threadLog("Website is not properly initialized (no commands)\n") ; return E_NOINIT ; }
// Read in any existing manifest file
rc = _loadstatus() ;
if (rc != E_OK)
{ threadLog("Error on loading status - aborting\n") ; return rc ; }
// If resuming execution, start we left off
for (n = 0, ci = m_Commands ; n < m_Sofar ; n++, ci++) ;
// Execute commands in order
for (; rc == E_OK && hRet == HTTPMSG_OK && ci.Valid() ; ci++)
{
pDoc = 0 ;
wc = ci.Element() ;
switch (wc.m_Cmd)
{
case WEBCMD_LOAD_PAGE: // Get a page (no conditions)
if (!wc.m_Url)
{ threadLog("Invalid loadPage command - no URL\n") ; rc = E_NOINIT ; break ; }
threadLog("Doing WEBCMD_LOAD_PAGE\n") ;
pDoc = Download(wc.m_Url) ;
if (!pDoc)
{ threadLog("case 1. Could not fetch page %s\n", *wc.m_Url) ; rc = E_NOTFOUND ; break ; }
cur.Insert(wc.m_Url, pDoc) ;
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
if (pHdoc->m_Forms.Count())
{
// Add the forms to the m_Forms map in the hzWebhost instance
for (fi = pHdoc->m_Forms ; fi.Valid() ; fi++)
{
pForm = fi.Element() ;
m_Forms.Insert(pForm->name, pForm) ;
}
}
}
break ;
case WEBCMD_LOAD_LIST: // Get a list of pages (list supplied in command)
threadLog("Doing WEBCMD_LOAD_LIST\n") ;
if (!wc.m_Inputs)
{ threadLog(" - Invalid loadList command - no list of links named\n") ; rc = E_NOTFOUND ; break ; }
if (!m_Pagelists.Exists(wc.m_Inputs))
{ threadLog(" - No such list of links as %s\n", *wc.m_Inputs) ; rc = E_NOTFOUND ; break ; }
pgl = m_Pagelists[wc.m_Inputs] ;
for (si = pgl->links ; si.Valid() ; si++)
{
url = si.Element() ;
pDoc = Download(url) ;
if (!pDoc)
{ threadLog(" - case 3. Could not fetch page %s\n", *url) ; rc = E_NOTFOUND ; }
else
threadLog(" - Fetched page %s\n", *url) ;
}
threadLog("Ending WEBCMD_LOAD_LIST (%s)\n", *wc.m_Inputs) ;
break ;
case WEBCMD_SLCT_PAGE: // Select links from a page
threadLog("Doing WEBCMD_SLCT_PAGE\n") ;
if (wc.m_Url && wc.m_Inputs) { rc = E_NOINIT ; threadLog("Invalid request. Both a URL and an Input set specified\n") ; }
if (!wc.m_Url && !wc.m_Inputs) { rc = E_NOINIT ; threadLog("Invalid request. No URL or Input set specified\n") ; }
if (!wc.m_Output) { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no name for output list\n") ; }
if (!wc.m_Slct && !wc.m_Crit) { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no node selection or globing criteria\n") ; }
if (rc != E_OK)
break ;
if (cur.Exists(wc.m_Url))
pDoc = cur[wc.m_Url] ;
else
pDoc = Download(wc.m_Url) ;
if (!pDoc)
{ rc = E_NOTFOUND ; threadLog("case 2. Could not fetch page %s\n", *wc.m_Url) ; break ; }
pgl = new _pageList() ;
pgl->name = wc.m_Output ;
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Not a HTML document\n") ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
for (n = 0 ; n < pHdoc->m_vecTags.Count() ; n++)
{
pElem = pHdoc->m_vecTags[n] ;
threadLog("VEC TAG %d <%s ", n, *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
threadLog(" %s=%s", ai.Name(), ai.Value()) ;
}
threadLog(" />\n") ;
}
rc = pHdoc->FindElements(elems, wc.m_Slct) ;
for (n = 0 ; n < elems.Count() ; n++)
{
pElem = elems[n] ;
threadLog("%s. GOT <%s ", *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; aval = ai.Value() ;
threadLog(" %s=%s", *anam, *aval) ;
if (anam == "href")
{
url = aval ;
pgl->links.Add(url) ;
}
}
threadLog(" />\n") ;
}
}
threadLog("Inserting pagelist %s of %d items\n", *pgl->name, pgl->links.Count()) ;
m_Pagelists.Insert(pgl->name, pgl) ;
break ;
case WEBCMD_SLCT_LIST: // Select links from a set of pages (supplied as a set of links)
threadLog("Doing WEBCMD_SLCT_LIST (%s)\n", *wc.m_Url) ;
if (!wc.m_Inputs)
{ threadLog("Invalid slctList command - no source list of links\n") ; rc = E_NOINIT ; break ; }
if (!wc.m_Output)
{ rc = E_NOINIT ; threadLog("Invalid slctList command - no name for output list\n") ; }
if (!wc.m_Slct && !wc.m_Crit)
{ rc = E_NOINIT ; threadLog("Invalid slctList command - no node selection or globing criteria\n") ; }
if (rc != E_OK)
break ;
pgl2 = new _pageList() ;
pgl2->name = wc.m_Output ;
// Begin
pgl = m_Pagelists[wc.m_Inputs] ;
if (!pgl)
{ rc = E_CORRUPT ; threadLog("Pagelist of %s not found\n", *wc.m_Inputs) ; break ; }
for (si = pgl->links ; si.Valid() ; si++)
{
url = si.Element() ;
if (cur.Exists(url))
pDoc = cur[url] ;
else
pDoc = Download(url) ;
if (!pDoc)
{ rc = E_NOTFOUND ; threadLog("case 2.2 Could not fetch page %s\n", *url) ; break ; }
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
rc = pHdoc->FindElements(elems, wc.m_Slct) ;
for (n = 0 ; n < elems.Count() ; n++)
{
pElem = elems[n] ;
threadLog("%s. GOT <%s ", *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; aval = ai.Value() ;
threadLog(" %s=%s", *anam, *aval) ;
if (anam == "href")
{
url = aval ;
pgl2->links.Add(url) ;
}
}
threadLog(" />\n") ;
}
}
}
threadLog("Case 2. Inserting pagelist %s of %d items\n", *pgl2->name, pgl2->links.Count()) ;
m_Pagelists.Insert(pgl2->name, pgl2) ;
break ;
case WEBCMD_RGET: // Get a root page
threadLog("Doing WEBCMD_RGET\n") ;
threadLog("Page=%s Crit=%s\n", *wc.m_Url, *wc.m_Crit) ;
// Get root page first
pDoc = Download(wc.m_Url) ;
if (!pDoc)
threadLog("case 4. Could not fetch page %s\n", *wc.m_Url) ;
else
{
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Page %s not HTML\n", *wc.m_Url) ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
}
delete pDoc ;
}
// Now aggregate the vector of links from the page to a vector of all links from all pages. Use a set to avoid repeats.
for (n = 0 ; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
// Starting at the site root and for each page, grab all links and go to each link in turn
threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
for (nStart = 0 ; nStart < allinks.Count() ; nStart = nCount)
{
now.SysDateTime() ;
pglinks.Clear() ;
for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
{
url = allinks[nCount] ;
threadLog("Cosidering link %s - ", *url.Whole()) ;
if (m_mapHist.Exists(url)) { threadLog("historic\n") ; continue ; }
if (url == m_Authexit) { threadLog("exit-page\n") ; continue ; }
if (!m_Domains.Exists(url.Domain())) { threadLog("URL %s outside domain\n", *url) ; continue ; }
// Page not yet visted so we visit it, put it in list of pages visited and get the links. Some of these links may add to
// the list of links.
threadLog("Fetching\n") ;
pDoc = Download(url) ;
if (!pDoc)
threadLog("case 2. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
// Re-aggregate the all-links vector
for (n = 0 ; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
delete pDoc ;
}
}
}
break ;
case WEBCMD_POST: // Post a form. The form should have been previously downloaded and will be looked for by name
threadLog("Doing WEBCMD_POST\n") ;
pForm = m_Forms[wc.m_Output] ;
if (!pForm)
threadLog("Warning: No such form as [%s]\n", *wc.m_Output) ;
// Take the command's formdata and use it to populate the form's set of fields
/*
for (pi = pForm->fields ; pi.Valid() ; pi++)
{ P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
for (pi = wc.m_Formdata ; pi.Valid() ; pi++)
{ P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
for (n = 0 ; n < fvals.Count() ; n++)
{
P.name = fvals.GetKey(n) ;
P.value = fvals.GetObj(n) ;
flist.Add(P) ;
}
*/
rc = HC.PostForm(hRet, wc.m_Url, hdrs, wc.m_Formdata) ;
if (rc != E_OK)
{ threadLog("Could not post form to %s\n", *wc.m_Url) ; return rc ; }
if (hRet != HTTPMSG_OK)
{ threadLog("Invalid response to post form (to %s)\n", *wc.m_Url) ; return rc ; }
// Write out the login response
if (m_Repos)
{
url = wc.m_Url ;
S = m_Repos + "/" + url.Filename() ;
S += ".response" ;
os.open(*S) ;
if (os.fail())
{ threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
os << HC.m_Header ;
os << "\r\n\r\n" ;
os << HC.m_Content ;
os.close() ;
}
break ;
case WEBCMD_RSS: // Get an RSS feed
threadLog("Doing WEBCMD_RSS\n") ;
// If XML selectors for RSS feed are not initialized, set them here
if (!m_tagItem.m_Slct) { m_tagItem.m_Filt = (char*) 0 ; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
if (!m_tagUqid.m_Slct) { m_tagUqid.m_Filt = (char*) 0 ; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
if (!m_tagLink.m_Slct) { m_tagLink.m_Filt = (char*) 0 ; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
if (!m_tagDesc.m_Slct) { m_tagDesc.m_Filt = (char*) 0 ; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
if (!m_tagDate.m_Slct) { m_tagDate.m_Filt = (char*) 0 ; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
// Get the feed
rc = getRss_r(hRet, wc.m_Url, 0) ;
threadLog("Processed items\n") ;
break ;
}
}
// Write out manifest file
rc = _savestatus() ;
// Clear documents
for (n = 0 ; n < m_Pagelists.Count() ; n++)
{
pgl = m_Pagelists.GetObj(n) ;
delete pgl ;
}
for (n = 0 ; n < cur.Count() ; n++)
{
pDoc = cur.GetObj(n) ;
delete pDoc ;
}
return rc ;
}
hzEcode hzWebhost::Scrape (void)
{
// In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created and existing pages can be
// modified on an ad-hoc basis. A scrape captures the current state of the website or a limited portion of it to file.
//
// The scraping process runs through a set of known links for the website, downloading the page for each in turn. Each downloaded page is then examined for
// links. Links to domains other than the one in qestion are ignored. Links to such things as images are also ignored. Remaining links not found in the set
// of known links are added to this set. The process terminates when all the links have been attempted.
//
// The set of known links will need to comprise the site's home-page and a login page if this exists and if it is not the same as the home page. These will
// usually be enough to 'bootstrap' the rest of the site.
//
// Arguments: None
//
// Returns: E_NOINIT If no repository, no domain or no homepage has been specified
// E_NOTFOUND If the login page was not located
// E_WRITEFAIL If the login form recieved was not written to the repository
// E_OPENFAIL If the visit status file could not be opened
// E_OK If the scrape operation was successful
_hzfunc("hzWebhost::Scrape") ;
hzMapS<hzString,hzString> formData ; // Set of name value pairs
hzVect<hzString> hdrs ; // Extra headers, needed for submit form
hzList<hzPair>::Iter ci ; // Root commands iterator
hzSet<hzUrl> set_ctrl ; // Initial links from processing config params
hzVect<hzUrl> pglinks ; // Links encountered within a given pages
hzVect<hzUrl> allinks ; // Links encountered within a given pages
hzVect<hzUrl> todo ; // Links encountered in the pages in ctrl
ifstream is ; // For reading in visit status file
ofstream os ; // For writing out visit status file at end of scrape
hzDocMeta mark ; // Document meta data
hzChain Response ; // Response from form submission
hzDocument* pDoc ; // Downloaded document
hzDocHtml* pHdoc ; // Set if downloaded document is a HTML page.
hzPair X ; // Root comand instance
hzXDate now ; // Date/time now (for cheking is pages have expired
hzUrl url ; // Temp link
hzString vs_fname ; // Visit status filename
hzString pagepath ; // Filepath for file to store downloaded page
hzString S ; // Temp string
hzString etag ; // Temp string
uint32_t nStart ; // Links iterator
uint32_t nLimit ; // Links iterator
uint32_t nCount ; // Links iterator
uint32_t n ; // Aggregation iterator
hzEcode rc = E_OK ; // Return code
threadLog("Called hzWebhost::Scrape\n") ;
// Check if repository is set up (website is initialized)
if (!m_Repos)
{ threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
// Is there anything to do?
if (!m_Roots.Count())
{ threadLog("Website has no starting point (URL) for a WEB SCRAPE.\n") ; return E_NOINIT ; }
// Get the home page
// if (*m_Homepage)
if (m_Homepage)
{
//etag = 0 ;
//etag = (char*) 0 ;
pDoc = Download(m_Homepage) ;
if (!pDoc)
{ threadLog("Could not download page %s\n", *m_Homepage) ; return E_NOINIT ; }
m_docHome = pDoc ;
threadLog("HOMEPAGE SUCCESS\n") ;
}
// Login
rc = Login() ;
if (rc != E_OK)
{ threadLog("Login failed\n") ; return rc ; }
threadLog("Login SUCCESS\n") ;
// Run the root commands to obtain the set of roots. A root command may have either a URL or a 'link criteria' or both. If only a
// URL is present, this URL and ALL links found within it are added to the list of pages to process. If only a link criteria is
// present, the links found in the HOME page and the LOGIN RESPONSE page are tested against the criteria. If they match the link
// is added to the list of pages to process. If both a URL and a link criteria is found then the URL and any matching links found
// within it are added to the list of pages to process.
threadLog("Have %d root commands\n", m_Roots.Count()) ;
for (ci = m_Roots ; ci.Valid() ; ci++)
{
X = ci.Element() ;
threadLog("Page=%s Crit=%s\n", *X.name, *X.value) ;
// Get the page
if (X.name == "homepage")
{
// No page to get, just compare the criteria to the home
pHdoc = (hzDocHtml*) m_docHome ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
}
else if (X.name == "loginResponse")
{
// No page to get, just compare the criteria to the login response
pHdoc = (hzDocHtml*) m_resAuth ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
}
else
{
url = X.name ;
if (!url)
{ threadLog("Root command invalid page %s\n", *X.name) ; continue ; }
etag = (char*) 0 ;
pDoc = Download(url) ;
if (!pDoc)
threadLog("case 1. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Page %s not HTML\n", *url) ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
threadLog("Got page content, extracted %d links\n", pglinks.Count()) ;
}
delete pDoc ;
}
}
// Now aggregate the vector of links from the page to a vector of all links from all pages. Use a set to avoid repeats.
for (n = 0 ; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
/*
** Starting at the site root and for each page, grab all links and go to each link in turn
*/
threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
for (nStart = 0 ; nStart < allinks.Count() ; nStart = nCount)
{
now.SysDateTime() ;
todo.Clear() ;
for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
{
url = allinks[nCount] ;
threadLog("Cosidering link %s - ", *url.Whole()) ;
if (m_mapHist.Exists(url)) { threadLog("historic\n") ; continue ; }
if (url == m_Authexit) { threadLog("exit-page\n") ; continue ; }
if (!m_Domains.Exists(url.Domain())) { threadLog("URL %s outside domain\n", *url) ; continue ; }
// Page not yet visted so we visit it, put it in list of pages visited and get the links. Some of these links may add to
// the list of links.
threadLog("Fetching\n") ;
pDoc = Download(url) ;
threadLog("Fetched page %p\n", pDoc) ;
if (!pDoc)
threadLog("case 2. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
// Re-aggregate the all-links vector
for (n = 0 ; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
delete pDoc ;
}
}
/*
for (nAdded = nX = 0 ; nX < todo.Count() ; nX++)
{
//url = todo.GetObj(nX) ;
url = todo[nX] ; //.GetObj(nX) ;
if (set_ctrl.Exists(url))
continue ;
nAdded++ ;
set_ctrl.Insert(url) ;
}
todo.Clear() ;
if (!nAdded)
break ;
*/
}
// Write out manifest file
rc = _savestatus() ;
return rc ;
}
hzEcode hzWebhost::getRss_r (HttpRC& hRet, const hzUrl& feed, uint32_t nLevel)
{
// Recursive fetch of RSS documents. The supplied URL is downloaded and loaded into an XML document. There it is tested to ensure it is an
// XML document. The RSS feed is assumed to contain only links. These links may be to HTML pages or other (sub RSS feeds). The HTML pages
// are end points of the process. They are downloaded but any links they may contain are recorded but not followed. The sub-RSS feeds are
// then processed by recursive call to this function.
//
// Arguments: 1) hRet Set by this operation
// 2) feed The RSS URL
// 3) nLevel RSS Hierarchy
//
// Returns: E_NODATA If the download failed
// E_TYPE If the downloaded material does not appear to be XML
// E_FORMAT If the downloaded material could not be loaded into an XML document
// E_ARGUMENT If the RSS tags are not defined
// E_NOTFOUND If no tags were found in the RSS
// E_OK If the RSS data was collected
_hzfunc("hzWebhost::getRss_r") ;
hzVect<hzXmlNode*> linx ; // Links found in (this) RSS feed page
hzVect<hzUrl> todo ; // Links found in RSS feed page (additions to this are controlled by the set above)
hzDocXml X ; // For loading of RSS feed pages and extraction of links
hzXmlNode* pN1 ; // Nodes (containing <item>)
hzXmlNode* pN2 ; // Nodes (containing <item> subnodes of title, link, description)
hzDocMeta* pMark ; // Document meta data
hzDocument* pDoc ; // Document found at URL (could be XML of HTML)
hzUrl page ; // Temp link
hzString desc ; // RSS article description
hzString dstr ; // RSS article date
hzString uqid ; // Unique ID of RSS item
hzString title ; // RSS article title
uint32_t nIndex ; // Links iterator
hzEcode rc = E_OK ; // Return code
// Fetch the current RSS document
pDoc = Download(feed) ;
if (rc != E_OK)
{ threadLog("Could not fetch URL %s\n", *feed) ; return rc ; }
// If not an XML document then it is just a page. Nothing further.
if (pDoc->Whatami() != DOCTYPE_XML)
{ threadLog("case 1. Fetched feed (%s) is not of doctype XML\n", *feed) ; return E_TYPE ; }
nLevel++ ;
// Load current RSS document into XML document tree
rc = X.Load(HC.m_Content) ;
if (rc != E_OK)
return hzerr(rc, "Could not load feed %s", *feed) ;
// The page is an RSS document so select the <itme> tags
rc = X.FindNodes(linx, m_tagItem.m_Slct) ;
threadLog("Found %d <item> tags in feed %s\n", linx.Count(), *feed) ;
if (rc != E_OK)
return rc ;
for (nIndex = 0 ; nIndex < linx.Count() ; nIndex++)
{
threadLog("case 1\n") ;
pN1 = linx[nIndex] ;
title = (char*) 0 ; desc = (char*) 0 ; page = (char*) 0 ; uqid = (char*) 0 ; dstr = (char*) 0 ;
for (pN2 = pN1->GetFirstChild() ; pN2 ; pN2 = pN2->Sibling())
{
threadLog("case 2\n") ;
if (pN2->NameEQ(*m_tagTitl.m_Slct)) { title = pN2->m_fixContent ; continue ; }
if (pN2->NameEQ(*m_tagDesc.m_Slct)) { desc = pN2->m_fixContent ; continue ; }
if (pN2->NameEQ(*m_tagLink.m_Slct)) { page = pN2->m_fixContent ; continue ; }
if (pN2->NameEQ(*m_tagUqid.m_Slct)) { uqid = pN2->m_fixContent ; continue ; }
if (pN2->NameEQ(*m_tagDate.m_Slct)) { dstr = pN2->m_fixContent ; continue ; }
}
threadLog("case 3\n") ;
if (!page)
{ threadLog("case 1: title=%s; link=null uqid=%s\n", *title, *uqid) ; page = uqid ; }
if (!page)
{ threadLog("case 2: title=%s; link=null uqid=%s\n", *title, *uqid) ; continue ; }
threadLog("title=%s; link=%s\n", *title, *page) ;
if (m_mapHist.Exists(page))
threadLog("Exists in history, page %s\n", *page) ;
else
{
pMark = new hzDocMeta() ;
pMark->m_Title = title ;
pMark->m_Desc = desc ;
pMark->m_urlReq = page ;
if (dstr)
pMark->m_Modified.SetDateTime(*dstr) ;
//todo.Insert(page) ;
todo.Add(page) ;
threadLog("Adding to history, page %s\n", *page) ;
}
}
// Fetch all the new links found above by recursive call
for (nIndex = 0 ; nIndex < todo.Count() ; nIndex++)
{
page = todo[nIndex] ;
//pMark = m_mapHist[page] ;
threadLog("Processing %s\n", *page) ;
rc = getRss_r(hRet, page, nLevel) ;
}
return rc ;
}
hzEcode hzWebhost::GetRSS (void)
{
// In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created
// and existing pages can be modified on an ad-hoc basis. The RSS feeds allow greter ease when syncing an external website to the
// local machine. By periodically reading one or more RSS feeds one can obtain a set of links which can generally be taken as the
// set of pages deemed 'current' by the website. By comparing these links to a history file of already fetched links, new pages
// can be added to a respository as they appear on the site. The RSS feeds are just XML files containing links.
//
// This function will obtain all the RSS feeds from the site, garner all the links from them and then download any pages from the
// links that are not already in the site history. The feeds themselves are not saved as these will be fetched again.
//
// Arguments: None
//
// Returns: E_NOINIT If the repository for the webhost has not previously been defined
// E_OPENFAIL If the visit status file could not be opened
// E_NODATA If the download failed
// E_TYPE If the downloaded material does not appear to be XML
// E_FORMAT If the downloaded material could not be loaded into an XML document
// E_ARGUMENT If the RSS tags are not defined
// E_NOTFOUND If no tags were found in the RSS
// E_OK If the RSS data was collected
_hzfunc("hzWebhost::GetRSS") ;
hzList<hzUrl>::Iter fi ; // RSS feeds iterator
hzUrl feed ; // Temp link
HttpRC hRet ; // HTML return code
hzEcode rc = E_OK ; // Return code
threadLog("Called\n") ;
// Login
rc = Login() ;
if (rc != E_OK)
{ threadLog("Login failed\n") ; return rc ; }
// Get the home page if one applies. Do this regardless of weather we already have it because we need the cookie
if (!m_Feeds.Count())
{ threadLog("Website has no starting point (URL) for an RSS feed.\n") ; return E_NOINIT ; }
// If XML selectors for RSS feed are not initialized, set them here
if (!m_tagItem.m_Slct) { m_tagItem.m_Filt = (char*) 0 ; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
if (!m_tagUqid.m_Slct) { m_tagUqid.m_Filt = (char*) 0 ; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
if (!m_tagLink.m_Slct) { m_tagLink.m_Filt = (char*) 0 ; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
if (!m_tagDesc.m_Slct) { m_tagDesc.m_Filt = (char*) 0 ; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
if (!m_tagDate.m_Slct) { m_tagDate.m_Filt = (char*) 0 ; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
/*
** Fetch all the feed XML documents from the RSS source(s)
*/
for (fi = m_Feeds ; fi.Valid() ; fi++)
{
feed = fi.Element() ;
// Get the feed
rc = getRss_r(hRet, feed, 0) ;
threadLog("Processed items\n") ;
}
// Write out visit status file
rc = _savestatus() ;
return rc ;
}
hzDocument* hzWebhost::Download (const hzUrl& url)
{
// Fetch the page found at the supplied URL and return as a document (either XML or HTML).
//
// Note that if the page has already been downloaded (is in the site's history) then it is only downloaded again if it the time to
// live has expired. If the page is not downloaded then this function will reload it from file.
//
// Arguments: 1) url The URL of the file/resource to download
//
// Returns: Pointer to newly allocated document. Must be deleted after use.
_hzfunc("hzWebhost::Download") ;
static uint32_t nlast = 0 ; // Last point reached (for download rsumption)
ofstream os ; // To write out page contents
hzDocument* pDoc = 0 ; // Document downloaded
hzDocXml* pXdoc = 0 ; // XML Document downloaded
hzDocHtml* pHdoc = 0 ; // HTML Document downloaded
hzDocMeta* pMark ; // Document meta data
hzXDate now ; // Date & Time now
hzString S ; // Temp string
HttpRC hc ; // HTTP server return code
hzEcode rc ; // Return code
bool bHist = false ; // Set if url is already in history and downloaded again because of being out of date
char numbuf [8] ; // Working buffer
/*
** Check URL, insert in visited links if not already there
*/
if (!url)
{ threadLog("No supplied address\n") ; return 0 ; }
threadLog("FETCHING PAGE: %s\n", *url) ;
now.SysDateTime() ;
if (!(m_Opflags & WEBFLG_FORCE))
{
if (m_mapHist.Exists(url))
{
// The requested URL exists in the repository already. We check if it has expired and if not we terminate with OK
pMark = m_mapHist[url] ;
bHist = true ;
threadLog("Page %s is historic\n", *url) ;
// Create a document of the right type (XML or HTML)
if (pMark->m_Doctype == DOCTYPE_HTML)
pDoc = pHdoc = new hzDocHtml() ;
else if (pMark->m_Doctype == DOCTYPE_XML)
pDoc = pXdoc = new hzDocXml() ;
else
pDoc = pHdoc = new hzDocHtml() ;
pDoc->SetMeta(*pMark) ;
// Check if expiry is known and if so if it has expired
if (pMark->m_Expires.IsSet())
{
if (pMark->m_Expires < now)
{
// Set the markers and return
if (pMark->m_Doctype == DOCTYPE_XML)
{
// XML
pDoc = pXdoc = new hzDocXml() ;
pDoc->SetMeta(*pMark) ;
rc = pDoc->Load(HC.m_Content) ;
}
else
{
// HTML
pDoc = pHdoc = new hzDocHtml() ;
pDoc->SetMeta(*pMark) ;
rc = pDoc->Load(HC.m_Content) ;
}
threadLog("DOWNLOAD PREVIOUS (error=%s)\n\n", Err2Txt(rc)) ;
return pDoc ;
}
}
// At this point either the expiry date is unknown or it is known and has expired. Load from file
if (!HC.m_Content.Size())
{
threadLog("Case 1 Bloody thing is empty!\n") ;
return 0 ;
}
rc = pDoc->Load(HC.m_Content) ;
if (rc != E_OK)
threadLog("LOAD failed (error=%s)\n\n", Err2Txt(rc)) ;
return pDoc ;
}
}
// The requested URL is not in the history. Create the document meta for it and download it.
S = url.Filename() ;
pMark = new hzDocMeta() ;
pMark->m_urlReq = url ;
pMark->m_urlAct = url ;
pMark->m_Id = m_mapHist.Count() ;
sprintf(numbuf, "/%04d", pMark->m_Id) ;
pMark->m_Filename = m_Repos + numbuf + S ;
/*
** Get page content and process it into a tree
*/
threadLog("GETTIG PAGE: %s\n", *url) ;
rc = HC.GetPage(hc, url, pMark->m_Etag) ;
if (rc != E_OK)
{
threadLog("FAILED (error=%s) synopsis\n", Err2Txt(rc)) ;
threadLog(HC.m_Error) ;
return 0 ;
}
if (HC.m_Redirect)
pMark->m_urlAct = HC.m_Redirect ;
pMark->m_Modified = HC.m_Modified ;
threadLog("HTTP Return code = %d, cookie (value %s, path %s)\n", (uint32_t) hc, *m_CookieSess, *m_CookiePath) ;
/*
** Write out header to .hdr file and content to .con file
*/
if (m_Repos)
{
os.open(*pMark->m_Filename) ;
if (os.fail())
threadLog("Cannot write out header file %s\n", *pMark->m_Filename) ;
else
{
os << HC.m_Content ;
os.close() ;
}
os.clear() ;
}
/*
** Add the page but only process pages that are of a known HTML type .htm, .html, .shtml, .xhtml etc
*/
threadLog("PROCESSING Content: %d bytes\n", HC.m_Content.Size()) ;
if (!HC.m_Content.Size())
{
threadLog("Case 2 Bloody thing is empty!\n") ;
return 0 ;
}
pMark->m_Doctype = DeriveDoctype(HC.m_Content) ;
rc = E_NODATA ;
if (pMark->m_Doctype == DOCTYPE_XML)
{
// XML
pDoc = pXdoc = new hzDocXml() ;
pXdoc->Init(url) ;
rc = pXdoc->Load(HC.m_Content) ;
}
else
{
// HTML
pDoc = pHdoc = new hzDocHtml() ;
pHdoc->Init(url) ;
rc = pHdoc->Load(HC.m_Content) ;
if (rc != E_OK)
threadLog("Case 2 Bloody thing failed (error=%s)!\n", Err2Txt(rc)) ;
}
if (rc != E_OK)
{
threadLog("Load page failed error=%s\n", Err2Txt(rc)) ;
//delete pDoc ;
//return 0 ;
}
pDoc->SetMeta(*pMark) ;
// Place the URL in the site's history
m_mapHist.Insert(pMark->m_urlReq, pMark) ;
threadLog("Inserted URL %s\n", *pMark->m_urlReq) ;
if (pMark->m_urlAct != pMark->m_urlReq)
{
m_mapHist.Insert(pMark->m_urlAct, pMark) ;
threadLog("Inserted URL %s\n", *pMark->m_urlAct) ;
}
if (!bHist)
m_vecHist.Add(pMark) ;
if (pXdoc)
threadLog("DOWNLOAD SUCCESS XML Page %s. Now have %d (%d) items in history\n\n", *url, m_mapHist.Count(), nlast) ;
if (pHdoc)
threadLog("DOWNLOAD SUCCESS Page %s has %d links. Now have %d (%d) items in history\n\n", *url, pHdoc->m_vecLinks.Count(), m_mapHist.Count(), nlast) ;
threadLog(HC.m_Error) ;
return pDoc ;
}