// // File: hzHttpClient.h // // Legal Notice: This file is part of the HadronZoo C++ Class Library. // // Copyright 2025 HadronZoo Project (http://www.hadronzoo.com) // // The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free // Software Foundation, either version 3 of the License, or any later version. // // The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR // A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses. //
// The hzHttpClient suite in conjunction with the hzDocument classes, allows applications to act as HTTP clients to external websites. The following // classes are provided:- // // 1) hzCookie. Used to maintain session state. // 2) hzHttpClient. Manages the HTTP client connection to the remote website and provides inter alia, the functions of GetPage() and PostForm(). // 3) hzWebhost. A hzWebhost instance is a view of the external website as a whole and is used to manage client sessions with said website.
#ifndef hzHttpClient_h #define hzHttpClient_h
#include "hzTcpClient.h" #include "hzTmplList.h" #include "hzTmplVect.h" #include "hzTmplMapS.h" #include "hzHttpProto.h" #include "hzDocument.h"
enum hzAuthmode { // Category: Internet // // Authentication methods
HZ_AUTH_NONE, // No authentication regime in place HZ_AUTH_BASIC, // A base64 password is passed to the server with each HTTP request HZ_AUTH_FORM_POST, // A form submission of username and password by HTTP POST HZ_AUTH_FORM_GET, // A form submission of username and password by HTTP GET } ;
#define COOKIE_HTTPONLY 0x01
struct hzCookie { // Category: Internet // // A cookie is a session tracking device set by a server and returned by the client in subsequent HTTP request.
hzXDate m_Expires ; // If not set the cookie is permanent, otherwise stop using it after this date hzString m_Path ; // Send the cookie if the path is equal to or greater than this hzString m_Name ; // The cookie name hzString m_Value ; // The cookie value uint32_t m_Flags ; // Operational flags (eg HttpOnly)
hzCookie (void) { m_Flags = 0 ; }
void Clear (void) { m_Expires.Clear() ; m_Path.Clear() ; m_Name.Clear() ; m_Value.Clear() ;
m_Flags = 0 ; }
hzCookie& operator= (const hzCookie& op) { m_Expires = op.m_Expires ; m_Path = op.m_Path ; m_Name = op.m_Name ; m_Value = op.m_Value ; m_Flags = op.m_Flags ;
return *this ; }
// Serialization operator const char* (void) const ; } ;
class hzHttpClient { // Category: Internet // // The hzHttpClient class enables programs to operate as HTTP clients (e.g. webbots). It consists of a hzTcpClient, a set of cookies and a set of values derived from browsing // the target HTTP service (website). The class provides functions to GET resources (pages) and POST forms. Nothing is known or assumed about the pages being requested or the // forms being posted.
hzEcode _procHttpResponse (HttpRC& hRet, const hzUrl& url) ;
hzEcode _getpage (HttpRC& hRet, const hzUrl& url, const hzString& etag) ; hzEcode _postform (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& headers, const hzChain& formData) ;
public: hzMapS<hzString,hzCookie> m_Cookies ; // Cookies applicable to the target website
hzTcpClient m_Webhost ; // Current connection hzChain m_Error ; // Error logging hzChain m_Request ; // Chain formed by GetHttpPage() to send a request to a server hzChain m_Header ; // Header of server's response hzChain m_Content ; // HTML page content of server's response hzXDate m_Accessed ; // Date of last access (downloaded). Derived from 'Date:' field in header. hzXDate m_Modified ; // Date of last modification (from header). Default will be date/time of download. hzXDate m_Expires ; // Date page expires (after which it must be re-loaded) uint64_t m_rtRequest ; // Request sent time uint64_t m_rtResponse ; // Response complete time hzString m_CacheCtrl ; // Cache control hzString m_Host ; // Domain name or host currently connected to hzString m_Pragma ; // Pragma directive hzString m_Redirect ; // Where to go to actually load the page hzString m_KeepAlive ; // Keep-Alive parameters (currently ignored) hzString m_ContentType ; // Content type hzString m_ContEncoding ; // Eg 'gzip' hzString m_XferEncoding ; // The only expected value here is 'chunked' hzString m_AltProto ; // The only expected value here is 'chunked' hzString m_Encoding ; // Content encoding eg UTF-8 hzString m_Etag ; // Entity tag if present in the header hzString m_AuthBasic ; // If set, this is supplied with each GetPage() call. hzUrl m_Referer ; // Last page (used in constructing 'Referer:' header) uint32_t m_nTimeout ; // Maximum timeout uint32_t m_nMaxConnects ; // Maximum connections (we only ever use one) uint32_t m_nContentLen ; // Content length uint32_t m_Retcode ; // HTML return code bool m_bConnection ; // True if server returns Keep-Alive bool m_bChunked ; // Server's response was chunked (diagnostics only) char m_buf[HZ_MAXPACKET+4] ; // Buffer for IP packets
hzHttpClient (void) { m_buf[0] = 0 ; m_bConnection = false ; m_bChunked = false ; m_nTimeout = 0 ; m_nMaxConnects = 0 ; m_nContentLen = 0 ; m_Retcode = 0 ; } ~hzHttpClient (void) { }
hzEcode Connect (const hzUrl& url) ; hzEcode Close (void) ;
hzEcode TestPage (hzChain& Z, const hzUrl& url) ; hzEcode GetPage (HttpRC& hRet, const hzUrl& url, const hzString& etag) ; hzEcode PostForm (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& hdrs, const hzList<hzPair>& data) ; hzEcode PostAjax (HttpRC& hRet, const hzUrl& url, hzVect<hzString>& hdrs, const hzList<hzPair>& data) ; hzEcode UpgradeWS (HttpRC& hRet, const hzUrl& url) ; } ;
// // See synopsis Web Syncing/Scraping (automated downloading) // //
enum webcmd { // Category: Internet // // Command for automation of HTTP client
WEBCMD_UNDEF, // No step defined WEBCMD_LOAD_PAGE, // Load a single page (regardless of history) WEBCMD_LOAD_LIST, // Load pages from a list of links WEBCMD_SLCT_PAGE, // Select links from a page WEBCMD_SLCT_LIST, // Select links from a set of pages (set of links) WEBCMD_RGET, // Recursive get from a root page WEBCMD_POST, // Post a form WEBCMD_RSS // Get an RSS feed } ;
// Web syncing operational flags that affect website login #define HZ_WEBSYNC_AUTH_BASIC 0x0001 // A base64 password is passed to the server with each HTTP request #define HZ_WEBSYNC_AUTH_POST 0x0002 // A form submission of username and password by HTTP POST #define HZ_WEBSYNC_AUTH_GET 0x0004 // A form submission of username and password by HTTP GET
// Web syncing operational flags that affect timeout handling #define HZ_WEBSYNC_NOWAIT 0x008 // If this is set, the hzWebhost::Sync function exits on timeout. The calling application then needs to check if the sync operation ran to // completion and if not then recall it after a suitable delay.
#define WEBFLG_FORCE 0x01 // Fetch the page even if in the history #define WEBFLG_NOSTORE 0x02 // Don't record the page content in a file
class hzWebCMD { // Category: Internet // // Automatic syncing or downloading from a website is effected by executing a series of steps or commands. Each command is parameterized by a hzWebCMD class instance. // // The hzHttpClient class enables programs to operate as HTTP clients (e.g. webbots). It consists of a hzTcpClient, a set of cookies and a set of values derived from browsing // As a minimum, the hzWebCMD must contain a URL and a HTTP command such as GET. If the required action is to POST a form then the URL and the command must be accompanied by // a series of one or more name-value pairs serving as the form data. Note that the hzWebCMD can also specify recursive downloading.
public: hzList<hzPair> m_Formdata ; // List of name value pairs to submit to the site's login form (given as m_Authpage see below)
hzUrl m_Url ; // Page to fetch hzString m_Crit ; // This concerns the (globing) form the link (URL) must take in order to qualify for download. hzString m_Slct ; // This creates a set of nodes based on tagname and attributes hzString m_Inputs ; // List of pages to fetch (input list) hzString m_Output ; // Name of object, eg form or list of links (if applicable) uint32_t m_Flags ; // Reserved webcmd m_Cmd ; // The command to execute
hzWebCMD (void) { m_Flags = 0 ; } ~hzWebCMD (void) { Clear() ; }
void Clear (void) { m_Formdata.Clear() ;
m_Url = (char*) 0 ; m_Crit = m_Slct = m_Inputs = m_Output = (char*) 0 ; m_Flags = 0 ; m_Cmd = WEBCMD_UNDEF ; }
hzWebCMD& operator= (const hzWebCMD& op) { m_Formdata = op.m_Formdata ;
m_Url = op.m_Url ; m_Crit = op.m_Crit ; m_Slct = op.m_Slct ; m_Inputs = op.m_Inputs ; m_Output = op.m_Output ; m_Flags = op.m_Flags ; m_Cmd = op.m_Cmd ;
return *this ; } } ;
class hzWebhost { // Category: Internet // // The hzWebhost class facilitates automated downloading from the set of documents available at any given domain. This is a parameter driven and generally recursive process. // Starting from a list of one or more 'root' pages such as the home page, pages are downloaded and any links these may contain to other pages are garnered. Then subject to // specified limiting criteria, these latter pages are downloaded. The process terminates when all discovered pages are downloaded. // // By default links are limited to other pages on the same website or on other websites listed as related to this. Other criteria may apply such as date of file and file type // these pages are alse read in. // // Where authentication is required, the authentication sequence is normally by login form submission. The login form will be downloaded from a particular URL, the username // and password filled in and sent back to the URL indicated in the form (this may or may not be the same). NOTE this will not work where anti-robot mechanisms are in place, // such as google recaptcha forms. // // Note also that the sequence of pages to visit may have to include a seemingly pointless visit to a page (normally the home page), purely for the client to be issued with a // cookie in order for the login to be accepted.
struct _nodeList { // List of nodes selected within a downloaded page
hzList<hzHtmElem*> nodes ; // List of nodes
hzString name ; // Name of list uint32_t sofar ; // Count of node fetched
_nodeList (void) { sofar = 0 ; } } ;
struct _pageList { // Config for page to visit
hzList<hzUrl> links ; // List of URLs
hzString name ; // Name of list uint32_t sofar ; // Count of pages fetched
_pageList (void) { sofar = 0 ; } } ;
hzEcode getRss_r (HttpRC& hRet, const hzUrl& feed, uint32_t nLevel) ;
hzEcode _loadstatus (void) ; // Load log of which files have been downloaded hzEcode _savestatus (void) ; // Write out log of which files have been downloaded void _clear (void) ; // Clears history and resets
public: hzMapS<hzString,hzCookie> m_Cookies ; // All cookies needed for the session with server hzMapS<hzUrl,hzDocMeta*> m_mapHist ; // Links to other pages occuring in this page's body hzMapS<hzString,hzHtmForm*> m_Forms ; // Map of forms found in loaded pages. hzMapS<hzString,_nodeList*> m_Nodelists ; // Map of lists of selected nodes hzMapS<hzString,_pageList*> m_Pagelists ; // Map of lists of selected links hzVect<hzDocMeta*> m_vecHist ; // Links to other pages occuring in this page's body hzList<hzWebCMD> m_Commands ; // List of commands to effect a SYNC operation
hzSet<hzUrl> m_Offsite ; // Links discovered that are to pages in other domains or websites hzSet<hzEmaddr> m_Emails ; // Email addresses occuring in this page's body hzSet<hzString> m_Banned ; // Filter for banning visitation. Links meeting this are not visted, stored or processed. hzSet<hzString> m_Domains ; // Allowed domains for the site and it's links hzList<hzUrl> m_Authsteps ; // Initial URL requests that must be made for cookie collecting before the login form can be submitted. hzList<hzPair> m_Authform ; // List of name value pairs to submit to the site's login form (given as m_Authpage see below) hzList<hzPair> m_Roots ; // List of root commands (Webscraping only) hzList<hzUrl> m_Feeds ; // List of root commands (Webscraping only)
hzHttpClient HC ; // HTTP client instance
// XML selectors (for selecting info from RSS feeds) hzXmlSlct m_tagItem ; // For of extraction of a item hzXmlSlct m_tagTitl ; // For of extraction of a title hzXmlSlct m_tagDesc ; // For of extraction of a description hzXmlSlct m_tagUqid ; // For of extraction of a unique item id hzXmlSlct m_tagLink ; // For of extraction of a link hzXmlSlct m_tagDate ; // For of extraction of a date
// Session data hzDocument* m_docHome ; // Home page hzDocument* m_docAuth ; // Login page hzDocument* m_resAuth ; // Login page response hzDocument* m_resLast ; // Last page downloaded hzChain m_Trace ; // List of data items garnered (XML format) hzChain m_Styles ; // Stylesheet hzString m_Username ; // For controlled access to site hzString m_Password ; // For controlled access to site hzString m_AuthBasic ; // If set, this is supplied with each GetPage() call. hzString m_Repos ; // Target directory for download pages. hzString m_CookieSess ; // Session cookie (set when new cookie is offered by server, used in all subsequent requests) hzString m_CookiePath ; // Session cookie (set when new cookie is offered by server, used in all subsequent requests) hzString m_Name ; // Canonical name of site eg 'positive news'
// Common significant addresses (usually set by config file) hzUrl m_Homepage ; // Root URL for site hzUrl m_Authpage ; // Login page for site (if applicable, may be same as home) hzUrl m_Authexit ; // Logout URL hzUrl m_ContactUs ; // Used to post messages to websites
// General uint32_t m_Opflags ; // Operational flags uint32_t m_Sofar ; // Count of Sync commands executed
hzWebhost (void) { m_docHome = 0 ; m_docAuth = 0 ; m_resAuth = 0 ; m_resLast = 0 ; m_Opflags = 0 ; m_Sofar = 0 ; }
~hzWebhost (void) { _clear() ;
if (m_docHome) delete m_docHome ; if (m_docAuth) delete m_docAuth ; if (m_resAuth) delete m_resAuth ; if (m_resLast) delete m_resLast ; }
//hzEcode Init (const hzString& repos, const hzUrl& url, hzAuthmode authmode = HZ_AUTH_NONE) ; hzEcode AuthBasic (const char* username, const char* password) ; hzEcode AddRoot (hzUrl& url, hzString& criteria) ; hzEcode AddRSS (hzUrl& rss) ;
hzEcode AddBan (hzString& pageEnding) { if (!pageEnding) return E_NODATA ; return m_Banned.Insert(pageEnding) ; }
hzDocument* Download (const hzUrl& url) ;
hzEcode Login (void) ; void Logout (void) ; hzEcode Sync (void) ; hzEcode Scrape (void) ; hzEcode GetRSS (void) ; } ;
#endif // hzHttpClient_h