Run the series of hzWebCMD directives to sync key pages from a website to a repository Arguments: None
| Return Type | Function name | Arguments |
|---|---|---|
| hzEcode | hzWebhost::Sync | (void) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzEcode hzWebhost::Sync (void)
{
// Run the series of hzWebCMD directives to sync key pages from a website to a repository
//
// Arguments: None
//
// Returns: E_NOINIT If no repository, no domain or no homepage has been specified
// E_NOTFOUND If the login page was not located
// E_WRITEFAIL If the login form recieved was not written to the repository
// E_OPENFAIL If the visit status file could not be opened
// E_OK If the scrape operation was successful
_hzfunc("hzWebhost::Sync") ;
hzMapS <hzUrl,hzDocument*> cur ; // Currently loaded documents
hzMapS <hzString,hzString> fvals ; // Form values to be submitted
hzVect <hzHtmElem*> elems ; // Elements selected by the web selector command
hzList <hzWebCMD>::Iter ci ; // Iterator for web commands
hzList <hzPair>::Iter pi ; // Iterator for form data
hzList <hzUrl>::Iter si ; // Iterator for pagelist
hzList <hzHtmForm*>::Iter fi ; // Iterator for forms
hzSet <hzUrl> set_ctrl ; // Initial links from processing config params
hzVect <hzUrl> pglinks ; // Links encountered within a given pages
hzVect <hzUrl> allinks ; // Links encountered within a given pages
hzVect <hzString> hdrs ; // Extra headers, needed for submit form
hzList <hzPair> flist ; // Filtered list of form values
ofstream os ; // For writing form respose
_pageList* pgl = 0; // Primary pagelist instance
_pageList* pgl2 = 0; // Secondary pagelist instance
hzWebCMD wc ; // Current web command
hzDocument* pDoc ; // Downloaded document
hzDocHtml* pHdoc ; // Set if downloaded document is a HTML page.
hzHtmElem* pElem ; // HTML element (tag) lifted from page
hzHtmForm* pForm ; // Form found in page
hzPair P ; // Name value pair
hzXDate now ; // Date/time now (for cheking is pages have expired
hzAttrset ai ; // HTML element attribute iterator
hzString anam ; // Attribute name
hzString aval ; // Attribute value
hzString S ; // Temp string
hzUrl url ; // Temp link
uint32_t nStart ; // Links iterator
uint32_t nLimit ; // Links iterator
uint32_t nCount ; // Links iterator
uint32_t n ; // Aggregation iterator
HttpRC hRet = HTTPMSG_OK ; // HTML return code
hzEcode rc ; // Return code
threadLog("Called hzWebhost::Sync\n") ;
// Check if repository and list of command is set up
if (!m_Repos)
{ threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
if (!m_Commands.Count())
{ threadLog("Website is not properly initialized (no commands)\n") ; return E_NOINIT ; }
// Read in any existing manifest file
rc = _loadstatus() ;
if (rc != E_OK)
{ threadLog("Error on loading status - aborting\n") ; return rc ; }
// If resuming execution, start we left off
for (n = 0,ci = m_Commands ; n < m_Sofar ; n++, ci++) ;
// Execute commands in order
for (; rc == E_OK && hRet == HTTPMSG_OK && ci.Valid() ; ci++)
{
pDoc = 0;
wc = ci.Element() ;
switch (wc.m_Cmd)
{
case WEBCMD_LOAD_PAGE: // Get a page (no conditions)
if (!wc.m_Url)
{ threadLog("Invalid loadPage command - no URL\n") ; rc = E_NOINIT ; break ; }
threadLog("Doing WEBCMD_LOAD_PAGE\n") ;
pDoc = Download(wc.m_Url) ;
if (!pDoc)
{ threadLog("case 1. Could not fetch page %s\n", *wc.m_Url) ; rc = E_NOTFOUND ; break ; }
cur.Insert(wc.m_Url, pDoc) ;
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
if (pHdoc->m_Forms.Count())
{
// Add the forms to the m_Forms map in the hzWebhost instance
for (fi = pHdoc->m_Forms ; fi.Valid() ; fi++)
{
pForm = fi.Element() ;
m_Forms.Insert(pForm->name, pForm) ;
}
}
}
break ;
case WEBCMD_LOAD_LIST: // Get a list of pages (list supplied in command)
threadLog("Doing WEBCMD_LOAD_LIST\n") ;
if (!wc.m_Inputs)
{ threadLog(" - Invalid loadList command - no list of links named\n") ; rc = E_NOTFOUND ; break ; }
if (!m_Pagelists.Exists(wc.m_Inputs))
{ threadLog(" - No such list of links as %s\n", *wc.m_Inputs) ; rc = E_NOTFOUND ; break ; }
pgl = m_Pagelists[wc.m_Inputs] ;
for (si = pgl->links ; si.Valid() ; si++)
{
url = si.Element() ;
pDoc = Download(url) ;
if (!pDoc)
{ threadLog(" - case 3. Could not fetch page %s\n", *url) ; rc = E_NOTFOUND ; }
else
threadLog(" - Fetched page %s\n", *url) ;
}
threadLog("Ending WEBCMD_LOAD_LIST (%s)\n", *wc.m_Inputs) ;
break ;
case WEBCMD_SLCT_PAGE: // Select links from a page
threadLog("Doing WEBCMD_SLCT_PAGE\n") ;
if (wc.m_Url && wc.m_Inputs) { rc = E_NOINIT ; threadLog("Invalid request. Both a URL and an Input set specified\n") ; }
if (!wc.m_Url && !wc.m_Inputs) { rc = E_NOINIT ; threadLog("Invalid request. No URL or Input set specified\n") ; }
if (!wc.m_Output) { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no name for output list\n") ; }
if (!wc.m_Slct && !wc.m_Crit) { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no node selection or globing criteria\n") ; }
if (rc != E_OK)
break ;
if (cur.Exists(wc.m_Url))
pDoc = cur[wc.m_Url] ;
else
pDoc = Download(wc.m_Url) ;
if (!pDoc)
{ rc = E_NOTFOUND ; threadLog("case 2. Could not fetch page %s\n", *wc.m_Url) ; break ; }
pgl = new _pageList() ;
pgl->name = wc.m_Output ;
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Not a HTML document\n") ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
for (n = 0; n < pHdoc->m_vecTags.Count() ; n++)
{
pElem = pHdoc->m_vecTags[n] ;
threadLog("VEC TAG %d <%s ", n, *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
threadLog(" %s=%s", ai.Name(), ai.Value()) ;
}
threadLog(" />\n") ;
}
rc = pHdoc->FindElements(elems, wc.m_Slct) ;
for (n = 0; n < elems.Count() ; n++)
{
pElem = elems[n] ;
threadLog("%s. GOT <%s ", *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; aval = ai.Value() ;
threadLog(" %s=%s", *anam, *aval) ;
if (anam == "href")
{
url = aval ;
pgl->links.Add(url) ;
}
}
threadLog(" />\n") ;
}
}
threadLog("Inserting pagelist %s of %d items\n", *pgl->name, pgl->links.Count()) ;
m_Pagelists.Insert(pgl->name, pgl) ;
break ;
case WEBCMD_SLCT_LIST: // Select links from a set of pages (supplied as a set of links)
threadLog("Doing WEBCMD_SLCT_LIST (%s)\n", *wc.m_Url) ;
if (!wc.m_Inputs)
{ threadLog("Invalid slctList command - no source list of links\n") ; rc = E_NOINIT ; break ; }
if (!wc.m_Output)
{ rc = E_NOINIT ; threadLog("Invalid slctList command - no name for output list\n") ; }
if (!wc.m_Slct && !wc.m_Crit)
{ rc = E_NOINIT ; threadLog("Invalid slctList command - no node selection or globing criteria\n") ; }
if (rc != E_OK)
break ;
pgl2 = new _pageList() ;
pgl2->name = wc.m_Output ;
// Begin
pgl = m_Pagelists[wc.m_Inputs] ;
if (!pgl)
{ rc = E_CORRUPT ; threadLog("Pagelist of %s not found\n", *wc.m_Inputs) ; break ; }
for (si = pgl->links ; si.Valid() ; si++)
{
url = si.Element() ;
if (cur.Exists(url))
pDoc = cur[url] ;
else
pDoc = Download(url) ;
if (!pDoc)
{ rc = E_NOTFOUND ; threadLog("case 2.2 Could not fetch page %s\n", *url) ; break ; }
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
rc = pHdoc->FindElements(elems, wc.m_Slct) ;
for (n = 0; n < elems.Count() ; n++)
{
pElem = elems[n] ;
threadLog("%s. GOT <%s ", *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; aval = ai.Value() ;
threadLog(" %s=%s", *anam, *aval) ;
if (anam == "href")
{
url = aval ;
pgl2->links.Add(url) ;
}
}
threadLog(" />\n") ;
}
}
}
threadLog("Case 2. Inserting pagelist %s of %d items\n", *pgl2->name, pgl2->links.Count()) ;
m_Pagelists.Insert(pgl2->name, pgl2) ;
break ;
case WEBCMD_RGET: // Get a root page
threadLog("Doing WEBCMD_RGET\n") ;
threadLog("Page=%s Crit=%s\n", *wc.m_Url, *wc.m_Crit) ;
// Get root page first
pDoc = Download(wc.m_Url) ;
if (!pDoc)
threadLog("case 4. Could not fetch page %s\n", *wc.m_Url) ;
else
{
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Page %s not HTML\n", *wc.m_Url) ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
}
delete pDoc ;
}
// Now aggregate the vector of links from the page to a vector of all links from all pages. Use a set to avoid repeats.
for (n = 0; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
// Starting at the site root and for each page, grab all links and go to each link in turn
threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
for (nStart = 0; nStart < allinks.Count() ; nStart = nCount)
{
now.SysDateTime() ;
pglinks.Clear() ;
for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
{
url = allinks[nCount] ;
threadLog("Cosidering link %s - ", *url.Whole()) ;
if (m_mapHist.Exists(url)) { threadLog("historic\n") ; continue ; }
if (url == m_Authexit) { threadLog("exit-page\n") ; continue ; }
if (!m_Domains.Exists(url.Domain())) { threadLog("URL %s outside domain\n", *url) ; continue ; }
// Page not yet visted so we visit it, put it in list of pages visited and get the links. Some of these links may add to
// the list of links.
threadLog("Fetching\n") ;
pDoc = Download(url) ;
if (!pDoc)
threadLog("case 2. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
// Re-aggregate the all-links vector
for (n = 0; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
delete pDoc ;
}
}
}
break ;
case WEBCMD_POST: // Post a form. The form should have been previously downloaded and will be looked for by name
threadLog("Doing WEBCMD_POST\n") ;
pForm = m_Forms[wc.m_Output] ;
if (!pForm)
threadLog("Warning: No such form as [%s]\n", *wc.m_Output) ;
// Take the command's formdata and use it to populate the form's set of fields
/*
** for (pi = pForm->fields ; pi.Valid() ; pi++)
** { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
** for (pi = wc.m_Formdata ; pi.Valid() ; pi++)
** { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
**
** for (n = 0 ; n < fvals.Count() ; n++)
** {
** P.name = fvals.GetKey(n) ;
** P.value = fvals.GetObj(n) ;
** flist.Add(P) ;
** }
** */
rc = HC.PostForm(hRet, wc.m_Url, hdrs, wc.m_Formdata) ;
if (rc != E_OK)
{ threadLog("Could not post form to %s\n", *wc.m_Url) ; return rc ; }
if (hRet != HTTPMSG_OK)
{ threadLog("Invalid response to post form (to %s)\n", *wc.m_Url) ; return rc ; }
// Write out the login response
if (m_Repos)
{
url = wc.m_Url ;
S = m_Repos + "/" + url.Filename() ;
S += ".response" ;
os.open(*S) ;
if (os.fail())
{ threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
os << HC.m_Header ;
os << "\r\n\r\n" ;
os << HC.m_Content ;
os.close() ;
}
break ;
case WEBCMD_RSS: // Get an RSS feed
threadLog("Doing WEBCMD_RSS\n") ;
// If XML selectors for RSS feed are not initialized, set them here
if (!m_tagItem.m_Slct) { m_tagItem.m_Filt = (char*) 0; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
if (!m_tagUqid.m_Slct) { m_tagUqid.m_Filt = (char*) 0; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
if (!m_tagLink.m_Slct) { m_tagLink.m_Filt = (char*) 0; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
if (!m_tagDesc.m_Slct) { m_tagDesc.m_Filt = (char*) 0; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
if (!m_tagDate.m_Slct) { m_tagDate.m_Filt = (char*) 0; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
// Get the feed
rc = getRss_r(hRet, wc.m_Url, 0);
threadLog("Processed items\n") ;
break ;
}
}
// Write out manifest file
rc = _savestatus() ;
// Clear documents
for (n = 0; n < m_Pagelists.Count() ; n++)
{
pgl = m_Pagelists.GetObj(n) ;
delete pgl ;
}
for (n = 0; n < cur.Count() ; n++)
{
pDoc = cur.GetObj(n) ;
delete pDoc ;
}
return rc ;
}