| Return Type | Function name | Arguments |
|---|---|---|
| hzEcode | hzWebhost::Sync | (void) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzEcode hzWebhost::Sync (void)
{
_hzfunc("hzWebhost::Sync") ;
hzMapS <hzUrl,hzDocument*> cur ;
hzMapS <hzString,hzString> fvals ;
hzVect <hzHtmElem*> elems ;
hzList <hzWebCMD>::Iter ci ;
hzList <hzPair>::Iter pi ;
hzList <hzUrl>::Iter si ;
hzList <hzHtmForm*>::Iter fi ;
hzSet <hzUrl> set_ctrl ;
hzVect <hzUrl> pglinks ;
hzVect <hzUrl> allinks ;
hzVect <hzString> hdrs ;
hzList <hzPair> flist ;
ofstream os ;
_pageList* pgl = 0;
_pageList* pgl2 = 0;
hzWebCMD wc ;
hzDocument* pDoc ;
hzDocHtml* pHdoc ;
hzHtmElem* pElem ;
hzHtmForm* pForm ;
hzPair P ;
hzXDate now ;
hzAttrset ai ;
hzString anam ;
hzString aval ;
hzString S ;
hzUrl url ;
uint32_t nStart ;
uint32_t nLimit ;
uint32_t nCount ;
uint32_t n ;
HttpRC hRet = HTTPMSG_OK ;
hzEcode rc ;
threadLog("Called hzWebhost::Sync\n") ;
if (!m_Repos)
{ threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
if (!m_Commands.Count())
{ threadLog("Website is not properly initialized (no commands)\n") ; return E_NOINIT ; }
rc = _loadstatus() ;
if (rc != E_OK)
{ threadLog("Error on loading status - aborting\n") ; return rc ; }
for (n = 0,ci = m_Commands ; n < m_Sofar ; n++, ci++) ;
for (; rc == E_OK && hRet == HTTPMSG_OK && ci.Valid() ; ci++)
{
pDoc = 0;
wc = ci.Element() ;
switch (wc.m_Cmd)
{
case WEBCMD_LOAD_PAGE: // Get a page (no conditions)
if (!wc.m_Url)
{ threadLog("Invalid loadPage command - no URL\n") ; rc = E_NOINIT ; break ; }
threadLog("Doing WEBCMD_LOAD_PAGE\n") ;
pDoc = Download(wc.m_Url) ;
if (!pDoc)
{ threadLog("case 1. Could not fetch page %s\n", *wc.m_Url) ; rc = E_NOTFOUND ; break ; }
cur.Insert(wc.m_Url, pDoc) ;
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
if (pHdoc->m_Forms.Count())
{
for (fi = pHdoc->m_Forms ; fi.Valid() ; fi++)
{
pForm = fi.Element() ;
m_Forms.Insert(pForm->name, pForm) ;
}
}
}
break ;
case WEBCMD_LOAD_LIST: // Get a list of pages (list supplied in command)
threadLog("Doing WEBCMD_LOAD_LIST\n") ;
if (!wc.m_Inputs)
{ threadLog(" - Invalid loadList command - no list of links named\n") ; rc = E_NOTFOUND ; break ; }
if (!m_Pagelists.Exists(wc.m_Inputs))
{ threadLog(" - No such list of links as %s\n", *wc.m_Inputs) ; rc = E_NOTFOUND ; break ; }
pgl = m_Pagelists[wc.m_Inputs] ;
for (si = pgl->links ; si.Valid() ; si++)
{
url = si.Element() ;
pDoc = Download(url) ;
if (!pDoc)
{ threadLog(" - case 3. Could not fetch page %s\n", *url) ; rc = E_NOTFOUND ; }
else
threadLog(" - Fetched page %s\n", *url) ;
}
threadLog("Ending WEBCMD_LOAD_LIST (%s)\n", *wc.m_Inputs) ;
break ;
case WEBCMD_SLCT_PAGE: // Select links from a page
threadLog("Doing WEBCMD_SLCT_PAGE\n") ;
if (wc.m_Url && wc.m_Inputs) { rc = E_NOINIT ; threadLog("Invalid request. Both a URL and an Input set specified\n") ; }
if (!wc.m_Url && !wc.m_Inputs) { rc = E_NOINIT ; threadLog("Invalid request. No URL or Input set specified\n") ; }
if (!wc.m_Output) { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no name for output list\n") ; }
if (!wc.m_Slct && !wc.m_Crit) { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no node selection or globing criteria\n") ; }
if (rc != E_OK)
break ;
if (cur.Exists(wc.m_Url))
pDoc = cur[wc.m_Url] ;
else
pDoc = Download(wc.m_Url) ;
if (!pDoc)
{ rc = E_NOTFOUND ; threadLog("case 2. Could not fetch page %s\n", *wc.m_Url) ; break ; }
pgl = new _pageList() ;
pgl->name = wc.m_Output ;
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Not a HTML document\n") ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
for (n = 0; n < pHdoc->m_vecTags.Count() ; n++)
{
pElem = pHdoc->m_vecTags[n] ;
threadLog("VEC TAG %d <%s ", n, *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
threadLog(" %s=%s", ai.Name(), ai.Value()) ;
}
threadLog(" />\n") ;
}
rc = pHdoc->FindElements(elems, wc.m_Slct) ;
for (n = 0; n < elems.Count() ; n++)
{
pElem = elems[n] ;
threadLog("%s. GOT <%s ", *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; aval = ai.Value() ;
threadLog(" %s=%s", *anam, *aval) ;
if (anam == "href")
{
url = aval ;
pgl->links.Add(url) ;
}
}
threadLog(" />\n") ;
}
}
threadLog("Inserting pagelist %s of %d items\n", *pgl->name, pgl->links.Count()) ;
m_Pagelists.Insert(pgl->name, pgl) ;
break ;
case WEBCMD_SLCT_LIST: // Select links from a set of pages (supplied as a set of links)
threadLog("Doing WEBCMD_SLCT_LIST (%s)\n", *wc.m_Url) ;
if (!wc.m_Inputs)
{ threadLog("Invalid slctList command - no source list of links\n") ; rc = E_NOINIT ; break ; }
if (!wc.m_Output)
{ rc = E_NOINIT ; threadLog("Invalid slctList command - no name for output list\n") ; }
if (!wc.m_Slct && !wc.m_Crit)
{ rc = E_NOINIT ; threadLog("Invalid slctList command - no node selection or globing criteria\n") ; }
if (rc != E_OK)
break ;
pgl2 = new _pageList() ;
pgl2->name = wc.m_Output ;
pgl = m_Pagelists[wc.m_Inputs] ;
if (!pgl)
{ rc = E_CORRUPT ; threadLog("Pagelist of %s not found\n", *wc.m_Inputs) ; break ; }
for (si = pgl->links ; si.Valid() ; si++)
{
url = si.Element() ;
if (cur.Exists(url))
pDoc = cur[url] ;
else
pDoc = Download(url) ;
if (!pDoc)
{ rc = E_NOTFOUND ; threadLog("case 2.2 Could not fetch page %s\n", *url) ; break ; }
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
rc = pHdoc->FindElements(elems, wc.m_Slct) ;
for (n = 0; n < elems.Count() ; n++)
{
pElem = elems[n] ;
threadLog("%s. GOT <%s ", *pElem->Name()) ;
for (ai = pElem ; ai.Valid() ; ai.Advance())
{
anam = ai.Name() ; aval = ai.Value() ;
threadLog(" %s=%s", *anam, *aval) ;
if (anam == "href")
{
url = aval ;
pgl2->links.Add(url) ;
}
}
threadLog(" />\n") ;
}
}
}
threadLog("Case 2. Inserting pagelist %s of %d items\n", *pgl2->name, pgl2->links.Count()) ;
m_Pagelists.Insert(pgl2->name, pgl2) ;
break ;
case WEBCMD_RGET: // Get a root page
threadLog("Doing WEBCMD_RGET\n") ;
threadLog("Page=%s Crit=%s\n", *wc.m_Url, *wc.m_Crit) ;
pDoc = Download(wc.m_Url) ;
if (!pDoc)
threadLog("case 4. Could not fetch page %s\n", *wc.m_Url) ;
else
{
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Page %s not HTML\n", *wc.m_Url) ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
}
delete pDoc ;
}
for (n = 0; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
for (nStart = 0; nStart < allinks.Count() ; nStart = nCount)
{
now.SysDateTime() ;
pglinks.Clear() ;
for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
{
url = allinks[nCount] ;
threadLog("Cosidering link %s - ", *url.Whole()) ;
if (m_mapHist.Exists(url)) { threadLog("historic\n") ; continue ; }
if (url == m_Authexit) { threadLog("exit-page\n") ; continue ; }
if (!m_Domains.Exists(url.Domain())) { threadLog("URL %s outside domain\n", *url) ; continue ; }
threadLog("Fetching\n") ;
pDoc = Download(url) ;
if (!pDoc)
threadLog("case 2. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
for (n = 0; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
delete pDoc ;
}
}
}
break ;
case WEBCMD_POST: // Post a form. The form should have been previously downloaded and will be looked for by name
threadLog("Doing WEBCMD_POST\n") ;
pForm = m_Forms[wc.m_Output] ;
if (!pForm)
threadLog("Warning: No such form as [%s]\n", *wc.m_Output) ;
/*
** for (pi = pForm->fields ; pi.Valid() ; pi++)
** { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
** for (pi = wc.m_Formdata ; pi.Valid() ; pi++)
** { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
**
** for (n = 0 ; n < fvals.Count() ; n++)
** {
** P.name = fvals.GetKey(n) ;
** P.value = fvals.GetObj(n) ;
** flist.Add(P) ;
** }
** */
rc = HC.PostForm(hRet, wc.m_Url, hdrs, wc.m_Formdata) ;
if (rc != E_OK)
{ threadLog("Could not post form to %s\n", *wc.m_Url) ; return rc ; }
if (hRet != HTTPMSG_OK)
{ threadLog("Invalid response to post form (to %s)\n", *wc.m_Url) ; return rc ; }
if (m_Repos)
{
url = wc.m_Url ;
S = m_Repos + "/" + url.Filename() ;
S += ".response" ;
os.open(*S) ;
if (os.fail())
{ threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
os << HC.m_Header ;
os << "\r\n\r\n" ;
os << HC.m_Content ;
os.close() ;
}
break ;
case WEBCMD_RSS: // Get an RSS feed
threadLog("Doing WEBCMD_RSS\n") ;
if (!m_tagItem.m_Slct) { m_tagItem.m_Filt = (char*) 0; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
if (!m_tagUqid.m_Slct) { m_tagUqid.m_Filt = (char*) 0; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
if (!m_tagLink.m_Slct) { m_tagLink.m_Filt = (char*) 0; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
if (!m_tagDesc.m_Slct) { m_tagDesc.m_Filt = (char*) 0; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
if (!m_tagDate.m_Slct) { m_tagDate.m_Filt = (char*) 0; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
rc = getRss_r(hRet, wc.m_Url, 0);
threadLog("Processed items\n") ;
break ;
}
}
rc = _savestatus() ;
for (n = 0; n < m_Pagelists.Count() ; n++)
{
pgl = m_Pagelists.GetObj(n) ;
delete pgl ;
}
for (n = 0; n < cur.Count() ; n++)
{
pDoc = cur.GetObj(n) ;
delete pDoc ;
}
return rc ;
}