Return Type | Function name | Arguments |
---|---|---|
hzEcode | hzWebhost::Sync | (void) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzEcode hzWebhost::Sync (void) { _hzfunc("hzWebhost::Sync") ; hzMapS <hzUrl,hzDocument*> cur ; hzMapS <hzString,hzString> fvals ; hzVect <hzHtmElem*> elems ; hzList <hzWebCMD>::Iter ci ; hzList <hzPair>::Iter pi ; hzList <hzUrl>::Iter si ; hzList <hzHtmForm*>::Iter fi ; hzSet <hzUrl> set_ctrl ; hzVect <hzUrl> pglinks ; hzVect <hzUrl> allinks ; hzVect <hzString> hdrs ; hzList <hzPair> flist ; ofstream os ; _pageList* pgl = 0; _pageList* pgl2 = 0; hzWebCMD wc ; hzDocument* pDoc ; hzDocHtml* pHdoc ; hzHtmElem* pElem ; hzHtmForm* pForm ; hzPair P ; hzXDate now ; hzAttrset ai ; hzString anam ; hzString aval ; hzString S ; hzUrl url ; uint32_t nStart ; uint32_t nLimit ; uint32_t nCount ; uint32_t n ; HttpRC hRet = HTTPMSG_OK ; hzEcode rc ; threadLog("Called hzWebhost::Sync\n") ; if (!m_Repos) { threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; } if (!m_Commands.Count()) { threadLog("Website is not properly initialized (no commands)\n") ; return E_NOINIT ; } rc = _loadstatus() ; if (rc != E_OK) { threadLog("Error on loading status - aborting\n") ; return rc ; } for (n = 0,ci = m_Commands ; n < m_Sofar ; n++, ci++) ; for (; rc == E_OK && hRet == HTTPMSG_OK && ci.Valid() ; ci++) { pDoc = 0; wc = ci.Element() ; switch (wc.m_Cmd) { case WEBCMD_LOAD_PAGE: // Get a page (no conditions) if (!wc.m_Url) { threadLog("Invalid loadPage command - no URL\n") ; rc = E_NOINIT ; break ; } threadLog("Doing WEBCMD_LOAD_PAGE\n") ; pDoc = Download(wc.m_Url) ; if (!pDoc) { threadLog("case 1. Could not fetch page %s\n", *wc.m_Url) ; rc = E_NOTFOUND ; break ; } cur.Insert(wc.m_Url, pDoc) ; if (pDoc->Whatami() == DOCTYPE_HTML) { pHdoc = (hzDocHtml*) pDoc ; if (pHdoc->m_Forms.Count()) { for (fi = pHdoc->m_Forms ; fi.Valid() ; fi++) { pForm = fi.Element() ; m_Forms.Insert(pForm->name, pForm) ; } } } break ; case WEBCMD_LOAD_LIST: // Get a list of pages (list supplied in command) threadLog("Doing WEBCMD_LOAD_LIST\n") ; if (!wc.m_Inputs) { threadLog(" - Invalid loadList command - no list of links named\n") ; rc = E_NOTFOUND ; break ; } if (!m_Pagelists.Exists(wc.m_Inputs)) { threadLog(" - No such list of links as %s\n", *wc.m_Inputs) ; rc = E_NOTFOUND ; break ; } pgl = m_Pagelists[wc.m_Inputs] ; for (si = pgl->links ; si.Valid() ; si++) { url = si.Element() ; pDoc = Download(url) ; if (!pDoc) { threadLog(" - case 3. Could not fetch page %s\n", *url) ; rc = E_NOTFOUND ; } else threadLog(" - Fetched page %s\n", *url) ; } threadLog("Ending WEBCMD_LOAD_LIST (%s)\n", *wc.m_Inputs) ; break ; case WEBCMD_SLCT_PAGE: // Select links from a page threadLog("Doing WEBCMD_SLCT_PAGE\n") ; if (wc.m_Url && wc.m_Inputs) { rc = E_NOINIT ; threadLog("Invalid request. Both a URL and an Input set specified\n") ; } if (!wc.m_Url && !wc.m_Inputs) { rc = E_NOINIT ; threadLog("Invalid request. No URL or Input set specified\n") ; } if (!wc.m_Output) { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no name for output list\n") ; } if (!wc.m_Slct && !wc.m_Crit) { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no node selection or globing criteria\n") ; } if (rc != E_OK) break ; if (cur.Exists(wc.m_Url)) pDoc = cur[wc.m_Url] ; else pDoc = Download(wc.m_Url) ; if (!pDoc) { rc = E_NOTFOUND ; threadLog("case 2. Could not fetch page %s\n", *wc.m_Url) ; break ; } pgl = new _pageList() ; pgl->name = wc.m_Output ; if (pDoc->Whatami() != DOCTYPE_HTML) threadLog("Not a HTML document\n") ; else { pHdoc = (hzDocHtml*) pDoc ; for (n = 0; n < pHdoc->m_vecTags.Count() ; n++) { pElem = pHdoc->m_vecTags[n] ; threadLog("VEC TAG %d <%s ", n, *pElem->Name()) ; for (ai = pElem ; ai.Valid() ; ai.Advance()) { threadLog(" %s=%s", ai.Name(), ai.Value()) ; } threadLog(" />\n") ; } rc = pHdoc->FindElements(elems, wc.m_Slct) ; for (n = 0; n < elems.Count() ; n++) { pElem = elems[n] ; threadLog("%s. GOT <%s ", *pElem->Name()) ; for (ai = pElem ; ai.Valid() ; ai.Advance()) { anam = ai.Name() ; aval = ai.Value() ; threadLog(" %s=%s", *anam, *aval) ; if (anam == "href") { url = aval ; pgl->links.Add(url) ; } } threadLog(" />\n") ; } } threadLog("Inserting pagelist %s of %d items\n", *pgl->name, pgl->links.Count()) ; m_Pagelists.Insert(pgl->name, pgl) ; break ; case WEBCMD_SLCT_LIST: // Select links from a set of pages (supplied as a set of links) threadLog("Doing WEBCMD_SLCT_LIST (%s)\n", *wc.m_Url) ; if (!wc.m_Inputs) { threadLog("Invalid slctList command - no source list of links\n") ; rc = E_NOINIT ; break ; } if (!wc.m_Output) { rc = E_NOINIT ; threadLog("Invalid slctList command - no name for output list\n") ; } if (!wc.m_Slct && !wc.m_Crit) { rc = E_NOINIT ; threadLog("Invalid slctList command - no node selection or globing criteria\n") ; } if (rc != E_OK) break ; pgl2 = new _pageList() ; pgl2->name = wc.m_Output ; pgl = m_Pagelists[wc.m_Inputs] ; if (!pgl) { rc = E_CORRUPT ; threadLog("Pagelist of %s not found\n", *wc.m_Inputs) ; break ; } for (si = pgl->links ; si.Valid() ; si++) { url = si.Element() ; if (cur.Exists(url)) pDoc = cur[url] ; else pDoc = Download(url) ; if (!pDoc) { rc = E_NOTFOUND ; threadLog("case 2.2 Could not fetch page %s\n", *url) ; break ; } if (pDoc->Whatami() == DOCTYPE_HTML) { pHdoc = (hzDocHtml*) pDoc ; rc = pHdoc->FindElements(elems, wc.m_Slct) ; for (n = 0; n < elems.Count() ; n++) { pElem = elems[n] ; threadLog("%s. GOT <%s ", *pElem->Name()) ; for (ai = pElem ; ai.Valid() ; ai.Advance()) { anam = ai.Name() ; aval = ai.Value() ; threadLog(" %s=%s", *anam, *aval) ; if (anam == "href") { url = aval ; pgl2->links.Add(url) ; } } threadLog(" />\n") ; } } } threadLog("Case 2. Inserting pagelist %s of %d items\n", *pgl2->name, pgl2->links.Count()) ; m_Pagelists.Insert(pgl2->name, pgl2) ; break ; case WEBCMD_RGET: // Get a root page threadLog("Doing WEBCMD_RGET\n") ; threadLog("Page=%s Crit=%s\n", *wc.m_Url, *wc.m_Crit) ; pDoc = Download(wc.m_Url) ; if (!pDoc) threadLog("case 4. Could not fetch page %s\n", *wc.m_Url) ; else { if (pDoc->Whatami() != DOCTYPE_HTML) threadLog("Page %s not HTML\n", *wc.m_Url) ; else { pHdoc = (hzDocHtml*) pDoc ; pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ; } delete pDoc ; } for (n = 0; n < pglinks.Count() ; n++) { url = pglinks[n] ; if (!set_ctrl.Exists(url)) allinks.Add(url) ; } threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ; for (nStart = 0; nStart < allinks.Count() ; nStart = nCount) { now.SysDateTime() ; pglinks.Clear() ; for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++) { url = allinks[nCount] ; threadLog("Cosidering link %s - ", *url.Whole()) ; if (m_mapHist.Exists(url)) { threadLog("historic\n") ; continue ; } if (url == m_Authexit) { threadLog("exit-page\n") ; continue ; } if (!m_Domains.Exists(url.Domain())) { threadLog("URL %s outside domain\n", *url) ; continue ; } threadLog("Fetching\n") ; pDoc = Download(url) ; if (!pDoc) threadLog("case 2. Could not fetch page %s\n", *url) ; else { if (pDoc->Whatami() == DOCTYPE_HTML) { pHdoc = (hzDocHtml*) pDoc ; pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ; for (n = 0; n < pglinks.Count() ; n++) { url = pglinks[n] ; if (!set_ctrl.Exists(url)) allinks.Add(url) ; } } delete pDoc ; } } } break ; case WEBCMD_POST: // Post a form. The form should have been previously downloaded and will be looked for by name threadLog("Doing WEBCMD_POST\n") ; pForm = m_Forms[wc.m_Output] ; if (!pForm) threadLog("Warning: No such form as [%s]\n", *wc.m_Output) ; /* ** for (pi = pForm->fields ; pi.Valid() ; pi++) ** { P = pi.Element() ; fvals.Insert(P.name, P.value) ; } ** for (pi = wc.m_Formdata ; pi.Valid() ; pi++) ** { P = pi.Element() ; fvals.Insert(P.name, P.value) ; } ** ** for (n = 0 ; n < fvals.Count() ; n++) ** { ** P.name = fvals.GetKey(n) ; ** P.value = fvals.GetObj(n) ; ** flist.Add(P) ; ** } ** */ rc = HC.PostForm(hRet, wc.m_Url, hdrs, wc.m_Formdata) ; if (rc != E_OK) { threadLog("Could not post form to %s\n", *wc.m_Url) ; return rc ; } if (hRet != HTTPMSG_OK) { threadLog("Invalid response to post form (to %s)\n", *wc.m_Url) ; return rc ; } if (m_Repos) { url = wc.m_Url ; S = m_Repos + "/" + url.Filename() ; S += ".response" ; os.open(*S) ; if (os.fail()) { threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; } os << HC.m_Header ; os << "\r\n\r\n" ; os << HC.m_Content ; os.close() ; } break ; case WEBCMD_RSS: // Get an RSS feed threadLog("Doing WEBCMD_RSS\n") ; if (!m_tagItem.m_Slct) { m_tagItem.m_Filt = (char*) 0; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; } if (!m_tagUqid.m_Slct) { m_tagUqid.m_Filt = (char*) 0; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; } if (!m_tagLink.m_Slct) { m_tagLink.m_Filt = (char*) 0; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; } if (!m_tagDesc.m_Slct) { m_tagDesc.m_Filt = (char*) 0; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; } if (!m_tagDate.m_Slct) { m_tagDate.m_Filt = (char*) 0; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; } rc = getRss_r(hRet, wc.m_Url, 0); threadLog("Processed items\n") ; break ; } } rc = _savestatus() ; for (n = 0; n < m_Pagelists.Count() ; n++) { pgl = m_Pagelists.GetObj(n) ; delete pgl ; } for (n = 0; n < cur.Count() ; n++) { pDoc = cur.GetObj(n) ; delete pDoc ; } return rc ; }