Return Type | Function name | Arguments |
---|---|---|
hzEcode | hzWebhost::Scrape | (void) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzEcode hzWebhost::Scrape (void) { _hzfunc("hzWebhost::Scrape") ; hzMapS<hzString,hzString> formData ; hzVect<hzString> hdrs ; hzList<hzPair>::Iter ci ; hzSet<hzUrl> set_ctrl ; hzVect<hzUrl> pglinks ; hzVect<hzUrl> allinks ; hzVect<hzUrl> todo ; ifstream is ; ofstream os ; hzDocMeta mark ; hzChain Response ; hzDocument* pDoc ; hzDocHtml* pHdoc ; hzPair X ; hzXDate now ; hzUrl url ; hzString vs_fname ; hzString pagepath ; hzString S ; hzString etag ; uint32_t nStart ; uint32_t nLimit ; uint32_t nCount ; uint32_t n ; hzEcode rc = E_OK ; threadLog("Called hzWebhost::Scrape\n") ; if (!m_Repos) { threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; } if (!m_Roots.Count()) { threadLog("Website has no starting point (URL) for a WEB SCRAPE.\n") ; return E_NOINIT ; } if (m_Homepage) { pDoc = Download(m_Homepage) ; if (!pDoc) { threadLog("Could not download page %s\n", *m_Homepage) ; return E_NOINIT ; } m_docHome = pDoc ; threadLog("HOMEPAGE SUCCESS\n") ; } rc = Login() ; if (rc != E_OK) { threadLog("Login failed\n") ; return rc ; } threadLog("Login SUCCESS\n") ; threadLog("Have %d root commands\n", m_Roots.Count()) ; for (ci = m_Roots ; ci.Valid() ; ci++) { X = ci.Element() ; threadLog("Page=%s Crit=%s\n", *X.name, *X.value) ; if (X.name == "homepage") { pHdoc = (hzDocHtml*) m_docHome ; pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ; } else if (X.name == "loginResponse") { pHdoc = (hzDocHtml*) m_resAuth ; pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ; } else { url = X.name ; if (!url) { threadLog("Root command invalid page %s\n", *X.name) ; continue ; } etag = (char*) 0; pDoc = Download(url) ; if (!pDoc) threadLog("case 1. Could not fetch page %s\n", *url) ; else { if (pDoc->Whatami() != DOCTYPE_HTML) threadLog("Page %s not HTML\n", *url) ; else { pHdoc = (hzDocHtml*) pDoc ; pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ; threadLog("Got page content, extracted %d links\n", pglinks.Count()) ; } delete pDoc ; } } for (n = 0; n < pglinks.Count() ; n++) { url = pglinks[n] ; if (!set_ctrl.Exists(url)) allinks.Add(url) ; } } /* ** ** Starting at the site root and for each page, grab all links and go to each link in turn ** */ threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ; for (nStart = 0; nStart < allinks.Count() ; nStart = nCount) { now.SysDateTime() ; todo.Clear() ; for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++) { url = allinks[nCount] ; threadLog("Cosidering link %s - ", *url.Whole()) ; if (m_mapHist.Exists(url)) { threadLog("historic\n") ; continue ; } if (url == m_Authexit) { threadLog("exit-page\n") ; continue ; } if (!m_Domains.Exists(url.Domain())) { threadLog("URL %s outside domain\n", *url) ; continue ; } threadLog("Fetching\n") ; pDoc = Download(url) ; threadLog("Fetched page %p\n", pDoc) ; if (!pDoc) threadLog("case 2. Could not fetch page %s\n", *url) ; else { if (pDoc->Whatami() == DOCTYPE_HTML) { pHdoc = (hzDocHtml*) pDoc ; pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ; for (n = 0; n < pglinks.Count() ; n++) { url = pglinks[n] ; if (!set_ctrl.Exists(url)) allinks.Add(url) ; } } delete pDoc ; } } /* ** for (nAdded = nX = 0 ; nX < todo.Count() ; nX++) ** { ** //url = todo.GetObj(nX) ; ** url = todo[nX] ; //.GetObj(nX) ; ** ** if (set_ctrl.Exists(url)) ** continue ; ** nAdded++ ; ** set_ctrl.Insert(url) ; ** } ** ** todo.Clear() ; ** ** if (!nAdded) ** break ; ** */ } rc = _savestatus() ; return rc ; }