| Return Type | Function name | Arguments |
|---|---|---|
| hzEcode | hzWebhost::Scrape | (void) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzEcode hzWebhost::Scrape (void)
{
_hzfunc("hzWebhost::Scrape") ;
hzMapS<hzString,hzString> formData ;
hzVect<hzString> hdrs ;
hzList<hzPair>::Iter ci ;
hzSet<hzUrl> set_ctrl ;
hzVect<hzUrl> pglinks ;
hzVect<hzUrl> allinks ;
hzVect<hzUrl> todo ;
ifstream is ;
ofstream os ;
hzDocMeta mark ;
hzChain Response ;
hzDocument* pDoc ;
hzDocHtml* pHdoc ;
hzPair X ;
hzXDate now ;
hzUrl url ;
hzString vs_fname ;
hzString pagepath ;
hzString S ;
hzString etag ;
uint32_t nStart ;
uint32_t nLimit ;
uint32_t nCount ;
uint32_t n ;
hzEcode rc = E_OK ;
threadLog("Called hzWebhost::Scrape\n") ;
if (!m_Repos)
{ threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
if (!m_Roots.Count())
{ threadLog("Website has no starting point (URL) for a WEB SCRAPE.\n") ; return E_NOINIT ; }
if (m_Homepage)
{
pDoc = Download(m_Homepage) ;
if (!pDoc)
{ threadLog("Could not download page %s\n", *m_Homepage) ; return E_NOINIT ; }
m_docHome = pDoc ;
threadLog("HOMEPAGE SUCCESS\n") ;
}
rc = Login() ;
if (rc != E_OK)
{ threadLog("Login failed\n") ; return rc ; }
threadLog("Login SUCCESS\n") ;
threadLog("Have %d root commands\n", m_Roots.Count()) ;
for (ci = m_Roots ; ci.Valid() ; ci++)
{
X = ci.Element() ;
threadLog("Page=%s Crit=%s\n", *X.name, *X.value) ;
if (X.name == "homepage")
{
pHdoc = (hzDocHtml*) m_docHome ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
}
else if (X.name == "loginResponse")
{
pHdoc = (hzDocHtml*) m_resAuth ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
}
else
{
url = X.name ;
if (!url)
{ threadLog("Root command invalid page %s\n", *X.name) ; continue ; }
etag = (char*) 0;
pDoc = Download(url) ;
if (!pDoc)
threadLog("case 1. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Page %s not HTML\n", *url) ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
threadLog("Got page content, extracted %d links\n", pglinks.Count()) ;
}
delete pDoc ;
}
}
for (n = 0; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
/*
** ** Starting at the site root and for each page, grab all links and go to each link in turn
** */
threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
for (nStart = 0; nStart < allinks.Count() ; nStart = nCount)
{
now.SysDateTime() ;
todo.Clear() ;
for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
{
url = allinks[nCount] ;
threadLog("Cosidering link %s - ", *url.Whole()) ;
if (m_mapHist.Exists(url)) { threadLog("historic\n") ; continue ; }
if (url == m_Authexit) { threadLog("exit-page\n") ; continue ; }
if (!m_Domains.Exists(url.Domain())) { threadLog("URL %s outside domain\n", *url) ; continue ; }
threadLog("Fetching\n") ;
pDoc = Download(url) ;
threadLog("Fetched page %p\n", pDoc) ;
if (!pDoc)
threadLog("case 2. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
for (n = 0; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
delete pDoc ;
}
}
/*
** for (nAdded = nX = 0 ; nX < todo.Count() ; nX++)
** {
** //url = todo.GetObj(nX) ;
** url = todo[nX] ; //.GetObj(nX) ;
**
** if (set_ctrl.Exists(url))
** continue ;
** nAdded++ ;
** set_ctrl.Insert(url) ;
** }
**
** todo.Clear() ;
**
** if (!nAdded)
** break ;
** */
}
rc = _savestatus() ;
return rc ;
}