Return TypeFunction nameArguments
hzEcodehzWebhost::Scrape(void)

Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp

Function Logic:

0:START 1:!m_Repos 2:Return E_NOINIT 3:!m_Roots.Count() 4:Return E_NOINIT 5:m_Homepage 6:hzWebhost::Download pDoc 7:!pDoc 8:Return E_NOINIT 9:m_docHome 10:hzWebhost::Login rc 11:rc!=E_OK 12:Return rc 13:hzList::Count ci 14:ci.Valid(); 15:hzList::Iter::Element X 16:X.name==homepage 17:pHdoc hzDocHtml::ExtractLinksBasic 18:X.name==loginResponse 19:pHdoc hzDocHtml::ExtractLinksBasic 20:url 21:!url 22:etag hzWebhost::Download pDoc 23:!pDoc 24:pDoc->Whatami()!=DOCTYPE_HTML 25:pHdoc hzDocHtml::ExtractLinksBasic hzVect::Count 26:pDoc 27:n 28:url hzSet::Exists 29:!set_ctrl.Exists(url) 30:hzVect::Add 31:hzVect::Count hzVect::Count 32:nStart 33:hzXDate::SysDateTime hzVect::Clear nStart nCount 34:nCount 35:url hzUrl::Whole hzMapS::Exists 36:m_mapHist.Exists(url) 37:url==m_Authexit 38:hzUrl::Domain hzSet::Exists 39:!m_Domains.Exists(url.Domain()) 40:hzWebhost::Download pDoc 41:!pDoc 42:pDoc->Whatami()==DOCTYPE_HTML 43:pHdoc hzDocHtml::ExtractLinksBasic n 44:n 45:url hzSet::Exists 46:!set_ctrl.Exists(url) 47:hzVect::Add 48:pDoc 49:hzWebhost::_savestatus rc 50:Return rc

Function body:

hzEcode hzWebhost::Scrape (void)
{
   _hzfunc("hzWebhost::Scrape") ;
   hzMapS<hzString,hzString>   formData ;
   hzVect<hzString>            hdrs ;
   hzList<hzPair>::Iter        ci ;
   hzSet<hzUrl>    set_ctrl ;
   hzVect<hzUrl>   pglinks ;
   hzVect<hzUrl>   allinks ;
   hzVect<hzUrl>   todo ;
   ifstream        is ;
   ofstream        os ;
   hzDocMeta       mark ;
   hzChain         Response ;
   hzDocument*     pDoc ;
   hzDocHtml*      pHdoc ;
   hzPair          X ;
   hzXDate         now ;
   hzUrl           url ;
   hzString        vs_fname ;
   hzString        pagepath ;
   hzString        S ;
   hzString        etag ;
   uint32_t        nStart ;
   uint32_t        nLimit ;
   uint32_t        nCount ;
   uint32_t        n ;
   hzEcode         rc = E_OK ;
   threadLog("Called hzWebhost::Scrape\n") ;
   if (!m_Repos)
       { threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
   if (!m_Roots.Count())
       { threadLog("Website has no starting point (URL) for a WEB SCRAPE.\n") ; return E_NOINIT ; }
   if (m_Homepage)
   {
       pDoc = Download(m_Homepage) ;
       if (!pDoc)
           { threadLog("Could not download page %s\n", *m_Homepage) ; return E_NOINIT ; }
       m_docHome = pDoc ;
       threadLog("HOMEPAGE SUCCESS\n") ;
   }
   rc = Login() ;
   if (rc != E_OK)
       { threadLog("Login failed\n") ; return rc ; }
   threadLog("Login SUCCESS\n") ;
   threadLog("Have %d root commands\n", m_Roots.Count()) ;
   for (ci = m_Roots ; ci.Valid() ; ci++)
   {
       X = ci.Element() ;
       threadLog("Page=%s Crit=%s\n", *X.name, *X.value) ;
       if (X.name == "homepage")
       {
           pHdoc = (hzDocHtml*) m_docHome ;
           pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
       }
       else if (X.name == "loginResponse")
       {
           pHdoc = (hzDocHtml*) m_resAuth ;
           pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
       }
       else
       {
           url = X.name ;
           if (!url)
               { threadLog("Root command invalid page %s\n", *X.name) ; continue ; }
           etag = (char*) 0;
           pDoc = Download(url) ;
           if (!pDoc)
               threadLog("case 1. Could not fetch page %s\n", *url) ;
           else
           {
               if (pDoc->Whatami() != DOCTYPE_HTML)
                   threadLog("Page %s not HTML\n", *url) ;
               else
               {
                   pHdoc = (hzDocHtml*) pDoc ;
                   pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
                   threadLog("Got page content, extracted %d links\n", pglinks.Count()) ;
               }
               delete pDoc ;
           }
       }
       for (n = 0; n < pglinks.Count() ; n++)
       {
           url = pglinks[n] ;
           if (!set_ctrl.Exists(url))
               allinks.Add(url) ;
       }
   }
   /*
   **  ** Starting at the site root and for each page, grab all links and go to each link in turn
   **      */
   threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
   for (nStart = 0; nStart < allinks.Count() ; nStart = nCount)
   {
       now.SysDateTime() ;
       todo.Clear() ;
       for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
       {
           url = allinks[nCount] ;
           threadLog("Cosidering link %s - ", *url.Whole()) ;
           if (m_mapHist.Exists(url))              { threadLog("historic\n") ; continue ; }
           if (url == m_Authexit)                  { threadLog("exit-page\n") ; continue ; }
           if (!m_Domains.Exists(url.Domain()))    { threadLog("URL %s outside domain\n", *url) ; continue ; }
           threadLog("Fetching\n") ;
           pDoc = Download(url) ;
           threadLog("Fetched page %p\n", pDoc) ;
           if (!pDoc)
               threadLog("case 2. Could not fetch page %s\n", *url) ;
           else
           {
               if (pDoc->Whatami() == DOCTYPE_HTML)
               {
                   pHdoc = (hzDocHtml*) pDoc ;
                   pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
                   for (n = 0; n < pglinks.Count() ; n++)
                   {
                       url = pglinks[n] ;
                       if (!set_ctrl.Exists(url))
                           allinks.Add(url) ;
                   }
               }
               delete pDoc ;
           }
       }
       /*
       **  for (nAdded = nX = 0 ; nX < todo.Count() ; nX++)
       **    {
       **     //url = todo.GetObj(nX) ;
       **     url = todo[nX] ; //.GetObj(nX) ;
       **  
       **     if (set_ctrl.Exists(url))
       **      continue ;
       **     nAdded++ ;
       **     set_ctrl.Insert(url) ;
       **    }
       **  
       **    todo.Clear() ;
       **  
       **    if (!nAdded)
       **     break ;
       **           */
   }
   rc = _savestatus() ;
   return rc ;
}