In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created and existing pages can be modified on an ad-hoc basis. A scrape captures the current state of the website or a limited portion of it to file. The scraping process runs through a set of known links for the website, downloading the page for each in turn. Each downloaded page is then examined for links. Links to domains other than the one in qestion are ignored. Links to such things as images are also ignored. Remaining links not found in the set of known links are added to this set. The process terminates when all the links have been attempted. The set of known links will need to comprise the site's home-page and a login page if this exists and if it is not the same as the home page. These will usually be enough to 'bootstrap' the rest of the site. Arguments: None

Return Type	Function name	Arguments
hzEcode	hzWebhost::Scrape	(void)

Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp

Function Logic:

Function body:

hzEcode hzWebhost::Scrape (void)
{
   //  In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created and existing pages can be
   //  modified on an ad-hoc basis. A scrape captures the current state of the website or a limited portion of it to file.
   //  
   //  The scraping process runs through a set of known links for the website, downloading the page for each in turn. Each downloaded page is then examined for
   //  links. Links to domains other than the one in qestion are ignored. Links to such things as images are also ignored. Remaining links not found in the set
   //  of known links are added to this set. The process terminates when all the links have been attempted.
   //  
   //  The set of known links will need to comprise the site's home-page and a login page if this exists and if it is not the same as the home page. These will
   //  usually be enough to 'bootstrap' the rest of the site.
   //  
   //  Arguments: None
   //  
   //  Returns: E_NOINIT If no repository, no domain or no homepage has been specified
   //     E_NOTFOUND If the login page was not located
   //     E_WRITEFAIL If the login form recieved was not written to the repository
   //     E_OPENFAIL If the visit status file could not be opened
   //     E_OK  If the scrape operation was successful
   _hzfunc("hzWebhost::Scrape") ;
   hzMapS<hzString,hzString>   formData ;  //  Set of name value pairs
   hzVect<hzString>            hdrs ;      //  Extra headers, needed for submit form
   hzList<hzPair>::Iter        ci ;        //  Root commands iterator
   hzSet<hzUrl>    set_ctrl ;      //  Initial links from processing config params
   hzVect<hzUrl>   pglinks ;       //  Links encountered within a given pages
   hzVect<hzUrl>   allinks ;       //  Links encountered within a given pages
   hzVect<hzUrl>   todo ;          //  Links encountered in the pages in ctrl
   ifstream        is ;            //  For reading in visit status file
   ofstream        os ;            //  For writing out visit status file at end of scrape
   hzDocMeta       mark ;          //  Document meta data
   hzChain         Response ;      //  Response from form submission
   hzDocument*     pDoc ;          //  Downloaded document
   hzDocHtml*      pHdoc ;         //  Set if downloaded document is a HTML page.
   hzPair          X ;             //  Root comand instance
   hzXDate         now ;           //  Date/time now (for cheking is pages have expired
   hzUrl           url ;           //  Temp link
   hzString        vs_fname ;      //  Visit status filename
   hzString        pagepath ;      //  Filepath for file to store downloaded page
   hzString        S ;             //  Temp string
   hzString        etag ;          //  Temp string
   uint32_t        nStart ;        //  Links iterator
   uint32_t        nLimit ;        //  Links iterator
   uint32_t        nCount ;        //  Links iterator
   uint32_t        n ;             //  Aggregation iterator
   hzEcode         rc = E_OK ;     //  Return code
   threadLog("Called hzWebhost::Scrape\n") ;
   //  Check if repository is set up (website is initialized)
   if (!m_Repos)
       { threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
   //  Is there anything to do?
   if (!m_Roots.Count())
       { threadLog("Website has no starting point (URL) for a WEB SCRAPE.\n") ; return E_NOINIT ; }
   //  Get the home page
   //  if (*m_Homepage)
   if (m_Homepage)
   {
       //  etag = 0 ;
       //  etag = (char*) 0 ;
       pDoc = Download(m_Homepage) ;
       if (!pDoc)
           { threadLog("Could not download page %s\n", *m_Homepage) ; return E_NOINIT ; }
       m_docHome = pDoc ;
       threadLog("HOMEPAGE SUCCESS\n") ;
   }
   //  Login
   rc = Login() ;
   if (rc != E_OK)
       { threadLog("Login failed\n") ; return rc ; }
   threadLog("Login SUCCESS\n") ;
   //  Run the root commands to obtain the set of roots. A root command may have either a URL or a 'link criteria' or both. If only a
   //  URL is present, this URL and ALL links found within it are added to the list of pages to process. If only a link criteria is
   //  present, the links found in the HOME page and the LOGIN RESPONSE page are tested against the criteria. If they match the link
   //  is added to the list of pages to process. If both a URL and a link criteria is found then the URL and any matching links found
   //  within it are added to the list of pages to process.
   threadLog("Have %d root commands\n", m_Roots.Count()) ;
   for (ci = m_Roots ; ci.Valid() ; ci++)
   {
       X = ci.Element() ;
       threadLog("Page=%s Crit=%s\n", *X.name, *X.value) ;
       //  Get the page
       if (X.name == "homepage")
       {
           //  No page to get, just compare the criteria to the home
           pHdoc = (hzDocHtml*) m_docHome ;
           pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
       }
       else if (X.name == "loginResponse")
       {
           //  No page to get, just compare the criteria to the login response
           pHdoc = (hzDocHtml*) m_resAuth ;
           pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
       }
       else
       {
           url = X.name ;
           if (!url)
               { threadLog("Root command invalid page %s\n", *X.name) ; continue ; }
           etag = (char*) 0;
           pDoc = Download(url) ;
           if (!pDoc)
               threadLog("case 1. Could not fetch page %s\n", *url) ;
           else
           {
               if (pDoc->Whatami() != DOCTYPE_HTML)
                   threadLog("Page %s not HTML\n", *url) ;
               else
               {
                   pHdoc = (hzDocHtml*) pDoc ;
                   pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
                   threadLog("Got page content, extracted %d links\n", pglinks.Count()) ;
               }
               delete pDoc ;
           }
       }
       //  Now aggregate the vector of links from the page to a vector of all links from all pages. Use a set to avoid repeats.
       for (n = 0; n < pglinks.Count() ; n++)
       {
           url = pglinks[n] ;
           if (!set_ctrl.Exists(url))
               allinks.Add(url) ;
       }
   }
   /*
   **  ** Starting at the site root and for each page, grab all links and go to each link in turn
   **      */
   threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
   for (nStart = 0; nStart < allinks.Count() ; nStart = nCount)
   {
       now.SysDateTime() ;
       todo.Clear() ;
       for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
       {
           url = allinks[nCount] ;
           threadLog("Cosidering link %s - ", *url.Whole()) ;
           if (m_mapHist.Exists(url))              { threadLog("historic\n") ; continue ; }
           if (url == m_Authexit)                  { threadLog("exit-page\n") ; continue ; }
           if (!m_Domains.Exists(url.Domain()))    { threadLog("URL %s outside domain\n", *url) ; continue ; }
           //  Page not yet visted so we visit it, put it in list of pages visited and get the links. Some of these links may add to
           //  the list of links.
           threadLog("Fetching\n") ;
           pDoc = Download(url) ;
           threadLog("Fetched page %p\n", pDoc) ;
           if (!pDoc)
               threadLog("case 2. Could not fetch page %s\n", *url) ;
           else
           {
               if (pDoc->Whatami() == DOCTYPE_HTML)
               {
                   pHdoc = (hzDocHtml*) pDoc ;
                   pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
                   //  Re-aggregate the all-links vector
                   for (n = 0; n < pglinks.Count() ; n++)
                   {
                       url = pglinks[n] ;
                       if (!set_ctrl.Exists(url))
                           allinks.Add(url) ;
                   }
               }
               delete pDoc ;
           }
       }
       /*
       **  for (nAdded = nX = 0 ; nX < todo.Count() ; nX++)
       **    {
       **     //url = todo.GetObj(nX) ;
       **     url = todo[nX] ; //.GetObj(nX) ;
       **  
       **     if (set_ctrl.Exists(url))
       **      continue ;
       **     nAdded++ ;
       **     set_ctrl.Insert(url) ;
       **    }
       **  
       **    todo.Clear() ;
       **  
       **    if (!nAdded)
       **     break ;
       **           */
   }
   //  Write out manifest file
   rc = _savestatus() ;
   return rc ;
}