In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created and existing pages can be modified on an ad-hoc basis. A scrape captures the current state of the website or a limited portion of it to file. The scraping process runs through a set of known links for the website, downloading the page for each in turn. Each downloaded page is then examined for links. Links to domains other than the one in qestion are ignored. Links to such things as images are also ignored. Remaining links not found in the set of known links are added to this set. The process terminates when all the links have been attempted. The set of known links will need to comprise the site's home-page and a login page if this exists and if it is not the same as the home page. These will usually be enough to 'bootstrap' the rest of the site. Arguments: None
| Return Type | Function name | Arguments |
|---|---|---|
| hzEcode | hzWebhost::Scrape | (void) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzEcode hzWebhost::Scrape (void)
{
// In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created and existing pages can be
// modified on an ad-hoc basis. A scrape captures the current state of the website or a limited portion of it to file.
//
// The scraping process runs through a set of known links for the website, downloading the page for each in turn. Each downloaded page is then examined for
// links. Links to domains other than the one in qestion are ignored. Links to such things as images are also ignored. Remaining links not found in the set
// of known links are added to this set. The process terminates when all the links have been attempted.
//
// The set of known links will need to comprise the site's home-page and a login page if this exists and if it is not the same as the home page. These will
// usually be enough to 'bootstrap' the rest of the site.
//
// Arguments: None
//
// Returns: E_NOINIT If no repository, no domain or no homepage has been specified
// E_NOTFOUND If the login page was not located
// E_WRITEFAIL If the login form recieved was not written to the repository
// E_OPENFAIL If the visit status file could not be opened
// E_OK If the scrape operation was successful
_hzfunc("hzWebhost::Scrape") ;
hzMapS<hzString,hzString> formData ; // Set of name value pairs
hzVect<hzString> hdrs ; // Extra headers, needed for submit form
hzList<hzPair>::Iter ci ; // Root commands iterator
hzSet<hzUrl> set_ctrl ; // Initial links from processing config params
hzVect<hzUrl> pglinks ; // Links encountered within a given pages
hzVect<hzUrl> allinks ; // Links encountered within a given pages
hzVect<hzUrl> todo ; // Links encountered in the pages in ctrl
ifstream is ; // For reading in visit status file
ofstream os ; // For writing out visit status file at end of scrape
hzDocMeta mark ; // Document meta data
hzChain Response ; // Response from form submission
hzDocument* pDoc ; // Downloaded document
hzDocHtml* pHdoc ; // Set if downloaded document is a HTML page.
hzPair X ; // Root comand instance
hzXDate now ; // Date/time now (for cheking is pages have expired
hzUrl url ; // Temp link
hzString vs_fname ; // Visit status filename
hzString pagepath ; // Filepath for file to store downloaded page
hzString S ; // Temp string
hzString etag ; // Temp string
uint32_t nStart ; // Links iterator
uint32_t nLimit ; // Links iterator
uint32_t nCount ; // Links iterator
uint32_t n ; // Aggregation iterator
hzEcode rc = E_OK ; // Return code
threadLog("Called hzWebhost::Scrape\n") ;
// Check if repository is set up (website is initialized)
if (!m_Repos)
{ threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
// Is there anything to do?
if (!m_Roots.Count())
{ threadLog("Website has no starting point (URL) for a WEB SCRAPE.\n") ; return E_NOINIT ; }
// Get the home page
// if (*m_Homepage)
if (m_Homepage)
{
// etag = 0 ;
// etag = (char*) 0 ;
pDoc = Download(m_Homepage) ;
if (!pDoc)
{ threadLog("Could not download page %s\n", *m_Homepage) ; return E_NOINIT ; }
m_docHome = pDoc ;
threadLog("HOMEPAGE SUCCESS\n") ;
}
// Login
rc = Login() ;
if (rc != E_OK)
{ threadLog("Login failed\n") ; return rc ; }
threadLog("Login SUCCESS\n") ;
// Run the root commands to obtain the set of roots. A root command may have either a URL or a 'link criteria' or both. If only a
// URL is present, this URL and ALL links found within it are added to the list of pages to process. If only a link criteria is
// present, the links found in the HOME page and the LOGIN RESPONSE page are tested against the criteria. If they match the link
// is added to the list of pages to process. If both a URL and a link criteria is found then the URL and any matching links found
// within it are added to the list of pages to process.
threadLog("Have %d root commands\n", m_Roots.Count()) ;
for (ci = m_Roots ; ci.Valid() ; ci++)
{
X = ci.Element() ;
threadLog("Page=%s Crit=%s\n", *X.name, *X.value) ;
// Get the page
if (X.name == "homepage")
{
// No page to get, just compare the criteria to the home
pHdoc = (hzDocHtml*) m_docHome ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
}
else if (X.name == "loginResponse")
{
// No page to get, just compare the criteria to the login response
pHdoc = (hzDocHtml*) m_resAuth ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
}
else
{
url = X.name ;
if (!url)
{ threadLog("Root command invalid page %s\n", *X.name) ; continue ; }
etag = (char*) 0;
pDoc = Download(url) ;
if (!pDoc)
threadLog("case 1. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() != DOCTYPE_HTML)
threadLog("Page %s not HTML\n", *url) ;
else
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
threadLog("Got page content, extracted %d links\n", pglinks.Count()) ;
}
delete pDoc ;
}
}
// Now aggregate the vector of links from the page to a vector of all links from all pages. Use a set to avoid repeats.
for (n = 0; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
/*
** ** Starting at the site root and for each page, grab all links and go to each link in turn
** */
threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
for (nStart = 0; nStart < allinks.Count() ; nStart = nCount)
{
now.SysDateTime() ;
todo.Clear() ;
for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
{
url = allinks[nCount] ;
threadLog("Cosidering link %s - ", *url.Whole()) ;
if (m_mapHist.Exists(url)) { threadLog("historic\n") ; continue ; }
if (url == m_Authexit) { threadLog("exit-page\n") ; continue ; }
if (!m_Domains.Exists(url.Domain())) { threadLog("URL %s outside domain\n", *url) ; continue ; }
// Page not yet visted so we visit it, put it in list of pages visited and get the links. Some of these links may add to
// the list of links.
threadLog("Fetching\n") ;
pDoc = Download(url) ;
threadLog("Fetched page %p\n", pDoc) ;
if (!pDoc)
threadLog("case 2. Could not fetch page %s\n", *url) ;
else
{
if (pDoc->Whatami() == DOCTYPE_HTML)
{
pHdoc = (hzDocHtml*) pDoc ;
pHdoc->ExtractLinksBasic(pglinks, m_Domains, X.value) ;
// Re-aggregate the all-links vector
for (n = 0; n < pglinks.Count() ; n++)
{
url = pglinks[n] ;
if (!set_ctrl.Exists(url))
allinks.Add(url) ;
}
}
delete pDoc ;
}
}
/*
** for (nAdded = nX = 0 ; nX < todo.Count() ; nX++)
** {
** //url = todo.GetObj(nX) ;
** url = todo[nX] ; //.GetObj(nX) ;
**
** if (set_ctrl.Exists(url))
** continue ;
** nAdded++ ;
** set_ctrl.Insert(url) ;
** }
**
** todo.Clear() ;
**
** if (!nAdded)
** break ;
** */
}
// Write out manifest file
rc = _savestatus() ;
return rc ;
}