Fetch the page found at the supplied URL and return as a document (either XML or HTML). Note that if the page has already been downloaded (is in the site's history) then it is only downloaded again if it the time to live has expired. If the page is not downloaded then this function will reload it from file. Pointer to newly allocated document. Must be deleted after use.
| Return Type | Function name | Arguments |
|---|---|---|
| hzDocument* | hzWebhost::Download | (hzUrl&,) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzDocument* hzWebhost::Download (hzUrl& url)
{
// Fetch the page found at the supplied URL and return as a document (either XML or HTML).
//
// Note that if the page has already been downloaded (is in the site's history) then it is only downloaded again if it the time to
// live has expired. If the page is not downloaded then this function will reload it from file.
//
// Arguments: 1) url The URL of the file/resource to download
//
// Returns: Pointer to newly allocated document. Must be deleted after use.
_hzfunc("hzWebhost::Download") ;
static uint32_t nlast = 0; // Last point reached (for download rsumption)
ofstream os ; // To write out page contents
hzDocument* pDoc = 0; // Document downloaded
hzDocXml* pXdoc = 0; // XML Document downloaded
hzDocHtml* pHdoc = 0; // HTML Document downloaded
hzDocMeta* pMark ; // Document meta data
hzXDate now ; // Date & Time now
hzString S ; // Temp string
HttpRC hc ; // HTTP server return code
hzEcode rc ; // Return code
bool bHist = false ; // Set if url is already in history and downloaded again because of being out of date
char numbuf [8]; // Working buffer
/*
** ** Check URL, insert in visited links if not already there
** */
if (!url)
{ threadLog("No supplied address\n") ; return 0; }
threadLog("FETCHING PAGE: %s\n", *url) ;
now.SysDateTime() ;
if (!(m_Opflags & WEBFLG_FORCE))
{
if (m_mapHist.Exists(url))
{
// The requested URL exists in the repository already. We check if it has expired and if not we terminate with OK
pMark = m_mapHist[url] ;
bHist = true ;
threadLog("Page %s is historic\n", *url) ;
// Create a document of the right type (XML or HTML)
if (pMark->m_Doctype == DOCTYPE_HTML)
pDoc = pHdoc = new hzDocHtml() ;
else if (pMark->m_Doctype == DOCTYPE_XML)
pDoc = pXdoc = new hzDocXml() ;
else
pDoc = pHdoc = new hzDocHtml() ;
pDoc->SetMeta(*pMark) ;
// Check if expiry is known and if so if it has expired
if (pMark->m_Expires.IsSet())
{
if (pMark->m_Expires < now)
{
// Set the markers and return
if (pMark->m_Doctype == DOCTYPE_XML)
{
// XML
pDoc = pXdoc = new hzDocXml() ;
pDoc->SetMeta(*pMark) ;
rc = pDoc->Load(HC.m_Content) ;
}
else
{
// HTML
pDoc = pHdoc = new hzDocHtml() ;
pDoc->SetMeta(*pMark) ;
rc = pDoc->Load(HC.m_Content) ;
}
threadLog("DOWNLOAD PREVIOUS (error=%s)\n\n", Err2Txt(rc)) ;
return pDoc ;
}
}
// At this point either the expiry date is unknown or it is known and has expired. Load from file
if (!HC.m_Content.Size())
{
threadLog("Case 1 Bloody thing is empty!\n") ;
return 0;
}
rc = pDoc->Load(HC.m_Content) ;
if (rc != E_OK)
threadLog("LOAD failed (error=%s)\n\n", Err2Txt(rc)) ;
return pDoc ;
}
}
// The requested URL is not in the history. Create the document meta for it and download it.
S = url.Filename() ;
pMark = new hzDocMeta() ;
pMark->m_urlReq = url ;
pMark->m_urlAct = url ;
pMark->m_Id = m_mapHist.Count() ;
sprintf(numbuf, "/%04d", pMark->m_Id) ;
pMark->m_Filename = m_Repos + numbuf + S ;
/*
** ** Get page content and process it into a tree
** */
threadLog("GETTIG PAGE: %s\n", *url) ;
rc = HC.GetPage(hc, url, pMark->m_Etag) ;
if (rc != E_OK)
{
threadLog("FAILED (error=%s) synopsis\n", Err2Txt(rc)) ;
threadLog(HC.m_Error) ;
return 0;
}
if (HC.m_Redirect)
pMark->m_urlAct = HC.m_Redirect ;
pMark->m_Modified = HC.m_Modified ;
threadLog("HTTP Return code = %d, cookie (value %s, path %s)\n", (uint32_t) hc, *m_CookieSess, *m_CookiePath) ;
/*
** ** Write out header to .hdr file and content to .con file
** */
if (m_Repos)
{
os.open(*pMark->m_Filename) ;
if (os.fail())
threadLog("Cannot write out header file %s\n", *pMark->m_Filename) ;
else
{
os << HC.m_Content ;
os.close() ;
}
os.clear() ;
}
/*
** ** Add the page but only process pages that are of a known HTML type .htm, .html, .shtml, .xhtml etc
** */
threadLog("PROCESSING Content: %d bytes\n", HC.m_Content.Size()) ;
if (!HC.m_Content.Size())
{
threadLog("Case 2 Bloody thing is empty!\n") ;
return 0;
}
pMark->m_Doctype = DeriveDoctype(HC.m_Content) ;
rc = E_NODATA ;
if (pMark->m_Doctype == DOCTYPE_XML)
{
// XML
pDoc = pXdoc = new hzDocXml() ;
pXdoc->Init(url) ;
rc = pXdoc->Load(HC.m_Content) ;
}
else
{
// HTML
pDoc = pHdoc = new hzDocHtml() ;
pHdoc->Init(url) ;
rc = pHdoc->Load(HC.m_Content) ;
if (rc != E_OK)
threadLog("Case 2 Bloody thing failed (error=%s)!\n", Err2Txt(rc)) ;
}
if (rc != E_OK)
{
threadLog("Load page failed error=%s\n", Err2Txt(rc)) ;
// delete pDoc ;
// return 0 ;
}
pDoc->SetMeta(*pMark) ;
// Place the URL in the site's history
m_mapHist.Insert(pMark->m_urlReq, pMark) ;
threadLog("Inserted URL %s\n", *pMark->m_urlReq) ;
if (pMark->m_urlAct != pMark->m_urlReq)
{
m_mapHist.Insert(pMark->m_urlAct, pMark) ;
threadLog("Inserted URL %s\n", *pMark->m_urlAct) ;
}
if (!bHist)
m_vecHist.Add(pMark) ;
if (pXdoc)
threadLog("DOWNLOAD SUCCESS XML Page %s. Now have %d (%d) items in history\n\n", *url, m_mapHist.Count(), nlast) ;
if (pHdoc)
threadLog("DOWNLOAD SUCCESS Page %s has %d links. Now have %d (%d) items in history\n\n", *url, pHdoc->m_vecLinks.Count(), m_mapHist.Count(), nlast) ;
threadLog(HC.m_Error) ;
return pDoc ;
}