Return Type | Function name | Arguments |
---|---|---|
hzDocument* | hzWebhost::Download | (const hzUrl&,) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzDocument* hzWebhost::Download (const hzUrl& url) { _hzfunc("hzWebhost::Download") ; static uint32_t nlast = 0; ofstream os ; hzDocument* pDoc = 0; hzDocXml* pXdoc = 0; hzDocHtml* pHdoc = 0; hzDocMeta* pMark ; hzXDate now ; hzString S ; HttpRC hc ; hzEcode rc ; bool bHist = false ; char numbuf [8]; /* ** ** Check URL, insert in visited links if not already there ** */ if (!url) { threadLog("No supplied address\n") ; return 0; } threadLog("FETCHING PAGE: %s\n", *url) ; now.SysDateTime() ; if (!(m_Opflags & WEBFLG_FORCE)) { if (m_mapHist.Exists(url)) { pMark = m_mapHist[url] ; bHist = true ; threadLog("Page %s is historic\n", *url) ; if (pMark->m_Doctype == DOCTYPE_HTML) pDoc = pHdoc = new hzDocHtml() ; else if (pMark->m_Doctype == DOCTYPE_XML) pDoc = pXdoc = new hzDocXml() ; else pDoc = pHdoc = new hzDocHtml() ; pDoc->SetMeta(*pMark) ; if (pMark->m_Expires.IsSet()) { if (pMark->m_Expires < now) { if (pMark->m_Doctype == DOCTYPE_XML) { pDoc = pXdoc = new hzDocXml() ; pDoc->SetMeta(*pMark) ; rc = pDoc->Load(HC.m_Content) ; } else { pDoc = pHdoc = new hzDocHtml() ; pDoc->SetMeta(*pMark) ; rc = pDoc->Load(HC.m_Content) ; } threadLog("DOWNLOAD PREVIOUS (error=%s)\n\n", Err2Txt(rc)) ; return pDoc ; } } if (!HC.m_Content.Size()) { threadLog("Case 1 Bloody thing is empty!\n") ; return 0; } rc = pDoc->Load(HC.m_Content) ; if (rc != E_OK) threadLog("LOAD failed (error=%s)\n\n", Err2Txt(rc)) ; return pDoc ; } } S = url.Filename() ; pMark = new hzDocMeta() ; pMark->m_urlReq = url ; pMark->m_urlAct = url ; pMark->m_Id = m_mapHist.Count() ; sprintf(numbuf, "/%04d", pMark->m_Id) ; pMark->m_Filename = m_Repos + numbuf + S ; /* ** ** Get page content and process it into a tree ** */ threadLog("GETTIG PAGE: %s\n", *url) ; rc = HC.GetPage(hc, url, pMark->m_Etag) ; if (rc != E_OK) { threadLog("FAILED (error=%s) synopsis\n", Err2Txt(rc)) ; threadLog(HC.m_Error) ; return 0; } if (HC.m_Redirect) pMark->m_urlAct = HC.m_Redirect ; pMark->m_Modified = HC.m_Modified ; threadLog("HTTP Return code = %d, cookie (value %s, path %s)\n", (uint32_t) hc, *m_CookieSess, *m_CookiePath) ; /* ** ** Write out header to .hdr file and content to .con file ** */ if (m_Repos) { os.open(*pMark->m_Filename) ; if (os.fail()) threadLog("Cannot write out header file %s\n", *pMark->m_Filename) ; else { os << HC.m_Content ; os.close() ; } os.clear() ; } /* ** ** Add the page but only process pages that are of a known HTML type .htm, .html, .shtml, .xhtml etc ** */ threadLog("PROCESSING Content: %d bytes\n", HC.m_Content.Size()) ; if (!HC.m_Content.Size()) { threadLog("Case 2 Bloody thing is empty!\n") ; return 0; } pMark->m_Doctype = DeriveDoctype(HC.m_Content) ; rc = E_NODATA ; if (pMark->m_Doctype == DOCTYPE_XML) { pDoc = pXdoc = new hzDocXml() ; pXdoc->Init(url) ; rc = pXdoc->Load(HC.m_Content) ; } else { pDoc = pHdoc = new hzDocHtml() ; pHdoc->Init(url) ; rc = pHdoc->Load(HC.m_Content) ; if (rc != E_OK) threadLog("Case 2 Bloody thing failed (error=%s)!\n", Err2Txt(rc)) ; } if (rc != E_OK) { threadLog("Load page failed error=%s\n", Err2Txt(rc)) ; } pDoc->SetMeta(*pMark) ; m_mapHist.Insert(pMark->m_urlReq, pMark) ; threadLog("Inserted URL %s\n", *pMark->m_urlReq) ; if (pMark->m_urlAct != pMark->m_urlReq) { m_mapHist.Insert(pMark->m_urlAct, pMark) ; threadLog("Inserted URL %s\n", *pMark->m_urlAct) ; } if (!bHist) m_vecHist.Add(pMark) ; if (pXdoc) threadLog("DOWNLOAD SUCCESS XML Page %s. Now have %d (%d) items in history\n\n", *url, m_mapHist.Count(), nlast) ; if (pHdoc) threadLog("DOWNLOAD SUCCESS Page %s has %d links. Now have %d (%d) items in history\n\n", *url, pHdoc->m_vecLinks.Count(), m_mapHist.Count(), nlast) ; threadLog(HC.m_Error) ; return pDoc ; }