Fetch the page found at the supplied URL and return as a document (either XML or HTML). Note that if the page has already been downloaded (is in the site's history) then it is only downloaded again if it the time to live has expired. If the page is not downloaded then this function will reload it from file. Pointer to newly allocated document. Must be deleted after use.

Return TypeFunction nameArguments
hzDocument*hzWebhost::Download(hzUrl&,)

Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp

Function Logic:

0:START 1:unknown 2:items 3:Return 0 4:items items 5:unknown 6:unknown 7:pMark bHist items 8:unknown 9:pHdoc 10:pDoc 11:pMark->m_Doctype==DOCTYPE_XML 12:pXdoc 13:pDoc 14:pHdoc 15:pDoc 16:items 17:unknown 18:unknown 19:unknown 20:pXdoc 21:pDoc items rc 22:pHdoc 23:pDoc items rc 24:items 25:Return pDoc 26:unknown 27:items 28:Return 0 29:rc 30:unknown 31:items 32:Return pDoc 33:S pMark pMark pMark pMark items numbuf m_Repos pMark items rc 34:unknown 35:items items 36:Return 0 37:unknown 38:pMark 39:pMark items 40:unknown 41:items 42:unknown 43:items 44:items items 45:items 46:items 47:unknown 48:items 49:Return 0 50:pMark rc 51:unknown 52:pXdoc 53:pDoc items rc 54:pHdoc 55:pDoc items rc 56:unknown 57:items 58:unknown 59:items 60:items items items 61:unknown 62:items items 63:unknown 64:items 65:unknown 66:items 67:unknown 68:items 69:items 70:Return pDoc

Function body:

hzDocument* hzWebhost::Download (hzUrl& url)
{
   //  Fetch the page found at the supplied URL and return as a document (either XML or HTML).
   //  
   //  Note that if the page has already been downloaded (is in the site's history) then it is only downloaded again if it the time to
   //  live has expired. If the page is not downloaded then this function will reload it from file.
   //  
   //  Arguments: 1) url  The URL of the file/resource to download
   //  
   //  Returns: Pointer to newly allocated document. Must be deleted after use.
   _hzfunc("hzWebhost::Download") ;
   static uint32_t nlast = 0;      //  Last point reached (for download rsumption)
   ofstream    os ;                //  To write out page contents
   hzDocument* pDoc = 0;           //  Document downloaded
   hzDocXml*   pXdoc = 0;          //  XML Document downloaded
   hzDocHtml*  pHdoc = 0;          //  HTML Document downloaded
   hzDocMeta*  pMark ;             //  Document meta data
   hzXDate     now ;               //  Date & Time now
   hzString    S ;                 //  Temp string
   HttpRC      hc ;                //  HTTP server return code
   hzEcode     rc ;                //  Return code
   bool        bHist = false ;     //  Set if url is already in history and downloaded again because of being out of date
   char        numbuf [8];     //  Working buffer
   /*
   **  ** Check URL, insert in visited links if not already there
   **      */
   if (!url)
       { threadLog("No supplied address\n") ; return 0; }
   threadLog("FETCHING PAGE: %s\n", *url) ;
   now.SysDateTime() ;
   if (!(m_Opflags & WEBFLG_FORCE))
   {
       if (m_mapHist.Exists(url))
       {
           //  The requested URL exists in the repository already. We check if it has expired and if not we terminate with OK
           pMark = m_mapHist[url] ;
           bHist = true ;
           threadLog("Page %s is historic\n", *url) ;
           //  Create a document of the right type (XML or HTML)
           if (pMark->m_Doctype == DOCTYPE_HTML)
               pDoc = pHdoc = new hzDocHtml() ;
           else if (pMark->m_Doctype == DOCTYPE_XML)
               pDoc = pXdoc = new hzDocXml() ;
           else
               pDoc = pHdoc = new hzDocHtml() ;
           pDoc->SetMeta(*pMark) ;
           //  Check if expiry is known and if so if it has expired
           if (pMark->m_Expires.IsSet())
           {
               if (pMark->m_Expires < now)
               {
                   //  Set the markers and return
                   if (pMark->m_Doctype == DOCTYPE_XML)
                   {
                       //  XML
                       pDoc = pXdoc = new hzDocXml() ;
                       pDoc->SetMeta(*pMark) ;
                       rc = pDoc->Load(HC.m_Content) ;
                   }
                   else
                   {
                       //  HTML
                       pDoc = pHdoc = new hzDocHtml() ;
                       pDoc->SetMeta(*pMark) ;
                       rc = pDoc->Load(HC.m_Content) ;
                   }
                   threadLog("DOWNLOAD PREVIOUS (error=%s)\n\n", Err2Txt(rc)) ;
                   return pDoc ;
               }
           }
           //  At this point either the expiry date is unknown or it is known and has expired. Load from file
           if (!HC.m_Content.Size())
           {
               threadLog("Case 1 Bloody thing is empty!\n") ;
               return 0;
           }
           rc = pDoc->Load(HC.m_Content) ;
           if (rc != E_OK)
               threadLog("LOAD failed (error=%s)\n\n", Err2Txt(rc)) ;
           return pDoc ;
       }
   }
   //  The requested URL is not in the history. Create the document meta for it and download it.
   S = url.Filename() ;
   pMark = new hzDocMeta() ;
   pMark->m_urlReq = url ;
   pMark->m_urlAct = url ;
   pMark->m_Id = m_mapHist.Count() ;
   sprintf(numbuf, "/%04d", pMark->m_Id) ;
   pMark->m_Filename = m_Repos + numbuf + S ;
   /*
   **  ** Get page content and process it into a tree
   **      */
   threadLog("GETTIG PAGE: %s\n", *url) ;
   rc = HC.GetPage(hc, url, pMark->m_Etag) ;
   if (rc != E_OK)
   {
       threadLog("FAILED (error=%s) synopsis\n", Err2Txt(rc)) ;
       threadLog(HC.m_Error) ;
       return 0;
   }
   if (HC.m_Redirect)
       pMark->m_urlAct = HC.m_Redirect ;
   pMark->m_Modified = HC.m_Modified ;
   threadLog("HTTP Return code = %d, cookie (value %s, path %s)\n", (uint32_t) hc, *m_CookieSess, *m_CookiePath) ;
   /*
   **  ** Write out header to .hdr file and content to .con file
   **      */
   if (m_Repos)
   {
       os.open(*pMark->m_Filename) ;
       if (os.fail())
           threadLog("Cannot write out header file %s\n", *pMark->m_Filename) ;
       else
       {
           os << HC.m_Content ;
           os.close() ;
       }
       os.clear() ;
   }
   /*
   **  ** Add the page but only process pages that are of a known HTML type .htm, .html, .shtml, .xhtml etc
   **      */
   threadLog("PROCESSING Content: %d bytes\n", HC.m_Content.Size()) ;
   if (!HC.m_Content.Size())
   {
       threadLog("Case 2 Bloody thing is empty!\n") ;
       return 0;
   }
   pMark->m_Doctype = DeriveDoctype(HC.m_Content) ;
   rc = E_NODATA ;
   if (pMark->m_Doctype == DOCTYPE_XML)
   {
       //  XML
       pDoc = pXdoc = new hzDocXml() ;
       pXdoc->Init(url) ;
       rc = pXdoc->Load(HC.m_Content) ;
   }
   else
   {
       //  HTML
       pDoc = pHdoc = new hzDocHtml() ;
       pHdoc->Init(url) ;
       rc = pHdoc->Load(HC.m_Content) ;
       if (rc != E_OK)
       threadLog("Case 2 Bloody thing failed (error=%s)!\n", Err2Txt(rc)) ;
   }
   if (rc != E_OK)
   {
       threadLog("Load page failed error=%s\n", Err2Txt(rc)) ;
       //  delete pDoc ;
       //  return 0 ;
   }
   pDoc->SetMeta(*pMark) ;
   //  Place the URL in the site's history
   m_mapHist.Insert(pMark->m_urlReq, pMark) ;
   threadLog("Inserted URL %s\n", *pMark->m_urlReq) ;
   if (pMark->m_urlAct != pMark->m_urlReq)
   {
       m_mapHist.Insert(pMark->m_urlAct, pMark) ;
       threadLog("Inserted URL %s\n", *pMark->m_urlAct) ;
   }
   if (!bHist)
       m_vecHist.Add(pMark) ;
   if (pXdoc)
       threadLog("DOWNLOAD SUCCESS XML Page %s. Now have %d (%d) items in history\n\n", *url, m_mapHist.Count(), nlast) ;
   if (pHdoc)
       threadLog("DOWNLOAD SUCCESS Page %s has %d links. Now have %d (%d) items in history\n\n", *url, pHdoc->m_vecLinks.Count(), m_mapHist.Count(), nlast) ;
   threadLog(HC.m_Error) ;
   return pDoc ;
}