Recursive fetch of RSS documents. The supplied URL is downloaded and loaded into an XML document. There it is tested to ensure it is an XML document. The RSS feed is assumed to contain only links. These links may be to HTML pages or other (sub RSS feeds). The HTML pages are end points of the process. They are downloaded but any links they may contain are recorded but not followed. The sub-RSS feeds are then processed by recursive call to this function.

Return TypeFunction nameArguments
hzEcodehzWebhost::getRss_r(HttpRC&,hzUrl&,uint32_t,)

Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp

Function Logic:

0:START 1:pDoc 2:unknown 3:items 4:Return rc 5:unknown 6:items 7:Return E_TYPE 8:items rc 9:unknown 10:Return hzerr(rc,Could not load feed %s,*feed) 11:rc items 12:unknown 13:Return rc 14:unknown 15:items pN1 title desc page uqid dstr 16:unknown 17:items 18:unknown 19:title 20:unknown 21:desc 22:unknown 23:page 24:unknown 25:uqid 26:unknown 27:dstr 28:items 29:unknown 30:items page 31:unknown 32:items 33:items 34:unknown 35:items 36:pMark pMark pMark pMark 37:unknown 38:items 39:items items 40:unknown 41:page items rc 42:Return rc

Function body:

hzEcode hzWebhost::getRss_r (HttpRC& hRet)hzUrl& feed, uint32_t nLevel, 
{
   //  Recursive fetch of RSS documents. The supplied URL is downloaded and loaded into an XML document. There it is tested to ensure it is an
   //  XML document. The RSS feed is assumed to contain only links. These links may be to HTML pages or other (sub RSS feeds). The HTML pages
   //  are end points of the process. They are downloaded but any links they may contain are recorded but not followed. The sub-RSS feeds are
   //  then processed by recursive call to this function.
   //  
   //  Arguments: 1) hRet  Set by this operation
   //     2) feed  The RSS URL
   //     3) nLevel RSS Hierarchy
   //  
   //  Returns: E_NODATA If the download failed
   //     E_TYPE  If the downloaded material does not appear to be XML
   //     E_FORMAT If the downloaded material could not be loaded into an XML document
   //     E_ARGUMENT If the RSS tags are not defined
   //     E_NOTFOUND If no tags were found in the RSS
   //     E_OK  If the RSS data was collected
   _hzfunc("hzWebhost::getRss_r") ;
   hzVect<hzXmlNode*>  linx ;      //  Links found in (this) RSS feed page
   hzVect<hzUrl>       todo ;      //  Links found in RSS feed page (additions to this are controlled by the set above)
   hzDocXml        X ;             //  For loading of RSS feed pages and extraction of links
   hzXmlNode*      pN1 ;           //  Nodes (containing <item>)
   hzXmlNode*      pN2 ;           //  Nodes (containing <item> subnodes of title, link, description)
   hzDocMeta*      pMark ;         //  Document meta data
   hzDocument*     pDoc ;          //  Document found at URL (could be XML of HTML)
   hzUrl           page ;          //  Temp link
   hzString        desc ;          //  RSS article description
   hzString        dstr ;          //  RSS article date
   hzString        uqid ;          //  Unique ID of RSS item
   hzString        title ;         //  RSS article title
   uint32_t        nIndex ;        //  Links iterator
   hzEcode         rc = E_OK ;     //  Return code
   //  Fetch the current RSS document
   pDoc = Download(feed) ;
   if (rc != E_OK)
       { threadLog("Could not fetch URL %s\n", *feed) ; return rc ; }
   //  If not an XML document then it is just a page. Nothing further.
   if (pDoc->Whatami() != DOCTYPE_XML)
       { threadLog("case 1. Fetched feed (%s) is not of doctype XML\n", *feed) ; return E_TYPE ; }
   nLevel++ ;
   //  Load current RSS document into XML document tree
   rc = X.Load(HC.m_Content) ;
   if (rc != E_OK)
       return hzerr(rc, "Could not load feed %s", *feed) ;
   //  The page is an RSS document so select the <itme> tags
   rc = X.FindNodes(linx, m_tagItem.m_Slct) ;
   threadLog("Found %d <item> tags in feed %s\n", linx.Count(), *feed) ;
   if (rc != E_OK)
       return rc ;
   for (nIndex = 0; nIndex < linx.Count() ; nIndex++)
   {
       threadLog("case 1\n") ;
       pN1 = linx[nIndex] ;
       title = (char*) 0; desc = (char*) 0; page = (char*) 0; uqid = (char*) 0; dstr = (char*) 0;
       for (pN2 = pN1->GetFirstChild() ; pN2 ; pN2 = pN2->Sibling())
       {
           threadLog("case 2\n") ;
           if (pN2->NameEQ(*m_tagTitl.m_Slct)) { title = pN2->m_fixContent ; continue ; }
           if (pN2->NameEQ(*m_tagDesc.m_Slct)) { desc = pN2->m_fixContent ; continue ; }
           if (pN2->NameEQ(*m_tagLink.m_Slct)) { page = pN2->m_fixContent ; continue ; }
           if (pN2->NameEQ(*m_tagUqid.m_Slct)) { uqid = pN2->m_fixContent ; continue ; }
           if (pN2->NameEQ(*m_tagDate.m_Slct)) { dstr = pN2->m_fixContent ; continue ; }
       }
       threadLog("case 3\n") ;
       if (!page)
           { threadLog("case 1: title=%s; link=null uqid=%s\n", *title, *uqid) ; page = uqid ; }
       if (!page)
           { threadLog("case 2: title=%s; link=null uqid=%s\n", *title, *uqid) ; continue ; }
       threadLog("title=%s; link=%s\n", *title, *page) ;
       if (m_mapHist.Exists(page))
           threadLog("Exists in history, page %s\n", *page) ;
       else
       {
           pMark = new hzDocMeta() ;
           pMark->m_Title = title ;
           pMark->m_Desc = desc ;
           pMark->m_urlReq = page ;
           if (dstr)
               pMark->m_Modified.SetDateTime(*dstr) ;
           //  todo.Insert(page) ;
           todo.Add(page) ;
           threadLog("Adding to history, page %s\n", *page) ;
       }
   }
   //  Fetch all the new links found above by recursive call
   for (nIndex = 0; nIndex < todo.Count() ; nIndex++)
   {
       page = todo[nIndex] ;
       //  pMark = m_mapHist[page] ;
       threadLog("Processing %s\n", *page) ;
       rc = getRss_r(hRet, page, nLevel) ;
   }
   return rc ;
}