Recursive fetch of RSS documents. The supplied URL is downloaded and loaded into an XML document. There it is tested to ensure it is an XML document. The RSS feed is assumed to contain only links. These links may be to HTML pages or other (sub RSS feeds). The HTML pages are end points of the process. They are downloaded but any links they may contain are recorded but not followed. The sub-RSS feeds are then processed by recursive call to this function.

Return Type	Function name	Arguments
hzEcode	hzWebhost::getRss_r	(HttpRC&,hzUrl&,uint32_t,)

Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp

Function Logic:

Function body:

hzEcode hzWebhost::getRss_r (HttpRC& hRet)hzUrl& feed, uint32_t nLevel, 
{
   //  Recursive fetch of RSS documents. The supplied URL is downloaded and loaded into an XML document. There it is tested to ensure it is an
   //  XML document. The RSS feed is assumed to contain only links. These links may be to HTML pages or other (sub RSS feeds). The HTML pages
   //  are end points of the process. They are downloaded but any links they may contain are recorded but not followed. The sub-RSS feeds are
   //  then processed by recursive call to this function.
   //  
   //  Arguments: 1) hRet  Set by this operation
   //     2) feed  The RSS URL
   //     3) nLevel RSS Hierarchy
   //  
   //  Returns: E_NODATA If the download failed
   //     E_TYPE  If the downloaded material does not appear to be XML
   //     E_FORMAT If the downloaded material could not be loaded into an XML document
   //     E_ARGUMENT If the RSS tags are not defined
   //     E_NOTFOUND If no tags were found in the RSS
   //     E_OK  If the RSS data was collected
   _hzfunc("hzWebhost::getRss_r") ;
   hzVect<hzXmlNode*>  linx ;      //  Links found in (this) RSS feed page
   hzVect<hzUrl>       todo ;      //  Links found in RSS feed page (additions to this are controlled by the set above)
   hzDocXml        X ;             //  For loading of RSS feed pages and extraction of links
   hzXmlNode*      pN1 ;           //  Nodes (containing <item>)
   hzXmlNode*      pN2 ;           //  Nodes (containing <item> subnodes of title, link, description)
   hzDocMeta*      pMark ;         //  Document meta data
   hzDocument*     pDoc ;          //  Document found at URL (could be XML of HTML)
   hzUrl           page ;          //  Temp link
   hzString        desc ;          //  RSS article description
   hzString        dstr ;          //  RSS article date
   hzString        uqid ;          //  Unique ID of RSS item
   hzString        title ;         //  RSS article title
   uint32_t        nIndex ;        //  Links iterator
   hzEcode         rc = E_OK ;     //  Return code
   //  Fetch the current RSS document
   pDoc = Download(feed) ;
   if (rc != E_OK)
       { threadLog("Could not fetch URL %s\n", *feed) ; return rc ; }
   //  If not an XML document then it is just a page. Nothing further.
   if (pDoc->Whatami() != DOCTYPE_XML)
       { threadLog("case 1. Fetched feed (%s) is not of doctype XML\n", *feed) ; return E_TYPE ; }
   nLevel++ ;
   //  Load current RSS document into XML document tree
   rc = X.Load(HC.m_Content) ;
   if (rc != E_OK)
       return hzerr(rc, "Could not load feed %s", *feed) ;
   //  The page is an RSS document so select the <itme> tags
   rc = X.FindNodes(linx, m_tagItem.m_Slct) ;
   threadLog("Found %d <item> tags in feed %s\n", linx.Count(), *feed) ;
   if (rc != E_OK)
       return rc ;
   for (nIndex = 0; nIndex < linx.Count() ; nIndex++)
   {
       threadLog("case 1\n") ;
       pN1 = linx[nIndex] ;
       title = (char*) 0; desc = (char*) 0; page = (char*) 0; uqid = (char*) 0; dstr = (char*) 0;
       for (pN2 = pN1->GetFirstChild() ; pN2 ; pN2 = pN2->Sibling())
       {
           threadLog("case 2\n") ;
           if (pN2->NameEQ(*m_tagTitl.m_Slct)) { title = pN2->m_fixContent ; continue ; }
           if (pN2->NameEQ(*m_tagDesc.m_Slct)) { desc = pN2->m_fixContent ; continue ; }
           if (pN2->NameEQ(*m_tagLink.m_Slct)) { page = pN2->m_fixContent ; continue ; }
           if (pN2->NameEQ(*m_tagUqid.m_Slct)) { uqid = pN2->m_fixContent ; continue ; }
           if (pN2->NameEQ(*m_tagDate.m_Slct)) { dstr = pN2->m_fixContent ; continue ; }
       }
       threadLog("case 3\n") ;
       if (!page)
           { threadLog("case 1: title=%s; link=null uqid=%s\n", *title, *uqid) ; page = uqid ; }
       if (!page)
           { threadLog("case 2: title=%s; link=null uqid=%s\n", *title, *uqid) ; continue ; }
       threadLog("title=%s; link=%s\n", *title, *page) ;
       if (m_mapHist.Exists(page))
           threadLog("Exists in history, page %s\n", *page) ;
       else
       {
           pMark = new hzDocMeta() ;
           pMark->m_Title = title ;
           pMark->m_Desc = desc ;
           pMark->m_urlReq = page ;
           if (dstr)
               pMark->m_Modified.SetDateTime(*dstr) ;
           //  todo.Insert(page) ;
           todo.Add(page) ;
           threadLog("Adding to history, page %s\n", *page) ;
       }
   }
   //  Fetch all the new links found above by recursive call
   for (nIndex = 0; nIndex < todo.Count() ; nIndex++)
   {
       page = todo[nIndex] ;
       //  pMark = m_mapHist[page] ;
       threadLog("Processing %s\n", *page) ;
       rc = getRss_r(hRet, page, nLevel) ;
   }
   return rc ;
}