Recursive fetch of RSS documents. The supplied URL is downloaded and loaded into an XML document. There it is tested to ensure it is an XML document. The RSS feed is assumed to contain only links. These links may be to HTML pages or other (sub RSS feeds). The HTML pages are end points of the process. They are downloaded but any links they may contain are recorded but not followed. The sub-RSS feeds are then processed by recursive call to this function.
| Return Type | Function name | Arguments |
|---|---|---|
| hzEcode | hzWebhost::getRss_r | (HttpRC&,hzUrl&,uint32_t,) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzEcode hzWebhost::getRss_r (HttpRC& hRet)hzUrl& feed, uint32_t nLevel,
{
// Recursive fetch of RSS documents. The supplied URL is downloaded and loaded into an XML document. There it is tested to ensure it is an
// XML document. The RSS feed is assumed to contain only links. These links may be to HTML pages or other (sub RSS feeds). The HTML pages
// are end points of the process. They are downloaded but any links they may contain are recorded but not followed. The sub-RSS feeds are
// then processed by recursive call to this function.
//
// Arguments: 1) hRet Set by this operation
// 2) feed The RSS URL
// 3) nLevel RSS Hierarchy
//
// Returns: E_NODATA If the download failed
// E_TYPE If the downloaded material does not appear to be XML
// E_FORMAT If the downloaded material could not be loaded into an XML document
// E_ARGUMENT If the RSS tags are not defined
// E_NOTFOUND If no tags were found in the RSS
// E_OK If the RSS data was collected
_hzfunc("hzWebhost::getRss_r") ;
hzVect<hzXmlNode*> linx ; // Links found in (this) RSS feed page
hzVect<hzUrl> todo ; // Links found in RSS feed page (additions to this are controlled by the set above)
hzDocXml X ; // For loading of RSS feed pages and extraction of links
hzXmlNode* pN1 ; // Nodes (containing <item>)
hzXmlNode* pN2 ; // Nodes (containing <item> subnodes of title, link, description)
hzDocMeta* pMark ; // Document meta data
hzDocument* pDoc ; // Document found at URL (could be XML of HTML)
hzUrl page ; // Temp link
hzString desc ; // RSS article description
hzString dstr ; // RSS article date
hzString uqid ; // Unique ID of RSS item
hzString title ; // RSS article title
uint32_t nIndex ; // Links iterator
hzEcode rc = E_OK ; // Return code
// Fetch the current RSS document
pDoc = Download(feed) ;
if (rc != E_OK)
{ threadLog("Could not fetch URL %s\n", *feed) ; return rc ; }
// If not an XML document then it is just a page. Nothing further.
if (pDoc->Whatami() != DOCTYPE_XML)
{ threadLog("case 1. Fetched feed (%s) is not of doctype XML\n", *feed) ; return E_TYPE ; }
nLevel++ ;
// Load current RSS document into XML document tree
rc = X.Load(HC.m_Content) ;
if (rc != E_OK)
return hzerr(rc, "Could not load feed %s", *feed) ;
// The page is an RSS document so select the <itme> tags
rc = X.FindNodes(linx, m_tagItem.m_Slct) ;
threadLog("Found %d <item> tags in feed %s\n", linx.Count(), *feed) ;
if (rc != E_OK)
return rc ;
for (nIndex = 0; nIndex < linx.Count() ; nIndex++)
{
threadLog("case 1\n") ;
pN1 = linx[nIndex] ;
title = (char*) 0; desc = (char*) 0; page = (char*) 0; uqid = (char*) 0; dstr = (char*) 0;
for (pN2 = pN1->GetFirstChild() ; pN2 ; pN2 = pN2->Sibling())
{
threadLog("case 2\n") ;
if (pN2->NameEQ(*m_tagTitl.m_Slct)) { title = pN2->m_fixContent ; continue ; }
if (pN2->NameEQ(*m_tagDesc.m_Slct)) { desc = pN2->m_fixContent ; continue ; }
if (pN2->NameEQ(*m_tagLink.m_Slct)) { page = pN2->m_fixContent ; continue ; }
if (pN2->NameEQ(*m_tagUqid.m_Slct)) { uqid = pN2->m_fixContent ; continue ; }
if (pN2->NameEQ(*m_tagDate.m_Slct)) { dstr = pN2->m_fixContent ; continue ; }
}
threadLog("case 3\n") ;
if (!page)
{ threadLog("case 1: title=%s; link=null uqid=%s\n", *title, *uqid) ; page = uqid ; }
if (!page)
{ threadLog("case 2: title=%s; link=null uqid=%s\n", *title, *uqid) ; continue ; }
threadLog("title=%s; link=%s\n", *title, *page) ;
if (m_mapHist.Exists(page))
threadLog("Exists in history, page %s\n", *page) ;
else
{
pMark = new hzDocMeta() ;
pMark->m_Title = title ;
pMark->m_Desc = desc ;
pMark->m_urlReq = page ;
if (dstr)
pMark->m_Modified.SetDateTime(*dstr) ;
// todo.Insert(page) ;
todo.Add(page) ;
threadLog("Adding to history, page %s\n", *page) ;
}
}
// Fetch all the new links found above by recursive call
for (nIndex = 0; nIndex < todo.Count() ; nIndex++)
{
page = todo[nIndex] ;
// pMark = m_mapHist[page] ;
threadLog("Processing %s\n", *page) ;
rc = getRss_r(hRet, page, nLevel) ;
}
return rc ;
}