In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created and existing pages can be modified on an ad-hoc basis. The RSS feeds allow greter ease when syncing an external website to the local machine. By periodically reading one or more RSS feeds one can obtain a set of links which can generally be taken as the set of pages deemed 'current' by the website. By comparing these links to a history file of already fetched links, new pages can be added to a respository as they appear on the site. The RSS feeds are just XML files containing links. This function will obtain all the RSS feeds from the site, garner all the links from them and then download any pages from the links that are not already in the site history. The feeds themselves are not saved as these will be fetched again. Arguments: None
| Return Type | Function name | Arguments |
|---|---|---|
| hzEcode | hzWebhost::GetRSS | (void) |
Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp
Function Logic:
Function body:
hzEcode hzWebhost::GetRSS (void)
{
// In general a website can be thought of as a source of 'rolling' news updates in which old pages are deleted, new pages created
// and existing pages can be modified on an ad-hoc basis. The RSS feeds allow greter ease when syncing an external website to the
// local machine. By periodically reading one or more RSS feeds one can obtain a set of links which can generally be taken as the
// set of pages deemed 'current' by the website. By comparing these links to a history file of already fetched links, new pages
// can be added to a respository as they appear on the site. The RSS feeds are just XML files containing links.
//
// This function will obtain all the RSS feeds from the site, garner all the links from them and then download any pages from the
// links that are not already in the site history. The feeds themselves are not saved as these will be fetched again.
//
// Arguments: None
//
// Returns: E_NOINIT If the repository for the webhost has not previously been defined
// E_OPENFAIL If the visit status file could not be opened
// E_NODATA If the download failed
// E_TYPE If the downloaded material does not appear to be XML
// E_FORMAT If the downloaded material could not be loaded into an XML document
// E_ARGUMENT If the RSS tags are not defined
// E_NOTFOUND If no tags were found in the RSS
// E_OK If the RSS data was collected
_hzfunc("hzWebhost::GetRSS") ;
hzList<hzUrl>::Iter fi ; // RSS feeds iterator
hzUrl feed ; // Temp link
HttpRC hRet ; // HTML return code
hzEcode rc = E_OK ; // Return code
threadLog("Called\n") ;
// Login
rc = Login() ;
if (rc != E_OK)
{ threadLog("Login failed\n") ; return rc ; }
// Get the home page if one applies. Do this regardless of weather we already have it because we need the cookie
if (!m_Feeds.Count())
{ threadLog("Website has no starting point (URL) for an RSS feed.\n") ; return E_NOINIT ; }
// If XML selectors for RSS feed are not initialized, set them here
if (!m_tagItem.m_Slct) { m_tagItem.m_Filt = (char*) 0; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
if (!m_tagUqid.m_Slct) { m_tagUqid.m_Filt = (char*) 0; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
if (!m_tagLink.m_Slct) { m_tagLink.m_Filt = (char*) 0; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
if (!m_tagDesc.m_Slct) { m_tagDesc.m_Filt = (char*) 0; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
if (!m_tagDate.m_Slct) { m_tagDate.m_Filt = (char*) 0; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
/*
** ** Fetch all the feed XML documents from the RSS source(s)
** */
for (fi = m_Feeds ; fi.Valid() ; fi++)
{
feed = fi.Element() ;
// Get the feed
rc = getRss_r(hRet, feed, 0);
threadLog("Processed items\n") ;
}
// Write out visit status file
rc = _savestatus() ;
return rc ;
}