Run the series of hzWebCMD directives to sync key pages from a website to a repository Arguments: None

Return TypeFunction nameArguments
hzEcodehzWebhost::Sync(void)

Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp

Function Logic:

0:START 1:items 2:unknown 3:items 4:Return E_NOINIT 5:unknown 6:items 7:Return E_NOINIT 8:rc 9:unknown 10:items 11:Return rc 12:unknown 13:unknown 14:pDoc wc 15:wc.m_Cmd 16:WEBCMD_LOAD_PAGE 17:unknown 18:items rc 19:items pDoc 20:unknown 21:items rc 22:items 23:unknown 24:pHdoc 25:unknown 26:unknown 27:pForm items 28:WEBCMD_LOAD_LIST 29:items 30:unknown 31:items rc 32:unknown 33:items rc 34:pgl 35:unknown 36:url pDoc 37:unknown 38:items rc 39:items 40:items 41:WEBCMD_SLCT_PAGE 42:items 43:unknown 44:rc items 45:unknown 46:rc items 47:unknown 48:rc items 49:unknown 50:rc items 51:unknown 52:unknown 53:pDoc 54:pDoc 55:unknown 56:rc items 57:pgl pgl 58:unknown 59:items 60:pHdoc 61:unknown 62:pElem items 63:unknown 64:items 65:items 66:rc 67:unknown 68:pElem items 69:unknown 70:anam aval items 71:unknown 72:url items 73:items 74:items items 75:WEBCMD_SLCT_LIST 76:items 77:unknown 78:items rc 79:unknown 80:rc items 81:unknown 82:rc items 83:unknown 84:pgl2 pgl2 pgl 85:unknown 86:rc items 87:unknown 88:url 89:unknown 90:pDoc 91:pDoc 92:unknown 93:rc items 94:unknown 95:pHdoc rc 96:unknown 97:pElem items 98:unknown 99:anam aval items 100:unknown 101:url items 102:items 103:items items 104:WEBCMD_RGET 105:items items pDoc 106:unknown 107:items 108:unknown 109:items 110:pHdoc items 111:pDoc 112:unknown 113:url 114:unknown 115:items 116:items 117:unknown 118:items items 119:unknown 120:url items 121:unknown 122:items 123:unknown 124:items 125:unknown 126:items 127:items pDoc 128:unknown 129:items 130:unknown 131:pHdoc items 132:unknown 133:url 134:unknown 135:items 136:pDoc 137:WEBCMD_POST 138:items pForm 139:unknown 140:items 141:rc 142:unknown 143:items 144:Return rc 145:unknown 146:items 147:Return rc 148:unknown 149:url / m_Repos S S items 150:unknown 151:items 152:Return E_WRITEFAIL 153:items items items items 154:WEBCMD_RSS 155:items 156:unknown 157:m_tagItem m_tagItem m_tagItem 158:unknown 159:m_tagUqid m_tagUqid m_tagUqid 160:unknown 161:m_tagLink m_tagLink m_tagLink 162:unknown 163:m_tagDesc m_tagDesc m_tagDesc 164:unknown 165:m_tagDate m_tagDate m_tagDate 166:rc items 167:rc 168:unknown 169:pgl pgl 170:unknown 171:pDoc pDoc 172:Return rc

Function body:

hzEcode hzWebhost::Sync (void)
{
   //  Run the series of hzWebCMD directives to sync key pages from a website to a repository
   //  
   //  Arguments: None
   //  
   //  Returns: E_NOINIT If no repository, no domain or no homepage has been specified
   //     E_NOTFOUND If the login page was not located
   //     E_WRITEFAIL If the login form recieved was not written to the repository
   //     E_OPENFAIL If the visit status file could not be opened
   //     E_OK  If the scrape operation was successful
   _hzfunc("hzWebhost::Sync") ;
   hzMapS  <hzUrl,hzDocument*> cur ;       //  Currently loaded documents
   hzMapS  <hzString,hzString> fvals ;     //  Form values to be submitted
   hzVect  <hzHtmElem*>        elems ;     //  Elements selected by the web selector command
   hzList  <hzWebCMD>::Iter    ci ;        //  Iterator for web commands
   hzList  <hzPair>::Iter      pi ;        //  Iterator for form data
   hzList  <hzUrl>::Iter       si ;        //  Iterator for pagelist
   hzList  <hzHtmForm*>::Iter  fi ;        //  Iterator for forms
   hzSet   <hzUrl>     set_ctrl ;          //  Initial links from processing config params
   hzVect  <hzUrl>     pglinks ;           //  Links encountered within a given pages
   hzVect  <hzUrl>     allinks ;           //  Links encountered within a given pages
   hzVect  <hzString>  hdrs ;              //  Extra headers, needed for submit form
   hzList  <hzPair>    flist ;             //  Filtered list of form values
   ofstream        os ;                    //  For writing form respose
   _pageList*      pgl = 0;                //  Primary pagelist instance
   _pageList*      pgl2 = 0;               //  Secondary pagelist instance
   hzWebCMD        wc ;                    //  Current web command
   hzDocument*     pDoc ;                  //  Downloaded document
   hzDocHtml*      pHdoc ;                 //  Set if downloaded document is a HTML page.
   hzHtmElem*      pElem ;                 //  HTML element (tag) lifted from page
   hzHtmForm*      pForm ;                 //  Form found in page
   hzPair          P ;                     //  Name value pair
   hzXDate         now ;                   //  Date/time now (for cheking is pages have expired
   hzAttrset       ai ;                    //  HTML element attribute iterator
   hzString        anam ;                  //  Attribute name
   hzString        aval ;                  //  Attribute value
   hzString        S ;                     //  Temp string
   hzUrl           url ;                   //  Temp link
   uint32_t        nStart ;                //  Links iterator
   uint32_t        nLimit ;                //  Links iterator
   uint32_t        nCount ;                //  Links iterator
   uint32_t        n ;                     //  Aggregation iterator
   HttpRC          hRet = HTTPMSG_OK ;     //  HTML return code
   hzEcode         rc ;                    //  Return code
   threadLog("Called hzWebhost::Sync\n") ;
   //  Check if repository and list of command is set up
   if (!m_Repos)
       { threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
   if (!m_Commands.Count())
       { threadLog("Website is not properly initialized (no commands)\n") ; return E_NOINIT ; }
   //  Read in any existing manifest file
   rc = _loadstatus() ;
   if (rc != E_OK)
       { threadLog("Error on loading status - aborting\n") ; return rc ; }
   //  If resuming execution, start we left off
   for (n = 0,ci = m_Commands ; n < m_Sofar ; n++, ci++) ;
   //  Execute commands in order
   for (; rc == E_OK && hRet == HTTPMSG_OK && ci.Valid() ; ci++)
   {
       pDoc = 0;
       wc = ci.Element() ;
       switch  (wc.m_Cmd)
       {
       case WEBCMD_LOAD_PAGE:  //  Get a page (no conditions)
           if (!wc.m_Url)
               { threadLog("Invalid loadPage command - no URL\n") ; rc = E_NOINIT ; break ; }
           threadLog("Doing WEBCMD_LOAD_PAGE\n") ;
           pDoc = Download(wc.m_Url) ;
           if (!pDoc)
               { threadLog("case 1. Could not fetch page %s\n", *wc.m_Url) ; rc = E_NOTFOUND ; break ; }
           cur.Insert(wc.m_Url, pDoc) ;
           if (pDoc->Whatami() == DOCTYPE_HTML)
           {
               pHdoc = (hzDocHtml*) pDoc ;
               if (pHdoc->m_Forms.Count())
               {
                   //  Add the forms to the m_Forms map in the hzWebhost instance
                   for (fi = pHdoc->m_Forms ; fi.Valid() ; fi++)
                   {
                       pForm = fi.Element() ;
                       m_Forms.Insert(pForm->name, pForm) ;
                   }
               }
           }
           break ;
       case WEBCMD_LOAD_LIST:  //  Get a list of pages (list supplied in command)
           threadLog("Doing WEBCMD_LOAD_LIST\n") ;
           if (!wc.m_Inputs)
               { threadLog(" - Invalid loadList command - no list of links named\n") ; rc  = E_NOTFOUND ; break ; }
           if (!m_Pagelists.Exists(wc.m_Inputs))
               { threadLog(" - No such list of links as %s\n", *wc.m_Inputs) ; rc  = E_NOTFOUND ; break ; }
           pgl = m_Pagelists[wc.m_Inputs] ;
           for (si = pgl->links ; si.Valid() ; si++)
           {
               url = si.Element() ;
               pDoc = Download(url) ;
               if (!pDoc)
                   { threadLog(" - case 3. Could not fetch page %s\n", *url) ; rc = E_NOTFOUND ; }
               else
                   threadLog(" - Fetched page %s\n", *url) ;
           }
           threadLog("Ending WEBCMD_LOAD_LIST (%s)\n", *wc.m_Inputs) ;
           break ;
       case WEBCMD_SLCT_PAGE:  //  Select links from a page
           threadLog("Doing WEBCMD_SLCT_PAGE\n") ;
           if (wc.m_Url && wc.m_Inputs)    { rc = E_NOINIT ; threadLog("Invalid request. Both a URL and an Input set specified\n") ; }
           if (!wc.m_Url && !wc.m_Inputs)  { rc = E_NOINIT ; threadLog("Invalid request. No URL or Input set specified\n") ; }
           if (!wc.m_Output)               { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no name for output list\n") ; }
           if (!wc.m_Slct && !wc.m_Crit)   { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no node selection or globing criteria\n") ; }
           if (rc != E_OK)
               break ;
           if (cur.Exists(wc.m_Url))
               pDoc = cur[wc.m_Url] ;
           else
               pDoc = Download(wc.m_Url) ;
           if (!pDoc)
               { rc = E_NOTFOUND ; threadLog("case 2. Could not fetch page %s\n", *wc.m_Url) ; break ; }
           pgl = new _pageList() ;
           pgl->name = wc.m_Output ;
           if (pDoc->Whatami() != DOCTYPE_HTML)
               threadLog("Not a HTML document\n") ;
           else
           {
               pHdoc = (hzDocHtml*) pDoc ;
               for (n = 0; n < pHdoc->m_vecTags.Count() ; n++)
               {
                   pElem = pHdoc->m_vecTags[n] ;
                   threadLog("VEC TAG %d <%s ", n, *pElem->Name()) ;
                   for (ai = pElem ; ai.Valid() ; ai.Advance())
                   {
                       threadLog(" %s=%s", ai.Name(), ai.Value()) ;
                   }
                   threadLog(" />\n") ;
               }
               rc = pHdoc->FindElements(elems, wc.m_Slct) ;
               for (n = 0; n < elems.Count() ; n++)
               {
                   pElem = elems[n] ;
                   threadLog("%s. GOT <%s ", *pElem->Name()) ;
                   for (ai = pElem ; ai.Valid() ; ai.Advance())
                   {
                       anam = ai.Name() ; aval = ai.Value() ;
                       threadLog(" %s=%s", *anam, *aval) ;
                       if (anam == "href")
                       {
                           url = aval ;
                           pgl->links.Add(url) ;
                       }
                   }
                   threadLog(" />\n") ;
               }
           }
           threadLog("Inserting pagelist %s of %d items\n", *pgl->name, pgl->links.Count()) ;
           m_Pagelists.Insert(pgl->name, pgl) ;
           break ;
       case WEBCMD_SLCT_LIST:  //  Select links from a set of pages (supplied as a set of links)
           threadLog("Doing WEBCMD_SLCT_LIST (%s)\n", *wc.m_Url) ;
           if (!wc.m_Inputs)
               { threadLog("Invalid slctList command - no source list of links\n") ; rc = E_NOINIT ; break ; }
           if (!wc.m_Output)
               { rc = E_NOINIT ; threadLog("Invalid slctList command - no name for output list\n") ; }
           if (!wc.m_Slct && !wc.m_Crit)
               { rc = E_NOINIT ; threadLog("Invalid slctList command - no node selection or globing criteria\n") ; }
           if (rc != E_OK)
               break ;
           pgl2 = new _pageList() ;
           pgl2->name = wc.m_Output ;
           //  Begin
           pgl = m_Pagelists[wc.m_Inputs] ;
           if (!pgl)
               { rc = E_CORRUPT ; threadLog("Pagelist of %s not found\n", *wc.m_Inputs) ; break ; }
           for (si = pgl->links ; si.Valid() ; si++)
           {
               url = si.Element() ;
               if (cur.Exists(url))
                   pDoc = cur[url] ;
               else
                   pDoc = Download(url) ;
               if (!pDoc)
                   { rc = E_NOTFOUND ; threadLog("case 2.2 Could not fetch page %s\n", *url) ; break ; }
               if (pDoc->Whatami() == DOCTYPE_HTML)
               {
                   pHdoc = (hzDocHtml*) pDoc ;
                   rc = pHdoc->FindElements(elems, wc.m_Slct) ;
                   for (n = 0; n < elems.Count() ; n++)
                   {
                       pElem = elems[n] ;
                       threadLog("%s. GOT <%s ", *pElem->Name()) ;
                       for (ai = pElem ; ai.Valid() ; ai.Advance())
                       {
                           anam = ai.Name() ; aval = ai.Value() ;
                           threadLog(" %s=%s", *anam, *aval) ;
                           if (anam == "href")
                           {
                               url = aval ;
                               pgl2->links.Add(url) ;
                           }
                       }
                       threadLog(" />\n") ;
                   }
               }
           }
           threadLog("Case 2. Inserting pagelist %s of %d items\n", *pgl2->name, pgl2->links.Count()) ;
           m_Pagelists.Insert(pgl2->name, pgl2) ;
           break ;
       case WEBCMD_RGET:   //  Get a root page
           threadLog("Doing WEBCMD_RGET\n") ;
           threadLog("Page=%s Crit=%s\n", *wc.m_Url, *wc.m_Crit) ;
           //  Get root page first
           pDoc = Download(wc.m_Url) ;
           if (!pDoc)
               threadLog("case 4. Could not fetch page %s\n", *wc.m_Url) ;
           else
           {
               if (pDoc->Whatami() != DOCTYPE_HTML)
                   threadLog("Page %s not HTML\n", *wc.m_Url) ;
               else
               {
                   pHdoc = (hzDocHtml*) pDoc ;
                   pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
               }
               delete pDoc ;
           }
           //  Now aggregate the vector of links from the page to a vector of all links from all pages. Use a set to avoid repeats.
           for (n = 0; n < pglinks.Count() ; n++)
           {
               url = pglinks[n] ;
               if (!set_ctrl.Exists(url))
                   allinks.Add(url) ;
           }
           //  Starting at the site root and for each page, grab all links and go to each link in turn
           threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
           for (nStart = 0; nStart < allinks.Count() ; nStart = nCount)
           {
               now.SysDateTime() ;
               pglinks.Clear() ;
               for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
               {
                   url = allinks[nCount] ;
                   threadLog("Cosidering link %s - ", *url.Whole()) ;
                   if (m_mapHist.Exists(url))              { threadLog("historic\n") ; continue ; }
                   if (url == m_Authexit)                  { threadLog("exit-page\n") ; continue ; }
                   if (!m_Domains.Exists(url.Domain()))    { threadLog("URL %s outside domain\n", *url) ; continue ; }
                   //  Page not yet visted so we visit it, put it in list of pages visited and get the links. Some of these links may add to
                   //  the list of links.
                   threadLog("Fetching\n") ;
                   pDoc = Download(url) ;
                   if (!pDoc)
                       threadLog("case 2. Could not fetch page %s\n", *url) ;
                   else
                   {
                       if (pDoc->Whatami() == DOCTYPE_HTML)
                       {
                           pHdoc = (hzDocHtml*) pDoc ;
                           pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
                           //  Re-aggregate the all-links vector
                           for (n = 0; n < pglinks.Count() ; n++)
                           {
                               url = pglinks[n] ;
                               if (!set_ctrl.Exists(url))
                                   allinks.Add(url) ;
                           }
                       }
                       delete pDoc ;
                   }
               }
           }
           break ;
       case WEBCMD_POST:   //  Post a form. The form should have been previously downloaded and will be looked for by name
           threadLog("Doing WEBCMD_POST\n") ;
           pForm = m_Forms[wc.m_Output] ;
           if (!pForm)
               threadLog("Warning: No such form as [%s]\n", *wc.m_Output) ;
           //  Take the command's formdata and use it to populate the form's set of fields
           /*
           **  for (pi = pForm->fields ; pi.Valid() ; pi++)
           **      { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
           **     for (pi = wc.m_Formdata ; pi.Valid() ; pi++)
           **      { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
           **  
           **     for (n = 0 ; n < fvals.Count() ; n++)
           **     {
           **      P.name = fvals.GetKey(n) ;
           **      P.value = fvals.GetObj(n) ;
           **      flist.Add(P) ;
           **     }
           **                */
           rc = HC.PostForm(hRet, wc.m_Url, hdrs, wc.m_Formdata) ;
           if (rc != E_OK)
               { threadLog("Could not post form to %s\n", *wc.m_Url) ; return rc ; }
           if (hRet != HTTPMSG_OK)
               { threadLog("Invalid response to post form (to %s)\n", *wc.m_Url) ; return rc ; }
           //  Write out the login response
           if (m_Repos)
           {
               url = wc.m_Url ;
               S = m_Repos + "/" + url.Filename() ;
               S += ".response" ;
               os.open(*S) ;
               if (os.fail())
                   { threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
               os << HC.m_Header ;
               os << "\r\n\r\n" ;
               os << HC.m_Content ;
               os.close() ;
           }
           break ;
       case WEBCMD_RSS:    //  Get an RSS feed
           threadLog("Doing WEBCMD_RSS\n") ;
           //  If XML selectors for RSS feed are not initialized, set them here
           if (!m_tagItem.m_Slct)  { m_tagItem.m_Filt = (char*) 0; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
           if (!m_tagUqid.m_Slct)  { m_tagUqid.m_Filt = (char*) 0; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
           if (!m_tagLink.m_Slct)  { m_tagLink.m_Filt = (char*) 0; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
           if (!m_tagDesc.m_Slct)  { m_tagDesc.m_Filt = (char*) 0; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
           if (!m_tagDate.m_Slct)  { m_tagDate.m_Filt = (char*) 0; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
           //  Get the feed
           rc = getRss_r(hRet, wc.m_Url, 0);
           threadLog("Processed items\n") ;
           break ;
       }
   }
   //  Write out manifest file
   rc = _savestatus() ;
   //  Clear documents
   for (n = 0; n < m_Pagelists.Count() ; n++)
   {
       pgl = m_Pagelists.GetObj(n) ;
       delete pgl ;
   }
   for (n = 0; n < cur.Count() ; n++)
   {
       pDoc = cur.GetObj(n) ;
       delete pDoc ;
   }
   return rc ;
}