Return TypeFunction nameArguments
hzEcodehzWebhost::Sync(void)

Declared in file: hzHttpClient.h
Defined in file : hzHttpClient.cpp

Function Logic:

0:START 1:!m_Repos 2:Return E_NOINIT 3:!m_Commands.Count() 4:Return E_NOINIT 5:hzWebhost::_loadstatus rc 6:rc!=E_OK 7:Return rc 8:n 9:rc==E_OK&&hRet==HTTPMSG_OK&&ci.Valid(); 10:pDoc hzList::Iter::Element wc 11:wc.m_Cmd 12:WEBCMD_LOAD_PAGE 13:!wc.m_Url 14:rc 15:hzWebhost::Download pDoc 16:!pDoc 17:rc 18:hzMapS::Insert 19:pDoc->Whatami()==DOCTYPE_HTML 20:pHdoc hzList::Count 21:pHdoc->m_Forms.Count() 22:fi.Valid(); 23:hzList::Iter::Element pForm hzMapS::Insert 24:WEBCMD_LOAD_LIST 25:!wc.m_Inputs 26:rc 27:!m_Pagelists.Exists(wc.m_Inputs) 28:rc 29:pgl si 30:si.Valid(); 31:hzList::Iter::Element url hzWebhost::Download pDoc 32:!pDoc 33:rc 34:WEBCMD_SLCT_PAGE 35:wc.m_Url&&wc.m_Inputs 36:rc 37:!wc.m_Url&&!wc.m_Inputs 38:rc 39:!wc.m_Output 40:rc 41:!wc.m_Slct&&!wc.m_Crit 42:rc 43:rc!=E_OK 44:cur.Exists(wc.m_Url) 45:pDoc 46:hzWebhost::Download pDoc 47:!pDoc 48:rc 49:pgl pgl 50:pDoc->Whatami()!=DOCTYPE_HTML 51:pHdoc n 52:nm_vecTags.Count(); 53:pElem hzHtmElem::Name ai 54:ai.Valid(); 55:hzAttrset::Name hzAttrset::Value 56:hzDocHtml::FindElements rc n 57:n 58:pElem hzHtmElem::Name ai 59:ai.Valid(); 60:hzAttrset::Name anam hzAttrset::Value aval 61:anam==href 62:url hzList::Add 63:hzList::Count hzMapS::Insert 64:WEBCMD_SLCT_LIST 65:!wc.m_Inputs 66:rc 67:!wc.m_Output 68:rc 69:!wc.m_Slct&&!wc.m_Crit 70:rc 71:rc!=E_OK 72:pgl2 pgl2 pgl 73:!pgl 74:rc 75:si.Valid(); 76:hzList::Iter::Element url hzMapS::Exists 77:cur.Exists(url) 78:pDoc 79:hzWebhost::Download pDoc 80:!pDoc 81:rc 82:pDoc->Whatami()==DOCTYPE_HTML 83:pHdoc hzDocHtml::FindElements rc n 84:n 85:pElem hzHtmElem::Name ai 86:ai.Valid(); 87:hzAttrset::Name anam hzAttrset::Value aval 88:anam==href 89:url hzList::Add 90:hzList::Count hzMapS::Insert 91:WEBCMD_RGET 92:hzWebhost::Download pDoc 93:!pDoc 94:pDoc->Whatami()!=DOCTYPE_HTML 95:pHdoc hzDocHtml::ExtractLinksBasic 96:pDoc n 97:n 98:url hzSet::Exists 99:!set_ctrl.Exists(url) 100:hzVect::Add 101:hzVect::Count hzVect::Count 102:nStart 103:hzXDate::SysDateTime hzVect::Clear nStart nCount 104:nCount 105:url hzUrl::Whole hzMapS::Exists 106:m_mapHist.Exists(url) 107:url==m_Authexit 108:hzUrl::Domain hzSet::Exists 109:!m_Domains.Exists(url.Domain()) 110:hzWebhost::Download pDoc 111:!pDoc 112:pDoc->Whatami()==DOCTYPE_HTML 113:pHdoc hzDocHtml::ExtractLinksBasic n 114:n 115:url hzSet::Exists 116:!set_ctrl.Exists(url) 117:hzVect::Add 118:pDoc 119:WEBCMD_POST 120:pForm 121:!pForm 122:hzHttpClient::PostForm rc 123:rc!=E_OK 124:Return rc 125:hRet!=HTTPMSG_OK 126:Return rc 127:m_Repos 128:url hzUrl::Filename S S ofstream::open ofstream::fail 129:os.fail() 130:Return E_WRITEFAIL 131:items items items close 132:WEBCMD_RSS 133:!m_tagItem.m_Slct 134:m_tagItem m_tagItem m_tagItem 135:!m_tagUqid.m_Slct 136:m_tagUqid m_tagUqid m_tagUqid 137:!m_tagLink.m_Slct 138:m_tagLink m_tagLink m_tagLink 139:!m_tagDesc.m_Slct 140:m_tagDesc m_tagDesc m_tagDesc 141:!m_tagDate.m_Slct 142:m_tagDate m_tagDate m_tagDate 143:hzWebhost::getRss_r rc 144:hzWebhost::_savestatus rc n 145:n 146:hzMapS::GetObj pgl pgl n 147:n 148:hzMapS::GetObj pDoc pDoc 149:Return rc

Function body:

hzEcode hzWebhost::Sync (void)
{
   _hzfunc("hzWebhost::Sync") ;
   hzMapS  <hzUrl,hzDocument*> cur ;
   hzMapS  <hzString,hzString> fvals ;
   hzVect  <hzHtmElem*>        elems ;
   hzList  <hzWebCMD>::Iter    ci ;
   hzList  <hzPair>::Iter      pi ;
   hzList  <hzUrl>::Iter       si ;
   hzList  <hzHtmForm*>::Iter  fi ;
   hzSet   <hzUrl>     set_ctrl ;
   hzVect  <hzUrl>     pglinks ;
   hzVect  <hzUrl>     allinks ;
   hzVect  <hzString>  hdrs ;
   hzList  <hzPair>    flist ;
   ofstream        os ;
   _pageList*      pgl = 0;
   _pageList*      pgl2 = 0;
   hzWebCMD        wc ;
   hzDocument*     pDoc ;
   hzDocHtml*      pHdoc ;
   hzHtmElem*      pElem ;
   hzHtmForm*      pForm ;
   hzPair          P ;
   hzXDate         now ;
   hzAttrset       ai ;
   hzString        anam ;
   hzString        aval ;
   hzString        S ;
   hzUrl           url ;
   uint32_t        nStart ;
   uint32_t        nLimit ;
   uint32_t        nCount ;
   uint32_t        n ;
   HttpRC          hRet = HTTPMSG_OK ;
   hzEcode         rc ;
   threadLog("Called hzWebhost::Sync\n") ;
   if (!m_Repos)
       { threadLog("Website is not properly initialized (no repository)\n") ; return E_NOINIT ; }
   if (!m_Commands.Count())
       { threadLog("Website is not properly initialized (no commands)\n") ; return E_NOINIT ; }
   rc = _loadstatus() ;
   if (rc != E_OK)
       { threadLog("Error on loading status - aborting\n") ; return rc ; }
   for (n = 0,ci = m_Commands ; n < m_Sofar ; n++, ci++) ;
   for (; rc == E_OK && hRet == HTTPMSG_OK && ci.Valid() ; ci++)
   {
       pDoc = 0;
       wc = ci.Element() ;
       switch  (wc.m_Cmd)
       {
       case WEBCMD_LOAD_PAGE:  //  Get a page (no conditions)
           if (!wc.m_Url)
               { threadLog("Invalid loadPage command - no URL\n") ; rc = E_NOINIT ; break ; }
           threadLog("Doing WEBCMD_LOAD_PAGE\n") ;
           pDoc = Download(wc.m_Url) ;
           if (!pDoc)
               { threadLog("case 1. Could not fetch page %s\n", *wc.m_Url) ; rc = E_NOTFOUND ; break ; }
           cur.Insert(wc.m_Url, pDoc) ;
           if (pDoc->Whatami() == DOCTYPE_HTML)
           {
               pHdoc = (hzDocHtml*) pDoc ;
               if (pHdoc->m_Forms.Count())
               {
                   for (fi = pHdoc->m_Forms ; fi.Valid() ; fi++)
                   {
                       pForm = fi.Element() ;
                       m_Forms.Insert(pForm->name, pForm) ;
                   }
               }
           }
           break ;
       case WEBCMD_LOAD_LIST:  //  Get a list of pages (list supplied in command)
           threadLog("Doing WEBCMD_LOAD_LIST\n") ;
           if (!wc.m_Inputs)
               { threadLog(" - Invalid loadList command - no list of links named\n") ; rc  = E_NOTFOUND ; break ; }
           if (!m_Pagelists.Exists(wc.m_Inputs))
               { threadLog(" - No such list of links as %s\n", *wc.m_Inputs) ; rc  = E_NOTFOUND ; break ; }
           pgl = m_Pagelists[wc.m_Inputs] ;
           for (si = pgl->links ; si.Valid() ; si++)
           {
               url = si.Element() ;
               pDoc = Download(url) ;
               if (!pDoc)
                   { threadLog(" - case 3. Could not fetch page %s\n", *url) ; rc = E_NOTFOUND ; }
               else
                   threadLog(" - Fetched page %s\n", *url) ;
           }
           threadLog("Ending WEBCMD_LOAD_LIST (%s)\n", *wc.m_Inputs) ;
           break ;
       case WEBCMD_SLCT_PAGE:  //  Select links from a page
           threadLog("Doing WEBCMD_SLCT_PAGE\n") ;
           if (wc.m_Url && wc.m_Inputs)    { rc = E_NOINIT ; threadLog("Invalid request. Both a URL and an Input set specified\n") ; }
           if (!wc.m_Url && !wc.m_Inputs)  { rc = E_NOINIT ; threadLog("Invalid request. No URL or Input set specified\n") ; }
           if (!wc.m_Output)               { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no name for output list\n") ; }
           if (!wc.m_Slct && !wc.m_Crit)   { rc = E_NOINIT ; threadLog("Invalid linkSlct command - no node selection or globing criteria\n") ; }
           if (rc != E_OK)
               break ;
           if (cur.Exists(wc.m_Url))
               pDoc = cur[wc.m_Url] ;
           else
               pDoc = Download(wc.m_Url) ;
           if (!pDoc)
               { rc = E_NOTFOUND ; threadLog("case 2. Could not fetch page %s\n", *wc.m_Url) ; break ; }
           pgl = new _pageList() ;
           pgl->name = wc.m_Output ;
           if (pDoc->Whatami() != DOCTYPE_HTML)
               threadLog("Not a HTML document\n") ;
           else
           {
               pHdoc = (hzDocHtml*) pDoc ;
               for (n = 0; n < pHdoc->m_vecTags.Count() ; n++)
               {
                   pElem = pHdoc->m_vecTags[n] ;
                   threadLog("VEC TAG %d <%s ", n, *pElem->Name()) ;
                   for (ai = pElem ; ai.Valid() ; ai.Advance())
                   {
                       threadLog(" %s=%s", ai.Name(), ai.Value()) ;
                   }
                   threadLog(" />\n") ;
               }
               rc = pHdoc->FindElements(elems, wc.m_Slct) ;
               for (n = 0; n < elems.Count() ; n++)
               {
                   pElem = elems[n] ;
                   threadLog("%s. GOT <%s ", *pElem->Name()) ;
                   for (ai = pElem ; ai.Valid() ; ai.Advance())
                   {
                       anam = ai.Name() ; aval = ai.Value() ;
                       threadLog(" %s=%s", *anam, *aval) ;
                       if (anam == "href")
                       {
                           url = aval ;
                           pgl->links.Add(url) ;
                       }
                   }
                   threadLog(" />\n") ;
               }
           }
           threadLog("Inserting pagelist %s of %d items\n", *pgl->name, pgl->links.Count()) ;
           m_Pagelists.Insert(pgl->name, pgl) ;
           break ;
       case WEBCMD_SLCT_LIST:  //  Select links from a set of pages (supplied as a set of links)
           threadLog("Doing WEBCMD_SLCT_LIST (%s)\n", *wc.m_Url) ;
           if (!wc.m_Inputs)
               { threadLog("Invalid slctList command - no source list of links\n") ; rc = E_NOINIT ; break ; }
           if (!wc.m_Output)
               { rc = E_NOINIT ; threadLog("Invalid slctList command - no name for output list\n") ; }
           if (!wc.m_Slct && !wc.m_Crit)
               { rc = E_NOINIT ; threadLog("Invalid slctList command - no node selection or globing criteria\n") ; }
           if (rc != E_OK)
               break ;
           pgl2 = new _pageList() ;
           pgl2->name = wc.m_Output ;
           pgl = m_Pagelists[wc.m_Inputs] ;
           if (!pgl)
               { rc = E_CORRUPT ; threadLog("Pagelist of %s not found\n", *wc.m_Inputs) ; break ; }
           for (si = pgl->links ; si.Valid() ; si++)
           {
               url = si.Element() ;
               if (cur.Exists(url))
                   pDoc = cur[url] ;
               else
                   pDoc = Download(url) ;
               if (!pDoc)
                   { rc = E_NOTFOUND ; threadLog("case 2.2 Could not fetch page %s\n", *url) ; break ; }
               if (pDoc->Whatami() == DOCTYPE_HTML)
               {
                   pHdoc = (hzDocHtml*) pDoc ;
                   rc = pHdoc->FindElements(elems, wc.m_Slct) ;
                   for (n = 0; n < elems.Count() ; n++)
                   {
                       pElem = elems[n] ;
                       threadLog("%s. GOT <%s ", *pElem->Name()) ;
                       for (ai = pElem ; ai.Valid() ; ai.Advance())
                       {
                           anam = ai.Name() ; aval = ai.Value() ;
                           threadLog(" %s=%s", *anam, *aval) ;
                           if (anam == "href")
                           {
                               url = aval ;
                               pgl2->links.Add(url) ;
                           }
                       }
                       threadLog(" />\n") ;
                   }
               }
           }
           threadLog("Case 2. Inserting pagelist %s of %d items\n", *pgl2->name, pgl2->links.Count()) ;
           m_Pagelists.Insert(pgl2->name, pgl2) ;
           break ;
       case WEBCMD_RGET:   //  Get a root page
           threadLog("Doing WEBCMD_RGET\n") ;
           threadLog("Page=%s Crit=%s\n", *wc.m_Url, *wc.m_Crit) ;
           pDoc = Download(wc.m_Url) ;
           if (!pDoc)
               threadLog("case 4. Could not fetch page %s\n", *wc.m_Url) ;
           else
           {
               if (pDoc->Whatami() != DOCTYPE_HTML)
                   threadLog("Page %s not HTML\n", *wc.m_Url) ;
               else
               {
                   pHdoc = (hzDocHtml*) pDoc ;
                   pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
               }
               delete pDoc ;
           }
           for (n = 0; n < pglinks.Count() ; n++)
           {
               url = pglinks[n] ;
               if (!set_ctrl.Exists(url))
                   allinks.Add(url) ;
           }
           threadLog("STAGE TWO Have %d links in history, %d links in 'all-links'\n", m_vecHist.Count(), allinks.Count()) ;
           for (nStart = 0; nStart < allinks.Count() ; nStart = nCount)
           {
               now.SysDateTime() ;
               pglinks.Clear() ;
               for (nCount = nStart, nLimit = allinks.Count() ; nCount < nLimit ; nCount++)
               {
                   url = allinks[nCount] ;
                   threadLog("Cosidering link %s - ", *url.Whole()) ;
                   if (m_mapHist.Exists(url))              { threadLog("historic\n") ; continue ; }
                   if (url == m_Authexit)                  { threadLog("exit-page\n") ; continue ; }
                   if (!m_Domains.Exists(url.Domain()))    { threadLog("URL %s outside domain\n", *url) ; continue ; }
                   threadLog("Fetching\n") ;
                   pDoc = Download(url) ;
                   if (!pDoc)
                       threadLog("case 2. Could not fetch page %s\n", *url) ;
                   else
                   {
                       if (pDoc->Whatami() == DOCTYPE_HTML)
                       {
                           pHdoc = (hzDocHtml*) pDoc ;
                           pHdoc->ExtractLinksBasic(pglinks, m_Domains, wc.m_Crit) ;
                           for (n = 0; n < pglinks.Count() ; n++)
                           {
                               url = pglinks[n] ;
                               if (!set_ctrl.Exists(url))
                                   allinks.Add(url) ;
                           }
                       }
                       delete pDoc ;
                   }
               }
           }
           break ;
       case WEBCMD_POST:   //  Post a form. The form should have been previously downloaded and will be looked for by name
           threadLog("Doing WEBCMD_POST\n") ;
           pForm = m_Forms[wc.m_Output] ;
           if (!pForm)
               threadLog("Warning: No such form as [%s]\n", *wc.m_Output) ;
           /*
           **  for (pi = pForm->fields ; pi.Valid() ; pi++)
           **      { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
           **     for (pi = wc.m_Formdata ; pi.Valid() ; pi++)
           **      { P = pi.Element() ; fvals.Insert(P.name, P.value) ; }
           **  
           **     for (n = 0 ; n < fvals.Count() ; n++)
           **     {
           **      P.name = fvals.GetKey(n) ;
           **      P.value = fvals.GetObj(n) ;
           **      flist.Add(P) ;
           **     }
           **                */
           rc = HC.PostForm(hRet, wc.m_Url, hdrs, wc.m_Formdata) ;
           if (rc != E_OK)
               { threadLog("Could not post form to %s\n", *wc.m_Url) ; return rc ; }
           if (hRet != HTTPMSG_OK)
               { threadLog("Invalid response to post form (to %s)\n", *wc.m_Url) ; return rc ; }
           if (m_Repos)
           {
               url = wc.m_Url ;
               S = m_Repos + "/" + url.Filename() ;
               S += ".response" ;
               os.open(*S) ;
               if (os.fail())
                   { threadLog("Cannot write out header file %s\n", *S) ; return E_WRITEFAIL ; }
               os << HC.m_Header ;
               os << "\r\n\r\n" ;
               os << HC.m_Content ;
               os.close() ;
           }
           break ;
       case WEBCMD_RSS:    //  Get an RSS feed
           threadLog("Doing WEBCMD_RSS\n") ;
           if (!m_tagItem.m_Slct)  { m_tagItem.m_Filt = (char*) 0; m_tagItem.m_Info = "node" ; m_tagItem.m_Slct = "item" ; }
           if (!m_tagUqid.m_Slct)  { m_tagUqid.m_Filt = (char*) 0; m_tagUqid.m_Info = "node" ; m_tagUqid.m_Slct = "guid" ; }
           if (!m_tagLink.m_Slct)  { m_tagLink.m_Filt = (char*) 0; m_tagLink.m_Info = "node" ; m_tagLink.m_Slct = "link" ; }
           if (!m_tagDesc.m_Slct)  { m_tagDesc.m_Filt = (char*) 0; m_tagDesc.m_Info = "node" ; m_tagDesc.m_Slct = "description" ; }
           if (!m_tagDate.m_Slct)  { m_tagDate.m_Filt = (char*) 0; m_tagDate.m_Info = "node" ; m_tagDate.m_Slct = "pubDate" ; }
           rc = getRss_r(hRet, wc.m_Url, 0);
           threadLog("Processed items\n") ;
           break ;
       }
   }
   rc = _savestatus() ;
   for (n = 0; n < m_Pagelists.Count() ; n++)
   {
       pgl = m_Pagelists.GetObj(n) ;
       delete pgl ;
   }
   for (n = 0; n < cur.Count() ; n++)
   {
       pDoc = cur.GetObj(n) ;
       delete pDoc ;
   }
   return rc ;
}