Return TypeFunction nameArguments
hzEcodeTokenizeFreetext(hzVect<hzToken>&,hzChain&,)

Declared and defined in file: hzTokens.cpp

Function Logic:

0:START 1:hzVect::Clear hzChain::Size 2:!C.Size() 3:Return E_NODATA 4:ci hzChain::Iter::Line 5:!ci.eof(); 6:!IsAlphanum(*ci) 7:items 8:hzChain::Clear nWeird nAlpha nDigit nSymb nPunct hzChain::Iter::Line nLine 9:!ci.eof()&&IsAlphanum(*ci); 10:IsPunct(*ci) 11:ci[1]<=(char)32 12:items 13:items hzChain::AddByte items 14:IsSymb(*ci) 15:items hzChain::AddByte items 16:IsDigit(*ci) 17:items hzChain::AddByte items 18:IsAlpha(*ci) 19:items hzChain::AddByte items 20:AtUnicodeSeq(ucVal,ucLen,ci) 21:ucVal>255 22:items items 23:hzChain::AddByte ci 24:hzChain::AddByte hzChain::Size 25:!raw.Size() 26:nAlpha 27:!nDigit&&!nPunct&&!nSymb&&!nWeird 28:word hzString::ToLower hzToken::Init hzVect::Add 29:Return E_OK

Function body:

hzEcode TokenizeFreetext (hzVect<hzToken>& toks, hzChain& C)
{
   _hzfunc("TokenizeFreetext") ;
   hzChain     raw ;
   chIter      ci ;
   chIter      ri ;
   hzToken     T ;
   hzString    word ;
   uint32_t    ucVal ;
   uint32_t    nPunct ;
   uint32_t    nSymb ;
   uint32_t    nDigit ;
   uint32_t    nAlpha ;
   uint32_t    nWeird ;
   uint32_t    ucLen ;
   uint32_t    nLine ;
   toks.Clear() ;
   if (!C.Size())
       return E_NODATA ;
   ci = C ;
   ci.Line(1);
   for (; !ci.eof() ;)
   {
       if (!IsAlphanum(*ci))
           { ci++ ; continue ; }
       raw.Clear() ;
       nPunct = nSymb = nDigit = nAlpha = nWeird = 0;
       nLine = ci.Line() ;
       for (; !ci.eof() && IsAlphanum(*ci) ;)
       {
           if (IsPunct(*ci))
           {
               if (ci[1]<&eq; CHAR_SPACE)
                   { ci++ ; break ; }
               nPunct++ ;
               raw.AddByte(*ci) ;
               ci++ ;
               continue ;
           }
           if (IsSymb(*ci))    { nSymb++ ;     raw.AddByte(*ci) ;  ci++ ;  continue ; }
           if (IsDigit(*ci))   { nDigit++ ;    raw.AddByte(*ci) ;  ci++ ;  continue ; }
           if (IsAlpha(*ci))   { nAlpha++ ;    raw.AddByte(*ci) ;  ci++ ;  continue ; }
           if (AtUnicodeSeq(ucVal, ucLen, ci))
           {
               if (ucVal > 255)
                   { nWeird++ ; ci++ ; }
               else
                   { raw.AddByte(ucVal & 0xff);ci+=ucLen ; }
               continue ;
           }
           raw.AddByte(*ci) ;
       }
       if (!raw.Size())
           continue ;
       if (nAlpha)
       {
           if (!nDigit && !nPunct && !nSymb && !nWeird)
           {
               word = raw ;
               word.ToLower() ;
               T.Init(word, nLine, TOKEN_ALPHANUM) ;
               toks.Add(T) ;
           }
       }
   }
   return E_OK ;
}