Tokenize into words and numbers only, ignoring all punctuation. This is suitable for indexation of documents although it is not suitable for querying a document index as it does not produce boolean expressions of words.

Return TypeFunction nameArguments
hzEcodeTokenizeWords(hzVect<hzToken>&,hzChain&,)

Declared and defined in file: hzTokens.cpp

Function Logic:

0:START 1:items 2:unknown 3:Return E_NODATA 4:unknown 5:unknown 6:items 7:unknown 8:items items nLine 9:unknown 10:unknown 11:items 12:items 13:unknown 14:S items items 15:unknown 16:unknown 17:unknown 18:ci 19:unknown 20:items items nLine items 21:unknown 22:items 23:unknown 24:S items items 25:items 26:Return E_OK

Function body:

hzEcode TokenizeWords (hzVect<hzToken>& toks)hzChain& C, 
{
   //  Category: Text Processing
   //  
   //  Tokenize into words and numbers only, ignoring all punctuation. This is suitable for indexation of documents although it is not
   //  suitable for querying a document index as it does not produce boolean expressions of words.
   //  
   //  Arguments: 1) toks The vector of tokens found in the input
   //     2) C  The input chain
   //  
   //  Returns: E_NODATA If the supplied chain is empty
   //     E_OK  If the supplied chain is tokenized
   _hzfunc("TokenizeWords") ;
   chIter      ci ;        //  For iteration of input
   hzChain     word ;      //  For building token
   hzToken     T ;         //  Token
   hzString    S ;         //  Temp string
   uint32_t    nLine ;     //  For assigning line numbers to tokens
   toks.Clear() ;
   if (!C.Size())
       return E_NODATA ;
   for (ci = C ; !ci.eof() ;)
   {
       if (*ci <&eq; CHAR_SPACE)
           { ci++ ; continue ; }
       if (*ci == CHAR_DQUOTE)
       {
           S.Clear() ;
           word.Clear() ;
           nLine = ci.Line() ;
           for (ci++ ; !ci.eof() ; ci++)
           {
               if (*ci == CHAR_DQUOTE)
                   { ci++ ; break ; }
               word.AddByte(*ci) ;
           }
           if (word.Size())
           {
               S = word ;
               T.Init(S, nLine, TOKEN_ALPHANUM) ;
               toks.Add(T) ;
           }
           continue ;
       }
       if (ci == "/*")
       {
           for (ci += 2; !ci.eof() ; ci++)
           {
               if (ci == "*/")
                   { ci += 2; break ; }
           }
           continue ;
       }
       if (IsAlphanum(*ci))
       {
           S.Clear() ;
           word.Clear() ;
           nLine = ci.Line() ;
           word.AddByte(*ci) ;
           for (ci++ ; !ci.eof() && IsAlphanum(*ci) ; ci++)
               word.AddByte(*ci) ;
           if (word.Size())
           {
               S = word ;
               T.Init(S, nLine, TOKEN_ALPHANUM) ;
               toks.Add(T) ;
           }
           continue ;
       }
       ci++ ;
   }
   return E_OK ;
}