Tokenize into words and numbers in accordance with the rules for freetext indexation. This, like the TokenizeWords() function, ignors all punctuation between words although it does process punctuation within words. Words are first split on the basis of whitespace only (although trailing punctuation chars are ignored). This produces 'raw' words from which other words may be derived. As the raw words are built up, a count of each character type is kept. This helps drive the word derivation process. The output then consists of the raw word (if it survives) followed by any derived word. Examples:- 1) $4million -> dollars 4 million 2) bee-keeper -> bee-keeper beekeeper bee keeper.

Return Type	Function name	Arguments
hzEcode	TokenizeFreetext	(hzVect<hzToken>&,hzChain&,)

Declared and defined in file: hzTokens.cpp

Function Logic:

Function body:

hzEcode TokenizeFreetext (hzVect<hzToken>& toks)hzChain& C, 
{
   //  Category: Text Processing
   //  
   //  Tokenize into words and numbers in accordance with the rules for freetext indexation. This, like the TokenizeWords() function,
   //  ignors all punctuation between words although it does process punctuation within words.
   //  
   //  Words are first split on the basis of whitespace only (although trailing punctuation chars are ignored). This produces 'raw'
   //  words from which other words may be derived. As the raw words are built up, a count of each character type is kept. This helps
   //  drive the word derivation process. The output then consists of the raw word (if it survives) followed by any derived word.
   //  
   //  Examples:-
   //  1) $4million -> dollars 4 million
   //  2) bee-keeper -> bee-keeper beekeeper bee keeper.
   //  
   //  Arguments: 1) toks The vector of tokens found in the input
   //     2) C  The input chain
   //  
   //  Returns: E_NODATA If the supplied chain is empty
   //     E_OK  If the supplied chain is tokenized
   _hzfunc("TokenizeFreetext") ;
   hzChain     raw ;       //  For building raw word
   chIter      ci ;        //  For iteration of input
   chIter      ri ;        //  For iteration of raw word
   hzToken     T ;         //  Token
   hzString    word ;      //  Word (from raw)
   uint32_t    ucVal ;     //  Value of unicode char
   uint32_t    nPunct ;    //  Count of punctuation chars in raw word
   uint32_t    nSymb ;     //  Count of symbol chars in raw word
   uint32_t    nDigit ;    //  Count of digits in raw word
   uint32_t    nAlpha ;    //  Count of alphas in raw word
   uint32_t    nWeird ;    //  Count of alphas in raw word
   uint32_t    ucLen ;     //  Length of unicode char
   uint32_t    nLine ;     //  Line number (at start of raw sequence)
   toks.Clear() ;
   if (!C.Size())
       return E_NODATA ;
   ci = C ;
   ci.Line(1);
   for (; !ci.eof() ;)
   {
       //  Ignore leading spaces and other non alphnumerics
       if (!IsAlphanum(*ci))
           { ci++ ; continue ; }
       //  Now we only have to deal with chars that are part of the raw word
       raw.Clear() ;
       nPunct = nSymb = nDigit = nAlpha = nWeird = 0;
       nLine = ci.Line() ;
       for (; !ci.eof() && IsAlphanum(*ci) ;)
       {
           if (IsPunct(*ci))
           {
               if (ci[1]<&eq; CHAR_SPACE)
                   { ci++ ; break ; }
               nPunct++ ;
               raw.AddByte(*ci) ;
               ci++ ;
               continue ;
           }
           if (IsSymb(*ci))    { nSymb++ ;     raw.AddByte(*ci) ;  ci++ ;  continue ; }
           if (IsDigit(*ci))   { nDigit++ ;    raw.AddByte(*ci) ;  ci++ ;  continue ; }
           if (IsAlpha(*ci))   { nAlpha++ ;    raw.AddByte(*ci) ;  ci++ ;  continue ; }
           //  Do we have a unicode sequence?
           if (AtUnicodeSeq(ucVal, ucLen, ci))
           {
               if (ucVal > 255)
                   { nWeird++ ; ci++ ; }
               else
                   { raw.AddByte(ucVal & 0xff);ci+=ucLen ; }
               continue ;
           }
           raw.AddByte(*ci) ;
       }
       //  Now we have a raw word, do we add it to the token list as-is or do we spawn derivatives?
       if (!raw.Size())
           continue ;
       if (nAlpha)
       {
           if (!nDigit && !nPunct && !nSymb && !nWeird)
           {
               word = raw ;
               word.ToLower() ;
               T.Init(word, nLine, TOKEN_ALPHANUM) ;
               toks.Add(T) ;
           }
           //  Now must fill in what happens when we have digits, hyphens and what have you.
       }
   }
   return E_OK ;
}