Tokenize into words and numbers in accordance with the rules for freetext indexation. This, like the TokenizeWords() function, ignors all punctuation between words although it does process punctuation within words. Words are first split on the basis of whitespace only (although trailing punctuation chars are ignored). This produces 'raw' words from which other words may be derived. As the raw words are built up, a count of each character type is kept. This helps drive the word derivation process. The output then consists of the raw word (if it survives) followed by any derived word. Examples:- 1) $4million -> dollars 4 million 2) bee-keeper -> bee-keeper beekeeper bee keeper.

Return TypeFunction nameArguments
hzEcodeTokenizeFreetext(hzVect<hzToken>&,hzChain&,)

Declared and defined in file: hzTokens.cpp

Function Logic:

0:START 1:items 2:unknown 3:Return E_NODATA 4:ci items 5:unknown 6:unknown 7:items 8:items nWeird nAlpha nDigit nSymb nPunct nLine 9:unknown 10:unknown 11:unknown 12:items 13:items items items 14:unknown 15:items items items 16:unknown 17:items items items 18:unknown 19:items items items 20:unknown 21:unknown 22:items items 23:items ci 24:items 25:unknown 26:unknown 27:unknown 28:word items items items 29:Return E_OK

Function body:

hzEcode TokenizeFreetext (hzVect<hzToken>& toks)hzChain& C, 
{
   //  Category: Text Processing
   //  
   //  Tokenize into words and numbers in accordance with the rules for freetext indexation. This, like the TokenizeWords() function,
   //  ignors all punctuation between words although it does process punctuation within words.
   //  
   //  Words are first split on the basis of whitespace only (although trailing punctuation chars are ignored). This produces 'raw'
   //  words from which other words may be derived. As the raw words are built up, a count of each character type is kept. This helps
   //  drive the word derivation process. The output then consists of the raw word (if it survives) followed by any derived word.
   //  
   //  Examples:-
   //  1) $4million -> dollars 4 million
   //  2) bee-keeper -> bee-keeper beekeeper bee keeper.
   //  
   //  Arguments: 1) toks The vector of tokens found in the input
   //     2) C  The input chain
   //  
   //  Returns: E_NODATA If the supplied chain is empty
   //     E_OK  If the supplied chain is tokenized
   _hzfunc("TokenizeFreetext") ;
   hzChain     raw ;       //  For building raw word
   chIter      ci ;        //  For iteration of input
   chIter      ri ;        //  For iteration of raw word
   hzToken     T ;         //  Token
   hzString    word ;      //  Word (from raw)
   uint32_t    ucVal ;     //  Value of unicode char
   uint32_t    nPunct ;    //  Count of punctuation chars in raw word
   uint32_t    nSymb ;     //  Count of symbol chars in raw word
   uint32_t    nDigit ;    //  Count of digits in raw word
   uint32_t    nAlpha ;    //  Count of alphas in raw word
   uint32_t    nWeird ;    //  Count of alphas in raw word
   uint32_t    ucLen ;     //  Length of unicode char
   uint32_t    nLine ;     //  Line number (at start of raw sequence)
   toks.Clear() ;
   if (!C.Size())
       return E_NODATA ;
   ci = C ;
   ci.Line(1);
   for (; !ci.eof() ;)
   {
       //  Ignore leading spaces and other non alphnumerics
       if (!IsAlphanum(*ci))
           { ci++ ; continue ; }
       //  Now we only have to deal with chars that are part of the raw word
       raw.Clear() ;
       nPunct = nSymb = nDigit = nAlpha = nWeird = 0;
       nLine = ci.Line() ;
       for (; !ci.eof() && IsAlphanum(*ci) ;)
       {
           if (IsPunct(*ci))
           {
               if (ci[1]<&eq; CHAR_SPACE)
                   { ci++ ; break ; }
               nPunct++ ;
               raw.AddByte(*ci) ;
               ci++ ;
               continue ;
           }
           if (IsSymb(*ci))    { nSymb++ ;     raw.AddByte(*ci) ;  ci++ ;  continue ; }
           if (IsDigit(*ci))   { nDigit++ ;    raw.AddByte(*ci) ;  ci++ ;  continue ; }
           if (IsAlpha(*ci))   { nAlpha++ ;    raw.AddByte(*ci) ;  ci++ ;  continue ; }
           //  Do we have a unicode sequence?
           if (AtUnicodeSeq(ucVal, ucLen, ci))
           {
               if (ucVal > 255)
                   { nWeird++ ; ci++ ; }
               else
                   { raw.AddByte(ucVal & 0xff);ci+=ucLen ; }
               continue ;
           }
           raw.AddByte(*ci) ;
       }
       //  Now we have a raw word, do we add it to the token list as-is or do we spawn derivatives?
       if (!raw.Size())
           continue ;
       if (nAlpha)
       {
           if (!nDigit && !nPunct && !nSymb && !nWeird)
           {
               word = raw ;
               word.ToLower() ;
               T.Init(word, nLine, TOKEN_ALPHANUM) ;
               toks.Add(T) ;
           }
           //  Now must fill in what happens when we have digits, hyphens and what have you.
       }
   }
   return E_OK ;
}