Tokenize into words and numbers in accordance with the rules for freetext indexation. This, like the TokenizeWords() function, ignors all punctuation between words although it does process punctuation within words. Words are first split on the basis of whitespace only (although trailing punctuation chars are ignored). This produces 'raw' words from which other words may be derived. As the raw words are built up, a count of each character type is kept. This helps drive the word derivation process. The output then consists of the raw word (if it survives) followed by any derived word. Examples:- 1) $4million -> dollars 4 million 2) bee-keeper -> bee-keeper beekeeper bee keeper.
| Return Type | Function name | Arguments |
|---|---|---|
| hzEcode | TokenizeFreetext | (hzVect<hzToken>&,hzChain&,) |
Declared and defined in file: hzTokens.cpp
Function Logic:
Function body:
hzEcode TokenizeFreetext (hzVect<hzToken>& toks)hzChain& C,
{
// Category: Text Processing
//
// Tokenize into words and numbers in accordance with the rules for freetext indexation. This, like the TokenizeWords() function,
// ignors all punctuation between words although it does process punctuation within words.
//
// Words are first split on the basis of whitespace only (although trailing punctuation chars are ignored). This produces 'raw'
// words from which other words may be derived. As the raw words are built up, a count of each character type is kept. This helps
// drive the word derivation process. The output then consists of the raw word (if it survives) followed by any derived word.
//
// Examples:-
// 1) $4million -> dollars 4 million
// 2) bee-keeper -> bee-keeper beekeeper bee keeper.
//
// Arguments: 1) toks The vector of tokens found in the input
// 2) C The input chain
//
// Returns: E_NODATA If the supplied chain is empty
// E_OK If the supplied chain is tokenized
_hzfunc("TokenizeFreetext") ;
hzChain raw ; // For building raw word
chIter ci ; // For iteration of input
chIter ri ; // For iteration of raw word
hzToken T ; // Token
hzString word ; // Word (from raw)
uint32_t ucVal ; // Value of unicode char
uint32_t nPunct ; // Count of punctuation chars in raw word
uint32_t nSymb ; // Count of symbol chars in raw word
uint32_t nDigit ; // Count of digits in raw word
uint32_t nAlpha ; // Count of alphas in raw word
uint32_t nWeird ; // Count of alphas in raw word
uint32_t ucLen ; // Length of unicode char
uint32_t nLine ; // Line number (at start of raw sequence)
toks.Clear() ;
if (!C.Size())
return E_NODATA ;
ci = C ;
ci.Line(1);
for (; !ci.eof() ;)
{
// Ignore leading spaces and other non alphnumerics
if (!IsAlphanum(*ci))
{ ci++ ; continue ; }
// Now we only have to deal with chars that are part of the raw word
raw.Clear() ;
nPunct = nSymb = nDigit = nAlpha = nWeird = 0;
nLine = ci.Line() ;
for (; !ci.eof() && IsAlphanum(*ci) ;)
{
if (IsPunct(*ci))
{
if (ci[1]<&eq; CHAR_SPACE)
{ ci++ ; break ; }
nPunct++ ;
raw.AddByte(*ci) ;
ci++ ;
continue ;
}
if (IsSymb(*ci)) { nSymb++ ; raw.AddByte(*ci) ; ci++ ; continue ; }
if (IsDigit(*ci)) { nDigit++ ; raw.AddByte(*ci) ; ci++ ; continue ; }
if (IsAlpha(*ci)) { nAlpha++ ; raw.AddByte(*ci) ; ci++ ; continue ; }
// Do we have a unicode sequence?
if (AtUnicodeSeq(ucVal, ucLen, ci))
{
if (ucVal > 255)
{ nWeird++ ; ci++ ; }
else
{ raw.AddByte(ucVal & 0xff);ci+=ucLen ; }
continue ;
}
raw.AddByte(*ci) ;
}
// Now we have a raw word, do we add it to the token list as-is or do we spawn derivatives?
if (!raw.Size())
continue ;
if (nAlpha)
{
if (!nDigit && !nPunct && !nSymb && !nWeird)
{
word = raw ;
word.ToLower() ;
T.Init(word, nLine, TOKEN_ALPHANUM) ;
toks.Add(T) ;
}
// Now must fill in what happens when we have digits, hyphens and what have you.
}
}
return E_OK ;
}