FeatureStructure wordFS = cas.createFS(wordType);
FeatureStructure sepFS = cas.createFS(separatorType);
FeatureStructure eosFS = cas.createFS(eosType);
String text = cas.getDocumentText();
TextStringTokenizer tokenizer = new TextStringTokenizer(text);
tokenizer.setSeparators("/-*&@");
tokenizer.addWhitespaceChars(",");
tokenizer.setEndOfSentenceChars(".!?");
tokenizer.setShowWhitespace(false);
int tokenTypeCode;
int wordCounter = 0;
int sepCounter = 0;
int endOfSentenceCounter = 0;
AnnotationFS tokenAnnot;
while (tokenizer.isValid()) {
tokenAnnot = cas.createAnnotation(tokenType, tokenizer.getTokenStart(), tokenizer
.getTokenEnd());
tokenTypeCode = tokenizer.getTokenType();
switch (tokenTypeCode) {
case TextStringTokenizer.EOS: {
++endOfSentenceCounter;
tokenAnnot.setFeatureValue(tokenTypeFeature, eosFS);
break;
}
case TextStringTokenizer.SEP: {
++sepCounter;
tokenAnnot.setFeatureValue(tokenTypeFeature, sepFS);
break;
}
case TextStringTokenizer.WSP: {
break;
}
case TextStringTokenizer.WCH: {
++wordCounter;
tokenAnnot.setFeatureValue(tokenTypeFeature, wordFS);
// if ((wordCounter % 100000) == 0) {
// System.out.println("Number of words tokenized: " + wordCounter);
// }
break;
}
default: {
throw new Exception("Something went wrong, fire up that debugger!");
}
}
cas.getIndexRepository().addFS(tokenAnnot);
tokenizer.setToNext();
// System.out.println("Token: " + tokenizer.nextToken());
}
// time = System.currentTimeMillis() - time;
// System.out.println("Number of words: " + wordCounter);
// int allTokens = wordCounter + sepCounter + endOfSentenceCounter;