this.token = token;
this.index = index;
this.inChunk = chunk;
this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
PosTag selectedPosTag = null;
boolean matchedPosTag = false; //matched any of the POS annotations
//(1) check if this Token should be linked against the Vocabulary (isProcessable)
boolean upperCase = index > 0 && //not a sentence start
token.getEnd() > token.getStart() && //not an empty token
Character.isUpperCase(token.getSpan().codePointAt(0)); //and upper case
if(tpc.isLinkUpperCaseTokens() && upperCase){
isProcessable = true;
} else { //else use POS tag & token length
for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
// check three possible match
// 1. the LexicalCategory matches
// 2. the Pos matches
// 3. the String tag matches
PosTag posTag = posAnnotation.value();
if((!disjoint(tpc.getLinkedLexicalCategories(), posTag.getCategories())) ||
(!disjoint(tpc.getLinkedPos(), posTag.getPosHierarchy())) ||
tpc.getLinkedPosTags().contains(posTag.getTag())){
if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
selectedPosTag = posTag;
isProcessable = true;
matchedPosTag = true;
break;
} // else probability to low for inclusion
} else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
selectedPosTag = posTag; //also rejected PosTags are selected
matchedPosTag = true;
isProcessable = false;
break;
} // else probability to low for exclusion
}
if(!matchedPosTag) { //not matched against a POS Tag ...
// ... fall back to the token length
isProcessable = token.getSpan().length() >= elc.getMinSearchTokenLength();
}
}
//(2) check if this token should be considered to match labels of suggestions
if(isProcessable){ //processable tokens are also matchable
isMatchable = true;
} else if(tpc.isMatchUpperCaseTokens() && upperCase){
//match upper case tokens regardless of POS and length
isMatchable = true;
} else { //check POS and length to see if token is matchable
matchedPosTag = false; //reset to false!
for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
PosTag posTag = posAnnotation.value();
if(posTag.isMapped()){
if(!Collections.disjoint(tpc.getMatchedLexicalCategories(),
posTag.getCategories())){
if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
//override selectedPosTag if present
selectedPosTag = posTag; //mark the matchable as selected PosTag
isMatchable = true;
matchedPosTag = true;
break;
} // else probability to low for inclusion
} else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
if(selectedPosTag == null){ //do not override existing values
selectedPosTag = posTag; //also rejected PosTags are selected
}
isMatchable = false;
matchedPosTag = true;
break;
} // else probability to low for exclusion
} //else not matched ... search next one
}
if(!matchedPosTag){ //not matched against POS tag ...
//fall back to the token length
isMatchable = token.getSpan().length() >= elc.getMinSearchTokenLength();
}
}
//(3) check for morpho analyses
if(selectedPosTag == null){ //token is not processable or matchable
//we need to set the selectedPoas tag to the first POS annotation
Value<PosTag> posAnnotation = token.getAnnotation(POS_ANNOTATION);
if(posAnnotation != null) {
selectedPosTag = posAnnotation.value();
}
}
List<Value<MorphoFeatures>> morphoAnnotations = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if(selectedPosTag == null){ //no POS information ... use the first morpho annotation
morpho = morphoAnnotations.isEmpty() ? null : morphoAnnotations.get(0).value();
} else { //select the correct morpho annotation based on the POS tag
MorphoFeatures mf = null;
selectMorphoFeature :
for(Value<MorphoFeatures> morphoAnnotation : morphoAnnotations){
for(PosTag posTag : morphoAnnotation.value().getPosList()){
if(!disjoint(selectedPosTag.getCategories(),posTag.getCategories())){
mf = morphoAnnotation.value();
break selectMorphoFeature; //stop after finding the first one
}
}
}