Source Code of org.apache.ctakes.dictionary.lookup.algorithms.FirstTokenPermutationImpl

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.dictionary.lookup.algorithms;


import org.apache.ctakes.dictionary.lookup.DictionaryEngine;
import org.apache.ctakes.dictionary.lookup.MetaDataHit;
import org.apache.ctakes.dictionary.lookup.phrasebuilder.PhraseBuilder;
import org.apache.ctakes.dictionary.lookup.vo.LookupAnnotation;
import org.apache.ctakes.dictionary.lookup.vo.LookupHit;
import org.apache.ctakes.dictionary.lookup.vo.LookupToken;
import org.apache.log4j.Logger;


import java.util.*;




/**
 * <b>OVERVIEW: </b> Each LookupToken is fed into a "first token" Dictionary. A
 * hit indicates an anchor and the window around this anchor is based on
 * context. This hit also contains all the presentations from the Dictionary
 * where the "first token" is contained.
 * <p/>
 * The window is determined by finding the largest overlapping context window
 * annotation. Permutations of LookupTokens found within this window are used to
 * match against the presentations found earlier. If context window annotations
 * are not provided, a fixed window is used based on the specified max
 * permutation level.
 * <p/>
 * <b>OPTIONAL CONTEXT: </b> context window annotations
 *
 * @author Mayo Clinic
 */
public class FirstTokenPermutationImpl implements LookupAlgorithm {
   // LOG4J logger based on class name
   final private Logger iv_logger = Logger.getLogger( getClass().getName() );


   /**
    * Key value for context map. Value is expected to be a List of
    * LookupAnnotation objects in sorted order.
    */
   public static final String CTX_KEY_WINDOW_ANNOTATIONS = "WINDOW_ANNOTATIONS";


   /**
    * Key value for LookupToken attribute. Value is expected to be either TRUE
    * or FALSE. This indicates whether to use this token for a "first token"
    * lookup or not. This is optional.
    */
   public static final String LT_KEY_USE_FOR_LOOKUP = "USE_FOR_LOOKUP";


   final private DictionaryEngine iv_firstTokenDictEngine;
   final private PhraseBuilder iv_phrBuilder;


   final private int iv_maxPermutationLevel;
   // key = level Integer, value = Permutation list
   final private Map<Integer, List<List<Integer>>> iv_permCacheMap;


   private String[] iv_textMetaFieldNames;


   /**
    * Constructor
    *
    * @param firstTokenDictEngine Dictionary that is indexed against first tokens.
    * @param phraseBuilder        Builds phrases to match against Dictionary.
    * @param textMetaFieldNames   MetaFieldNames used to extract presentations.
    * @param maxPermutationLevel  Max permutation Level allowed.
    */
   public FirstTokenPermutationImpl( final DictionaryEngine firstTokenDictEngine,
                                     final PhraseBuilder phraseBuilder,
                                     final String textMetaFieldNames[],
                                     final int maxPermutationLevel ) {
      iv_firstTokenDictEngine = firstTokenDictEngine;
      iv_phrBuilder = phraseBuilder;
      iv_textMetaFieldNames = textMetaFieldNames;


      iv_maxPermutationLevel = maxPermutationLevel;
      iv_permCacheMap = new HashMap<>( maxPermutationLevel );
      for ( int i = 0; i <= maxPermutationLevel; i++ ) {
         final List<List<Integer>> permList = PermutationUtil.getPermutationList( i );
         iv_permCacheMap.put( i, permList );
      }
   }


   /**
    * {@inheritDoc}
    */
   @Override
   public Collection<LookupHit> lookup( final List<LookupToken> lookupTokenList,
                                        final Map<String, List<LookupAnnotation>> contextMap ) throws Exception {
      // setup optional window context data
      final List<LookupAnnotation> windowAnnotations = getWindowAnnotations( contextMap );
      final boolean useWindowAnnots = !windowAnnotations.isEmpty();
      // map of all the annotation start indices as keys and the annotations with those indices as values
      final Map<Integer, List<LookupAnnotation>> wStartOffsetMap = getMultipleStartOffsetMap( windowAnnotations );
      // map of all the annotation end indices as keys and the annotations with those indices as values
      final Map<Integer, List<LookupAnnotation>> wEndOffsetMap = getMultipleEndOffsetMap( windowAnnotations );
      // map of all lookupTokens and their index within the lookupTokenList.  Faster for fetching than List.indexOf(..)
      final Map<LookupToken, Integer> ltListIndexMap = getListIndexMap( lookupTokenList );
      // map of all the token start indices as keys and the tokens with those indices as values
      final Map<Integer, List<LookupToken>> ltStartOffsetMap = getMultipleStartOffsetMap( lookupTokenList );
      // map of all the token end indices as keys and the tokens with those indices as values
      final Map<Integer, List<LookupToken>> ltEndOffsetMap = getMultipleEndOffsetMap( lookupTokenList );


      final List<LookupHit> lookupHits = new ArrayList<>();
      for ( int currentIndex = 0; currentIndex < lookupTokenList.size(); currentIndex++ ) {
         final LookupToken lookupToken = lookupTokenList.get( currentIndex );
         final String useForLookupString = lookupToken.getStringAttribute( LT_KEY_USE_FOR_LOOKUP );
         final boolean useForLookup = Boolean.valueOf( useForLookupString );
         if ( !useForLookup ) {
            continue;
         }
         final Collection<MetaDataHit> firstTokenHits = getFirstTokenHits( lookupToken );
         if ( firstTokenHits == null || firstTokenHits.isEmpty() ) {
            continue;
         }
         int wEndOffset = -1;
         if ( useWindowAnnots ) {
            // get the largest overlapping window annotation
            final LookupAnnotation windowAnnotation = getLargestWindowAnnotation( currentIndex, lookupToken,
                                                                                 ltStartOffsetMap, ltEndOffsetMap,
                                                                                 ltListIndexMap,
                                                                                 wStartOffsetMap, wEndOffsetMap );
            if ( windowAnnotation != null ) {
               wEndOffset = windowAnnotation.getEndOffset();
            }
         }
         if ( wEndOffset == -1 ) {
            iv_logger.debug( "Window size set to max perm level." );
            wEndOffset = getFixedWindowEndOffset( currentIndex, lookupToken, lookupTokenList );
         }
         final List<LookupToken> endLookupTokenList = getLookupTokenList( wEndOffset, ltEndOffsetMap, false );
         if ( endLookupTokenList.isEmpty() ) {
            iv_logger.debug( "Invalid window:" + currentIndex + "," + wEndOffset );
            continue;
         }
         final LookupToken endLookupToken = endLookupTokenList.get( endLookupTokenList.size() - 1 );
         final int startTokenIndex = currentIndex;
         final int endTokenIndex = ltListIndexMap.get( endLookupToken );
         // list of LookupToken objects bound by the window
         final List<LookupToken> wLookupTokenList = lookupTokenList.subList( startTokenIndex, endTokenIndex + 1 );
         // use permutation algorithm to find any hits inside the window
         // Note: currentIndex - startTokenIndex is always = 0. What was the intention?  12-26-2012 SPF
         final Collection<LookupHit> lhCol = getLookupHits( firstTokenHits, wLookupTokenList,
                                                            currentIndex - startTokenIndex );
         lookupHits.addAll( lhCol );
      }
      return lookupHits;
   }


   private Map<String,Set<MetaDataHit>> getNamedMetaDataHits( final Collection<MetaDataHit> firstTokenHits ) {
      final Map<String,Set<MetaDataHit>> namedMetaDataHits = new HashMap<>();
      for ( MetaDataHit firstTokenHit : firstTokenHits ) {
         for ( String name : iv_textMetaFieldNames ) {
            String text = firstTokenHit.getMetaFieldValue( name );
            if ( text != null ) {
               text = text.toLowerCase();
               Set<MetaDataHit> mdhSet = namedMetaDataHits.get( text );
               if ( mdhSet == null ) {
                  mdhSet = new HashSet<>();
               }
               mdhSet.add( firstTokenHit );
               namedMetaDataHits.put( text, mdhSet );
            } else {
               if ( iv_logger.isDebugEnabled() ) {
                  iv_logger.debug( "MetaField " + name + " contains no data." );
               }
            }
         }
      }
      return namedMetaDataHits;
   }


   private Collection<LookupHit> getLookupHits( final Collection<MetaDataHit> firstTokenHits,
                                                final List<LookupToken> wLookupTokenList,
                                                final int firstTokenIndex ) throws Exception {
      if ( wLookupTokenList.size() - 1 > iv_maxPermutationLevel ) {
         iv_logger.debug( "Beyond permutation cache size." );
         return Collections.emptyList();
      }
      final Map<String,Set<MetaDataHit>> namedMetaDataHits = getNamedMetaDataHits( firstTokenHits );


      final List<LookupHit> lookupHits = new ArrayList<>();
      final LookupToken firstWordLookupToken = wLookupTokenList.get( firstTokenIndex );
      final int firstWordStartOffset = firstWordLookupToken.getStartOffset();
      final int firstWordEndOffset = firstWordLookupToken.getEndOffset();
      final List<LookupToken> singleTokenList = Arrays.asList( firstWordLookupToken );
      final String[] firstWordPhrases = iv_phrBuilder.getPhrases( singleTokenList );
      for ( int i=0; i<firstWordPhrases.length; i++ ) {
         // perform toLowerCase() here instead of in the iterations below 2-21-13 spf
         firstWordPhrases[i] = firstWordPhrases[i].toLowerCase();
      }
      int permutationIndex = wLookupTokenList.size();
      if ( firstTokenIndex < wLookupTokenList.size() && permutationIndex > 0 ) {
         permutationIndex--;
      }
      final List<List<Integer>> permutationList = iv_permCacheMap.get( permutationIndex );
      for ( List<Integer> permutations : permutationList ) {
         // Moved sort and offset calculation from inner (per MetaDataHit) iteration 2-21-2013 spf
         Collections.sort( permutations );
         int startOffset = firstWordStartOffset;
         int endOffset = firstWordEndOffset;
         if ( !permutations.isEmpty() ) {
            int firstIdx = permutations.get( 0 );
            if ( firstIdx <= firstTokenIndex ) {
               firstIdx--;
            }
            final LookupToken firstToken = wLookupTokenList.get( firstIdx );
            if ( firstToken.getStartOffset() < firstWordStartOffset ) {
               startOffset = firstToken.getStartOffset();
            }
            int lastIdx = permutations.get( permutations.size() - 1 );
            if ( lastIdx <= firstTokenIndex ) {
               lastIdx--;
            }
            final LookupToken lastToken = wLookupTokenList.get( lastIdx );
            if ( lastToken.getEndOffset() > firstWordEndOffset ) {
               endOffset = lastToken.getEndOffset();
            }
         }
         // convert permutation idx back into LookupTokens
         final List<LookupToken> tempLookupTokens = new ArrayList<>();
         for ( Integer idx : permutations ) {
            if ( idx <= firstTokenIndex ) {
               idx--;
            }
            final LookupToken lookupToken = wLookupTokenList.get( idx );
            tempLookupTokens.add( lookupToken );
         }
         final String[] lookupTokenPhrases = iv_phrBuilder.getPhrases( tempLookupTokens );
         for ( String lookupTokenPhrase : lookupTokenPhrases ) {
            // perform toLowerCase() here instead of repeating in each inner loop
            lookupTokenPhrase = lookupTokenPhrase.toLowerCase();
            for ( String firstWordPhrase : firstWordPhrases ) {
               // perform toLowerCase() here so it isn't done for the whole concatenated string
//               firstWordPhrase = firstWordPhrase.toLowerCase();
               final StringBuilder phraseSB = new StringBuilder();
               phraseSB.append( firstWordPhrase ).append( ' ' ).append( lookupTokenPhrase );
               final String fullPhrase = phraseSB.toString().trim();
               final Set<MetaDataHit> mdhSet = namedMetaDataHits.get( fullPhrase );
               if ( mdhSet == null ) {
                  continue;
               }
               for ( MetaDataHit mdh : mdhSet ) {
                  final LookupHit lh = new LookupHit( mdh, startOffset, endOffset );
                  lookupHits.add( lh );
               }
            }
         }
      }
      return lookupHits;
   }


   /**
    * Extracts the list of LookupAnnotation objects representing noun phrases
    * from the context map.
    *
    * @param contextMap Map where key=Impl specific String object and value=List of
    *                   LookupAnnotation objects
    * @return list of window annotations or empty list if null
    */
   private List<LookupAnnotation> getWindowAnnotations( final Map<String, List<LookupAnnotation>> contextMap ) {
      final List<LookupAnnotation> list = contextMap.get( CTX_KEY_WINDOW_ANNOTATIONS );
      if ( list == null || list.isEmpty() ) {
         iv_logger.debug( "No context window annotations." );
         return Collections.emptyList();
      }
      return list;
   }


   /**
    * Determines the number of ListTokens are contained within the specified
    * start and end offsets;
    *
    * @param ltStartOffsetMap -
    * @param ltEndOffsetMap   -
    * @param ltListIndexMap   -
    * @param startOffset      -
    * @param endOffset        -
    * @return                 -
    */
   private int getNumberOfListTokens( final Map<Integer, List<LookupToken>> ltStartOffsetMap,
                                      final Map<Integer, List<LookupToken>> ltEndOffsetMap,
                                      final Map<LookupToken, Integer> ltListIndexMap,
                                      final int startOffset, final int endOffset ) {
      final List<LookupToken> startLookupTokenList = getLookupTokenList( startOffset, ltStartOffsetMap, true );
      final List<LookupToken> endLookupTokenList = getLookupTokenList( endOffset, ltEndOffsetMap, false );


      if ( startLookupTokenList.isEmpty() || endLookupTokenList.isEmpty() ) {
         iv_logger.debug( "Invalid window:" + startOffset + "," + endOffset );
         return -1;
      }
      final LookupToken startLookupToken = startLookupTokenList.get( 0 );
      final Integer startIdx = ltListIndexMap.get( startLookupToken );


      final LookupToken endLookupToken = endLookupTokenList.get( endLookupTokenList.size() - 1 );
      final Integer endIdx = ltListIndexMap.get( endLookupToken );


      return endIdx - startIdx + 1;
   }


   /**
    * Attempts to get a list of LookupToken objects at the specified offset. If
    * there are none, this method attempts to try nearby offsets based on the
    * traversal direction.
    *
    * @param offset -
    * @param ltOffsetMap -
    * @param traverseRight -
    * @return list of lookup tokens in window, never null
    */
   private static List<LookupToken> getLookupTokenList( final int offset,
                                                 final Map<Integer, List<LookupToken>> ltOffsetMap,
                                                 final boolean traverseRight ) {
      // first attempt the original offset, which will be the case most of the time
      List<LookupToken> lookupTokenList = ltOffsetMap.get( offset );
      if ( lookupTokenList != null ) {
         return lookupTokenList;
      }
      // otherwise traverse some nearby offsets and attempt to find a token
      // TODO hardcoded max offset window is 10 char
      final int offsetWindow = 10;
      if ( traverseRight ) {
         final int max = offset + offsetWindow;
         for ( int i = offset; i <= max; i++ ) {
            lookupTokenList = ltOffsetMap.get( i );
            if ( lookupTokenList != null ) {
               return lookupTokenList;
            }
         }
      } else {
         final int min = offset - offsetWindow;
         for ( int i = offset; i >= min; i-- ) {
            lookupTokenList = ltOffsetMap.get( i );
            if ( lookupTokenList != null ) {
               return lookupTokenList;
            }
         }
      }
      // no tokens in window - return an empty list, not null
      return Collections.emptyList();
   }


   /**
    * Determines the largest overlapping window annotation for the specified
    * LookupToken.
    */
   private LookupAnnotation getLargestWindowAnnotation( final int tokenIdx, final LookupToken lt,
                                                        final Map<Integer, List<LookupToken>> ltStartOffsetMap,
                                                        final Map<Integer, List<LookupToken>> ltEndOffsetMap,
                                                        final Map<LookupToken, Integer> ltListIndexMap,
                                                        final Map<Integer, List<LookupAnnotation>> wStartOffsetMap,
                                                        final Map<Integer, List<LookupAnnotation>> wEndOffsetMap ) {
      final Set<LookupAnnotation> startCandidateSet = new HashSet<>();
      final Set<LookupAnnotation> endCandidateSet = new HashSet<>();


      for ( Map.Entry<Integer, List<LookupAnnotation>> entry : wStartOffsetMap.entrySet() ) {
         final Integer startOffset = entry.getKey();
         if ( startOffset <= lt.getStartOffset() ) {
            startCandidateSet.addAll( entry.getValue() );
         }
      }
      for ( Map.Entry<Integer, List<LookupAnnotation>> entry : wEndOffsetMap.entrySet() ) {
         final Integer endOffset = entry.getKey();
         if ( endOffset >= lt.getEndOffset() ) {
            endCandidateSet.addAll( entry.getValue() );
         }
      }
      // union to get window annotations that are overlapping with LookupToken
      startCandidateSet.retainAll( endCandidateSet );


      // find largest overlapping window annotation
      LookupAnnotation largestWindowAnnot = null;
      for ( LookupAnnotation tempLookupAnnot : startCandidateSet ) {
         if ( largestWindowAnnot == null || tempLookupAnnot.getLength() > largestWindowAnnot.getLength() ) {
            // now see if we can handle the size of this window (permutation wise)
            final int ltCount = getNumberOfListTokens( ltStartOffsetMap, ltEndOffsetMap, ltListIndexMap,
                                                       tempLookupAnnot.getStartOffset(),
                                                       tempLookupAnnot.getEndOffset() );


            if ( ltCount <= iv_maxPermutationLevel && ltCount > 0 ) {
               largestWindowAnnot = tempLookupAnnot;
            } else if ( iv_logger.isDebugEnabled() ) {
               iv_logger.debug( "Window size of " + ltCount
                                + " exceeds the max permutation level of " + iv_maxPermutationLevel + "." );
            }
         }
      }
      return largestWindowAnnot;
   }


   private int getFixedWindowEndOffset( final int tokenIdx, final LookupToken lt, final List<LookupToken> ltList ) {
      // This iterates to the last index, then returns the last valid offset.
      // If we were performing max() this might be understandable ...
      //      int fixedEndOffset = 0;
      //      for ( int i = tokenIdx; (i < tokenIdx + iv_maxPermutationLevel)
      //            && (i < ltList.size()); i++ ) {
      //         LookupToken tempLookupToken = (LookupToken) ltList.get( i );
      //         if ( tempLookupToken != null ) {
      //            fixedEndOffset = tempLookupToken.getEndOffset();
      //         }
      //      }
      //      return fixedEndOffset;


      // Go backward and return the first valid end offset ...
      final int count = Math.min( tokenIdx + iv_maxPermutationLevel, ltList.size() );
      if ( count <= 0 ) {
         return 0;
      }
      for ( int i = count - 1; i >= 0; i-- ) {
         final LookupToken tempLookupToken = ltList.get( i );
         if ( tempLookupToken != null ) {
            return tempLookupToken.getEndOffset();
         }
      }
      return 0;
   }


   /**
    * Creates a map that binds an object from a list to its index position.
    *
    * @param list -
    * @return -
    */
   static private Map<LookupToken, Integer> getListIndexMap( final List<LookupToken> list ) {
      final Map<LookupToken, Integer> m = new HashMap<>( list.size() );
      for ( int i = 0; i < list.size(); i++ ) {
         m.put( list.get( i ), i );
      }
      return m;
   }


   /**
    * Creates a map that uses the start offset to index the LookupAnnotation objects.
    *
    * @param lookupAnnotList -
    * @return map of integers and lookup annotations
    */
   static private <T extends LookupAnnotation> Map<Integer, T> getSingleStartOffsetMap( final List<T> lookupAnnotList ) {
      final Map<Integer, T> m = new HashMap<Integer, T>();
      for ( T lookupAnnotation : lookupAnnotList ) {
         final Integer key = lookupAnnotation.getStartOffset();
         m.put( key, lookupAnnotation );
      }
      return m;
   }


   /**
    * Creates a map that uses the start offset to index the LookupAnnotation objects.
    *
    * @param lookupAnnotList -
    * @return map of integers and lookup annotation lists
    */
   static private <T extends LookupAnnotation> Map<Integer, List<T>> getMultipleStartOffsetMap( final List<T> lookupAnnotList ) {
      final Map<Integer, List<T>> m = new HashMap<>();
      for ( T lookupAnnotation : lookupAnnotList ) {
         final Integer key = lookupAnnotation.getStartOffset();
         List<T> list = m.get( key );
         if ( list == null ) {
            list = new ArrayList<>();
         }
         list.add( lookupAnnotation );
         m.put( key, list );
      }
      return m;
   }


   /**
    * Creates a map that uses the end offset to index the LookupAnnotation objects.
    *
    * @param lookupAnnotList -
    * @return map of integers and lookup annotations
    */
   static private <T extends LookupAnnotation> Map<Integer, T> getSingleEndOffsetMap( final List<T> lookupAnnotList ) {
      final Map<Integer, T> m = new HashMap<>();
      for ( T lookupAnnotation : lookupAnnotList ) {
         final Integer key = lookupAnnotation.getEndOffset();
         m.put( key, lookupAnnotation );
      }
      return m;
   }


   /**
    * Creates a map that uses the end offset to index the LookupAnnotation objects.
    *
    * @param lookupAnnotList -
    * @return map of integers and lookup annotation lists
    */
   static private <T extends LookupAnnotation> Map<Integer, List<T>> getMultipleEndOffsetMap( final List<T> lookupAnnotList ) {
      final Map<Integer, List<T>> m = new HashMap<>();
      for ( T lookupAnnotation : lookupAnnotList ) {
         final Integer key = lookupAnnotation.getEndOffset();
         List<T> list = m.get( key );
         if ( list == null ) {
            list = new ArrayList<>();
         }
         list.add( lookupAnnotation );
         m.put( key, list );
      }
      return m;
   }




   /**
    * Gets the hits for the specified LookupToken. This uses the first token Dictionary.
    *
    * @param firstLookupToken -
    * @return -
    * @throws Exception
    */
   private Collection<MetaDataHit> getFirstTokenHits( final LookupToken firstLookupToken ) throws Exception {
      final List<LookupToken> singleTokenList = Arrays.asList( firstLookupToken );
      final String[] phrases = iv_phrBuilder.getPhrases( singleTokenList );
      final List<MetaDataHit> metaDataHits = new ArrayList<>();
      for ( String phrase : phrases ) {
         final Collection<MetaDataHit> phraseMetaDataHits = iv_firstTokenDictEngine.metaLookup( phrase );
         if ( !phraseMetaDataHits.isEmpty() ) {
            metaDataHits.addAll( phraseMetaDataHits );
         }
      }
      return metaDataHits;
   }
}
Source Code of org.apache.ctakes.dictionary.lookup.algorithms.FirstTokenPermutationImpl

Related Classes of org.apache.ctakes.dictionary.lookup.algorithms.FirstTokenPermutationImpl