Source Code of org.languagetool.rules.uk.TokenAgreementRule

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2013 Andriy Rysin
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.uk;


import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.ResourceBundle;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.apache.commons.lang.StringUtils;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.language.Ukrainian;
import org.languagetool.rules.Category;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.uk.IPOSTag;
import org.languagetool.tagging.uk.UkrainianTagger;


/**
 * A rule that checks if tokens in the sentence agree on inflection etc
 * 
 * @author Andriy Rysin
 */
public class TokenAgreementRule extends Rule {
  private static final String NO_VIDMINOK_SUBSTR = ":nv";
  private static final String REQUIRE_VIDMINOK_SUBSTR = ":rv_";
  private static final String VIDMINOK_SUBSTR = ":v_";
  private static final Pattern REQUIRE_VIDMINOK_REGEX = Pattern.compile(":r(v_[a-z]+)");
  private static final Pattern VIDMINOK_REGEX = Pattern.compile(":(v_[a-z]+)");


  private final Ukrainian ukrainian = new Ukrainian();


  private final static Set<String> STREETS = new HashSet<String>(Arrays.asList(
      "Штрассе", "Авеню", "Стріт"
      ));


  public TokenAgreementRule(final ResourceBundle messages) throws IOException {
    if (messages != null) {
      super.setCategory(new Category(messages.getString("category_misc")));
    }
  }


  @Override
  public final String getId() {
    return "UK_TOKEN_AGREEMENT";
  }


  @Override
  public String getDescription() {
    return "Узгодження слів у реченні";
  }


  public String getShort() {
    return "Узгодження слів у реченні";
  }
  /**
   * Indicates if the rule is case-sensitive. 
   * @return true if the rule is case-sensitive, false otherwise.
   */
  public boolean isCaseSensitive() {
    return false;
  }


  @Override
  public final RuleMatch[] match(final AnalyzedSentence text) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();    


    AnalyzedTokenReadings reqTokenReadings = null;
    int i = -1;
    for (AnalyzedTokenReadings tokenReadings: tokens) {
      i++;


      String posTag = tokenReadings.getAnalyzedToken(0).getPOSTag();


      //TODO: skip conj напр. «бодай»


      if (posTag == null || posTag.contains(IPOSTag.unknown.getText()) || posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
        reqTokenReadings = null;
        continue;
      }


      String token = tokenReadings.getAnalyzedToken(0).getToken();
      if( posTag.contains(REQUIRE_VIDMINOK_SUBSTR) && tokenReadings.getReadingsLength() == 1 ) {
        String prep = token;


        if( prep.equals("за") && reverseSearch(tokens, i, "що") )
          continue;


        if( prep.equalsIgnoreCase("понад") )
          continue;


        if( (prep.equalsIgnoreCase("окрім") || prep.equalsIgnoreCase("крім"))
            && tokens.length > i+1 && tokens[i+1].getAnalyzedToken(0).getToken().equalsIgnoreCase("як") ) {
          reqTokenReadings = null;
          continue;
        }


        reqTokenReadings = tokenReadings;
        continue;
      }


      if( reqTokenReadings == null )
        continue;


      ArrayList<String> posTagsToFind = new ArrayList<String>();


      //      if( tokens.length > i+1 && Character.isUpperCase(tokenReadings.getAnalyzedToken(0).getToken().charAt(0))
      //        && hasRequiredPosTag(Arrays.asList("v_naz"), tokenReadings)
      //        && Character.isUpperCase(tokens[i+1].getAnalyzedToken(0).getToken().charAt(0)) )
      //          continue; // "у Конан Дойла"




      //TODO: for numerics only v_naz
      if( reqTokenReadings.getAnalyzedToken(0).getToken().equalsIgnoreCase("понад") ) { //&& tokenReadings.getAnalyzedToken(0).getPOSTag().equals(IPOSTag.numr) ) { 
        posTagsToFind.add("v_naz");
      }


      String reqPosTag = reqTokenReadings.getAnalyzedToken(0).getPOSTag();


      Matcher matcher = REQUIRE_VIDMINOK_REGEX.matcher(reqPosTag);
      while( matcher.find() ) {
        posTagsToFind.add(matcher.group(1));
      }


      for(AnalyzedToken readingToken: tokenReadings) {
        if( IPOSTag.numr.match(readingToken.getPOSTag()) ) {
          posTagsToFind.add("v_naz");  // TODO: only if noun is following?
          break;
        }
      }


      //      System.out.println("For " + tokenReadings + " to match " + posTagsToFind + " of " + reqTokenReadings.getToken());
      if( ! hasRequiredPosTag(posTagsToFind, tokenReadings) ) {
        if( isTokenToSkip(tokenReadings) )
          continue;


//        if( isTokenToIgnore(tokenReadings) ) {
//          reqTokenReadings = null;
//          continue;
//        }


        String prep = reqTokenReadings.getAnalyzedToken(0).getToken();
        if( prep.equalsIgnoreCase("до") ) {
          if( tokenReadings.getAnalyzedToken(0).getToken().compareToIgnoreCase("Я") == 0 ) {  // від А до Я
            reqTokenReadings = null;
            continue;
          }
        }


        if( prep.equalsIgnoreCase("в") || prep.equalsIgnoreCase("у") ) {
          if( hasRequiredPosTag(Arrays.asList("p:v_naz"), tokenReadings) ) {  //TODO: only for subset: президенти/депутати/мери/гості... or by verb піти/йти/балотуватися/записатися...
            reqTokenReadings = null;
            continue;
          }
        }


        // exceptions
        if( tokens.length > i+1 ) {
          if( isCapitalized( token ) 
              && STREETS.contains( tokens[i+1].getAnalyzedToken(0).getToken()) ) {
            reqTokenReadings = null;
            continue;
          }


          if( IPOSTag.isNum(tokens[i+1].getAnalyzedToken(0).getPOSTag())
              && (token.equals("мінус") || token.equals("плюс")
                  || token.equals("мінімум") || token.equals("максимум") ) ) {
            reqTokenReadings = null;
            continue;
          }


          if( reqTokenReadings.getAnalyzedToken(0).getToken().equalsIgnoreCase("через")
              && token.equals("років") 
              && IPOSTag.isNum(tokens[i+1].getAnalyzedToken(0).getPOSTag()) ) {
            reqTokenReadings = null;
            continue;
          }


          if( (token.equals("собі") || token.equals("йому"))
              && tokens[i+1].getAnalyzedToken(0).getToken().startsWith("подібн") ) {
            //          reqTokenReadings = null;
            continue;
          }


          if( (token.equals("усім") || token.equals("всім"))
              && tokens[i+1].getAnalyzedToken(0).getToken().startsWith("відом") ) {
            //          reqTokenReadings = null;
            continue;
          }


          if( prep.equalsIgnoreCase("до") && token.equals("схід") 
                && tokens[i+1].getAnalyzedToken(0).getToken().equals("сонця") ) {
            reqTokenReadings = null;
            continue;
          }


          if( tokens[i+1].getAnalyzedToken(0).getToken().equals("«") 
              && tokens[i].getAnalyzedToken(0).getPOSTag().contains(":abbr") ) {
            reqTokenReadings = null;
            continue;
          }


          
          if( tokens.length > i+2 ) {
            if ((token.equals("нікому") || token.equals("ніким") || token.equals("нічим") || token.equals("нічому")) 
                && tokens[i+1].getAnalyzedToken(0).getToken().equals("не")) {
              //          reqTokenReadings = null;
              continue;
            }




          }
        }


        RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, reqTokenReadings, posTagsToFind);
        ruleMatches.add(potentialRuleMatch);
      }


      reqTokenReadings = null;
    }


    return toRuleMatchArray(ruleMatches);
  }


  private static boolean isCapitalized(String token) {
    return token.length() > 1 && Character.isUpperCase(token.charAt(0)) && Character.isLowerCase(token.charAt(1));
  }


  private boolean reverseSearch(AnalyzedTokenReadings[] tokens, int pos, String string) {
    for(int i=pos-1; i >= 0 && i > pos-4; i--) {
      if( tokens[i].getAnalyzedToken(0).getToken().equalsIgnoreCase("що") )
        return true;
    }
    return false;
  }


  private boolean isTokenToSkip(AnalyzedTokenReadings tokenReadings) {
    for(AnalyzedToken token: tokenReadings) {
//      System.out.println("    tag: " + token.getPOSTag() + " for " + token.getToken());
      if( IPOSTag.adv.match(token.getPOSTag())
          || IPOSTag.contains(token.getPOSTag(), "adv>")
          ||  IPOSTag.insert.match(token.getPOSTag()) )
        return true;
    }
    return false;
  }


//  private boolean isTokenToIgnore(AnalyzedTokenReadings tokenReadings) {
//    for(AnalyzedToken token: tokenReadings) {
//      if( token.getPOSTag().contains("abbr") )
//        return true;
//    }
//    return false;
//  }


  private boolean hasRequiredPosTag(Collection<String> posTagsToFind, AnalyzedTokenReadings tokenReadings) {
    boolean vidminokFound = false;  // because POS dictionary is not complete


    for(AnalyzedToken token: tokenReadings) {
      String posTag = token.getPOSTag();


      if( posTag == null || posTag.contains(NO_VIDMINOK_SUBSTR) )
        return true;


      if( posTag.contains(VIDMINOK_SUBSTR) ) {
        vidminokFound = true;


        for(String posTagToFind: posTagsToFind) {
          //          System.out.println("  verifying: " + token + " -> " + posTag + " ~ " + posTagToFind);


          if ( posTag.contains(posTagToFind) )
            return true;
        }
      }
    }


    return ! vidminokFound; //false;
  }


  private RuleMatch createRuleMatch(AnalyzedTokenReadings tokenReadings, AnalyzedTokenReadings reqTokenReadings, List<String> posTagsToFind) {
    String tokenString = tokenReadings.getToken();


    Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();


    ArrayList<String> suggestions = new ArrayList<String>();
    String oldPosTag = tokenReadings.getAnalyzedToken(0).getPOSTag();
    String requiredPostTagsRegEx = ":(" + StringUtils.join(posTagsToFind,"|") + ")";
    String posTag = oldPosTag.replaceFirst(":v_[a-z]+", requiredPostTagsRegEx);


    //    System.out.println("  creating suggestion for " + tokenReadings + " / " + tokenReadings.getAnalyzedToken(0) +" and tag " + posTag);


    try {
      String[] synthesized = ukrainianSynthesizer.synthesize(tokenReadings.getAnalyzedToken(0), posTag, true);


      //      System.out.println("Synthesized: " + Arrays.asList(synthesized));
      suggestions.addAll( Arrays.asList(synthesized) );
    } catch (IOException e) {
      throw new RuntimeException(e);
    }


    ArrayList<String> reqVidminkyNames = new ArrayList<String>();
    for (String vidm: posTagsToFind) {
      reqVidminkyNames.add(UkrainianTagger.VIDMINKY_MAP.get(vidm));
    }


    ArrayList<String> foundVidminkyNames = new ArrayList<String>();
    for(AnalyzedToken token: tokenReadings) {
      String posTag2 = token.getPOSTag();
      if( posTag2.contains(VIDMINOK_SUBSTR) ) {
        foundVidminkyNames.add(UkrainianTagger.VIDMINKY_MAP.get(posTag2.replaceFirst("^.*"+VIDMINOK_REGEX+".*$", "$1")));
      }
    }


    String msg = MessageFormat.format("Прийменник «{0}» вимагає іншого відмінка: {1}, а знайдено: {2}", 
        reqTokenReadings.getToken(), StringUtils.join(reqVidminkyNames, ", "), StringUtils.join(foundVidminkyNames, ", "));
        
    if( tokenString.equals("їх") ) {
      msg += ". Можливо тут потрібно присвійний займенник «їхній»?";
      try {
        String newYihPostag = "adj:p" + requiredPostTagsRegEx + ".*";
        String[] synthesized = ukrainianSynthesizer.synthesize(new AnalyzedToken("їхній", "adj:m:v_naz:&pron", "їхній"), newYihPostag, true);
        suggestions.addAll( Arrays.asList(synthesized) );
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
        
    int pos = tokenReadings.getStartPos();


    RuleMatch potentialRuleMatch = new RuleMatch(this, pos, pos + tokenString.length(), msg, getShort());


    potentialRuleMatch.setSuggestedReplacements(suggestions);


    return potentialRuleMatch;
  }


  @Override
  public void reset() {
  }


}
Source Code of org.languagetool.rules.uk.TokenAgreementRule

Related Classes of org.languagetool.rules.uk.TokenAgreementRule