Package org.languagetool.rules.uk

Source Code of org.languagetool.rules.uk.TokenAgreementRule

/* LanguageTool, a natural language style checker
* Copyright (C) 2013 Andriy Rysin
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
* USA
*/
package org.languagetool.rules.uk;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.ResourceBundle;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.language.Ukrainian;
import org.languagetool.rules.Category;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.uk.IPOSTag;
import org.languagetool.tagging.uk.UkrainianTagger;

/**
* A rule that checks if tokens in the sentence agree on inflection etc
*
* @author Andriy Rysin
*/
public class TokenAgreementRule extends Rule {
  private static final String NO_VIDMINOK_SUBSTR = ":nv";
  private static final String REQUIRE_VIDMINOK_SUBSTR = ":rv_";
  private static final String VIDMINOK_SUBSTR = ":v_";
  private static final Pattern REQUIRE_VIDMINOK_REGEX = Pattern.compile(":r(v_[a-z]+)");
  private static final Pattern VIDMINOK_REGEX = Pattern.compile(":(v_[a-z]+)");

  private final Ukrainian ukrainian = new Ukrainian();

  private final static Set<String> STREETS = new HashSet<String>(Arrays.asList(
      "Штрассе", "Авеню", "Стріт"
      ));

  public TokenAgreementRule(final ResourceBundle messages) throws IOException {
    if (messages != null) {
      super.setCategory(new Category(messages.getString("category_misc")));
    }
  }

  @Override
  public final String getId() {
    return "UK_TOKEN_AGREEMENT";
  }

  @Override
  public String getDescription() {
    return "Узгодження слів у реченні";
  }

  public String getShort() {
    return "Узгодження слів у реченні";
  }
  /**
   * Indicates if the rule is case-sensitive.
   * @return true if the rule is case-sensitive, false otherwise.
   */
  public boolean isCaseSensitive() {
    return false;
  }

  @Override
  public final RuleMatch[] match(final AnalyzedSentence text) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();   

    AnalyzedTokenReadings reqTokenReadings = null;
    int i = -1;
    for (AnalyzedTokenReadings tokenReadings: tokens) {
      i++;

      String posTag = tokenReadings.getAnalyzedToken(0).getPOSTag();

      //TODO: skip conj напр. «бодай»

      if (posTag == null || posTag.contains(IPOSTag.unknown.getText()) || posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
        reqTokenReadings = null;
        continue;
      }

      String token = tokenReadings.getAnalyzedToken(0).getToken();
      if( posTag.contains(REQUIRE_VIDMINOK_SUBSTR) && tokenReadings.getReadingsLength() == 1 ) {
        String prep = token;

        if( prep.equals("за") && reverseSearch(tokens, i, "що") )
          continue;

        if( prep.equalsIgnoreCase("понад") )
          continue;

        if( (prep.equalsIgnoreCase("окрім") || prep.equalsIgnoreCase("крім"))
            && tokens.length > i+1 && tokens[i+1].getAnalyzedToken(0).getToken().equalsIgnoreCase("як") ) {
          reqTokenReadings = null;
          continue;
        }

        reqTokenReadings = tokenReadings;
        continue;
      }

      if( reqTokenReadings == null )
        continue;

      ArrayList<String> posTagsToFind = new ArrayList<String>();

      //      if( tokens.length > i+1 && Character.isUpperCase(tokenReadings.getAnalyzedToken(0).getToken().charAt(0))
      //        && hasRequiredPosTag(Arrays.asList("v_naz"), tokenReadings)
      //        && Character.isUpperCase(tokens[i+1].getAnalyzedToken(0).getToken().charAt(0)) )
      //          continue; // "у Конан Дойла"


      //TODO: for numerics only v_naz
      if( reqTokenReadings.getAnalyzedToken(0).getToken().equalsIgnoreCase("понад") ) { //&& tokenReadings.getAnalyzedToken(0).getPOSTag().equals(IPOSTag.numr) ) {
        posTagsToFind.add("v_naz");
      }

      String reqPosTag = reqTokenReadings.getAnalyzedToken(0).getPOSTag();

      Matcher matcher = REQUIRE_VIDMINOK_REGEX.matcher(reqPosTag);
      while( matcher.find() ) {
        posTagsToFind.add(matcher.group(1));
      }

      for(AnalyzedToken readingToken: tokenReadings) {
        if( IPOSTag.numr.match(readingToken.getPOSTag()) ) {
          posTagsToFind.add("v_naz")// TODO: only if noun is following?
          break;
        }
      }

      //      System.out.println("For " + tokenReadings + " to match " + posTagsToFind + " of " + reqTokenReadings.getToken());
      if( ! hasRequiredPosTag(posTagsToFind, tokenReadings) ) {
        if( isTokenToSkip(tokenReadings) )
          continue;

//        if( isTokenToIgnore(tokenReadings) ) {
//          reqTokenReadings = null;
//          continue;
//        }

        String prep = reqTokenReadings.getAnalyzedToken(0).getToken();
        if( prep.equalsIgnoreCase("до") ) {
          if( tokenReadings.getAnalyzedToken(0).getToken().compareToIgnoreCase("Я") == 0 ) {  // від А до Я
            reqTokenReadings = null;
            continue;
          }
        }

        if( prep.equalsIgnoreCase("в") || prep.equalsIgnoreCase("у") ) {
          if( hasRequiredPosTag(Arrays.asList("p:v_naz"), tokenReadings) ) {  //TODO: only for subset: президенти/депутати/мери/гості... or by verb піти/йти/балотуватися/записатися...
            reqTokenReadings = null;
            continue;
          }
        }

        // exceptions
        if( tokens.length > i+1 ) {
          if( isCapitalized( token )
              && STREETS.contains( tokens[i+1].getAnalyzedToken(0).getToken()) ) {
            reqTokenReadings = null;
            continue;
          }

          if( IPOSTag.isNum(tokens[i+1].getAnalyzedToken(0).getPOSTag())
              && (token.equals("мінус") || token.equals("плюс")
                  || token.equals("мінімум") || token.equals("максимум") ) ) {
            reqTokenReadings = null;
            continue;
          }

          if( reqTokenReadings.getAnalyzedToken(0).getToken().equalsIgnoreCase("через")
              && token.equals("років")
              && IPOSTag.isNum(tokens[i+1].getAnalyzedToken(0).getPOSTag()) ) {
            reqTokenReadings = null;
            continue;
          }

          if( (token.equals("собі") || token.equals("йому"))
              && tokens[i+1].getAnalyzedToken(0).getToken().startsWith("подібн") ) {
            //          reqTokenReadings = null;
            continue;
          }

          if( (token.equals("усім") || token.equals("всім"))
              && tokens[i+1].getAnalyzedToken(0).getToken().startsWith("відом") ) {
            //          reqTokenReadings = null;
            continue;
          }

          if( prep.equalsIgnoreCase("до") && token.equals("схід")
                && tokens[i+1].getAnalyzedToken(0).getToken().equals("сонця") ) {
            reqTokenReadings = null;
            continue;
          }

          if( tokens[i+1].getAnalyzedToken(0).getToken().equals("«")
              && tokens[i].getAnalyzedToken(0).getPOSTag().contains(":abbr") ) {
            reqTokenReadings = null;
            continue;
          }

         
          if( tokens.length > i+2 ) {
            if ((token.equals("нікому") || token.equals("ніким") || token.equals("нічим") || token.equals("нічому"))
                && tokens[i+1].getAnalyzedToken(0).getToken().equals("не")) {
              //          reqTokenReadings = null;
              continue;
            }


          }
        }

        RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, reqTokenReadings, posTagsToFind);
        ruleMatches.add(potentialRuleMatch);
      }

      reqTokenReadings = null;
    }

    return toRuleMatchArray(ruleMatches);
  }

  private static boolean isCapitalized(String token) {
    return token.length() > 1 && Character.isUpperCase(token.charAt(0)) && Character.isLowerCase(token.charAt(1));
  }

  private boolean reverseSearch(AnalyzedTokenReadings[] tokens, int pos, String string) {
    for(int i=pos-1; i >= 0 && i > pos-4; i--) {
      if( tokens[i].getAnalyzedToken(0).getToken().equalsIgnoreCase("що") )
        return true;
    }
    return false;
  }

  private boolean isTokenToSkip(AnalyzedTokenReadings tokenReadings) {
    for(AnalyzedToken token: tokenReadings) {
//      System.out.println("    tag: " + token.getPOSTag() + " for " + token.getToken());
      if( IPOSTag.adv.match(token.getPOSTag())
          || IPOSTag.contains(token.getPOSTag(), "adv>")
          ||  IPOSTag.insert.match(token.getPOSTag()) )
        return true;
    }
    return false;
  }

//  private boolean isTokenToIgnore(AnalyzedTokenReadings tokenReadings) {
//    for(AnalyzedToken token: tokenReadings) {
//      if( token.getPOSTag().contains("abbr") )
//        return true;
//    }
//    return false;
//  }

  private boolean hasRequiredPosTag(Collection<String> posTagsToFind, AnalyzedTokenReadings tokenReadings) {
    boolean vidminokFound = false// because POS dictionary is not complete

    for(AnalyzedToken token: tokenReadings) {
      String posTag = token.getPOSTag();

      if( posTag == null || posTag.contains(NO_VIDMINOK_SUBSTR) )
        return true;

      if( posTag.contains(VIDMINOK_SUBSTR) ) {
        vidminokFound = true;

        for(String posTagToFind: posTagsToFind) {
          //          System.out.println("  verifying: " + token + " -> " + posTag + " ~ " + posTagToFind);

          if ( posTag.contains(posTagToFind) )
            return true;
        }
      }
    }

    return ! vidminokFound; //false;
  }

  private RuleMatch createRuleMatch(AnalyzedTokenReadings tokenReadings, AnalyzedTokenReadings reqTokenReadings, List<String> posTagsToFind) {
    String tokenString = tokenReadings.getToken();

    Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();

    ArrayList<String> suggestions = new ArrayList<String>();
    String oldPosTag = tokenReadings.getAnalyzedToken(0).getPOSTag();
    String requiredPostTagsRegEx = ":(" + StringUtils.join(posTagsToFind,"|") + ")";
    String posTag = oldPosTag.replaceFirst(":v_[a-z]+", requiredPostTagsRegEx);

    //    System.out.println("  creating suggestion for " + tokenReadings + " / " + tokenReadings.getAnalyzedToken(0) +" and tag " + posTag);

    try {
      String[] synthesized = ukrainianSynthesizer.synthesize(tokenReadings.getAnalyzedToken(0), posTag, true);

      //      System.out.println("Synthesized: " + Arrays.asList(synthesized));
      suggestions.addAll( Arrays.asList(synthesized) );
    } catch (IOException e) {
      throw new RuntimeException(e);
    }

    ArrayList<String> reqVidminkyNames = new ArrayList<String>();
    for (String vidm: posTagsToFind) {
      reqVidminkyNames.add(UkrainianTagger.VIDMINKY_MAP.get(vidm));
    }

    ArrayList<String> foundVidminkyNames = new ArrayList<String>();
    for(AnalyzedToken token: tokenReadings) {
      String posTag2 = token.getPOSTag();
      if( posTag2.contains(VIDMINOK_SUBSTR) ) {
        foundVidminkyNames.add(UkrainianTagger.VIDMINKY_MAP.get(posTag2.replaceFirst("^.*"+VIDMINOK_REGEX+".*$", "$1")));
      }
    }

    String msg = MessageFormat.format("Прийменник «{0}» вимагає іншого відмінка: {1}, а знайдено: {2}",
        reqTokenReadings.getToken(), StringUtils.join(reqVidminkyNames, ", "), StringUtils.join(foundVidminkyNames, ", "));
       
    if( tokenString.equals("їх") ) {
      msg += ". Можливо тут потрібно присвійний займенник «їхній»?";
      try {
        String newYihPostag = "adj:p" + requiredPostTagsRegEx + ".*";
        String[] synthesized = ukrainianSynthesizer.synthesize(new AnalyzedToken("їхній", "adj:m:v_naz:&pron", "їхній"), newYihPostag, true);
        suggestions.addAll( Arrays.asList(synthesized) );
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
       
    int pos = tokenReadings.getStartPos();

    RuleMatch potentialRuleMatch = new RuleMatch(this, pos, pos + tokenString.length(), msg, getShort());

    potentialRuleMatch.setSuggestedReplacements(suggestions);

    return potentialRuleMatch;
  }

  @Override
  public void reset() {
  }

}
TOP

Related Classes of org.languagetool.rules.uk.TokenAgreementRule

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.