Package org.languagetool

Examples of org.languagetool.AnalyzedTokenReadings


    // ignore "im Allgemeinen gilt" but not "im Allgemeinen Fall":
    return "im".equalsIgnoreCase(prevToken) && "Allgemeinen".equals(token) && !hasNounReading(nextReadings);
  }

  private boolean isAdjectiveAsNoun(int i, AnalyzedTokenReadings[] tokens) {
    AnalyzedTokenReadings prevToken = i > 0 ? tokens[i-1] : null;
    boolean isPrevDeterminer = prevToken != null && (prevToken.hasPartialPosTag("ART") || prevToken.hasPartialPosTag("PRP"));
    if (!isPrevDeterminer) {
      return false;
    }
    AnalyzedTokenReadings nextReadings = i < tokens.length-1 ? tokens[i+1] : null;
    for (AnalyzedToken reading : tokens[i].getReadings()) {
      String posTag = reading.getPOSTag();
      // ignore "die Ausgewählten" but not "die Ausgewählten Leute":
      if (posTag != null && posTag.contains(":ADJ") && !hasNounReading(nextReadings)) {
        return true;
View Full Code Here


  private boolean isLanguage(int i, AnalyzedTokenReadings[] tokens) {
    String token = tokens[i].getToken();
    boolean maybeLanguage = languages.contains(token) ||
                            languages.contains(token.replaceFirst("e$", "")) ||  // z.B. "ins Japanische übersetzt"
                            languages.contains(token.replaceFirst("en$", ""));   // z.B. "im Japanischen"
    AnalyzedTokenReadings prevToken = i > 0 ? tokens[i-1] : null;
    AnalyzedTokenReadings nextReadings = i < tokens.length-1 ? tokens[i+1] : null;
    return maybeLanguage && ((nextReadings != null && !hasNounReading(nextReadings)) ||
                             (prevToken != null && prevToken.getToken().equals("auf")));
  }
View Full Code Here

  private boolean isProbablyCity(int i, AnalyzedTokenReadings[] tokens) {
    String token = tokens[i].getToken();
    boolean hasCityPrefix = "Klein".equals(token) || "Groß".equals(token) || "Neu".equals(token);
    if (hasCityPrefix) {
      AnalyzedTokenReadings nextReadings = i < tokens.length-1 ? tokens[i+1] : null;
      return nextReadings != null && (!nextReadings.isTagged() || nextReadings.hasPartialPosTag("EIG"));
    }
    return false;
  }
View Full Code Here

  @Override
  public final RuleMatch[] match(final AnalyzedSentence text) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();   

    AnalyzedTokenReadings reqTokenReadings = null;
    int i = -1;
    for (AnalyzedTokenReadings tokenReadings: tokens) {
      i++;

      String posTag = tokenReadings.getAnalyzedToken(0).getPOSTag();

      //TODO: skip conj напр. «бодай»

      if (posTag == null || posTag.contains(IPOSTag.unknown.getText()) || posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
        reqTokenReadings = null;
        continue;
      }

      String token = tokenReadings.getAnalyzedToken(0).getToken();
      if( posTag.contains(REQUIRE_VIDMINOK_SUBSTR) && tokenReadings.getReadingsLength() == 1 ) {
        String prep = token;

        if( prep.equals("за") && reverseSearch(tokens, i, "що") )
          continue;

        if( prep.equalsIgnoreCase("понад") )
          continue;

        if( (prep.equalsIgnoreCase("окрім") || prep.equalsIgnoreCase("крім"))
            && tokens.length > i+1 && tokens[i+1].getAnalyzedToken(0).getToken().equalsIgnoreCase("як") ) {
          reqTokenReadings = null;
          continue;
        }

        reqTokenReadings = tokenReadings;
        continue;
      }

      if( reqTokenReadings == null )
        continue;

      ArrayList<String> posTagsToFind = new ArrayList<String>();

      //      if( tokens.length > i+1 && Character.isUpperCase(tokenReadings.getAnalyzedToken(0).getToken().charAt(0))
      //        && hasRequiredPosTag(Arrays.asList("v_naz"), tokenReadings)
      //        && Character.isUpperCase(tokens[i+1].getAnalyzedToken(0).getToken().charAt(0)) )
      //          continue; // "у Конан Дойла"


      //TODO: for numerics only v_naz
      if( reqTokenReadings.getAnalyzedToken(0).getToken().equalsIgnoreCase("понад") ) { //&& tokenReadings.getAnalyzedToken(0).getPOSTag().equals(IPOSTag.numr) ) {
        posTagsToFind.add("v_naz");
      }

      String reqPosTag = reqTokenReadings.getAnalyzedToken(0).getPOSTag();

      Matcher matcher = REQUIRE_VIDMINOK_REGEX.matcher(reqPosTag);
      while( matcher.find() ) {
        posTagsToFind.add(matcher.group(1));
      }

      for(AnalyzedToken readingToken: tokenReadings) {
        if( IPOSTag.numr.match(readingToken.getPOSTag()) ) {
          posTagsToFind.add("v_naz")// TODO: only if noun is following?
          break;
        }
      }

      //      System.out.println("For " + tokenReadings + " to match " + posTagsToFind + " of " + reqTokenReadings.getToken());
      if( ! hasRequiredPosTag(posTagsToFind, tokenReadings) ) {
        if( isTokenToSkip(tokenReadings) )
          continue;

//        if( isTokenToIgnore(tokenReadings) ) {
//          reqTokenReadings = null;
//          continue;
//        }

        String prep = reqTokenReadings.getAnalyzedToken(0).getToken();
        if( prep.equalsIgnoreCase("до") ) {
          if( tokenReadings.getAnalyzedToken(0).getToken().compareToIgnoreCase("Я") == 0 ) {  // від А до Я
            reqTokenReadings = null;
            continue;
          }
        }

        if( prep.equalsIgnoreCase("в") || prep.equalsIgnoreCase("у") ) {
          if( hasRequiredPosTag(Arrays.asList("p:v_naz"), tokenReadings) ) {  //TODO: only for subset: президенти/депутати/мери/гості... or by verb піти/йти/балотуватися/записатися...
            reqTokenReadings = null;
            continue;
          }
        }

        // exceptions
        if( tokens.length > i+1 ) {
          if( isCapitalized( token )
              && STREETS.contains( tokens[i+1].getAnalyzedToken(0).getToken()) ) {
            reqTokenReadings = null;
            continue;
          }

          if( IPOSTag.isNum(tokens[i+1].getAnalyzedToken(0).getPOSTag())
              && (token.equals("мінус") || token.equals("плюс")
                  || token.equals("мінімум") || token.equals("максимум") ) ) {
            reqTokenReadings = null;
            continue;
          }

          if( reqTokenReadings.getAnalyzedToken(0).getToken().equalsIgnoreCase("через")
              && token.equals("років")
              && IPOSTag.isNum(tokens[i+1].getAnalyzedToken(0).getPOSTag()) ) {
            reqTokenReadings = null;
            continue;
          }
View Full Code Here

    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      // a real tagger would need to assign a POS tag
      // in the next line instead of null:
      l.add(new AnalyzedToken(word, null, null));
      tokenReadings.add(new AnalyzedTokenReadings(l, 0));
    }
    return tokenReadings;
  }
View Full Code Here

    return tokenReadings;
  }

  @Override
  public AnalyzedTokenReadings createNullToken(String token, int startPos) {
    return new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), startPos);
  }
View Full Code Here

      }
    }

    // It must clear attributes, as it is creating new tokens.
    clearAttributes();
    final AnalyzedTokenReadings tr = tokenIter.next();

    // add POS tag for sentence start.
    if (tr.isSentenceStart()) {
      // TODO: would be needed so negated tokens can match on something (see testNegatedMatchAtSentenceStart())
      // but breaks other cases:
      //termAtt.append("SENT_START");
      typeAtt.setType("pos");
      String posTag = tr.getAnalyzedToken(0).getPOSTag();
      String lemma = tr.getAnalyzedToken(0).getLemma();
      if (toLowerCase) {
        termAtt.append(POS_PREFIX.toLowerCase()).append(posTag.toLowerCase());
        if (lemma != null) {
          termAtt.append(LEMMA_PREFIX.toLowerCase()).append(lemma.toLowerCase());
        }
      } else {
        termAtt.append(POS_PREFIX).append(posTag);
        if (lemma != null) {
          termAtt.append(LEMMA_PREFIX).append(lemma);
        }
      }
      return true;
    }

    // by pass the white spaces.
    if (tr.isWhitespace()) {
      return this.incrementToken();
    }

    final AnalyzedToken at = tr.getAnalyzedToken(0);
    offsetAtt.setOffset(tr.getStartPos(), tr.getStartPos() + at.getToken().length());

    for (AnalyzedToken token : tr) {
      if (token.getPOSTag() != null) {
        if (toLowerCase) {
          posStack.push(POS_PREFIX.toLowerCase() + token.getPOSTag().toLowerCase());
        } else {
          posStack.push(POS_PREFIX + token.getPOSTag());
        }
      }
      if (token.getLemma() != null) {
        if (toLowerCase) {
          posStack.push(LEMMA_PREFIX.toLowerCase() + token.getLemma().toLowerCase());
        } else {
          // chances are good this is the same for all loop iterations, store it anyway...
          posStack.push(LEMMA_PREFIX + token.getLemma());
        }
      }
    }

    current = captureState();
    if (toLowerCase) {
      termAtt.append(tr.getAnalyzedToken(0).getToken().toLowerCase());
    } else {
      termAtt.append(tr.getAnalyzedToken(0).getToken());
    }

    return true;

  }
View Full Code Here

      if (l.isEmpty()) {
        l.add(new AnalyzedToken(word, null, null));
      }

      tokenReadings.add(new AnalyzedTokenReadings(l, pos));
      pos += word.length();
    }

    return tokenReadings;
  }
View Full Code Here

    }
  }

  @Override
  public final AnalyzedTokenReadings createNullToken(final String token, final int startPos) {
    return new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), startPos);
  }
View Full Code Here

import org.languagetool.rules.patterns.MatchState;

public class MatchTest extends TestCase {

  private AnalyzedTokenReadings getAnalyzedTokenReadings(String token, String posTag, String lemma) {
    return new AnalyzedTokenReadings(new AnalyzedToken(token, posTag, lemma), 0);
  }
View Full Code Here

TOP

Related Classes of org.languagetool.AnalyzedTokenReadings

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.