Package morfologik.stemming

Examples of morfologik.stemming.IStemmer


    return tokenReadings;
  }

  @Override
  public List<AnalyzedToken> additionalTags(String word) {
    final IStemmer dictLookup;
    try {
      dictLookup = new DictionaryLookup(getDictionary());
    } catch (IOException e) {
      throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e);
    }
    List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
    //Any well-formed adverb with suffix -ment is tagged as an adverb (RG)
    //Adjectiu femení singular o participi femení singular + -ment
    if (word.endsWith("ment")){
      final String lowerWord = word.toLowerCase(conversionLocale);
      final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
      List<AnalyzedToken> taggerTokens;
      taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
      for (AnalyzedToken taggerToken : taggerTokens ) {
        final String posTag = taggerToken.getPOSTag();
        if (posTag != null) {
          final Matcher m = ADJ_PART_FS.matcher(posTag);
          if (m.matches()) {
            additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
            return additionalTaggedTokens;
          }
        }
      }
    }
    //Any well-formed verb with prefixes is tagged as a verb copying the original tags
    Matcher matcher=PREFIXES_FOR_VERBS.matcher(word);
    if (matcher.matches()) {
      final String possibleVerb = matcher.group(2).toLowerCase();
      List<AnalyzedToken> taggerTokens;
      taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
      for (AnalyzedToken taggerToken : taggerTokens ) {
        final String posTag = taggerToken.getPOSTag();
        if (posTag != null) {
          final Matcher m = VERB.matcher(posTag);
          if (m.matches()) {
            String lemma=matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
            additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
          }
        }
      }
      return additionalTaggedTokens;
    }
    // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags
    /*if (word.startsWith("ex")) {
      final String lowerWord = word.toLowerCase(conversionLocale);
      final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1");
      List<AnalyzedToken> taggerTokens;
      taggerTokens = asAnalyzedTokenList(possibleNoun,dictLookup.lookup(possibleNoun));
      for (AnalyzedToken taggerToken : taggerTokens) {
        final String posTag = taggerToken.getPOSTag();
        if (posTag != null) {
          final Matcher m = NOUN.matcher(posTag);
          if (m.matches()) {
            String lemma = "ex".concat(taggerToken.getLemma());
            additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
          }
        }
      }
      return additionalTaggedTokens;
    }*/
    // Interpret deprecated characters of "ela geminada"
    // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT
    // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
    if (word.contains("\u0140") || word.contains("\u013f")) {
      final String lowerWord = word.toLowerCase(conversionLocale);
      final String possibleWord = lowerWord.replaceAll("\u0140", "l·");
      List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word,dictLookup.lookup(possibleWord));
      return taggerTokens;
    }
    return null;
  }
View Full Code Here


      p = Pattern.compile("N.*|A.*|V.P.*|PX.");
    } else {
      p = Pattern.compile(posTag);
    }
    final List<String> results = new ArrayList<>();
    final IStemmer synthesizer = createStemmer();
   
    for (final String tag : possibleTags) {
      final Matcher m = p.matcher(tag);
      if (m.matches()) {
        if (addDt) {
View Full Code Here

  public final String[] synthesize(final AnalyzedToken token,
      final String posTag) throws IOException {
    if (posTag == null) {
      return null;
    }
    final IStemmer synthesizer = new DictionaryLookup(getDictionary());
    boolean isNegated = false;
    if (token.getPOSTag() != null) {
      isNegated = posTag.indexOf(NEGATION_TAG) > 0
          || token.getPOSTag().indexOf(NEGATION_TAG) > 0
          && !(posTag.indexOf(COMP_TAG) > 0) && !(posTag.indexOf(SUP_TAG) > 0);
View Full Code Here

    if (posTagRegExp) {
      if (possibleTags == null) {
        possibleTags = SynthesizerTools.loadWords(JLanguageTool.getDataBroker().
            getFromResourceDirAsStream(TAGS_FILE_NAME));
      }
      final IStemmer synthesizer = new DictionaryLookup(getDictionary());
      final List<String> results = new ArrayList<>();

      boolean isNegated = false;
      if (token.getPOSTag() != null) {
        isNegated = posTag.indexOf(NEGATION_TAG) > 0
View Full Code Here

        }
        return ret;       
    }
   
    private IStemmer loadDictionary() throws IOException {
        IStemmer dictLookup = new DictionaryLookup(Dictionary.read(dictFile));
        return dictLookup;
    }
View Full Code Here

    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;   
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer morfologik = new DictionaryLookup(getDictionary());

    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(plLocale);
      taggerTokens = asAnalyzedTokenList(word, morfologik.lookup(word));
      lowerTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(lowerWord));      
      final boolean isLowercase = word.equals(lowerWord);

      //normal case
      addTokens(taggerTokens, l);

      if (!isLowercase) {
        //lowercase
        addTokens(lowerTaggerTokens, l);
      }

      //uppercase
      if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
        if (isLowercase) {
          upperTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(StringTools
              .uppercaseFirstChar(word)));
          if (!upperTaggerTokens.isEmpty()) {
            addTokens(upperTaggerTokens, l);
          } else {
            l.add(new AnalyzedToken(word, null, null));
View Full Code Here

    this.manualSynthesizer = manualSynthesizer;
  }

  @Override
  protected IStemmer createStemmer() {
    return new IStemmer() { // null synthesiser
      @Override
      public List<WordData> lookup(CharSequence word) {
        return new ArrayList<>();
      }
    };
View Full Code Here

TOP

Related Classes of morfologik.stemming.IStemmer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.