Source Code of org.languagetool.dev.blogs.BlogFetcher

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev.blogs;


import com.fasterxml.jackson.databind.ObjectMapper;
import com.sun.syndication.feed.synd.SyndEntryImpl;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;
import de.abelssoft.tools.FileTools;
import org.languagetool.tools.StringTools;


import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * Downloads blog content with the help of the readability.com API.
 * @since 2.7
 */
class BlogFetcher {


  private static final String READABILITY_API_KEY_FILE = "/home/dnaber/.readability-parser.txt";
  // e.g. <link rel="alternate" type="application/rss+xml" href="http://www.mimikama.at/feed/" />:
  private static final Pattern linkPattern = Pattern.compile("<link[^>]+?type=\"application/rss\\+xml\"[^>]+?/>", Pattern.DOTALL);
  private static final Pattern linkHrefPattern = Pattern.compile("href=[\"'](.*?)[\"']");


  private final String secretReadabilityToken;


  BlogFetcher(String secretReadabilityToken) {
    this.secretReadabilityToken = secretReadabilityToken;
  }


  private List<String> getBlogContent(String url) throws IOException {
    List<String> result = new ArrayList<>();
    String content = getContent(new URL(url));
    Matcher matcher = linkPattern.matcher(content);
    if (matcher.find()) {
      String linkContent = matcher.group();
      Matcher hrefMatcher = linkHrefPattern.matcher(linkContent);
      if (hrefMatcher.find()) {
        String feedUrl = hrefMatcher.group(1);
        SyndFeedInput input = new SyndFeedInput();
        try {
          SyndFeed feed = input.build(new XmlReader(new URL(feedUrl)));
          List<String> contentList = getContent(feed.getEntries());
          result.addAll(contentList);
          return result;
        } catch (Exception e) {
          throw new RuntimeException("Could not get feed data from " + feedUrl, e);
        }
      } else {
        System.err.println("No 'href' found for feed: " + url);
      }
    }
    System.err.println("No '<link>' found for feed: " + url);
    return result;
  }


  private List<String> getContent(List entries) throws IOException {
    List<String> result = new ArrayList<>();
    for (Object entry : entries) {
      SyndEntryImpl syndEntry = (SyndEntryImpl) entry;
      System.out.println("  Getting " + syndEntry.getUri());
      String json = getPageContent(syndEntry.getUri());
      ObjectMapper mapper = new ObjectMapper();
      Map map = mapper.readValue(json, Map.class);
      //System.out.println("json: " + json);
      //System.out.println("map: " + o);
      //System.out.println("content: " + o.get("content"));
      result.add(map.get("content").toString());
    }
    return result;
  }


  private String getContent(URL pageUrl) throws IOException {
    try (InputStream inputStream = pageUrl.openStream()) {
      return StringTools.streamToString(inputStream, "utf-8");
    }
  }


  private String getPageContent(String pageUrl) throws IOException {
    if (!pageUrl.startsWith("http")) {
      throw new IllegalArgumentException("Invalid feed URL: " + pageUrl);
    }
    URL url = new URL("https://www.readability.com/api/content/v1/parser?url=" + pageUrl + "&token=" + secretReadabilityToken);
    return getContent(url);
  }


  public static void main(String[] args) throws IOException {
    if (args.length != 2) {
      System.err.println("Usage: " + BlogFetcher.class.getSimpleName() + " <urlListFile> <outputDir>");
      System.exit(1);
    }
    String secret = FileTools.loadFile(new FileInputStream(READABILITY_API_KEY_FILE), "utf-8").trim();
    BlogFetcher fetcher = new BlogFetcher(secret);
    File outputDir = new File(args[1]);
    if (!outputDir.exists() || !outputDir.isDirectory()) {
      System.err.println("Output directory does not exist or is not a directory: " + outputDir);
      System.exit(1);
    }
    try (Scanner scanner = new Scanner(new File(args[0]))) {
      while (scanner.hasNextLine()) {
        String url = scanner.nextLine();
        try {
          File output = new File(outputDir, new URL(url).getHost());
          System.out.println("Working on " + url + ", writing result to " + output);
          List<String> blogContentList = fetcher.getBlogContent(url);
          try (FileWriter writer = new FileWriter(output)) {
            for (String content : blogContentList) {
              writer.write(content);
              writer.write("\n");
            }
          }
        } catch (Exception e) {
          //noinspection CallToPrintStackTrace
          e.printStackTrace();
        }
      }
    }
  }


}
Source Code of org.languagetool.dev.blogs.BlogFetcher

Related Classes of org.languagetool.dev.blogs.BlogFetcher