Package org.archive.modules.writer

Source Code of org.archive.modules.writer.Kw3WriterProcessor

/* Created on 2006-okt-03
*
* Copyright (C) 2006 National Library of Sweden.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

package org.archive.modules.writer;

import static org.archive.modules.writer.Kw3Constants.ARCHIVE_TIME_KEY;
import static org.archive.modules.writer.Kw3Constants.COLLECTION_KEY;
import static org.archive.modules.writer.Kw3Constants.CONTENT_LENGTH_KEY;
import static org.archive.modules.writer.Kw3Constants.CONTENT_MD5_KEY;
import static org.archive.modules.writer.Kw3Constants.HARVESTER_KEY;
import static org.archive.modules.writer.Kw3Constants.HEADER_LENGTH_KEY;
import static org.archive.modules.writer.Kw3Constants.HEADER_MD5_KEY;
import static org.archive.modules.writer.Kw3Constants.IP_ADDRESS_KEY;
import static org.archive.modules.writer.Kw3Constants.STATUS_CODE_KEY;
import static org.archive.modules.writer.Kw3Constants.URL_KEY;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.InetAddress;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.io.ReplayInputStream;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.spring.ConfigPath;
import org.archive.util.FileUtils;
import org.springframework.beans.factory.annotation.Autowired;

/**
* Processor module that writes the results of successful fetches to
* files on disk. These files are MIME-files of the type used by the
* Swedish National Library's Kulturarw3 web harvesting [http://www.kb.se/kw3/].
* Each URI gets written to its own file and has a path consisting of:
* <ul>
<li> A dir named with the first two chars of the website's md5. </li>
<li> A dir named after the website. </li>
<li> 'current' - a dir indicating that this is the directory being written
*                   to by the ongoing crawl. </li>
<li> A file on the format <md5 of url>.<fetchtime in seconds> </li>
* </ul>
* Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'
*
* The MIME-file itself consists of three parts:
* <ul>
<li> 1. ArchiveInfo - Metadata about the file and its content. </li>
<li> 2. Header - The HTTP response header. </li>
<li> 3. Content - The HTTP response content, plus content-type. </li>
* </ul>
*
* @author oskar
*/
public class Kw3WriterProcessor extends Processor {


  @SuppressWarnings("unused")
  private static final long serialVersionUID = 3L;
 
  private static String COLON = ":";
  private static String WS = " ";
  private static String LF = "\n";
 
  /**
   * Logger.
   */
  private static final Logger logger =
      Logger.getLogger(Kw3WriterProcessor.class.getName());


  /**
   * Top-level directory for archive files.
   */
  protected ConfigPath path = new ConfigPath("Kw3Writer subdirectory","arcs");
  public ConfigPath getPath() {
      return this.path;
  }
  public void setPath(ConfigPath s) {
      this.path = s;
  }
 
  /**
   * Max size for each file.
   */
  protected long maxFileSizeBytes = 100000000L;
  public long getMaxFileSizeBytes() {
      return maxFileSizeBytes;
  }
  public void setMaxFileSizeBytes(long maxFileSizeBytes) {
      this.maxFileSizeBytes = maxFileSizeBytes;
  }
 
  /**
   * Should permissions be changed for the newly created dirs.
   */
  protected boolean chmod = false;
  public boolean getChmod() {
      return chmod;
  }
  public void setChmod(boolean chmod) {
      this.chmod = chmod;
  }

  /**
   * What should the permissions be set to. Given as three octal digits, as to
   * the UNIX 'chmod' command. Ex. 777 for all permissions to everyone.
   */
  protected String chmodValue = "777";
  public String getChmodValue() {
      return this.chmodValue;
  }
  public void setChmodValue(String s) {
      this.chmodValue = s;
  }


  /**
   * Max size for each file.Key for the maximum ARC bytes to write attribute.
   */
  public static final String ATTR_MAX_BYTES_WRITTEN = "total-bytes-to-write";
 
 
  /**
   * Name of collection.
   */
  protected String collection = "kw3";
  public String getCollection() {
      return this.collection;
  }
  public void setCollection(String s) {
      this.collection = s;
  }

  /**
   * Name of the harvester that is used for the web harvesting.
   */
  protected String harvester = "heritrix";
  public String getHarvester() {
      return this.harvester;
  }
  public void setHarvester(String s) {
      this.harvester = s;
  }
 
  /**
   * The server cache to use.
   */
  protected ServerCache serverCache;
  public ServerCache getServerCache() {
      return this.serverCache;
  }
  @Autowired
  public void setServerCache(ServerCache serverCache) {
      this.serverCache = serverCache;
  }

  private static String BOUNDARY_START = "KulturArw3_";
 
  /**
   * Constructor.
   */
  public Kw3WriterProcessor() {
  }
 
  protected boolean shouldProcess(CrawlURI curi) {
      // Only successful fetches are written.
      if (!isSuccess(curi)) {
          return false;
      }
     
      // Only http and https schemes are supported.
      String scheme = curi.getUURI().getScheme().toLowerCase();
      if (!"http".equalsIgnoreCase(scheme)
              && !"https".equalsIgnoreCase(scheme)) {
          return false;               
      }
     
      return true;
  }


  protected void innerProcess(CrawlURI curi) {     
      // Write the MIME-file
      try {
          writeMimeFile(curi);
      } catch (IOException e) {
          logger.log(Level.WARNING, "i/o error", e);
      }     
  }
 
  /**
   * The actual writing of the Kulturarw3 MIME-file.
   *
   * The MIME-file consists of three parts:
   * 1. ArchiveInfo - Metadata about the file and its content.
   * 2. Header - The HTTP response header.
   * 3. Content - The HTTP response content, plus content-type.
   *
   * For more on this format, see '?'.
   */
  protected void writeMimeFile(CrawlURI curi) throws IOException {
      ReplayInputStream ris = null;
      OutputStream out = null;
               
      try {
          String boundary = BOUNDARY_START + stringToMD5(curi.toString());
          ris = curi.getRecorder().getRecordedInput().
              getReplayInputStream();
          out = initOutputStream(curi);
         
          // Part 1: Archive info
          writeArchiveInfoPart(boundary, curi, ris, out);

          // Part 2: Header info + HTTP header
          writeHeaderPart(boundary, ris, out);

          // Part 3: Content info + HTTP content
          writeContentPart(boundary, curi, ris, out);

          // And finally the terminator string
          String terminator = "\n--" + boundary + "--\n";
          out.write(terminator.getBytes());
      } finally {
          if (ris != null)
              ris.close();
          if (out != null)
              out.close();
      }
  }
 
  /**
   * Get the OutputStream for the file to write to.
   *
   * It has a path consisting of:
   * 1. A dir named with the first two chars of the website's md5.
   * 2. A dir named after the website.
   * 3. 'current' - a dir indicating that this is the directory being written
   *                to by the ongoing crawl.
   * 4. A file on the format <md5 of url>.<fetchtime in seconds>
   *
   * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'           
   */
  protected OutputStream initOutputStream(CrawlURI curi) throws IOException {
      String uri = curi.toString();
      int port = curi.getUURI().getPort();
      String host = (port == 80 || port <= 0) ?
              curi.getUURI().getHost() : curi.getUURI().getHost() + ":" + port;
      long fetchTime = curi.getFetchBeginTime() / 1000;
            
      String md5 = stringToMD5(host);
      File dir = new File(getPath().getFile(), md5.substring(0, 2) + "/" + host +
              "/current");
      if (!dir.exists()) {
          FileUtils.ensureWriteableDirectory(dir);
          if (this.chmod)
              chmods(dir, getPath().getFile());
      }
      md5 = stringToMD5(uri);
      File arcFile = new File(dir, md5 + "." + fetchTime);
      return new FastBufferedOutputStream(new FileOutputStream(arcFile));      
  }
 
  protected void writeArchiveInfoPart(String boundary, CrawlURI curi,
          ReplayInputStream ris, OutputStream out)
          throws IOException {
      // Get things we need to write in this part
      String uri = curi.toString();
      String ip = getHostAddress(curi);
      long headerLength = ris.getHeaderSize();
      long contentLength = ris.getContentSize();
      long archiveTime = System.currentTimeMillis() / 1000; // Fetchtime in seconds
      int statusCode = curi.getFetchStatus();
      String headerMd5 = null;
      Object contentMd5 = null;      
     
      // Get headerMd5
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      ris.readHeaderTo(baos);
      headerMd5 = stringToMD5(baos.toString());             
     
      // Get contentMd5
      contentMd5 = curi.getContentDigest();
      if (contentMd5 != null)
          contentMd5 = getHexString((byte[]) contentMd5);
     
      StringBuffer buffer = new StringBuffer();
      buffer.append("MIME-version: 1.1" + LF);
      buffer.append("Content-Type: multipart/mixed; boundary=" + boundary + LF);
      buffer.append("HTTP-Part: ArchiveInfo" + LF);
      buffer.append(COLLECTION_KEY + COLON + WS + this.collection + LF);
      buffer.append(HARVESTER_KEY + COLON + WS + this.harvester + LF);
      buffer.append(URL_KEY + COLON + WS + uri + LF);
      buffer.append(IP_ADDRESS_KEY + COLON + WS + ip + LF);
      buffer.append(HEADER_LENGTH_KEY + COLON + WS + headerLength + LF);
      buffer.append(HEADER_MD5_KEY + COLON + WS + headerMd5 + LF);
      buffer.append(CONTENT_LENGTH_KEY + COLON + WS + contentLength + LF);
      buffer.append(CONTENT_MD5_KEY + COLON + WS + contentMd5 + LF);
      buffer.append(ARCHIVE_TIME_KEY + COLON + WS+ archiveTime + LF);
      buffer.append(STATUS_CODE_KEY + COLON + WS + statusCode + LF + LF);      
      out.write(buffer.toString().getBytes());      
  }
 
  protected void writeHeaderPart(String boundary, ReplayInputStream ris,
          OutputStream out)
          throws IOException {
      StringBuffer buffer = new StringBuffer();
      buffer.append("--" + boundary + LF);
      buffer.append("Content-Type: text/plain; charset=\"US-ascii\"" + LF);
      buffer.append("HTTP-Part: Header" + LF + LF );
      out.write(buffer.toString().getBytes());
      ris.readHeaderTo(out);      
  }
 
  protected void writeContentPart(String boundary, CrawlURI curi,
          ReplayInputStream ris, OutputStream out)
          throws IOException {
      // Get things we need to write in this part
      String uri = curi.toString();
      String contentType = curi.getContentType();
      long contentLength = ris.getContentSize();     
      // Only write content if there is some
      if (contentLength == 0)   return;
            
      StringBuffer buffer = new StringBuffer();
      buffer.append("--" + boundary + LF);
      buffer.append("Content-Type: " + contentType + LF);
      buffer.append("HTTP-Part: Content" + LF + LF);
      out.write(buffer.toString().getBytes());
     
      if (contentLength > getMaxFileSizeBytes()) {
          ris.readContentTo(out, getMaxFileSizeBytes());
          logger.info(" Truncated url: " + uri + ", Size: " + contentLength +
                  ", Content-type: " + contentType);
      } else {
          ris.readContentTo(out);
      }
  }

  // --- Private helper functions --- //
  /*
   * Get a MD5 checksum based on a String.
   */
  private String stringToMD5(String str) {
      try {
          byte b[] = str.getBytes();
          MessageDigest md = MessageDigest.getInstance("MD5");
          md.update(b);
          byte[] digest = md.digest();
          return getHexString(digest);
      } catch (NoSuchAlgorithmException e) {
          logger.log(Level.WARNING, "md5 error", e);
      }
      return null;
  }

  /*
   * Fast convert a byte array to a hex string with possible leading zero.
   */
  private String getHexString(byte[] b) {
      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < b.length; i++) {
          String tmp = Integer.toHexString(b[i] & 0xff);
          if (tmp.length() < 2)
              sb.append("0" + tmp);
          else
              sb.append(tmp);
      }
      return sb.toString();
  }

  /*
   * Chmods for all newly created directories.
   */
  private void chmods(File dir, File arcsDir) {
      String topdir = arcsDir.getAbsolutePath();
      chmod(dir, this.chmodValue);
      File parent = dir.getParentFile();
      while (!parent.getAbsolutePath().equalsIgnoreCase((topdir))) {
          chmod(parent, this.chmodValue);
          parent = parent.getParentFile();
      }
     
  }

  /*
   * Chmod for a specific file or directory.
   */
  private void chmod(File file, String permissions) {
      Process proc = null;
      try {
          proc = Runtime.getRuntime().exec("chmod " + permissions + " " +
                  file.getAbsolutePath());
          proc.waitFor();
          proc.getInputStream().close();
          proc.getOutputStream().close();
          proc.getErrorStream().close();
      } catch (IOException e) {
          logger.log(Level.WARNING, "chmod failed", e);
      } catch (InterruptedException e) {
          logger.log(Level.WARNING, "chmod failed", e);
      }
  }

  private String getHostAddress(CrawlURI curi) {
      CrawlHost h = serverCache.getHostFor(curi.getUURI());
      if (h == null) {
          throw new NullPointerException("Crawlhost is null for " + curi + " " +
                  curi.getVia());
      }
      InetAddress a = h.getIP();
      if (a == null) {
          throw new NullPointerException("Address is null for " + curi + " " +
             curi.getVia() + ". Address " +
                 ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ?
                     "was never looked up." :
                     (System.currentTimeMillis() - h.getIpFetched()) + " ms ago."));
      }
      return h.getIP().getHostAddress();
  }
}
TOP

Related Classes of org.archive.modules.writer.Kw3WriterProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.