Package org.archive.io

Source Code of org.archive.io.CrawlerJournal

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.io;

import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.mg4j.util.MutableString;

import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.lang.StringUtils;
import org.archive.checkpointing.Checkpoint;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.TextUtils;

/**
* Utility class for a crawler journal/log that is compressed and
* rotates by serial number at checkpoints.
*
* @author gojomo
*/
public class CrawlerJournal implements Closeable {
    private static final Logger LOGGER = Logger.getLogger(
            CrawlerJournal.class.getName());
   
    /** prefix for error lines*/
    public static final String LOG_ERROR = "E ";
    /** prefix for timestamp lines */
    public static final String LOG_TIMESTAMP = "T ";

    /**
     * Stream on which we record frontier events.
     */
    protected Writer out = null;
   
    /** line count */
    protected long lines = 0;
    /** number of lines between timestamps */
    protected int timestamp_interval = 0; // 0 means no timestamps
   
    /**
     * File we're writing journal to.
     * Keep a reference in case we want to rotate it off.
     */
    protected File gzipFile = null;
   
    /**
     * Create a new crawler journal at the given location
     *
     * @param path Directory to make thejournal in.
     * @param filename Name to use for journal file.
     * @throws IOException
     */
    public CrawlerJournal(String path, String filename)
    throws IOException {
        this.gzipFile = new File(path, filename);
        this.out = initialize(gzipFile);
    }
   
    /**
     * Create a new crawler journal at the given location
     *
     * @param file path at which to make journal
     * @throws IOException
     */
    public CrawlerJournal(File file) throws IOException {
        this.gzipFile = file;
        this.out = initialize(gzipFile);
    }
   
    protected Writer initialize(final File f) throws FileNotFoundException, IOException {
        FileUtils.moveAsideIfExists(f);
        return new OutputStreamWriter(new GZIPOutputStream(
            new FastBufferedOutputStream(new FileOutputStream(f),32*1024)));
    }

    /**
     * Write a line
     *
     * @param string String
     */
    public synchronized void writeLine(String... strs) {
        try {
            for(String s : strs) {
                this.out.write(s);
            }
            this.out.write("\n");
            noteLine();
        } catch (IOException e) {
            LOGGER.log(
                Level.SEVERE,
                "problem writing journal line: "+StringUtils.join(strs),
                e);
        }
    }

    /**
     * Write a line.
     *
     * @param mstring MutableString to write
     */
    public synchronized void writeLine(MutableString mstring) {
        if (this.out == null) {
            return;
        }
        try {
            mstring.write(out);
            this.out.write("\n");
            noteLine();
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE,"problem writing journal line: "+mstring, e);
        }
    }

    /**
     * Count and note a line
     *
     * @throws IOException
     */
    protected void noteLine() throws IOException {
        lines++;
        considerTimestamp();
    }

    /**
     * Write a timestamp line if appropriate
     *
     * @throws IOException
     */
    protected void considerTimestamp() throws IOException {
        if(timestamp_interval > 0 && lines % timestamp_interval == 0) {
            out.write(LOG_TIMESTAMP);
            out.write(ArchiveUtils.getLog14Date());
            out.write("\n");
        }
    }

    /**
     * Flush and close the underlying IO objects.
     */
    public void close() {
        if (this.out == null) {
            return;
        }
        try {
            this.out.flush();
            this.out.close();
            this.out = null;
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE,"problem closing journal", e);
        }
    }

    /**
     * Note a serious error vioa a special log line
     *
     * @param err
     */
    public synchronized void seriousError(String err) {
        writeLine(LOG_ERROR+ArchiveUtils.getLog14Date()+" "+err+"\n");
    }

    /**
     * Handle a checkpoint by rotating the current log to a checkpoint-named
     * file and starting a new log.
     *
     * @param checkpointDir
     * @throws IOException
     */
    public synchronized void rotateForCheckpoint(Checkpoint checkpointInProgress) {
        if (this.out == null || !this.gzipFile.exists()) {
            return;
        }
        close();

        File newName = new File(this.gzipFile.getParentFile(),
                this.gzipFile.getName() + "." + checkpointInProgress.getName());
        try {
            FileUtils.moveAsideIfExists(newName);
            if (checkpointInProgress.getForgetAllButLatest()) {
                // merge any earlier checkpointed files into new checkpoint
                // file, taking advantage of the legality of concatenating gzips
               
                File[] oldCheckpointeds = this.gzipFile.getParentFile().listFiles(new FilenameFilter() {
                    @Override
                    public boolean accept(File dir, String name) {
                        String regex = "^" + Pattern.quote(gzipFile.getName()) + "\\.cp\\d{5}-\\d{14}$";
                        return TextUtils.matches(regex, name);
                    }
                });
                Arrays.sort(oldCheckpointeds);
               
                for (int i = 1; i < oldCheckpointeds.length; i++) {
                    FileUtils.appendTo(oldCheckpointeds[0], oldCheckpointeds[i]);
                    oldCheckpointeds[i].delete();
                }
               
                if (oldCheckpointeds.length > 0) {
                    FileUtils.appendTo(oldCheckpointeds[0], this.gzipFile);
                    this.gzipFile.delete();
                    oldCheckpointeds[0].renameTo(newName);
                } else {
                    this.gzipFile.renameTo(newName);
                }
            } else {
                this.gzipFile.renameTo(newName);
            }
           
            // Open new gzip file.
            this.out = initialize(this.gzipFile);
        } catch (IOException ioe) {
            LOGGER.log(Level.SEVERE,"Problem rotating recovery journal", ioe);
        }
    }
}
TOP

Related Classes of org.archive.io.CrawlerJournal

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.