Package org.archive.modules.recrawl

Source Code of org.archive.modules.recrawl.PersistProcessor

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.modules.recrawl;


import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.logging.ConsoleHandler;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.SerializationUtils;
import org.archive.bdb.BdbModule;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.OneLineSimpleLogger;
import org.archive.util.SURT;
import org.archive.util.bdbje.EnhancedEnvironment;
import org.archive.util.iterator.LineReadingIterator;
import org.json.JSONObject;

import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.bind.tuple.StringBinding;
import com.sleepycat.collections.StoredIterator;
import com.sleepycat.collections.StoredSortedMap;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.EnvironmentConfig;

/**
* Superclass for Processors which utilize BDB-JE for URI state
* (including most notably history) persistence.
*
* @author gojomo
*/
public abstract class PersistProcessor extends AbstractPersistProcessor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1L;

    private static final Logger logger =
        Logger.getLogger(PersistProcessor.class.getName());

    /** name of history Database */
    public static final String URI_HISTORY_DBNAME = "uri_history";
   
    public static final BdbModule.BdbConfig HISTORY_DB_CONFIG;
    static {
        BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();
        dbConfig.setTransactional(false);
        dbConfig.setAllowCreate(true);
        dbConfig.setDeferredWrite(true);
        HISTORY_DB_CONFIG = dbConfig;
    }

    public PersistProcessor() {
    }
   
    /**
     * Return a preferred String key for persisting the given CrawlURI's
     * AList state.
     *
     * @param curi CrawlURI
     * @return String key
     */
    public static String persistKeyFor(CrawlURI curi) {
        // use a case-sensitive SURT for uniqueness and sorting benefits
        return persistKeyFor(curi.getUURI().toString());
    }

    public static String persistKeyFor(String uri) {
        // use a case-sensitive SURT for uniqueness and sorting benefits
        return SURT.fromURI(uri,true);
    }

    /**
     * Copies entries from an existing environment db to a new one. If
     * historyMap is not provided, only logs the entries that would have been
     * copied.
     *
     * @param sourceDir existing environment database directory
     * @param historyMap new environment db (or null for a dry run)
     * @return number of records
     * @throws DatabaseException
     */
    private static int copyPersistEnv(File sourceDir, StoredSortedMap<String,Map> historyMap)
    throws DatabaseException {
        int count = 0;

        // open the source env history DB, copying entries to target env
        EnhancedEnvironment sourceEnv = setupCopyEnvironment(sourceDir, true);
        StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
        DatabaseConfig historyDbConfig = HISTORY_DB_CONFIG.toDatabaseConfig();
        historyDbConfig.setReadOnly(true);
        Database sourceHistoryDB = sourceEnv.openDatabase(
                null, URI_HISTORY_DBNAME, historyDbConfig);
        StoredSortedMap<String,Map> sourceHistoryMap = new StoredSortedMap<String,Map>(sourceHistoryDB,
                new StringBinding(), new SerialBinding<Map>(sourceClassCatalog,
                        Map.class), true);

        Iterator<Entry<String,Map>> iter = sourceHistoryMap.entrySet().iterator();
        while (iter.hasNext()) {
            Entry<String,Map> item = iter.next();
            if (logger.isLoggable(Level.FINE)) {
                logger.fine(item.getKey() + " " + new JSONObject(item.getValue()));
            }
           
            if (historyMap != null) {
                historyMap.put(item.getKey(), item.getValue());
            }
            count++;
        }
        StoredIterator.close(iter);
        sourceHistoryDB.close();
        sourceEnv.close();
       
        return count;
    }

    /**
     * Populates an environment db from a persist log. If historyMap is
     * not provided, only logs the entries that would have been populated.
     *
     * @param persistLogReader
     *            persist log
     * @param historyMap
     *            new environment db (or null for a dry run)
     * @return number of records
     * @throws UnsupportedEncodingException
     * @throws DatabaseException
     */
    private static int populatePersistEnvFromLog(BufferedReader persistLogReader, StoredSortedMap<String,Map> historyMap)
    throws UnsupportedEncodingException, DatabaseException {
        int count = 0;

        Iterator<String> iter = new LineReadingIterator(persistLogReader);
        while (iter.hasNext()) {
            String line = iter.next();
            if (line.length() == 0) {
                continue;
            }
            String[] splits = line.split(" ");
            if (splits.length != 2) {
                logger.severe("bad line has " + splits.length + " fields (should be 2): " + line);
                continue;
            }

            Map alist;
            try {
                alist = (Map) SerializationUtils.deserialize(Base64.decodeBase64(splits[1].getBytes("UTF-8")));
            } catch (Exception e) {
                logger.severe("caught exception " + e + " deserializing line: " + line);
                continue;
            }

            if (logger.isLoggable(Level.FINE)) {
                logger.fine(splits[0] + " " + ArchiveUtils.prettyString(alist));
            }

            if (historyMap != null) try {
                historyMap.put(splits[0], alist);
            } catch (Exception e) {
                logger.log(Level.SEVERE, "caught exception after loading " + count +
                        " urls from the persist log (perhaps crawl was stopped by user?)", e);
                IOUtils.closeQuietly(persistLogReader);

                // seems to finish most cleanly when we return rather than throw something
                return count;
            }

            count++;
        }
        IOUtils.closeQuietly(persistLogReader);
       
        return count;
    }

    /**
     * Populates a new environment db from an old environment db or a persist
     * log. If path to new environment is not provided, only logs the entries
     * that would have been populated.
     *
     * @param sourcePath
     *            source of old entries: can be a path to an existing
     *            environment db, or a URL or path to a persist log
     * @param envFile
     *            path to new environment db (or null for a dry run)
     * @return number of records
     * @throws DatabaseException
     * @throws IOException
     */
    public static int populatePersistEnv(String sourcePath, File envFile)
        throws IOException {
        int count = 0;
        StoredSortedMap<String,Map> historyMap = null;
        EnhancedEnvironment targetEnv = null;
        StoredClassCatalog classCatalog = null;
        Database historyDB = null;

        if (envFile != null) {
            // set up target environment
            FileUtils.ensureWriteableDirectory(envFile);
            targetEnv = setupCopyEnvironment(envFile);
            classCatalog = targetEnv.getClassCatalog();
            historyDB = targetEnv.openDatabase(null, URI_HISTORY_DBNAME,
                    HISTORY_DB_CONFIG.toDatabaseConfig());
            historyMap = new StoredSortedMap<String,Map>(historyDB,
                    new StringBinding(), new SerialBinding<Map>(classCatalog,
                        Map.class), true);
        }

        try {
            count = copyPersistSourceToHistoryMap(new File(sourcePath), historyMap);
        } finally {
            // in finally block so that we unlock the target env even if we
            // failed to populate it
            if (envFile != null) {
                logger.info(count + " records imported from " + sourcePath + " to BDB env " + envFile);
                historyDB.sync();
                historyDB.close();
                targetEnv.close();
            } else {
                logger.info(count + " records found in " + sourcePath);
            }
        }

        return count;
    }

    /**
     * Populates a given StoredSortedMap (history map) from an old
     * environment db or a persist log. If a map is not provided, only
     * logs the entries that would have been populated.
     *
     * @param sourceFile
     *            source of old entries: can be a path to an existing
     *            environment db or persist log
     * @param historyMap
     *            map to populate (or null for a dry run)
     * @return number of records
     * @throws DatabaseException
     * @throws IOException
     */
    public static int copyPersistSourceToHistoryMap(File sourceFile,
            StoredSortedMap<String, Map> historyMap) throws DatabaseException,
            IOException {
        // delegate depending on the source
        if (sourceFile.isDirectory()) {
            return copyPersistEnv(sourceFile, historyMap);
        } else {
            BufferedReader persistLogReader = ArchiveUtils.getBufferedReader(sourceFile);
            return populatePersistEnvFromLog(persistLogReader, historyMap);
        }
    }

    /**
     * Populates a given StoredSortedMap (history map) from an old persist log.
     * If a map is not provided, only logs the entries that would have been
     * populated.
     *
     * @param sourceUrl
     *            url of source persist log
     * @param historyMap
     *            map to populate (or null for a dry run)
     * @return number of records
     * @throws DatabaseException
     * @throws IOException
     */
    public static int copyPersistSourceToHistoryMap(URL sourceUrl,
            StoredSortedMap<String, Map> historyMap) throws DatabaseException,
            IOException {
        BufferedReader persistLogReader = ArchiveUtils
                .getBufferedReader(sourceUrl);
        return populatePersistEnvFromLog(persistLogReader, historyMap);
    }
   
    /**
     * Utility main for importing a log into a BDB-JE environment or moving a
     * database between environments (2 arguments), or simply dumping a log
     * to stderr in a more readable format (1 argument).
     *
     * @param args command-line arguments
     * @throws DatabaseException
     * @throws IOException
     */
    public static void main(String[] args) throws DatabaseException, IOException {
        Handler handler = new ConsoleHandler();
        handler.setLevel(Level.ALL);
        handler.setFormatter(new OneLineSimpleLogger());
        logger.addHandler(handler);
        logger.setUseParentHandlers(false);

        if (args.length == 2) {
            logger.setLevel(Level.INFO);
            populatePersistEnv(args[0], new File(args[1]));
        } else if (args.length == 1) {
            logger.setLevel(Level.FINE);
            populatePersistEnv(args[0], null);
        } else {
            System.out.println("Arguments: ");
            System.out.println("    source [target]");
            System.out.println(
                "...where source is either a txtser log file or BDB env dir");
            System.out.println(
                "and target, if present, is a BDB env dir. ");
            return;
        }
    }

    public static EnhancedEnvironment setupCopyEnvironment(File env) throws DatabaseException {
        return setupCopyEnvironment(env, false);
    }
   
    public static EnhancedEnvironment setupCopyEnvironment(File env, boolean readOnly) throws DatabaseException {
        EnvironmentConfig envConfig = new EnvironmentConfig();
        envConfig.setAllowCreate(true);
        envConfig.setReadOnly(readOnly);
        try {
            return new EnhancedEnvironment(env, envConfig);
        } catch (IllegalArgumentException iae) {
            throw new IllegalArgumentException("problem with specified environment "+env+"; is it already open?", iae);
        }
    }
}
TOP

Related Classes of org.archive.modules.recrawl.PersistProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.