Package net.yacy.repository

Source Code of net.yacy.repository.Blacklist

// Blacklist.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.07.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2011-07-04 01:55:55 +0200 (Mo, 04. Jul 2011) $
// $LastChangedRevision: 7827 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
package net.yacy.repository;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.SetTools;

public class Blacklist {

    public static final String BLACKLIST_DHT = "dht";
    public static final String BLACKLIST_CRAWLER = "crawler";
    public static final String BLACKLIST_PROXY = "proxy";
    public static final String BLACKLIST_SEARCH = "search";
    public static final String BLACKLIST_SURFTIPS = "surftips";
    public static final String BLACKLIST_NEWS = "news";
    public final static String BLACKLIST_FILENAME_FILTER = "^.*\\.black$";

    public static enum BlacklistError {

        NO_ERROR(0),
        TWO_WILDCARDS_IN_HOST(1),
        SUBDOMAIN_XOR_WILDCARD(2),
        PATH_REGEX(3),
        WILDCARD_BEGIN_OR_END(4),
        HOST_WRONG_CHARS(5),
        DOUBLE_OCCURANCE(6),
        HOST_REGEX(7);
        final int errorCode;

        BlacklistError(final int errorCode) {
            this.errorCode = errorCode;
        }

        public int getInt() {
            return errorCode;
        }

        public long getLong() {
            return (long) errorCode;
        }
    }
    protected static final Set<String> BLACKLIST_TYPES = new HashSet<String>(Arrays.asList(new String[]{
                Blacklist.BLACKLIST_CRAWLER,
                Blacklist.BLACKLIST_PROXY,
                Blacklist.BLACKLIST_DHT,
                Blacklist.BLACKLIST_SEARCH,
                Blacklist.BLACKLIST_SURFTIPS,
                Blacklist.BLACKLIST_NEWS
            }));
    public static final String BLACKLIST_TYPES_STRING = "proxy,crawler,dht,search,surftips,news";
    private File blacklistRootPath = null;
    private final Map<String, HandleSet> cachedUrlHashs;
    private final Map<String, Map<String, List<String>>> hostpaths_matchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
    private final Map<String, Map<String, List<String>>> hostpaths_notmatchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here

    public Blacklist(final File rootPath) {

        this.setRootPath(rootPath);

        // prepare the data structure
        this.hostpaths_matchable = new HashMap<String, Map<String, List<String>>>();
        this.hostpaths_notmatchable = new HashMap<String, Map<String, List<String>>>();
        this.cachedUrlHashs = new HashMap<String, HandleSet>();

        for (final String blacklistType : BLACKLIST_TYPES) {
            this.hostpaths_matchable.put(blacklistType, new HashMap<String, List<String>>());
            this.hostpaths_notmatchable.put(blacklistType, new HashMap<String, List<String>>());
            this.cachedUrlHashs.put(blacklistType, new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0));
        }
    }

    public final void setRootPath(final File rootPath) {
        if (rootPath == null) {
            throw new NullPointerException("The blacklist root path must not be null.");
        }
        if (!rootPath.isDirectory()) {
            throw new IllegalArgumentException("The blacklist root path is not a directory.");
        }
        if (!rootPath.canRead()) {
            throw new IllegalArgumentException("The blacklist root path is not readable.");
        }

        this.blacklistRootPath = rootPath;
    }

    protected Map<String, List<String>> getBlacklistMap(final String blacklistType, final boolean matchable) {
        if (blacklistType == null) {
            throw new IllegalArgumentException("Blacklist type not set.");
        }
        if (!BLACKLIST_TYPES.contains(blacklistType)) {
            throw new IllegalArgumentException("Unknown blacklist type: " + blacklistType + ".");
        }

        return (matchable) ? this.hostpaths_matchable.get(blacklistType) : this.hostpaths_notmatchable.get(blacklistType);
    }

    protected HandleSet getCacheUrlHashsSet(final String blacklistType) {
        if (blacklistType == null) {
            throw new IllegalArgumentException("Blacklist type not set.");
        }
        if (!BLACKLIST_TYPES.contains(blacklistType)) {
            throw new IllegalArgumentException("Unknown backlist type.");
        }

        return this.cachedUrlHashs.get(blacklistType);
    }

    public void clear() {
        for (final Map<String, List<String>> entry : this.hostpaths_matchable.values()) {
            entry.clear();
        }
        for (final Map<String, List<String>> entry : this.hostpaths_notmatchable.values()) {
            entry.clear();
        }
        for (final HandleSet entry : this.cachedUrlHashs.values()) {
            entry.clear();
        }
    }

    public int size() {
        int size = 0;
        for (final String entry : this.hostpaths_matchable.keySet()) {
            for (final List<String> ientry : this.hostpaths_matchable.get(entry).values()) {
                size += ientry.size();
            }
        }
        for (final String entry : this.hostpaths_notmatchable.keySet()) {
            for (final List<String> ientry : this.hostpaths_notmatchable.get(entry).values()) {
                size += ientry.size();
            }
        }
        return size;
    }

    public void loadList(final BlacklistFile[] blFiles, final String sep) {
        for (final BlacklistFile blf : blFiles) {
            loadList(blf.getType(), blf.getFileName(), sep);
        }
    }

    private void loadList(final BlacklistFile blFile, final String sep) {
        final Map<String, List<String>> blacklistMapMatch = getBlacklistMap(blFile.getType(), true);
        final Map<String, List<String>> blacklistMapNotMatch = getBlacklistMap(blFile.getType(), false);
        Set<Map.Entry<String, List<String>>> loadedBlacklist;
        Map.Entry<String, List<String>> loadedEntry;
        List<String> paths;
        List<String> loadedPaths;

        final Set<String> fileNames = blFile.getFileNamesUnified();
        for (final String fileName : fileNames) {
            // make sure all requested blacklist files exist
            final File file = new File(this.blacklistRootPath, fileName);
            try {
                file.createNewFile();
            } catch (final IOException e) { /* */ }

            // join all blacklists from files into one internal blacklist map
            loadedBlacklist = SetTools.loadMapMultiValsPerKey(file.toString(), sep).entrySet();
            for (final Iterator<Map.Entry<String, List<String>>> mi = loadedBlacklist.iterator(); mi.hasNext();) {
                loadedEntry = mi.next();
                loadedPaths = loadedEntry.getValue();

                // create new entry if host mask unknown, otherwise merge
                // existing one with path patterns from blacklist file
                paths = (isMatchable(loadedEntry.getKey())) ? blacklistMapMatch.get(loadedEntry.getKey()) : blacklistMapNotMatch.get(loadedEntry.getKey());
                if (paths == null) {
                    if (isMatchable(loadedEntry.getKey())) {
                        blacklistMapMatch.put(loadedEntry.getKey(), loadedPaths);
                    } else {
                        blacklistMapNotMatch.put(loadedEntry.getKey(), loadedPaths);
                    }
                } else {
                    // TODO check for duplicates? (refactor List -> Set)
                    paths.addAll(loadedPaths);
                }
            }
        }
    }

    public void loadList(final String blacklistType, final String fileNames, final String sep) {
        // method for not breaking older plasmaURLPattern interface
        final BlacklistFile blFile = new BlacklistFile(fileNames, blacklistType);

        loadList(blFile, sep);
    }

    public void removeAll(final String blacklistType, final String host) {
        getBlacklistMap(blacklistType, true).remove(host);
        getBlacklistMap(blacklistType, false).remove(host);
    }

    public void remove(final String blacklistType, final String host, final String path) {

        final Map<String, List<String>> blacklistMap = getBlacklistMap(blacklistType, true);
        List<String> hostList = blacklistMap.get(host);
        if (hostList != null) {
            hostList.remove(path);
            if (hostList.isEmpty()) {
                blacklistMap.remove(host);
            }
        }
       
        final Map<String, List<String>> blacklistMapNotMatch = getBlacklistMap(blacklistType, false);
        hostList = blacklistMapNotMatch.get(host);
        if (hostList != null) {
            hostList.remove(path);
            if (hostList.isEmpty()) {
                blacklistMapNotMatch.remove(host);
            }
        }
    }

    public void add(final String blacklistType, final String host, final String path) {
        if (host == null) {
            throw new IllegalArgumentException("host may not be null");
        }
        if (path == null) {
            throw new IllegalArgumentException("path may not be null");
        }

        final String p = (path.length() > 0 && path.charAt(0) == '/') ? path.substring(1) : path;

        final Map<String, List<String>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));

        // avoid PatternSyntaxException e
        final String h =
                ((!isMatchable(host) && host.length() > 0 && host.charAt(0) == '*') ? "." + host : host).toLowerCase();

        List<String> hostList;
        if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
            blacklistMap.put(h, (hostList = new ArrayList<String>()));
        }

        hostList.add(p);
    }

    public int blacklistCacheSize() {
        int size = 0;
        final Iterator<String> iter = this.cachedUrlHashs.keySet().iterator();
        while (iter.hasNext()) {
            size += this.cachedUrlHashs.get(iter.next()).size();
        }
        return size;
    }

    public boolean hashInBlacklistedCache(final String blacklistType, final byte[] urlHash) {
        return getCacheUrlHashsSet(blacklistType).has(urlHash);
    }

    public boolean contains(final String blacklistType, final String host, final String path) {
        boolean ret = false;

        if (blacklistType != null && host != null && path != null) {
            final Map<String, List<String>> blacklistMap =
                    getBlacklistMap(blacklistType, isMatchable(host));

            // avoid PatternSyntaxException e
            final String h =
                    ((!isMatchable(host) && host.length() > 0 && host.charAt(0) == '*') ? "." + host : host).toLowerCase();

            List<String> hostList = blacklistMap.get(h);
            if (hostList != null) {
                ret = hostList.contains(path);
            }
        }
        return ret;
    }

    public boolean isListed(final String blacklistType, final DigestURI url) {
        if (url == null) {
            throw new IllegalArgumentException("url may not be null");
        }

        if (url.getHost() == null) {
            return false;
        }
        final HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType);
        if (!urlHashCache.has(url.hash())) {
            final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
            if (temp) {
                try {
                    urlHashCache.put(url.hash());
                } catch (RowSpaceExceededException e) {
                    Log.logException(e);
                }
            }
            return temp;
        }
        return true;
    }

    public static boolean isMatchable(final String host) {

        return (
                (Pattern.matches("^[a-z0-9.-]*$", host))            // simple Domain (yacy.net or www.yacy.net)
                || (Pattern.matches("^\\*\\.[a-z0-9-.]*$", host))   // start with *. (not .* and * must follow a dot)
                || (Pattern.matches("^[a-z0-9-.]*\\.\\*$", host))   // ends with .* (not *. and before * must be a dot)
                );
    }

    public String getEngineInfo() {
        return "Default YaCy Blacklist Engine";
    }

    public boolean isListed(final String blacklistType, final String hostlow, final String path) {
        if (hostlow == null) {
            throw new IllegalArgumentException("hostlow may not be null");
        }
        if (path == null) {
            throw new IllegalArgumentException("path may not be null");
        }

        // getting the proper blacklist
        final Map<String, List<String>> blacklistMapMatched = getBlacklistMap(blacklistType, true);

        final String p = (path.length() > 0 && path.charAt(0) == '/') ? path.substring(1) : path;

        List<String> app;
        boolean matched = false;
        String pp = ""; // path-pattern

        // try to match complete domain
        if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) {
            for (int i = app.size() - 1; !matched && i > -1; i--) {
                pp = app.get(i);
                if (pp.indexOf("?*") > 0) {
                    // prevent "Dangling meta character '*'" exception
                    Log.logWarning("Blacklist", "ignored blacklist path to prevent 'Dangling meta character' exception: " + pp);
                    continue;
                }
                matched |= (("*".equals(pp)) || (p.matches(pp)));
            }
        }
        // first try to match the domain with wildcard '*'
        // [TL] While "." are found within the string
        int index = 0;
        while (!matched && (index = hostlow.indexOf('.', index + 1)) != -1) {
            if ((app = blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*")) != null) {
                for (int i = app.size() - 1; !matched && i > -1; i--) {
                    pp = app.get(i);
                    matched |= (("*".equals(pp)) || (p.matches(pp)));
                }
            }
            if ((app = blacklistMapMatched.get(hostlow.substring(0, index))) != null) {
                for (int i = app.size() - 1; !matched && i > -1; i--) {
                    pp = app.get(i);
                    matched |= (("*".equals(pp)) || (p.matches(pp)));
                }
            }
        }
        index = hostlow.length();
        while (!matched && (index = hostlow.lastIndexOf('.', index - 1)) != -1) {
            if ((app = blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
                for (int i = app.size() - 1; !matched && i > -1; i--) {
                    pp = app.get(i);
                    matched |= (("*".equals(pp)) || (p.matches(pp)));
                }
            }
            if ((app = blacklistMapMatched.get(hostlow.substring(index + 1, hostlow.length()))) != null) {
                for (int i = app.size() - 1; !matched && i > -1; i--) {
                    pp = app.get(i);
                    matched |= (("*".equals(pp)) || (p.matches(pp)));
                }
            }
        }


        // loop over all Regexentrys
        if (!matched) {
            final Map<String, List<String>> blacklistMapNotMatched = getBlacklistMap(blacklistType, false);
            String key;
            for (final Entry<String, List<String>> entry : blacklistMapNotMatched.entrySet()) {
                key = entry.getKey();
                try {
                    if (Pattern.matches(key, hostlow)) {
                        app = entry.getValue();
                        for (int i = 0; i < app.size(); i++) {
                            if (Pattern.matches(app.get(i), p)) {
                                return true;
                            }
                        }
                    }
                } catch (final PatternSyntaxException e) {
                    //System.out.println(e.toString());
                }
            }
        }
        return matched;
    }

    public BlacklistError checkError(final String element, final Map<String, String> properties) {

        final boolean allowRegex = (properties != null) && properties.get("allowRegex").equalsIgnoreCase("true");
        int slashPos;
        final String host, path;

        if ((slashPos = element.indexOf('/')) == -1) {
            host = element;
            path = ".*";
        } else {
            host = element.substring(0, slashPos);
            path = element.substring(slashPos + 1);
        }

        if (!allowRegex || !RegexHelper.isValidRegex(host)) {
            final int i = host.indexOf('*');

            // check whether host begins illegally
            if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) {
                if (i == 0 && host.length() > 1 && host.charAt(1) != '.') {
                    return BlacklistError.SUBDOMAIN_XOR_WILDCARD;
                }
                return BlacklistError.HOST_WRONG_CHARS;
            }

            // in host-part only full sub-domains may be wildcards
            if (host.length() > 0 && i > -1) {
                if (!(i == 0 || i == host.length() - 1)) {
                    return BlacklistError.WILDCARD_BEGIN_OR_END;
                }

                if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') {
                    return BlacklistError.SUBDOMAIN_XOR_WILDCARD;
                }
            }

            // check for double-occurences of "*" in host
            if (host.indexOf("*", i + 1) > -1) {
                return BlacklistError.TWO_WILDCARDS_IN_HOST;
            }
        } else if (allowRegex && !RegexHelper.isValidRegex(host)) {
            return BlacklistError.HOST_REGEX;
        }

        // check for errors on regex-compiling path
        if (!RegexHelper.isValidRegex(path) && !"*".equals(path)) {
            return BlacklistError.PATH_REGEX;
        }

        return BlacklistError.NO_ERROR;
    }

    public static String defaultBlacklist(final File listsPath) {
        final List<String> dirlist = FileUtils.getDirListing(listsPath, Blacklist.BLACKLIST_FILENAME_FILTER);
        if (dirlist.isEmpty()) {
            return null;
        }
        return dirlist.get(0);
    }

    /**
     * Checks if a blacklist file contains a certain entry.
     * @param blacklistToUse The blacklist.
     * @param newEntry The Entry.
     * @return True if file contains entry, else false.
     */
    public static boolean blacklistFileContains(final File listsPath, final String blacklistToUse, final String newEntry) {
        final Set<String> blacklist = new HashSet<String>(FileUtils.getListArray(new File(listsPath, blacklistToUse)));
        return blacklist != null && blacklist.contains(newEntry);
    }
}
TOP

Related Classes of net.yacy.repository.Blacklist

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.