866 lines
34 KiB
Java
866 lines
34 KiB
Java
// Blacklist.java
|
|
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 11.07.2005 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
package net.yacy.repository;
|
|
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.FileOutputStream;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.ObjectInputStream;
|
|
import java.io.ObjectOutputStream;
|
|
import java.io.PrintWriter;
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Locale;
|
|
import java.util.Map;
|
|
import java.util.Map.Entry;
|
|
import java.util.Set;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
import java.util.concurrent.ConcurrentMap;
|
|
import java.util.regex.Pattern;
|
|
import java.util.regex.PatternSyntaxException;
|
|
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.document.id.MultiProtocolURL;
|
|
import net.yacy.cora.document.id.Punycode;
|
|
import net.yacy.cora.document.id.Punycode.PunycodeException;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.data.ListManager;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.index.RowHandleSet;
|
|
import net.yacy.kelondro.util.FileUtils;
|
|
import net.yacy.kelondro.util.SetTools;
|
|
import net.yacy.search.Switchboard;
|
|
import net.yacy.search.SwitchboardConstants;
|
|
|
|
public class Blacklist {
|
|
|
|
private final static ConcurrentLog log = new ConcurrentLog(Blacklist.class.getSimpleName());
|
|
|
|
public enum BlacklistType {
|
|
DHT, CRAWLER, PROXY, SEARCH, SURFTIPS, NEWS;
|
|
|
|
@Override
|
|
public final String toString () {
|
|
return super.toString().toLowerCase(Locale.ROOT);
|
|
}
|
|
}
|
|
|
|
public static final String BLACKLIST_FILENAME_FILTER = "^.*\\.black$";
|
|
|
|
public static enum BlacklistError {
|
|
|
|
NO_ERROR(0),
|
|
TWO_WILDCARDS_IN_HOST(1),
|
|
SUBDOMAIN_XOR_WILDCARD(2),
|
|
PATH_REGEX(3),
|
|
WILDCARD_BEGIN_OR_END(4),
|
|
HOST_WRONG_CHARS(5),
|
|
DOUBLE_OCCURANCE(6),
|
|
HOST_REGEX(7);
|
|
final int errorCode;
|
|
|
|
BlacklistError(final int errorCode) {
|
|
this.errorCode = errorCode;
|
|
}
|
|
|
|
public int getInt() {
|
|
return this.errorCode;
|
|
}
|
|
|
|
public long getLong() {
|
|
return this.errorCode;
|
|
}
|
|
}
|
|
|
|
private File blacklistRootPath = null;
|
|
private final ConcurrentMap<BlacklistType, HandleSet> cachedUrlHashs;
|
|
private final ConcurrentMap<BlacklistType, Map<String, Set<Pattern>>> hostpaths_matchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
|
|
private final ConcurrentMap<BlacklistType, Map<String, Set<Pattern>>> hostpaths_notmatchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
|
|
|
|
public Blacklist(final File rootPath) {
|
|
|
|
setRootPath(rootPath);
|
|
|
|
// prepare the data structure
|
|
this.hostpaths_matchable = new ConcurrentHashMap<BlacklistType, Map<String, Set<Pattern>>>();
|
|
this.hostpaths_notmatchable = new ConcurrentHashMap<BlacklistType, Map<String, Set<Pattern>>>();
|
|
this.cachedUrlHashs = new ConcurrentHashMap<BlacklistType, HandleSet>();
|
|
|
|
for (final BlacklistType blacklistType : BlacklistType.values()) {
|
|
this.hostpaths_matchable.put(blacklistType, new ConcurrentHashMap<String, Set<Pattern>>());
|
|
this.hostpaths_notmatchable.put(blacklistType, new ConcurrentHashMap<String, Set<Pattern>>());
|
|
loadDHTCache(blacklistType);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Close (shutdown) this "sub-system", add more here for shutdown.
|
|
*/
|
|
public final synchronized void close() {
|
|
log.fine("Shutting down blacklists ...");
|
|
|
|
// Save cache
|
|
for (final BlacklistType blacklistType : BlacklistType.values()) {
|
|
saveDHTCache(blacklistType);
|
|
}
|
|
|
|
log.fine("All blacklists has been shutdown.");
|
|
}
|
|
|
|
private final void setRootPath(final File rootPath) {
|
|
if (rootPath == null) {
|
|
throw new NullPointerException("The blacklist root path must not be null.");
|
|
}
|
|
if (!rootPath.isDirectory()) {
|
|
throw new IllegalArgumentException("The blacklist root path is not a directory.");
|
|
}
|
|
if (!rootPath.canRead()) {
|
|
throw new IllegalArgumentException("The blacklist root path is not readable.");
|
|
}
|
|
|
|
this.blacklistRootPath = rootPath;
|
|
}
|
|
|
|
protected final Map<String, Set<Pattern>> getBlacklistMap(final BlacklistType blacklistType, final boolean matchable) {
|
|
return (matchable) ? this.hostpaths_matchable.get(blacklistType) : this.hostpaths_notmatchable.get(blacklistType);
|
|
}
|
|
|
|
protected final HandleSet getCacheUrlHashsSet(final BlacklistType blacklistType) {
|
|
return this.cachedUrlHashs.get(blacklistType);
|
|
}
|
|
|
|
public final File getRootPath() {
|
|
return blacklistRootPath;
|
|
}
|
|
|
|
public final void clear() {
|
|
for (final Map<String, Set<Pattern>> entry : this.hostpaths_matchable.values()) {
|
|
entry.clear();
|
|
}
|
|
for (final Map<String, Set<Pattern>> entry : this.hostpaths_notmatchable.values()) {
|
|
entry.clear();
|
|
}
|
|
for (final HandleSet entry : this.cachedUrlHashs.values()) {
|
|
entry.clear();
|
|
}
|
|
}
|
|
|
|
public final int size() {
|
|
int size = 0;
|
|
for (final BlacklistType entry : this.hostpaths_matchable.keySet()) {
|
|
for (final Set<Pattern> ientry : this.hostpaths_matchable.get(entry).values()) {
|
|
size += ientry.size();
|
|
}
|
|
}
|
|
for (final BlacklistType entry : this.hostpaths_notmatchable.keySet()) {
|
|
for (final Set<Pattern> ientry : this.hostpaths_notmatchable.get(entry).values()) {
|
|
size += ientry.size();
|
|
}
|
|
}
|
|
return size;
|
|
}
|
|
|
|
public final void loadList(final BlacklistFile[] blFiles, final String sep) {
|
|
for (final BlacklistFile blf : blFiles) {
|
|
loadList(blf, sep);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* create a blacklist from file, entries separated by 'sep'
|
|
* duplicate entries are removed
|
|
* @param blFile
|
|
* @param sep
|
|
*/
|
|
private void loadList(final BlacklistFile blFile, final String sep) {
|
|
|
|
final Map<String, Set<Pattern>> blacklistMapMatch = getBlacklistMap(blFile.getType(), true);
|
|
final Map<String, Set<Pattern>> blacklistMapNotMatch = getBlacklistMap(blFile.getType(), false);
|
|
Set<Map.Entry<String, List<String>>> loadedBlacklist;
|
|
Map.Entry<String, List<String>> loadedEntry;
|
|
Set<Pattern> paths;
|
|
List<String> loadedPaths;
|
|
Set<Pattern> loadedPathsPattern;
|
|
|
|
final Set<String> fileNames = blFile.getFileNamesUnified();
|
|
for (final String fileName : fileNames) {
|
|
// make sure all requested blacklist files exist
|
|
final File file = new File(this.blacklistRootPath, fileName);
|
|
try {
|
|
file.createNewFile();
|
|
} catch (final IOException e) { /* */ }
|
|
|
|
// join all blacklists from files into one internal blacklist map
|
|
loadedBlacklist = SetTools.loadMapMultiValsPerKey(file.toString(), sep).entrySet();
|
|
for (final Iterator<Map.Entry<String, List<String>>> mi = loadedBlacklist.iterator(); mi.hasNext();) {
|
|
loadedEntry = mi.next();
|
|
loadedPaths = loadedEntry.getValue();
|
|
loadedPathsPattern = new HashSet<Pattern>();
|
|
for (String a: loadedPaths) {
|
|
if (a.equals("*")) {
|
|
loadedPathsPattern.add(Pattern.compile(".*", Pattern.CASE_INSENSITIVE));
|
|
continue;
|
|
}
|
|
if (a.indexOf("?*", 0) > 0) {
|
|
// prevent "Dangling meta character '*'" exception
|
|
log.warn("ignored blacklist path to prevent 'Dangling meta character' exception: " + a);
|
|
continue;
|
|
}
|
|
/* We ensure now that any necessary percent-encoding is applied, as the blacklist file may have been manually edited.
|
|
* (when using the web interface, encoding should already have been applied in the add() function) */
|
|
final String normalizedPattern = MultiProtocolURL.escapePathPattern(a);
|
|
loadedPathsPattern.add(Pattern.compile(normalizedPattern, Pattern.CASE_INSENSITIVE)); // add case insesitive regex
|
|
}
|
|
|
|
// create new entry if host mask unknown, otherwise merge
|
|
// existing one with path patterns from blacklist file
|
|
paths = (isMatchable(loadedEntry.getKey())) ? blacklistMapMatch.get(loadedEntry.getKey()) : blacklistMapNotMatch.get(loadedEntry.getKey());
|
|
if (paths == null) {
|
|
if (isMatchable(loadedEntry.getKey())) {
|
|
blacklistMapMatch.put(loadedEntry.getKey(), loadedPathsPattern);
|
|
} else {
|
|
blacklistMapNotMatch.put(loadedEntry.getKey(), loadedPathsPattern);
|
|
}
|
|
} else {
|
|
paths.addAll(new HashSet<Pattern>(loadedPathsPattern));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public final void loadList(final BlacklistType blacklistType, final String fileNames, final String sep) {
|
|
// method for not breaking older plasmaURLPattern interface
|
|
final BlacklistFile blFile = new BlacklistFile(fileNames, blacklistType);
|
|
loadList(blFile, sep);
|
|
}
|
|
|
|
/**
|
|
* remove the host/path from internal blacklist maps for given blacklistType
|
|
* !! and removes the entry from source blacklist file !!
|
|
* @param blacklistType
|
|
* @param blacklistToUse
|
|
* @param host
|
|
* @param path
|
|
*/
|
|
public final void remove(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) {
|
|
|
|
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, true);
|
|
removePatternFromMap(host, path, blacklistMap);
|
|
|
|
final Map<String, Set<Pattern>> blacklistMapNotMatch = getBlacklistMap(blacklistType, false);
|
|
removePatternFromMap(host, path, blacklistMapNotMatch);
|
|
|
|
//TODO: check if delete from blacklist is desired, on reload entry will not be available in any blacklist
|
|
// even if remove (above) from internal maps (at runtime) is only done for given blacklistType
|
|
// load blacklist data from file
|
|
final List<String> list = FileUtils.getListArray(new File(ListManager.listsPath, blacklistToUse));
|
|
|
|
/* delete the old entry from file, in any normalized or not normalized possible combinations */
|
|
final Set<String> entriesToDelete = new HashSet<>();
|
|
final String normalizedPathPattern = MultiProtocolURL.escapePathPattern(path);
|
|
entriesToDelete.add(host + "/" + path);
|
|
entriesToDelete.add(host + "/" + normalizedPathPattern);
|
|
if (!Punycode.isBasic(host)) {
|
|
try {
|
|
final String normalizedHost = MultiProtocolURL.toPunycode(host);
|
|
entriesToDelete.add(normalizedHost + "/" + path);
|
|
entriesToDelete.add(normalizedHost + "/" + normalizedPathPattern);
|
|
} catch (final PunycodeException ignored) {
|
|
/* We continue even if a punycode flavor can not be produced */
|
|
}
|
|
}
|
|
if (list != null) {
|
|
for (final String e : list) {
|
|
if (entriesToDelete.contains(e)) {
|
|
list.remove(e);
|
|
break;
|
|
}
|
|
}
|
|
FileUtils.writeList(new File(ListManager.listsPath, blacklistToUse), list.toArray(new String[list.size()]));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Remove the (host, pathPattern) entries eventually found in the given
|
|
* blacklist map.
|
|
*
|
|
* @param host the host part of the entry to remove
|
|
* @param pathPattern the path pattern part of the entry to remove
|
|
* @param blacklistMap a blacklist map to update
|
|
*/
|
|
private void removePatternFromMap(final String host, final String pathPattern,
|
|
final Map<String, Set<Pattern>> blacklistMap) {
|
|
final String normalizedPathPattern = MultiProtocolURL.escapePathPattern(pathPattern);
|
|
final Set<String> hosts = new HashSet<>();
|
|
hosts.add(host);
|
|
if (!Punycode.isBasic(host)) {
|
|
try {
|
|
hosts.add(MultiProtocolURL.toPunycode(host));
|
|
} catch (final PunycodeException ignored) {
|
|
/* We continue even if a punycode flavor can not be produced */
|
|
}
|
|
}
|
|
for (final String hostKey : hosts) {
|
|
final Set<Pattern> hostList = blacklistMap.get(hostKey);
|
|
if (hostList != null) {
|
|
// remove pattern from list (by comparing patternstring with path, remove(path)
|
|
// will not match path)
|
|
for (Pattern hp : hostList) {
|
|
String hpxs = hp.pattern();
|
|
if (hpxs.equals(pathPattern) || hpxs.equals(normalizedPathPattern)) {
|
|
hostList.remove(hp);
|
|
break;
|
|
}
|
|
}
|
|
if (hostList.isEmpty()) {
|
|
blacklistMap.remove(host);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Adds entries to a given blacklist internal data and updates the source
|
|
* file
|
|
*
|
|
* @param blacklistType
|
|
* @param blacklistToUse
|
|
* source file
|
|
* @param items
|
|
* blacklist host/path items to add
|
|
* @throws PunycodeException when a entry domain name could not be Punycode encoded
|
|
* @throws PatternSyntaxException when an entry regular expression is not valid
|
|
*/
|
|
public final void add(final BlacklistType blacklistType, final String blacklistToUse,
|
|
final Collection<BlacklistHostAndPath> items) throws PunycodeException, PatternSyntaxException {
|
|
|
|
if (items != null) {
|
|
PrintWriter pw = null;
|
|
try {
|
|
/* Get the content of the blacklist file in memory */
|
|
final Set<String> blacklist = new HashSet<String>(
|
|
FileUtils.getListArray(new File(this.blacklistRootPath, blacklistToUse)));
|
|
/* Open a writer on the file */
|
|
pw = new PrintWriter(new FileWriter(new File(this.blacklistRootPath, blacklistToUse), true));
|
|
|
|
for (BlacklistHostAndPath itemToAdd : items) {
|
|
final String host = itemToAdd.getHost();
|
|
final String path = itemToAdd.getPath();
|
|
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
|
|
final String safePath = MultiProtocolURL.escapePathPattern(path);
|
|
|
|
if (contains(blacklistType, safeHost, safePath)) {
|
|
/* Continue to the next item */
|
|
continue;
|
|
}
|
|
if (safeHost == null) {
|
|
log.warn("host must not be null");
|
|
/* Continue to the next item */
|
|
continue;
|
|
}
|
|
if (path == null) {
|
|
log.warn("path must not be null");
|
|
/* Continue to the next item */
|
|
continue;
|
|
}
|
|
|
|
String p = (!safePath.isEmpty() && safePath.charAt(0) == '/') ? safePath.substring(1) : safePath;
|
|
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
|
|
|
|
// avoid PatternSyntaxException e
|
|
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*')
|
|
? "." + safeHost : safeHost).toLowerCase(Locale.ROOT);
|
|
if (!p.isEmpty() && p.charAt(0) == '*') {
|
|
p = "." + p;
|
|
}
|
|
|
|
Set<Pattern> hostList;
|
|
if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
|
|
blacklistMap.put(h, (hostList = new HashSet<>()));
|
|
}
|
|
|
|
Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
|
|
|
|
hostList.add(pattern);
|
|
|
|
// Append the line to the file.
|
|
final String newEntry = h + "/" + pattern;
|
|
if (!blacklist.contains(newEntry)) {
|
|
pw.println(newEntry);
|
|
blacklist.add(newEntry);
|
|
}
|
|
|
|
}
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
} finally {
|
|
if (pw != null) {
|
|
pw.close();
|
|
if (pw.checkError()) {
|
|
log.warn("could not close stream to " + blacklistToUse + "! ");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Adds entry to a given blacklist internal data and updates the source file
|
|
* @param blacklistType
|
|
* @param blacklistToUse source file
|
|
* @param host
|
|
* @param path
|
|
* @throws PunycodeException when a entry domain name could not be Punycode encoded
|
|
* @throws PatternSyntaxException when an entry regular expression is not valid
|
|
*/
|
|
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host,
|
|
final String path) throws PunycodeException, PatternSyntaxException {
|
|
final Collection<BlacklistHostAndPath> oneItemList = new ArrayList<>();
|
|
oneItemList.add(new BlacklistHostAndPath(host, path));
|
|
this.add(blacklistType, blacklistToUse, oneItemList);
|
|
}
|
|
|
|
/**
|
|
* appends aN entry to the backlist source file and updates internal blacklist maps.
|
|
*
|
|
* @param blacklistSourcefile name of the blacklist file (LISTS/*.black)
|
|
* @param host host or host pattern
|
|
* @param path path or path pattern
|
|
* @throws PunycodeException
|
|
*/
|
|
public final void add (final String blacklistSourcefile, final String host, final String path) throws PunycodeException {
|
|
// TODO: check sourcefile synced with cache.ser files ?
|
|
if (host == null) {
|
|
throw new IllegalArgumentException("host may not be null");
|
|
}
|
|
if (path == null) {
|
|
throw new IllegalArgumentException("path may not be null");
|
|
}
|
|
|
|
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
|
|
p = MultiProtocolURL.escapePathPattern(p);
|
|
|
|
// avoid PatternSyntaxException e
|
|
String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);
|
|
|
|
h = Punycode.isBasic(h) ? h : MultiProtocolURL.toPunycode(h);
|
|
|
|
if (!p.isEmpty() && p.charAt(0) == '*') {
|
|
p = "." + p;
|
|
}
|
|
Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
|
|
|
|
// update (put) pattern to internal blacklist maps (for which source is active)
|
|
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
|
|
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklistSourcefile)) {
|
|
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(supportedBlacklistType, isMatchable(host));
|
|
Set<Pattern> hostList;
|
|
if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
|
|
blacklistMap.put(h, (hostList = new HashSet<Pattern>()));
|
|
}
|
|
hostList.add(pattern);
|
|
}
|
|
}
|
|
|
|
// Append the line to the file.
|
|
PrintWriter pw = null;
|
|
try {
|
|
final String newEntry = h + "/" + pattern;
|
|
if (!blacklistFileContains(blacklistRootPath, blacklistSourcefile, newEntry)) {
|
|
pw = new PrintWriter(new FileWriter(new File(blacklistRootPath, blacklistSourcefile), true));
|
|
pw.println(newEntry);
|
|
pw.close();
|
|
}
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
} finally {
|
|
if (pw != null) {
|
|
try {
|
|
pw.close();
|
|
} catch (final Exception e) {
|
|
log.warn("could not close stream to "
|
|
+ blacklistSourcefile + "! " + e.getMessage());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public final int blacklistCacheSize() {
|
|
int size = 0;
|
|
final Iterator<BlacklistType> iter = this.cachedUrlHashs.keySet().iterator();
|
|
while (iter.hasNext()) {
|
|
size += this.cachedUrlHashs.get(iter.next()).size();
|
|
}
|
|
return size;
|
|
}
|
|
|
|
public final void clearblacklistCache() {
|
|
final Iterator<BlacklistType> iter = this.cachedUrlHashs.keySet().iterator();
|
|
while (iter.hasNext()) {
|
|
this.cachedUrlHashs.get(iter.next()).clear();
|
|
}
|
|
}
|
|
|
|
public final boolean hashInBlacklistedCache(final BlacklistType blacklistType, final byte[] urlHash) {
|
|
HandleSet s = getCacheUrlHashsSet(blacklistType);
|
|
return s != null && s.has(urlHash);
|
|
}
|
|
|
|
/**
|
|
* Check blacklist to contain given host & path pattern.
|
|
* To check if a url matches a blacklist pattern, use isListed()
|
|
* @param blacklistType
|
|
* @param host
|
|
* @param path
|
|
* @return
|
|
*/
|
|
public final boolean contains(final BlacklistType blacklistType, final String host, final String path) {
|
|
boolean ret = false;
|
|
|
|
if (blacklistType != null && host != null && path != null) {
|
|
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
|
|
|
|
// avoid PatternSyntaxException e
|
|
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);
|
|
|
|
final Set<Pattern> hostList = blacklistMap.get(h);
|
|
if (hostList != null) {
|
|
for (Pattern hp : hostList) {
|
|
String hpxs = hp.pattern();
|
|
if (hpxs.equals(path)) {
|
|
ret = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* Checks whether the given entry is listed in given blacklist type.
|
|
* @param blacklistType The used blacklist
|
|
* @param url Entry to be checked
|
|
* @return Whether the given entry is blacklisted
|
|
*/
|
|
public final boolean isListed(final BlacklistType blacklistType, final DigestURL url) {
|
|
if (url == null) {
|
|
throw new IllegalArgumentException("url may not be null");
|
|
}
|
|
|
|
if (url.getHost() == null) {
|
|
return false;
|
|
}
|
|
HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType);
|
|
if (urlHashCache == null) {
|
|
urlHashCache = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
|
|
if (isListed(blacklistType, url.getHost().toLowerCase(Locale.ROOT), url.getFile())) {
|
|
try {
|
|
urlHashCache.put(url.hash());
|
|
} catch (final SpaceExceededException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
this.cachedUrlHashs.put(blacklistType, urlHashCache);
|
|
}
|
|
}
|
|
if (!urlHashCache.has(url.hash())) {
|
|
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(Locale.ROOT), url.getFile());
|
|
if (temp) {
|
|
try {
|
|
urlHashCache.put(url.hash());
|
|
} catch (final SpaceExceededException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
return temp;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
private static final Pattern m1 = Pattern.compile("^[a-z0-9.-]*$"); // simple Domain (yacy.net or www.yacy.net)
|
|
private static final Pattern m2 = Pattern.compile("^\\*\\.[a-z0-9-.]*$"); // start with *. (not .* and * must follow a dot)
|
|
private static final Pattern m3 = Pattern.compile("^[a-z0-9-.]*\\.\\*$"); // ends with .* (not *. and before * must be a dot)
|
|
public static boolean isMatchable(final String host) {
|
|
return (m1.matcher(host).matches() || m2.matcher(host).matches() || m3.matcher(host).matches());
|
|
}
|
|
|
|
public static String getEngineInfo() {
|
|
return "Default YaCy Blacklist Engine";
|
|
}
|
|
|
|
/**
|
|
* Check if the URL made of the specified host and path is blacklisted. All parameters must not be null.
|
|
* @param blacklistType type of blacklist (DHT, CRAWLER ...)
|
|
* @param hostlow host part
|
|
* @param path path on the host
|
|
* @return true when host/path is blacklisted
|
|
*/
|
|
public final boolean isListed(final BlacklistType blacklistType, final String hostlow, final String path) {
|
|
if (hostlow == null) {
|
|
throw new IllegalArgumentException("hostlow may not be null");
|
|
}
|
|
if (path == null) {
|
|
throw new IllegalArgumentException("path may not be null");
|
|
}
|
|
|
|
// getting the proper blacklists
|
|
final Map<String, Set<Pattern>> blacklistMapMatched = getBlacklistMap(blacklistType, true);
|
|
|
|
final Map<String, Set<Pattern>> blacklistMapNotMatched = getBlacklistMap(blacklistType, false);
|
|
|
|
return Blacklist.isListed(hostlow, path, blacklistMapMatched, blacklistMapNotMatched);
|
|
}
|
|
|
|
/**
|
|
* Check if the URL made of the specified host and path is blacklisted. All parameters must not be null.
|
|
* @param hostlow host part
|
|
* @param path path on the host
|
|
* @param blacklistMapMatched blacklist patterns indexed by matched hosts
|
|
* @param blacklistMapNotMatched blacklist patterns indexed by not matched hosts
|
|
* @return true when host/path is blacklisted
|
|
*/
|
|
protected final static boolean isListed(final String hostlow, final String path,
|
|
final Map<String, Set<Pattern>> blacklistMapMatched,
|
|
final Map<String, Set<Pattern>> blacklistMapNotMatched) {
|
|
long beginTime = 0;
|
|
if(log.isFine()) {
|
|
beginTime = System.nanoTime();
|
|
}
|
|
final String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
|
|
|
|
Pattern[] app;
|
|
boolean matched = false;
|
|
Pattern pp; // path-pattern
|
|
|
|
// try to match complete domain
|
|
if (!matched && blacklistMapMatched.get(hostlow) != null) {
|
|
app = blacklistMapMatched.get(hostlow).toArray(new Pattern[0]);
|
|
for (int i = app.length - 1; !matched && i > -1; i--) {
|
|
pp = app[i];
|
|
matched |= pp.matcher(p).matches();
|
|
}
|
|
}
|
|
// first try to match the domain with wildcard '*'
|
|
// [TL] While "." are found within the string
|
|
int index = 0;
|
|
while (!matched && (index = hostlow.indexOf('.', index + 1)) != -1) {
|
|
if (blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*") != null) {
|
|
app = blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*").toArray(new Pattern[0]);
|
|
for (int i = app.length - 1; !matched && i > -1; i--) {
|
|
pp = app[i];
|
|
matched |= pp.matcher(p).matches();
|
|
}
|
|
}
|
|
if (blacklistMapMatched.get(hostlow.substring(0, index)) != null) {
|
|
app = blacklistMapMatched.get(hostlow.substring(0, index)).toArray(new Pattern[0]);
|
|
for (int i = app.length - 1; !matched && i > -1; i--) {
|
|
pp = app[i];
|
|
matched |= pp.matcher(p).matches();
|
|
}
|
|
}
|
|
}
|
|
index = hostlow.length();
|
|
while (!matched && (index = hostlow.lastIndexOf('.', index - 1)) != -1) {
|
|
if (blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length())) != null) {
|
|
app = blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length())).toArray(new Pattern[0]);
|
|
for (int i = app.length - 1; !matched && i > -1; i--) {
|
|
pp = app[i];
|
|
matched |= pp.matcher(p).matches();
|
|
}
|
|
}
|
|
if (blacklistMapMatched.get(hostlow.substring(index + 1, hostlow.length())) != null) {
|
|
app = blacklistMapMatched.get(hostlow.substring(index + 1, hostlow.length())).toArray(new Pattern[0]);
|
|
for (int i = app.length - 1; !matched && i > -1; i--) {
|
|
pp = app[i];
|
|
matched |= pp.matcher(p).matches();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// loop over all Regex-entries
|
|
if (!matched) {
|
|
String key;
|
|
for (final Entry<String, Set<Pattern>> entry : blacklistMapNotMatched.entrySet()) {
|
|
key = entry.getKey();
|
|
try {
|
|
if (Pattern.matches(key, hostlow)) {
|
|
app = entry.getValue().toArray(new Pattern[0]);
|
|
for (final Pattern ap : app) {
|
|
if (ap.matcher(p).matches()) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
} catch (final PatternSyntaxException e) {
|
|
//System.out.println(e.toString());
|
|
}
|
|
}
|
|
}
|
|
if(log.isFine()) {
|
|
/* Trace URLs spending too much CPU time : set Blacklist.level = FINE in yacy.logging file */
|
|
long timeInSeconds = (System.nanoTime() - beginTime) / 1000000000;
|
|
if(timeInSeconds > 10) {
|
|
log.fine("Long processing : " + timeInSeconds + " seconds. URL : " + hostlow + path);
|
|
}
|
|
}
|
|
return matched;
|
|
}
|
|
|
|
public static BlacklistError checkError(final String element, final Map<String, String> properties) {
|
|
|
|
final boolean allowRegex = (properties != null) && properties.get("allowRegex").equalsIgnoreCase("true");
|
|
int slashPos;
|
|
final String host, path;
|
|
|
|
if ((slashPos = element.indexOf('/')) == -1) {
|
|
host = element;
|
|
path = ".*";
|
|
} else {
|
|
host = element.substring(0, slashPos);
|
|
path = element.substring(slashPos + 1);
|
|
}
|
|
|
|
if (!allowRegex || !RegexHelper.isValidRegex(host)) {
|
|
final int i = host.indexOf('*');
|
|
|
|
// check whether host begins illegally
|
|
if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) {
|
|
if (i == 0 && host.length() > 1 && host.charAt(1) != '.') {
|
|
return BlacklistError.SUBDOMAIN_XOR_WILDCARD;
|
|
}
|
|
return BlacklistError.HOST_WRONG_CHARS;
|
|
}
|
|
|
|
// in host-part only full sub-domains may be wildcards
|
|
if (!host.isEmpty() && i > -1) {
|
|
if (!(i == 0 || i == host.length() - 1)) {
|
|
return BlacklistError.WILDCARD_BEGIN_OR_END;
|
|
}
|
|
|
|
if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') {
|
|
return BlacklistError.SUBDOMAIN_XOR_WILDCARD;
|
|
}
|
|
}
|
|
|
|
// check for double-occurrences of "*" in host
|
|
if (host.indexOf("*", i + 1) > -1) {
|
|
return BlacklistError.TWO_WILDCARDS_IN_HOST;
|
|
}
|
|
} else if (allowRegex && !RegexHelper.isValidRegex(host)) {
|
|
return BlacklistError.HOST_REGEX;
|
|
}
|
|
|
|
// check for errors on regex-compiling path
|
|
if (!RegexHelper.isValidRegex(path) && !"*".equals(path)) {
|
|
return BlacklistError.PATH_REGEX;
|
|
}
|
|
|
|
return BlacklistError.NO_ERROR;
|
|
}
|
|
|
|
public static String defaultBlacklist(final File listsPath) {
|
|
final List<String> dirlist = FileUtils.getDirListing(listsPath, Blacklist.BLACKLIST_FILENAME_FILTER);
|
|
if (dirlist.isEmpty()) {
|
|
return null;
|
|
}
|
|
return dirlist.get(0);
|
|
}
|
|
|
|
/**
|
|
* Checks if a blacklist file contains a certain entry.
|
|
* @param blacklistToUse The blacklist.
|
|
* @param newEntry The Entry.
|
|
* @return True if file contains entry, else false.
|
|
*/
|
|
public static boolean blacklistFileContains(final File listsPath, final String blacklistToUse, final String newEntry) {
|
|
final Set<String> blacklist = new HashSet<String>(FileUtils.getListArray(new File(listsPath, blacklistToUse)));
|
|
return blacklist != null && blacklist.contains(newEntry);
|
|
}
|
|
|
|
private static File DHTCacheFile(final BlacklistType type) {
|
|
final String BLACKLIST_DHT_CACHEFILE_NAME = SwitchboardConstants.LISTS_PATH_DEFAULT + "/blacklist_" + type.name() + "_Cache.ser";
|
|
return new File(Switchboard.getSwitchboard().dataPath, BLACKLIST_DHT_CACHEFILE_NAME);
|
|
}
|
|
|
|
private final void saveDHTCache(final BlacklistType type) {
|
|
try (
|
|
/* Resources automatically closed by this try-with-resources statement */
|
|
final FileOutputStream fileOutStream =new FileOutputStream(DHTCacheFile(type));
|
|
final ObjectOutputStream out = new ObjectOutputStream(fileOutStream);
|
|
) {
|
|
HandleSet s = getCacheUrlHashsSet(type);
|
|
if (s != null) {
|
|
out.writeObject(getCacheUrlHashsSet(type));
|
|
}
|
|
} catch (final IOException e) {
|
|
/* Catch but trace in log any IO exception occurring in write or automatic closing */
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
|
|
private final void loadDHTCache(final BlacklistType type) {
|
|
File cachefile = DHTCacheFile(type);
|
|
if (cachefile.exists()) {
|
|
FileInputStream fileInStream = null;
|
|
ObjectInputStream in = null;
|
|
try {
|
|
fileInStream = new FileInputStream(cachefile);
|
|
in = new ObjectInputStream(fileInStream);
|
|
RowHandleSet rhs = (RowHandleSet) in.readObject();
|
|
this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0) : rhs);
|
|
return;
|
|
} catch (final Throwable e) {
|
|
ConcurrentLog.logException(e);
|
|
} finally {
|
|
if(in != null) {
|
|
try {
|
|
in.close();
|
|
} catch(IOException ioe) {
|
|
log.warn("Could not close object input stream on file " + cachefile);
|
|
}
|
|
} else if(fileInStream != null){
|
|
/* An error may have been thrown while constructing the ObjectInputStream :
|
|
* by the way the file input stream still has to be closed properly */
|
|
try {
|
|
fileInStream.close();
|
|
} catch(IOException ioe) {
|
|
log.warn("Could not close input stream on file " + cachefile);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
this.cachedUrlHashs.put(type, new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0));
|
|
}
|
|
}
|