155 lines
5.8 KiB
Java
155 lines
5.8 KiB
Java
/*
|
|
* The MIT License (MIT)
|
|
* ------------------
|
|
*
|
|
* Copyright (c) 2012-2014 Philipp Nolte
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
* in the Software without restriction, including without limitation the rights
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
* THE SOFTWARE.
|
|
*/
|
|
|
|
/*
|
|
* This software was taken from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier
|
|
* and inserted into the loklak class hierarchy to be enhanced and extended
|
|
* by @0rb1t3r. After optimization in loklak it was inserted into the net.yacy.cora.bayes
|
|
* package. It shall be used to create custom search navigation filters.
|
|
* The original copyright notice was copied from the README.mnd
|
|
* from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier/blob/master/README.md
|
|
* The original package domain was de.daslaboratorium.machinelearning.classifier
|
|
*/
|
|
|
|
package net.yacy.cora.bayes;
|
|
|
|
import java.util.Collection;
|
|
import java.util.Comparator;
|
|
import java.util.SortedSet;
|
|
import java.util.TreeSet;
|
|
|
|
/**
|
|
* A concrete implementation of the abstract Classifier class. The Bayes
|
|
* classifier implements a naive Bayes approach to classifying a given set of
|
|
* features: classify(feat1,...,featN) = argmax(P(cat)*PROD(P(featI|cat)
|
|
*
|
|
* @author Philipp Nolte
|
|
*
|
|
* @see http://en.wikipedia.org/wiki/Naive_Bayes_classifier
|
|
*
|
|
* @param <T> The feature class.
|
|
* @param <K> The category class.
|
|
*/
|
|
public class BayesClassifier<T, K> extends Classifier<T, K> {
|
|
|
|
/**
|
|
* Calculates the product of all feature probabilities: PROD(P(featI|cat)
|
|
*
|
|
* @param features The set of features to use.
|
|
* @param category The category to test for.
|
|
* @return The product of all feature probabilities.
|
|
*/
|
|
private float featuresProbabilityProduct(Collection<T> features,
|
|
K category) {
|
|
float product = 1.0f;
|
|
for (T feature : features)
|
|
product *= this.featureWeighedAverage(feature, category);
|
|
return product;
|
|
}
|
|
|
|
/**
|
|
* Calculates the probability that the features can be classified as the
|
|
* category given.
|
|
*
|
|
* @param features The set of features to use.
|
|
* @param category The category to test for.
|
|
* @return The probability that the features can be classified as the
|
|
* category.
|
|
*/
|
|
private float categoryProbability(Collection<T> features, K category) {
|
|
return ((float) this.categoryCount(category)
|
|
/ (float) this.getCategoriesTotal())
|
|
* featuresProbabilityProduct(features, category);
|
|
}
|
|
|
|
/**
|
|
* Retrieves a sorted <code>Set</code> of probabilities that the given set
|
|
* of features is classified as the available categories.
|
|
*
|
|
* @param features The set of features to use.
|
|
* @return A sorted <code>Set</code> of category-probability-entries.
|
|
*/
|
|
private SortedSet<Classification<T, K>> categoryProbabilities(
|
|
Collection<T> features) {
|
|
|
|
/*
|
|
* Sort the set according to the possibilities. Because we have to sort
|
|
* by the mapped value and not by the mapped key, we can not use a
|
|
* sorted tree (TreeMap) and we have to use a set-entry approach to
|
|
* achieve the desired functionality. A custom comparator is therefore
|
|
* needed.
|
|
*/
|
|
SortedSet<Classification<T, K>> probabilities =
|
|
new TreeSet<Classification<T, K>>(
|
|
new Comparator<Classification<T, K>>() {
|
|
|
|
@Override
|
|
public int compare(Classification<T, K> o1,
|
|
Classification<T, K> o2) {
|
|
int toReturn = Float.compare(
|
|
o1.getProbability(), o2.getProbability());
|
|
if ((toReturn == 0)
|
|
&& !o1.getCategory().equals(o2.getCategory()))
|
|
toReturn = -1;
|
|
return toReturn;
|
|
}
|
|
});
|
|
|
|
for (K category : this.getCategories())
|
|
probabilities.add(new Classification<T, K>(
|
|
features, category,
|
|
this.categoryProbability(features, category)));
|
|
return probabilities;
|
|
}
|
|
|
|
/**
|
|
* Classifies the given set of features.
|
|
*
|
|
* @return The category the set of features is classified as.
|
|
*/
|
|
@Override
|
|
public Classification<T, K> classify(Collection<T> features) {
|
|
SortedSet<Classification<T, K>> probabilites =
|
|
this.categoryProbabilities(features);
|
|
|
|
if (probabilites.size() > 0) {
|
|
return probabilites.last();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Classifies the given set of features. and return the full details of the
|
|
* classification.
|
|
*
|
|
* @return The set of categories the set of features is classified as.
|
|
*/
|
|
public Collection<Classification<T, K>> classifyDetailed(
|
|
Collection<T> features) {
|
|
return this.categoryProbabilities(features);
|
|
}
|
|
|
|
}
|