435 lines
16 KiB
Java
435 lines
16 KiB
Java
/*
|
||
* The MIT License (MIT)
|
||
* ------------------
|
||
*
|
||
* Copyright (c) 2012-2014 Philipp Nolte
|
||
*
|
||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
* of this software and associated documentation files (the "Software"), to deal
|
||
* in the Software without restriction, including without limitation the rights
|
||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
* copies of the Software, and to permit persons to whom the Software is
|
||
* furnished to do so, subject to the following conditions:
|
||
*
|
||
* The above copyright notice and this permission notice shall be included in
|
||
* all copies or substantial portions of the Software.
|
||
*
|
||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||
* THE SOFTWARE.
|
||
*/
|
||
|
||
/*
|
||
* This software was taken from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier
|
||
* and inserted into the loklak class hierarchy to be enhanced and extended
|
||
* by @0rb1t3r. After optimization in loklak it was inserted into the net.yacy.cora.bayes
|
||
* package. It shall be used to create custom search navigation filters.
|
||
* The original copyright notice was copied from the README.mnd
|
||
* from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier/blob/master/README.md
|
||
* The original package domain was de.daslaboratorium.machinelearning.classifier
|
||
*/
|
||
|
||
package net.yacy.cora.bayes;
|
||
|
||
import java.util.Collection;
|
||
import java.util.LinkedList;
|
||
import java.util.Map;
|
||
import java.util.Queue;
|
||
import java.util.Set;
|
||
import java.util.concurrent.ConcurrentHashMap;
|
||
|
||
/**
|
||
* Abstract base extended by any concrete classifier. It implements the basic
|
||
* functionality for storing categories or features and can be used to calculate
|
||
* basic probabilities – both category and feature probabilities. The classify
|
||
* function has to be implemented by the concrete classifier class.
|
||
*
|
||
* @author Philipp Nolte
|
||
*
|
||
* @param <T> A feature class
|
||
* @param <K> A category class
|
||
*/
|
||
public abstract class Classifier<T, K> {
|
||
|
||
/**
|
||
* Initial capacity of category dictionaries.
|
||
*/
|
||
private static final int INITIAL_CATEGORY_DICTIONARY_CAPACITY = 16;
|
||
|
||
/**
|
||
* Initial capacity of feature dictionaries. It should be quite big, because
|
||
* the features will quickly outnumber the categories.
|
||
*/
|
||
private static final int INITIAL_FEATURE_DICTIONARY_CAPACITY = 32;
|
||
|
||
/**
|
||
* The initial memory capacity or how many classifications are memorized.
|
||
*/
|
||
private int memoryCapacity = 1000;
|
||
|
||
/**
|
||
* A dictionary mapping features to their number of occurrences in each
|
||
* known category.
|
||
*/
|
||
private Map<K, Map<T, Integer>> featureCountPerCategory;
|
||
|
||
/**
|
||
* A dictionary mapping features to their number of occurrences.
|
||
*/
|
||
private Map<T, Integer> totalFeatureCount;
|
||
|
||
/**
|
||
* A dictionary mapping categories to their number of occurrences.
|
||
*/
|
||
private Map<K, Integer> totalCategoryCount;
|
||
|
||
/**
|
||
* The classifier's memory. It will forget old classifications as soon as
|
||
* they become too old.
|
||
*/
|
||
private Queue<Classification<T, K>> memoryQueue;
|
||
|
||
/**
|
||
* Constructs a new classifier without any trained knowledge.
|
||
*/
|
||
public Classifier() {
|
||
this.reset();
|
||
}
|
||
|
||
/**
|
||
* Resets the <i>learned</i> feature and category counts.
|
||
*/
|
||
public void reset() {
|
||
this.featureCountPerCategory =
|
||
new ConcurrentHashMap<K, Map<T,Integer>>(
|
||
Classifier.INITIAL_CATEGORY_DICTIONARY_CAPACITY);
|
||
this.totalFeatureCount =
|
||
new ConcurrentHashMap<T, Integer>(
|
||
Classifier.INITIAL_FEATURE_DICTIONARY_CAPACITY);
|
||
this.totalCategoryCount =
|
||
new ConcurrentHashMap<K, Integer>(
|
||
Classifier.INITIAL_CATEGORY_DICTIONARY_CAPACITY);
|
||
this.memoryQueue = new LinkedList<Classification<T, K>>();
|
||
}
|
||
|
||
/**
|
||
* Returns a <code>Set</code> of features the classifier knows about.
|
||
*
|
||
* @return The <code>Set</code> of features the classifier knows about.
|
||
*/
|
||
public Set<T> getFeatures() {
|
||
return this.totalFeatureCount.keySet();
|
||
}
|
||
|
||
/**
|
||
* Returns a <code>Set</code> of categories the classifier knows about.
|
||
*
|
||
* @return The <code>Set</code> of categories the classifier knows about.
|
||
*/
|
||
public Set<K> getCategories() {
|
||
return this.totalCategoryCount.keySet();
|
||
}
|
||
|
||
/**
|
||
* Retrieves the total number of categories the classifier knows about.
|
||
*
|
||
* @return The total category count.
|
||
*/
|
||
public int getCategoriesTotal() {
|
||
int toReturn = 0;
|
||
for (Integer c: this.totalCategoryCount.values()) {
|
||
toReturn += c;
|
||
}
|
||
return toReturn;
|
||
}
|
||
|
||
/**
|
||
* Retrieves the memory's capacity.
|
||
*
|
||
* @return The memory's capacity.
|
||
*/
|
||
public int getMemoryCapacity() {
|
||
return memoryCapacity;
|
||
}
|
||
|
||
/**
|
||
* Sets the memory's capacity. If the new value is less than the old
|
||
* value, the memory will be truncated accordingly.
|
||
*
|
||
* @param memoryCapacity The new memory capacity.
|
||
*/
|
||
public void setMemoryCapacity(int memoryCapacity) {
|
||
for (int i = this.memoryCapacity; i > memoryCapacity; i--) {
|
||
this.memoryQueue.poll();
|
||
}
|
||
this.memoryCapacity = memoryCapacity;
|
||
}
|
||
|
||
/**
|
||
* Increments the count of a given feature in the given category. This is
|
||
* equal to telling the classifier, that this feature has occurred in this
|
||
* category.
|
||
*
|
||
* @param feature The feature, which count to increase.
|
||
* @param category The category the feature occurred in.
|
||
*/
|
||
public void incrementFeature(T feature, K category) {
|
||
Map<T, Integer> features =
|
||
this.featureCountPerCategory.get(category);
|
||
if (features == null) {
|
||
this.featureCountPerCategory.put(category,
|
||
new ConcurrentHashMap<T, Integer>(Classifier.INITIAL_FEATURE_DICTIONARY_CAPACITY));
|
||
features = this.featureCountPerCategory.get(category);
|
||
}
|
||
Integer count = features.get(feature);
|
||
if (count == null) {
|
||
features.put(feature, 0);
|
||
count = features.get(feature);
|
||
}
|
||
features.put(feature, ++count);
|
||
|
||
Integer totalCount = this.totalFeatureCount.get(feature);
|
||
if (totalCount == null) {
|
||
this.totalFeatureCount.put(feature, 0);
|
||
totalCount = this.totalFeatureCount.get(feature);
|
||
}
|
||
this.totalFeatureCount.put(feature, ++totalCount);
|
||
}
|
||
|
||
/**
|
||
* Increments the count of a given category. This is equal to telling the
|
||
* classifier, that this category has occurred once more.
|
||
*
|
||
* @param category The category, which count to increase.
|
||
*/
|
||
public void incrementCategory(K category) {
|
||
Integer count = this.totalCategoryCount.get(category);
|
||
if (count == null) {
|
||
this.totalCategoryCount.put(category, 0);
|
||
count = this.totalCategoryCount.get(category);
|
||
}
|
||
this.totalCategoryCount.put(category, ++count);
|
||
}
|
||
|
||
/**
|
||
* Decrements the count of a given feature in the given category. This is
|
||
* equal to telling the classifier that this feature was classified once in
|
||
* the category.
|
||
*
|
||
* @param feature The feature to decrement the count for.
|
||
* @param category The category.
|
||
*/
|
||
public void decrementFeature(T feature, K category) {
|
||
Map<T, Integer> features =
|
||
this.featureCountPerCategory.get(category);
|
||
if (features == null) {
|
||
return;
|
||
}
|
||
Integer count = features.get(feature);
|
||
if (count == null) {
|
||
return;
|
||
}
|
||
if (count.intValue() == 1) {
|
||
features.remove(feature);
|
||
if (features.size() == 0) {
|
||
this.featureCountPerCategory.remove(category);
|
||
}
|
||
} else {
|
||
features.put(feature, --count);
|
||
}
|
||
|
||
Integer totalCount = this.totalFeatureCount.get(feature);
|
||
if (totalCount == null) {
|
||
return;
|
||
}
|
||
if (totalCount.intValue() == 1) {
|
||
this.totalFeatureCount.remove(feature);
|
||
} else {
|
||
this.totalFeatureCount.put(feature, --totalCount);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Decrements the count of a given category. This is equal to telling the
|
||
* classifier, that this category has occurred once less.
|
||
*
|
||
* @param category The category, which count to increase.
|
||
*/
|
||
public void decrementCategory(K category) {
|
||
Integer count = this.totalCategoryCount.get(category);
|
||
if (count == null) {
|
||
return;
|
||
}
|
||
if (count.intValue() == 1) {
|
||
this.totalCategoryCount.remove(category);
|
||
} else {
|
||
this.totalCategoryCount.put(category, --count);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Retrieves the number of occurrences of the given feature in the given
|
||
* category.
|
||
*
|
||
* @param feature The feature, which count to retrieve.
|
||
* @param category The category, which the feature occurred in.
|
||
* @return The number of occurrences of the feature in the category.
|
||
*/
|
||
public int featureCount(T feature, K category) {
|
||
Map<T, Integer> features =
|
||
this.featureCountPerCategory.get(category);
|
||
if (features == null)
|
||
return 0;
|
||
Integer count = features.get(feature);
|
||
return (count == null) ? 0 : count.intValue();
|
||
}
|
||
|
||
/**
|
||
* Retrieves the number of occurrences of the given category.
|
||
*
|
||
* @param category The category, which count should be retrieved.
|
||
* @return The number of occurrences.
|
||
*/
|
||
public int categoryCount(K category) {
|
||
Integer count = this.totalCategoryCount.get(category);
|
||
return (count == null) ? 0 : count.intValue();
|
||
}
|
||
|
||
public float featureProbability(T feature, K category) {
|
||
if (this.categoryCount(category) == 0)
|
||
return 0;
|
||
return (float) this.featureCount(feature, category)
|
||
/ (float) this.categoryCount(category);
|
||
}
|
||
|
||
/**
|
||
* Retrieves the weighed average <code>P(feature|category)</code> with
|
||
* overall weight of <code>1.0</code> and an assumed probability of
|
||
* <code>0.5</code>. The probability defaults to the overall feature
|
||
* probability.
|
||
*
|
||
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureProbability(Object, Object)
|
||
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
|
||
*
|
||
* @param feature The feature, which probability to calculate.
|
||
* @param category The category.
|
||
* @return The weighed average probability.
|
||
*/
|
||
public float featureWeighedAverage(T feature, K category) {
|
||
return this.featureWeighedAverage(feature, category, null, 1.0f, 0.5f);
|
||
}
|
||
|
||
/**
|
||
* Retrieves the weighed average <code>P(feature|category)</code> with
|
||
* overall weight of <code>1.0</code>, an assumed probability of
|
||
* <code>0.5</code> and the given object to use for probability calculation.
|
||
*
|
||
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
|
||
*
|
||
* @param feature The feature, which probability to calculate.
|
||
* @param category The category.
|
||
* @param calculator The calculating object.
|
||
* @return The weighed average probability.
|
||
*/
|
||
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator) {
|
||
return this.featureWeighedAverage(feature, category,
|
||
calculator, 1.0f, 0.5f);
|
||
}
|
||
|
||
/**
|
||
* Retrieves the weighed average <code>P(feature|category)</code> with
|
||
* the given weight and an assumed probability of <code>0.5</code> and the
|
||
* given object to use for probability calculation.
|
||
*
|
||
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
|
||
*
|
||
* @param feature The feature, which probability to calculate.
|
||
* @param category The category.
|
||
* @param calculator The calculating object.
|
||
* @param weight The feature weight.
|
||
* @return The weighed average probability.
|
||
*/
|
||
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator, float weight) {
|
||
return this.featureWeighedAverage(feature, category,
|
||
calculator, weight, 0.5f);
|
||
}
|
||
|
||
/**
|
||
* Retrieves the weighed average <code>P(feature|category)</code> with
|
||
* the given weight, the given assumed probability and the given object to
|
||
* use for probability calculation.
|
||
*
|
||
* @param feature The feature, which probability to calculate.
|
||
* @param category The category.
|
||
* @param calculator The calculating object.
|
||
* @param weight The feature weight.
|
||
* @param assumedProbability The assumed probability.
|
||
* @return The weighed average probability.
|
||
*/
|
||
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator, float weight, float assumedProbability) {
|
||
|
||
/*
|
||
* use the given calculating object or the default method to calculate
|
||
* the probability that the given feature occurred in the given
|
||
* category.
|
||
*/
|
||
final float basicProbability =
|
||
(calculator == null)
|
||
? this.featureProbability(feature, category)
|
||
: calculator.featureProbability(feature, category);
|
||
|
||
Integer totals = this.totalFeatureCount.get(feature);
|
||
if (totals == null)
|
||
totals = 0;
|
||
return (weight * assumedProbability + totals * basicProbability)
|
||
/ (weight + totals);
|
||
}
|
||
|
||
/**
|
||
* Train the classifier by telling it that the given features resulted in
|
||
* the given category.
|
||
*
|
||
* @param category The category the features belong to.
|
||
* @param features The features that resulted in the given category.
|
||
*/
|
||
public void learn(K category, Collection<T> features) {
|
||
this.learn(new Classification<T, K>(features, category));
|
||
}
|
||
|
||
/**
|
||
* Train the classifier by telling it that the given features resulted in
|
||
* the given category.
|
||
*
|
||
* @param classification The classification to learn.
|
||
*/
|
||
public void learn(Classification<T, K> classification) {
|
||
|
||
for (T feature : classification.getFeatureset())
|
||
this.incrementFeature(feature, classification.getCategory());
|
||
this.incrementCategory(classification.getCategory());
|
||
|
||
this.memoryQueue.offer(classification);
|
||
if (this.memoryQueue.size() > this.memoryCapacity) {
|
||
Classification<T, K> toForget = this.memoryQueue.remove();
|
||
|
||
for (T feature : toForget.getFeatureset())
|
||
this.decrementFeature(feature, toForget.getCategory());
|
||
this.decrementCategory(toForget.getCategory());
|
||
}
|
||
}
|
||
|
||
/**
|
||
* The classify method. It will retrieve the most likely category for the
|
||
* features given and depends on the concrete classifier implementation.
|
||
*
|
||
* @param features The features to classify.
|
||
* @return The category most likely.
|
||
*/
|
||
public abstract Classification<T, K> classify(Collection<T> features);
|
||
|
||
}
|