crawler/source/net/yacy/cora/bayes/Classifier.java
2025-03-26 09:12:37 +09:00

435 lines
16 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* The MIT License (MIT)
* ------------------
*
* Copyright (c) 2012-2014 Philipp Nolte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/*
* This software was taken from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier
* and inserted into the loklak class hierarchy to be enhanced and extended
* by @0rb1t3r. After optimization in loklak it was inserted into the net.yacy.cora.bayes
* package. It shall be used to create custom search navigation filters.
* The original copyright notice was copied from the README.mnd
* from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier/blob/master/README.md
* The original package domain was de.daslaboratorium.machinelearning.classifier
*/
package net.yacy.cora.bayes;
import java.util.Collection;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/**
* Abstract base extended by any concrete classifier. It implements the basic
* functionality for storing categories or features and can be used to calculate
* basic probabilities both category and feature probabilities. The classify
* function has to be implemented by the concrete classifier class.
*
* @author Philipp Nolte
*
* @param <T> A feature class
* @param <K> A category class
*/
public abstract class Classifier<T, K> {
/**
* Initial capacity of category dictionaries.
*/
private static final int INITIAL_CATEGORY_DICTIONARY_CAPACITY = 16;
/**
* Initial capacity of feature dictionaries. It should be quite big, because
* the features will quickly outnumber the categories.
*/
private static final int INITIAL_FEATURE_DICTIONARY_CAPACITY = 32;
/**
* The initial memory capacity or how many classifications are memorized.
*/
private int memoryCapacity = 1000;
/**
* A dictionary mapping features to their number of occurrences in each
* known category.
*/
private Map<K, Map<T, Integer>> featureCountPerCategory;
/**
* A dictionary mapping features to their number of occurrences.
*/
private Map<T, Integer> totalFeatureCount;
/**
* A dictionary mapping categories to their number of occurrences.
*/
private Map<K, Integer> totalCategoryCount;
/**
* The classifier's memory. It will forget old classifications as soon as
* they become too old.
*/
private Queue<Classification<T, K>> memoryQueue;
/**
* Constructs a new classifier without any trained knowledge.
*/
public Classifier() {
this.reset();
}
/**
* Resets the <i>learned</i> feature and category counts.
*/
public void reset() {
this.featureCountPerCategory =
new ConcurrentHashMap<K, Map<T,Integer>>(
Classifier.INITIAL_CATEGORY_DICTIONARY_CAPACITY);
this.totalFeatureCount =
new ConcurrentHashMap<T, Integer>(
Classifier.INITIAL_FEATURE_DICTIONARY_CAPACITY);
this.totalCategoryCount =
new ConcurrentHashMap<K, Integer>(
Classifier.INITIAL_CATEGORY_DICTIONARY_CAPACITY);
this.memoryQueue = new LinkedList<Classification<T, K>>();
}
/**
* Returns a <code>Set</code> of features the classifier knows about.
*
* @return The <code>Set</code> of features the classifier knows about.
*/
public Set<T> getFeatures() {
return this.totalFeatureCount.keySet();
}
/**
* Returns a <code>Set</code> of categories the classifier knows about.
*
* @return The <code>Set</code> of categories the classifier knows about.
*/
public Set<K> getCategories() {
return this.totalCategoryCount.keySet();
}
/**
* Retrieves the total number of categories the classifier knows about.
*
* @return The total category count.
*/
public int getCategoriesTotal() {
int toReturn = 0;
for (Integer c: this.totalCategoryCount.values()) {
toReturn += c;
}
return toReturn;
}
/**
* Retrieves the memory's capacity.
*
* @return The memory's capacity.
*/
public int getMemoryCapacity() {
return memoryCapacity;
}
/**
* Sets the memory's capacity. If the new value is less than the old
* value, the memory will be truncated accordingly.
*
* @param memoryCapacity The new memory capacity.
*/
public void setMemoryCapacity(int memoryCapacity) {
for (int i = this.memoryCapacity; i > memoryCapacity; i--) {
this.memoryQueue.poll();
}
this.memoryCapacity = memoryCapacity;
}
/**
* Increments the count of a given feature in the given category. This is
* equal to telling the classifier, that this feature has occurred in this
* category.
*
* @param feature The feature, which count to increase.
* @param category The category the feature occurred in.
*/
public void incrementFeature(T feature, K category) {
Map<T, Integer> features =
this.featureCountPerCategory.get(category);
if (features == null) {
this.featureCountPerCategory.put(category,
new ConcurrentHashMap<T, Integer>(Classifier.INITIAL_FEATURE_DICTIONARY_CAPACITY));
features = this.featureCountPerCategory.get(category);
}
Integer count = features.get(feature);
if (count == null) {
features.put(feature, 0);
count = features.get(feature);
}
features.put(feature, ++count);
Integer totalCount = this.totalFeatureCount.get(feature);
if (totalCount == null) {
this.totalFeatureCount.put(feature, 0);
totalCount = this.totalFeatureCount.get(feature);
}
this.totalFeatureCount.put(feature, ++totalCount);
}
/**
* Increments the count of a given category. This is equal to telling the
* classifier, that this category has occurred once more.
*
* @param category The category, which count to increase.
*/
public void incrementCategory(K category) {
Integer count = this.totalCategoryCount.get(category);
if (count == null) {
this.totalCategoryCount.put(category, 0);
count = this.totalCategoryCount.get(category);
}
this.totalCategoryCount.put(category, ++count);
}
/**
* Decrements the count of a given feature in the given category. This is
* equal to telling the classifier that this feature was classified once in
* the category.
*
* @param feature The feature to decrement the count for.
* @param category The category.
*/
public void decrementFeature(T feature, K category) {
Map<T, Integer> features =
this.featureCountPerCategory.get(category);
if (features == null) {
return;
}
Integer count = features.get(feature);
if (count == null) {
return;
}
if (count.intValue() == 1) {
features.remove(feature);
if (features.size() == 0) {
this.featureCountPerCategory.remove(category);
}
} else {
features.put(feature, --count);
}
Integer totalCount = this.totalFeatureCount.get(feature);
if (totalCount == null) {
return;
}
if (totalCount.intValue() == 1) {
this.totalFeatureCount.remove(feature);
} else {
this.totalFeatureCount.put(feature, --totalCount);
}
}
/**
* Decrements the count of a given category. This is equal to telling the
* classifier, that this category has occurred once less.
*
* @param category The category, which count to increase.
*/
public void decrementCategory(K category) {
Integer count = this.totalCategoryCount.get(category);
if (count == null) {
return;
}
if (count.intValue() == 1) {
this.totalCategoryCount.remove(category);
} else {
this.totalCategoryCount.put(category, --count);
}
}
/**
* Retrieves the number of occurrences of the given feature in the given
* category.
*
* @param feature The feature, which count to retrieve.
* @param category The category, which the feature occurred in.
* @return The number of occurrences of the feature in the category.
*/
public int featureCount(T feature, K category) {
Map<T, Integer> features =
this.featureCountPerCategory.get(category);
if (features == null)
return 0;
Integer count = features.get(feature);
return (count == null) ? 0 : count.intValue();
}
/**
* Retrieves the number of occurrences of the given category.
*
* @param category The category, which count should be retrieved.
* @return The number of occurrences.
*/
public int categoryCount(K category) {
Integer count = this.totalCategoryCount.get(category);
return (count == null) ? 0 : count.intValue();
}
public float featureProbability(T feature, K category) {
if (this.categoryCount(category) == 0)
return 0;
return (float) this.featureCount(feature, category)
/ (float) this.categoryCount(category);
}
/**
* Retrieves the weighed average <code>P(feature|category)</code> with
* overall weight of <code>1.0</code> and an assumed probability of
* <code>0.5</code>. The probability defaults to the overall feature
* probability.
*
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureProbability(Object, Object)
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
*
* @param feature The feature, which probability to calculate.
* @param category The category.
* @return The weighed average probability.
*/
public float featureWeighedAverage(T feature, K category) {
return this.featureWeighedAverage(feature, category, null, 1.0f, 0.5f);
}
/**
* Retrieves the weighed average <code>P(feature|category)</code> with
* overall weight of <code>1.0</code>, an assumed probability of
* <code>0.5</code> and the given object to use for probability calculation.
*
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
*
* @param feature The feature, which probability to calculate.
* @param category The category.
* @param calculator The calculating object.
* @return The weighed average probability.
*/
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator) {
return this.featureWeighedAverage(feature, category,
calculator, 1.0f, 0.5f);
}
/**
* Retrieves the weighed average <code>P(feature|category)</code> with
* the given weight and an assumed probability of <code>0.5</code> and the
* given object to use for probability calculation.
*
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
*
* @param feature The feature, which probability to calculate.
* @param category The category.
* @param calculator The calculating object.
* @param weight The feature weight.
* @return The weighed average probability.
*/
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator, float weight) {
return this.featureWeighedAverage(feature, category,
calculator, weight, 0.5f);
}
/**
* Retrieves the weighed average <code>P(feature|category)</code> with
* the given weight, the given assumed probability and the given object to
* use for probability calculation.
*
* @param feature The feature, which probability to calculate.
* @param category The category.
* @param calculator The calculating object.
* @param weight The feature weight.
* @param assumedProbability The assumed probability.
* @return The weighed average probability.
*/
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator, float weight, float assumedProbability) {
/*
* use the given calculating object or the default method to calculate
* the probability that the given feature occurred in the given
* category.
*/
final float basicProbability =
(calculator == null)
? this.featureProbability(feature, category)
: calculator.featureProbability(feature, category);
Integer totals = this.totalFeatureCount.get(feature);
if (totals == null)
totals = 0;
return (weight * assumedProbability + totals * basicProbability)
/ (weight + totals);
}
/**
* Train the classifier by telling it that the given features resulted in
* the given category.
*
* @param category The category the features belong to.
* @param features The features that resulted in the given category.
*/
public void learn(K category, Collection<T> features) {
this.learn(new Classification<T, K>(features, category));
}
/**
* Train the classifier by telling it that the given features resulted in
* the given category.
*
* @param classification The classification to learn.
*/
public void learn(Classification<T, K> classification) {
for (T feature : classification.getFeatureset())
this.incrementFeature(feature, classification.getCategory());
this.incrementCategory(classification.getCategory());
this.memoryQueue.offer(classification);
if (this.memoryQueue.size() > this.memoryCapacity) {
Classification<T, K> toForget = this.memoryQueue.remove();
for (T feature : toForget.getFeatureset())
this.decrementFeature(feature, toForget.getCategory());
this.decrementCategory(toForget.getCategory());
}
}
/**
* The classify method. It will retrieve the most likely category for the
* features given and depends on the concrete classifier implementation.
*
* @param features The features to classify.
* @return The category most likely.
*/
public abstract Classification<T, K> classify(Collection<T> features);
}