change multinomial naive bayes algorithm

edvardsanta · edvardsanta · commit e09c045c4ea5 · 2025-02-14T00:23:13.000-03:00
diff --git a/src/Abstracts/NaiveBayesClassifier.cs b/src/Abstracts/NaiveBayesClassifier.cs
@@ -6,18 +6,25 @@ public abstract class NaiveBayesClassifier : INaiveBayesClassifier
     {
         protected Dictionary<string, Dictionary<string, int>> wordCountsPerLabel;
         protected Dictionary<string, int> totalWordsPerLabel;
+        protected readonly int maxSamplesPerClass;
+        protected readonly bool useUndersampling;
+        protected readonly Random random;
 
-        public NaiveBayesClassifier()
+        public NaiveBayesClassifier(int maxSamplesPerClass = 1000, bool useUndersampling = true)
         {
             wordCountsPerLabel = new Dictionary<string, Dictionary<string, int>>();
             totalWordsPerLabel = new Dictionary<string, int>();
+            this.maxSamplesPerClass = maxSamplesPerClass;
+            this.useUndersampling = useUndersampling;
+            this.random = new Random();
         }
 
         public virtual void Train(IEnumerable<ClassifierModel> trainingData)
         {
-            foreach (var data in trainingData)
+            var balancedData = BalanceDataset(trainingData.ToList());
+
+            foreach (var data in balancedData)
             {
-                // Initialize dictionary for new labels
                 if (!wordCountsPerLabel.ContainsKey(data.Label))
                 {
                     wordCountsPerLabel[data.Label] = new Dictionary<string, int>();
@@ -33,16 +40,83 @@ public virtual void Train(IEnumerable<ClassifierModel> trainingData)
                         targetDictionary[word] = 0;
                     targetDictionary[word]++;
                 }
-
                 totalWordsPerLabel[data.Label]++;
             }
         }
 
-        protected IEnumerable<string> Tokenize(string text, char separator = ' ')
+        protected virtual IEnumerable<ClassifierModel> BalanceDataset(List<ClassifierModel> data)
+        {
+            // Group data by label
+            var groupedData = data.GroupBy(x => x.Label)
+                                .ToDictionary(g => g.Key, g => g.ToList());
+
+            // Find minority and majority class sizes
+            int minClassSize = groupedData.Values.Min(x => x.Count);
+            int maxClassSize = groupedData.Values.Max(x => x.Count);
+
+            // Determine target size based on strategy
+            int targetSize = useUndersampling ?
+                Math.Min(minClassSize, maxSamplesPerClass) :
+                Math.Min(maxClassSize, maxSamplesPerClass);
+
+            var balancedData = new List<ClassifierModel>();
+
+            foreach (var group in groupedData)
+            {
+                var samples = group.Value;
+                var currentSize = samples.Count;
+
+                if (currentSize <= targetSize)
+                {
+                    // If using oversampling and current size is less than target
+                    if (!useUndersampling && currentSize < targetSize)
+                    {
+                        balancedData.AddRange(OversampleData(samples, targetSize));
+                    }
+                    else
+                    {
+                        balancedData.AddRange(samples);
+                    }
+                }
+                else
+                {
+                    // Undersample if current size is greater than target
+                    balancedData.AddRange(UndersampleData(samples, targetSize));
+                }
+            }
+
+            return balancedData;
+        }
+
+        protected virtual IEnumerable<ClassifierModel> UndersampleData(List<ClassifierModel> samples, int targetSize)
+        {
+            return samples.OrderBy(x => random.Next()).Take(targetSize);
+        }
+
+        protected virtual IEnumerable<ClassifierModel> OversampleData(List<ClassifierModel> samples, int targetSize)
+        {
+            var result = new List<ClassifierModel>(samples);
+
+            while (result.Count < targetSize)
+            {
+                // Add random samples from the original set until we reach target size
+                result.Add(samples[random.Next(samples.Count)]);
+            }
+
+            return result;
+        }
+
+        protected virtual IEnumerable<string> Tokenize(string text, char separator = ' ')
         {
             return text.ToLower().Split(separator);
         }
-       
-        public abstract string Predict(string text);    
+
+        public Dictionary<string, int> GetClassDistribution(IEnumerable<ClassifierModel> data)
+        {
+            return data.GroupBy(x => x.Label)
+                      .ToDictionary(g => g.Key, g => g.Count());
+        }
+
+        public abstract Dictionary<string, double> Predict(string text);
     }
 }
diff --git a/src/MultinomialNaiveBayesClassifier.cs b/src/MultinomialNaiveBayesClassifier.cs
@@ -1,80 +1,139 @@
 ﻿using MyML.Abstracts;
-using MyML.Interfaces;
 
 namespace MyML
 {
     public class MultinomialNaiveBayesClassifier : NaiveBayesClassifier
     {
-        public override string Predict(string text)
+        private int _vocabularySize;
+
+        public MultinomialNaiveBayesClassifier()
         {
-            double maxProbability = double.MinValue;
+            CalculateVocabularySize();
+        }
+
+        /// <summary>
+        /// Predicts class probabilities for a given text input using Multinomial Naive Bayes classification.
+        /// Returns normalized probabilities (as percentages) for each class label.
+        /// </summary>
+        /// <remarks>
+        /// The prediction process consists of four main steps:
+        /// 
+        /// 1. Calculate posterior probabilities in log space:
+        ///    - Combines class prior probability with word likelihoods
+        ///    - P(class|text) ∝ log(P(class)) + Σ log(P(word|class))
+        /// 
+        /// 2. Find maximum log probability for numerical stability
+        ///    - Used for log-sum-exp trick to prevent overflow
+        /// 
+        /// 3. Convert log probabilities to normal space:
+        ///    - Uses log-sum-exp trick to prevent numerical overflow
+        ///    - Shifts all log probabilities by subtracting max value
+        ///    - exp(log(p) - maxLogP) / Σ exp(log(p) - maxLogP)
+        /// 
+        /// 4. Normalize probabilities to percentages:
+        ///    - Ensures all probabilities sum to 100%
+        /// </remarks>
+        /// <param name="text">Input text to classify</param>
+        /// <returns>Dictionary mapping class labels to their predicted probabilities (as percentages)</returns>
+        public override Dictionary<string, double> Predict(string text)
+        {
+            double maxLogProbability = double.MinValue;
             string? predictedLabel = null;
             int totalWordCount = totalWordsPerLabel.Values.Sum();
+            IEnumerable<string> words = Tokenize(text);
+            Dictionary<string, double> logProbabilities = new Dictionary<string, double>();
 
-            var words = Tokenize(text);
             foreach (var label in wordCountsPerLabel.Keys)
             {
                 var labelWordCounts = wordCountsPerLabel[label];
                 var totalClassCount = totalWordsPerLabel[label];
 
-                var probability = CalculateProbability(words, labelWordCounts, totalClassCount, totalWordCount);
-                var evidence = CalculateEvidence(words, wordCountsPerLabel, totalWordCount);
-                var labelProbability = probability / evidence;
+                double logLikelihood = CalculateProbability(
+                    words,
+                    labelWordCounts,
+                    totalClassCount,
+                    _vocabularySize
+                );
 
-                if (labelProbability > maxProbability)
+                double logPrior = Math.Log((double)totalClassCount / totalWordCount);
+                double logPosterior = logLikelihood + logPrior; 
+                logProbabilities.Add(label, logPosterior);
+                if (logPosterior > maxLogProbability)
                 {
-                    maxProbability = labelProbability;
+                    maxLogProbability = logPosterior;
                     predictedLabel = label;
                 }
             }
 
-            return predictedLabel!;
-        }
-
+            var result = new Dictionary<string, double>();
+            double sumExp = 0.0;
 
-        private double CalculateProbability(IEnumerable<string> words, Dictionary<string, int> wordCounts, int totalClassCount, int totalWordCount)
-        {
-            double probability = 1;
+            foreach (var kvp in logProbabilities)
+            {
+                double shiftedLogProb = kvp.Value - maxLogProbability;
+                sumExp += Math.Exp(shiftedLogProb);
+            }
 
-            foreach (var word in words)
+            foreach (var kvp in logProbabilities)
             {
-                // Laplace smoothing to add one just to ensure that every word contributes a small, non-zero probability
-                if (wordCounts.TryGetValue(word, out var count))
-                {
-                    probability *= (double)(count + 1) / (totalClassCount + totalWordCount);
-                }
-                else
-                {
-                    probability *= 1.0 / (totalClassCount + totalWordCount);
-                }
+                double shiftedLogProb = kvp.Value - maxLogProbability;
+                double normalizedProb = (Math.Exp(shiftedLogProb) / sumExp) * 100;
+                result.Add(kvp.Key, normalizedProb);
             }
-            return probability;
+
+            return result;
         }
+
         /// <summary>
-        /// Computes the evidence term P(B) in Bayes' Theorem, which is the probability of the observed features (words, in this case) across all classes
+        /// Calculates the log probability of a document belonging to a specific class using
+        /// the Multinomial Naive Bayes algorithm with Laplace (add-one) smoothing.
         /// </summary>
-        /// <param name="words"></param>
-        /// <param name="wordCountsPerLabel"></param>
-        /// <param name="totalWordCount"></param>
-        /// <returns></returns>
-        private double CalculateEvidence(IEnumerable<string> words, Dictionary<string, Dictionary<string, int>> wordCountsPerLabel, int totalWordCount)
+        /// <remarks>
+        /// 
+        /// 1. Uses log probabilities to prevent numerical underflow
+        /// 2. Applies Laplace smoothing to handle unseen words
+        /// 3. Assumes word independence (naive assumption)
+        /// 
+        /// The probability is calculated as:
+        /// P(class|document) ∝ log(P(class)) + Σ log(P(word|class))
+        /// 
+        /// Where P(word|class) is smoothed using Laplace smoothing:
+        /// P(word|class) = (count(word,class) + 1) / (totalWords + vocabularySize)
+        /// </remarks>
+        /// <param name="words">Collection of words from the document to classify</param>
+        /// <param name="wordCountsForClass">Dictionary containing word counts for the current class</param>
+        /// <param name="totalWordsInClass">Total number of words in the training data for this class</param>
+        /// <param name="vocabularySize">Size of the entire vocabulary across all classes</param>
+        /// <returns>
+        /// Log probability of the document belonging to the class. Higher values indicate
+        /// stronger association with the class.
+        /// </returns>
+        private double CalculateProbability(
+             IEnumerable<string> words,
+             Dictionary<string, int> wordCountsForClass,
+             int totalWordsInClass,
+             int vocabularySize)
         {
-            double evidence = 1;
-
+            double logProbability = 0.0; 
             foreach (var word in words)
             {
-                double wordProbability = 0;
-                foreach (var label in wordCountsPerLabel.Keys)
-                {
-                    if (wordCountsPerLabel[label].TryGetValue(word, out var count))
-                    {
-                        wordProbability += (double)count / totalWordCount;
-                    }
-                }
-                double dealUnseenWord = wordProbability > 0 ? wordProbability : 1.0;
-                evidence *= dealUnseenWord  / totalWordCount;
+                int count = wordCountsForClass.TryGetValue(word, out int c) ? c : 0;
+
+                // Laplace smoothing in log space
+                double smoothedProb = Math.Log((count + 1.0) / (totalWordsInClass + vocabularySize));
+                logProbability += smoothedProb;
+            }
+
+            return logProbability;
+        }
+        private void CalculateVocabularySize()
+        {
+            HashSet<string> uniqueWords = new();
+            foreach (var labelDict in wordCountsPerLabel.Values)
+            {
+                uniqueWords.UnionWith(labelDict.Keys);
             }
-            return evidence;
+            _vocabularySize = uniqueWords.Count;
         }
     }
 }