Source code for spammy.classifier

# -*- coding: utf-8 -*-
# @Author: Tasdik Rahman
# @Date:   2016-03-12
# @Last Modified by:   Tasdik Rahman
# @Last Modified time: 2016-04-15 22:10:29
# @MIT License
# @http://tasdikrahman.me
# @https://github.com/prodicus

"""
Rolling my own Implementation of Naive Bayes algorithm.

This particular implementation caters to the case when a category is not
observed in the dataset, and the model automatically assigns a 0 probability
to it!

Read about smoothening techniques somewhere but let's not delve into that now.


References
==========
[1] 
  - http://stackoverflow.com/a/5029989/3834059
  - http://stackoverflow.com/q/8419401/3834059
  - http://stackoverflow.com/q/2600790/3834059

"""

from __future__ import division, print_function
from collections import defaultdict
import math

[docs]class NaiveBayesClassifier(object):

    """
    Inherits from the 'object' class. Nothing special
    """

    def __init__(self):
        """
        Initializes the Naive Bayes object

        :param self: class object
        """
        self.total = 0
        self.label_count = defaultdict(int)
        self.feature_count = defaultdict(int)
        self.feature_label = defaultdict(lambda: defaultdict(int))
        # Reason? Refer:  [1]
        # self.feature_label = defaultdict(lambda: defaultdict(int))
        self.classification = defaultdict(int)

[docs]    def train(self, featurelist, label):
        """
        Trains the classifier for gods sake!

        Trying to emulate the API which the NLTK wrapper tries to provide for
        its nltk.NaiveBayesClassifier.train() gives

        .. note::

            `defaultdict` is used bacause when we try to acces a key which is not
            there in the `dictionary`, we get a `KeyError`. Whereas in
            `defaultdict`.It will try to return a default value if the key is not
            found.

            For more on `defaultdict`,
            Refer: http://stackoverflow.com/a/5900634/3834059

        :param self: class object
        :param featurelist: the list of the features
        :param label: class of the feature
        """

        # set() clears out all the duplicate objects inside the 'featurelist'
        featurelist = list(set(featurelist))

        for feature in featurelist:
            self.feature_count[feature] += 1
            self.feature_label[feature][label] += 1

        # incrementing label counts and the like
        self.label_count[label] += 1
        self.total = self.total + 1

[docs]    def feature_probability(self, feature, label):
        """
        This function calculates the probability of a feature to belong to a
        particular label. (i.e class of 'spam' or 'ham' for us.)

        .. note:: 

            for an unseen featurem I can assign a random probability, let's say 0.5

        :param self: class object
        :param feature: The feature for which we will be calculating the
                        probailty.
        :param label: spam or ham
        :returns: The probability of the feature being in the label.
        :rtype: float
        """
        # nothing but a ternary operator
        # returns spam, label == "ham" and the other way around
        rev_class = "spam" if label == "ham" else "ham"  # or opp_label

        # or I could use a lambda function, let's see how
        # rev_class = lambda label: "spam" if label == "ham" else "ham"
        # looks same to me!

        # *---------------------------------------------------------------------
        # P ( S | token ) =         no_in_spam / no_of_spam    <--- NUMERATOR
        #                     _______________________________________________
        #  DENOMINATOR --->    no_in_spam / no_of_spam + no_in_ham / no_of_ham
        # ----------------------------------------------------------------------

        feature_count = self.feature_label[feature][label]
        rev_class_count = self.feature_label[feature][rev_class]
        label_count = self.label_count[label]

        probability = 0  # or basicBayes

        if feature_count and label_count:
            NUMERATOR = feature_count / label_count
            DENOMINATOR = feature_count / label_count + \
                rev_class_count / self.label_count[rev_class]
            probability = NUMERATOR / DENOMINATOR

        return probability

[docs]    def document_probability(self, features, label):
        """
        Finds `document_probability()` by looping over the documents and
        calling `feature_probability()`

        :param self: class object
        :param features: List of features
        :param label: Label whose probability needs to be classified
        :returns: the probability of the document in being in a particular
                  class
        :rtype: float/int
        """

        if not self.total:
            return 0

        probability = 1.00
        features = list(set(features))
        for feature in features:
            # store the feature_probability
            fp = self.feature_probability(feature, label)
            if fp != 0:
                probability += fp

        try:
            op = probability
            return op
        except:
            # for ham to not be put in the spam box
            if label == "spam":
                return 0
            else:
                return 1

[docs]    def classify(self, features):
        """
        Writing the actual interface for the class here. This will classify our
        documents when called from the terminal

        :param self: class object
        :param features: The feaures of the document passed
        :returns: spam or ham
        :rtype: str
        """

        probability = {}
        for label in self.label_count.keys():
            probability[label] = self.document_probability(features, label)

        self.classification = probability

        if len(probability.items()) > 0:
            return sorted(
                probability.items(),
                key=lambda (k, v): v,
                reverse=True
            )[0][0]
        else:
            return "classification could not be done!"

    def __str__(self):
        """
        Overriding the default `__str__` function for better readability

        :param self: class object
        :returns: The spammy object with __str__ method overridden for verbosity
        """
        result = \
        "No of Features : {feature}, " \
        "\nNumber of spam email : {spam}" \
        "\nNumber of ham email : {ham}" \
        "\nTotal number of emails:  {total}".format(
            feature=len(self.feature_count),
            spam=self.label_count['spam'],
            ham=self.label_count['ham'],
            total=self.total
        )

        return result