Source code for spammy.classifier

# -*- coding: utf-8 -*-
# @Author: Tasdik Rahman
# @Date:   2016-03-12
# @Last Modified by:   Tasdik Rahman
# @Last Modified time: 2016-04-15 22:10:29
# @MIT License
# @http://tasdikrahman.me
# @https://github.com/prodicus

"""
Rolling my own Implementation of Naive Bayes algorithm.

This particular implementation caters to the case when a category is not
observed in the dataset, and the model automatically assigns a 0 probability
to it!

Read about smoothening techniques somewhere but let's not delve into that now.


References
==========
[1] 
  - http://stackoverflow.com/a/5029989/3834059
  - http://stackoverflow.com/q/8419401/3834059
  - http://stackoverflow.com/q/2600790/3834059

"""

from __future__ import division, print_function
from collections import defaultdict
import math

[docs]class NaiveBayesClassifier(object): """ Inherits from the 'object' class. Nothing special """ def __init__(self): """ Initializes the Naive Bayes object :param self: class object """ self.total = 0 self.label_count = defaultdict(int) self.feature_count = defaultdict(int) self.feature_label = defaultdict(lambda: defaultdict(int)) # Reason? Refer: [1] # self.feature_label = defaultdict(lambda: defaultdict(int)) self.classification = defaultdict(int)
[docs] def train(self, featurelist, label): """ Trains the classifier for gods sake! Trying to emulate the API which the NLTK wrapper tries to provide for its nltk.NaiveBayesClassifier.train() gives .. note:: `defaultdict` is used bacause when we try to acces a key which is not there in the `dictionary`, we get a `KeyError`. Whereas in `defaultdict`.It will try to return a default value if the key is not found. For more on `defaultdict`, Refer: http://stackoverflow.com/a/5900634/3834059 :param self: class object :param featurelist: the list of the features :param label: class of the feature """ # set() clears out all the duplicate objects inside the 'featurelist' featurelist = list(set(featurelist)) for feature in featurelist: self.feature_count[feature] += 1 self.feature_label[feature][label] += 1 # incrementing label counts and the like self.label_count[label] += 1 self.total = self.total + 1
[docs] def feature_probability(self, feature, label): """ This function calculates the probability of a feature to belong to a particular label. (i.e class of 'spam' or 'ham' for us.) .. note:: for an unseen featurem I can assign a random probability, let's say 0.5 :param self: class object :param feature: The feature for which we will be calculating the probailty. :param label: spam or ham :returns: The probability of the feature being in the label. :rtype: float """ # nothing but a ternary operator # returns spam, label == "ham" and the other way around rev_class = "spam" if label == "ham" else "ham" # or opp_label # or I could use a lambda function, let's see how # rev_class = lambda label: "spam" if label == "ham" else "ham" # looks same to me! # *--------------------------------------------------------------------- # P ( S | token ) = no_in_spam / no_of_spam <--- NUMERATOR # _______________________________________________ # DENOMINATOR ---> no_in_spam / no_of_spam + no_in_ham / no_of_ham # ---------------------------------------------------------------------- feature_count = self.feature_label[feature][label] rev_class_count = self.feature_label[feature][rev_class] label_count = self.label_count[label] probability = 0 # or basicBayes if feature_count and label_count: NUMERATOR = feature_count / label_count DENOMINATOR = feature_count / label_count + \ rev_class_count / self.label_count[rev_class] probability = NUMERATOR / DENOMINATOR return probability
[docs] def document_probability(self, features, label): """ Finds `document_probability()` by looping over the documents and calling `feature_probability()` :param self: class object :param features: List of features :param label: Label whose probability needs to be classified :returns: the probability of the document in being in a particular class :rtype: float/int """ if not self.total: return 0 probability = 1.00 features = list(set(features)) for feature in features: # store the feature_probability fp = self.feature_probability(feature, label) if fp != 0: probability += fp try: op = probability return op except: # for ham to not be put in the spam box if label == "spam": return 0 else: return 1
[docs] def classify(self, features): """ Writing the actual interface for the class here. This will classify our documents when called from the terminal :param self: class object :param features: The feaures of the document passed :returns: spam or ham :rtype: str """ probability = {} for label in self.label_count.keys(): probability[label] = self.document_probability(features, label) self.classification = probability if len(probability.items()) > 0: return sorted( probability.items(), key=lambda (k, v): v, reverse=True )[0][0] else: return "classification could not be done!"
def __str__(self): """ Overriding the default `__str__` function for better readability :param self: class object :returns: The spammy object with __str__ method overridden for verbosity """ result = \ "No of Features : {feature}, " \ "\nNumber of spam email : {spam}" \ "\nNumber of ham email : {ham}" \ "\nTotal number of emails: {total}".format( feature=len(self.feature_count), spam=self.label_count['spam'], ham=self.label_count['ham'], total=self.total ) return result