1 Import python packages

import nltk
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.svm import SVC , LinearSVC , NuSVC
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier

2 Naive Bayes Theorem: Basic Idea in natural language processing (NLP) problems.

\[\begin{equation} \textbf{P(Tag}|\textbf{Sentence)} = \textbf{P(Tag)} \frac{\textbf{P(Sentence} |\textbf{Tag})}{\textbf{P(Sentence)}} \end{equation}\]

\[\mbox{Posterior}= \mbox{Likelihood}\frac{\mbox{Proposition prior probability}}{\mbox{Evidence prior probability}}\]

2.1 Bayes’ Theorem for Naive Bayes Algorithm

The basic idea how to use Naive Bayes algorithm for machine learning classification problem is as follows: Suppose that a busines problem has multiple feature classes, say, \(C_1, C_2, \ldots, C_h\). The Naive Bayes algorithm use to compute the conditional probability of an object with a feature vector \(x_1, x_2,\ldots, x_m\) belongs to a particular class \(C_i\),

\[\displaystyle P(C_i|x_1, x_2,\ldots, x_m)=\frac{P(x_1, x_2,\ldots, x_m|C_i).P(C_i)}{P(x_1, x_2,\ldots, x_m)}\]

The main assumption of Naive Bayes Algorithm is that feature classes are mutually independent. Therefore, the conditional probability term, \(P(x_j|x_{j+1},\ldots, x_m, C_i)\) becomes \(P(x_j|C_i)\). Then

\[\displaystyle P(C_i|x_1, x_2,\ldots, x_m)=\left(\prod_{j=1}^{j=m}P(x_j|C_i)\right).\frac{P(C_i)}{P(x_1, x_2,\ldots, x_m)}\]

Due to the invariant scaling expression of \(P(x_1, x_2,\ldots, x_m)\) for all the feature classes, the above expression can simplify as \[\displaystyle P(C_i|x_1, x_2,\ldots, x_m)\propto\left(\prod_{j=1}^{j=m}P(x_j|C_i)\right).P(C_i)\] for \(1\leq i\leq h\)

3 A Practical Example

df = pd.read_csv(infile)
df = df[pd.notnull(df['tags'])]
print(df.head(20))
print(df['post'].apply(lambda x: len(x.split(' '))).sum())

my_tags = ['android','angularjs','asp.net','c#','c','c++','css','ios','html','jquery','mysql','java','javascript','.net','python','php','iphone','ruby-on-rails','sql','objective-c']

print(df['post'].apply(lambda x: len(x.split(' '))).sum())

## 10278243

4 Text Before Cleansing

def print_plot(index):
    example = df[df.index == index][['post', 'tags']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Tag:', example[1])


print_plot(10)

## when we need interface c# <blockquote>    <strong>possible duplicate:</strong><br>   <a href= https://stackoverflow.com/questions/240152/why-would-i-want-to-use-interfaces >why would i want to use interfaces </a>   <a href= https://stackoverflow.com/questions/9451868/why-i-need-interface >why i need interface </a>    </blockquote>     i want to know where and when to use it     for example    <pre><code>interface idemo {  // function prototype  public void show(); }  // first class using the interface class myclass1 : idemo {  public void show()  {   // function body comes here   response.write( i m in myclass );  }  }  // second class using the interface class myclass2 : idemo {  public void show()   {   // function body comes here   response.write( i m in myclass2 );   response.write( so  what  );  } </code></pre>   these two classes has the same function name with different body. this can be even achieved without interface. then why we need an interface where and when to use it
## Tag: c#

5 Text After Cleansing

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))


def clean_text(text):
# text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub(' ', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

df['post'] = df['post'].apply(clean_text)
print_plot(10)

## need interface c# blockquote strong possible duplicate strong br href https stackoverflow com questions 240152 would want use interfaces would want use interfaces href https stackoverflow com questions 9451868 need interface need interface blockquote want know use example pre code interface idemo function prototype public void show first class using interface class myclass1 idemo public void show function body comes response write myclass second class using interface class myclass2 idemo public void show function body comes response write myclass2 response write code pre two classes function name different body even achieved without interface need interface use
## Tag: c#

6 Split Dataset into Train-set and Test-set

X = df.post
y = df.tags
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1506)

7 Naive Bayes Classifier for Multinomial Models

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)
print("Accuracy: %.4f%" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

## Accuracy: 0.7379

print(classification_report(y_test, y_pred,target_names=my_tags))

8 Logistic Regression

from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print("Accuracy: %.4f%" % accuracy_score(y_pred, y_test))

## Accuracy: 0.7774

print(classification_report(y_test, y_pred,target_names=my_tags))

9 Linear Support Vector Machine

from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-5, random_state=1506, max_iter=5, tol=None)),
               ])

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
print("Accuracy: %.4f%" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

## Accuracy: 0.7939

print(classification_report(y_test, y_pred,target_names=my_tags))

Naive Bayes Vs Logistic Regression Vs Support Vector Machine: Python(Backend) + Rstudio