Question 1: Spam Email

spam.main = read.csv("spam.csv", header=T)

Q1a: Removing some varibles

spam = subset(spam.main, select = -c(id, isuid, domain, spampct, category, cappct))

normalize = function(data) 
{
  return ((data - min(data)) / (max(data) - min(data))) 
}

#spam = as.data.frame(lapply(spam, normalize))

kable(head(spam, n = 5))

day.of.week	time.of.day	size.kb	box	local	name	special	credit	sucker	porn	chain	username	large.text	spam
Thu	0	7	no	no	name	1	no	no	no	no	no	no	no
Thu	0	2	no	no	name	5	no	no	no	no	no	no	yes
Thu	14	3	no	yes	name	2	no	no	no	yes	no	no	no
Thu	3	3	yes	no	name	0	no	no	no	no	no	no	no
Thu	3	4	no	no	name	2	no	no	no	no	no	no	no

Q1b: Spliting Data into Training and Test Data

set.seed(14023498)
row.spam = nrow(spam)
sample.count = round(0.2 * row.spam, 0)
sample.spam = sample(1:row.spam, sample.count) 

spam.train = spam[-sample.spam, ]
spam.test = spam[sample.spam, ]

Training Dataset (Head)

kable(head(spam.train, n = 5))

	day.of.week	time.of.day	size.kb	box	local	name	special	credit	sucker	porn	chain	username	large.text	spam
1	Thu	0	7	no	no	name	1	no	no	no	no	no	no	no
3	Thu	14	3	no	yes	name	2	no	no	no	yes	no	no	no
4	Thu	3	3	yes	no	name	0	no	no	no	no	no	no	no
5	Thu	3	4	no	no	name	2	no	no	no	no	no	no	no
6	Thu	4	4	no	no	name	1	no	no	no	no	no	no	no

Test Dataset (Head)

kable(head(spam.test, n = 5))

	day.of.week	time.of.day	size.kb	box	local	digits	name	special	credit	sucker	porn	chain	username	large.text	spam
1551	Tue	10	4	no	no	0	name	0	no	no	no	no	no	yes	yes
2067	Fri	22	23	yes	no	6	name	3	yes	yes	no	no	yes	no	yes
106	Mon	23	3	no	no	1	name	1	no	no	no	no	no	no	no
1533	Wed	9	1	yes	yes	0	name	1	no	no	no	no	no	no	no
913	Wed	7	10	no	no	0	name	3	no	yes	no	no	yes	yes	yes

Q1c: Naive Bayes Classifier

test_data = spam.test
test_data$spam = as.factor(test_data$spam)
test_class = test_data[, 15]
train_data = spam.train
train_data$spam = as.factor(train_data$spam)
train_class = train_data[, 15]

set.seed(9234108)
nb.model = naiveBayes(spam ~., data = train_data)
pred = predict(nb.model, test_data)
error = 1 - sum(pred == test_class) / nrow(test_data)
acc = 1 - error
  
values = as.numeric(pred)-1
  
pred.2 = prediction(as.numeric(values), test_class)
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)

Accuracy	Error	AUC
0.8986175	0.1013825	0.9023562

nb.pred = as.numeric(pred) - 1

Q1d: Decision Tree

spam.tree = rpart(spam ~., data = spam.train, method = "class")
fancyRpartPlot(spam.tree, sub = "Decision Tree")

pred = predict(spam.tree, spam.test, type = "class")
conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table

##           Reference
## Prediction  no yes
##        no  263  26
##        yes  18 127

acc = conf.matrix$overall[1]
error = 1 - acc
base.error = error

pred.2 = prediction(as.numeric(pred), as.numeric(spam.test$spam))
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)

Accuracy	Error	AUC
0.8986175	0.1013825	0.8830042

tree.pred = as.numeric(pred) - 1
#print(summary(spam.tree))
#spam.tree

Q1e: Random Forest

set.seed(3421)

rf.model = randomForest(train_data[, -15], as.factor(as.numeric(train_class)))
rf.model

## 
## Call:
##  randomForest(x = train_data[, -15], y = as.factor(as.numeric(train_class))) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 6.62%
## Confusion matrix:
##      1   2 class.error
## 1 1132  48  0.04067797
## 2   67 490  0.12028725

pred = predict(rf.model, test_data[, -15])

conf.matrix = confusionMatrix(pred, as.factor(as.numeric(test_class)))
cf.mat.tab = conf.matrix$table
cf.mat.tab

##           Reference
## Prediction   1   2
##          1 268  15
##          2  13 138

acc = conf.matrix$overall[1]
error = 1 - acc
  
values = as.numeric(pred) - 1
  
pred.2 = prediction(as.numeric(values), test_class)
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)

Accuracy	Error	AUC
0.9354839	0.0645161	0.9278487

Q1h: Bagging - Ensemble

set.seed(90871)

bag = bagging(spam ~., data = spam.train, mfinal = 100)
bag.Pred = predict.bagging(bag, spam.test, newmfinal = 100)
bag.Pred$confusion

##                Observed Class
## Predicted Class  no yes
##             no  261  17
##             yes  20 136

error.bag = bag.Pred$error

The error - using bagging - on the test data is 0.0852535. This shows improvement.
The error - using the base model - on the test data is 0.1013825

Q1i: Boosting - Ensemble

boost = boosting(spam ~., data = spam.train, mfinal = 100)
boost.Pred = predict.boosting(boost, spam.test, newmfinal = 100)
boost.Pred$confusion

##                Observed Class
## Predicted Class  no yes
##             no  265  16
##             yes  16 137

error.boost = boost.Pred$error

The error - using boosting - on the test data is 0.0737327.

Question 2: Fruit

fruit.main = read.csv("fruit.txt", header=T)
names(fruit.main) = c("Type", "Weight", "Height", "Width")
fruit = fruit.main[which(fruit.main$Type < 3), ]

Q2a: 75/25 Spliting & Removing Type Lemon

set.seed(7218)
n.row = nrow(fruit)
sample = sample(1:n.row, round(0.75 * n.row))
fruit.train = fruit[sample, ]
fruit.test = fruit[-sample, ]

cat("Training Data - Fruit (Head)")

## Training Data - Fruit (Head)

kable(head(fruit.train, n = 5), row.names = F)

Type	Weight	Height	Width
1	0	0	0
2	0	0	0
2	1	1	2
2	0	0	0
2	1	2	2

cat("\n", "Testing Data - Fruit (Head)")

## 
##  Testing Data - Fruit (Head)

kable(head(fruit.test, n = 5), row.names = F)

Type	Height	Width
1	1	0
1	1	0
1	1	1
1	1	0
1	1	1

Q2b: Probabilities using Laplace Smoothing - Custom Function

naive.B = function(train, test, class.name) 
{
  if(!is.data.frame(train))
    train = as.data.frame(train())
  if(!is.data.frame(test))
    test = as.data.frame(test)
  
  index = match(class.name, names(train))
  
  train.main = train
  train = as.data.frame(train.main[, -index])
  train.class = as.data.frame(train.main[, index])
  names(train.class) = class.name
  
  test.main = test
  test = as.data.frame(test.main[, -index])
  test.class = as.data.frame(test.main[, index])
  names(test.class) = class.name
  
  #prob of class distribution
  prob.train.class = as.data.frame(table(train.class))
  prob.train.class$prob = prob.train.class$Freq / sum(prob.train.class$Freq)
  n.train.class = nrow(prob.train.class)
  prob.train.class = as.data.frame(t(prob.train.class[, -(1:2)]), row.names = NULL)
  
  n.attr = ncol(train.main)
  attr.prob.tb = list(n.attr)
  v.attr = c(1:n.attr)
  v.attr = v.attr[-index]
  h = 1
  
  for(a in v.attr)
  {
    attr = as.data.frame(train.main[, c(index, a)])
    
    prob.attr = t((table(attr)))
    mat.attr = matrix(nrow = nrow(prob.attr), ncol = ncol(prob.attr))
    
    for(i in 1:ncol(prob.attr))
    {
      mat.attr[,i] = (prob.attr[,i] + 1) / (sum(prob.attr[,i]) + nrow(prob.attr)) #smoothing
    }
    
    attr.prob.tb[[h]] = mat.attr
    h = h+1
  }
  
  #predicting the class label of the testing data
  test.class$pred1 = NA
  test.class$pred2 = NA
  #test.class$pred3 = NA
  test.class$predFinal = NA
  
  for(t in 1:nrow(test)) #loopiong thr each row in the test
  {
    pred.class = rep(NA, n.train.class) #keeping the pred given the class
    
    for(y in 1:n.train.class) #looping thr each class --- Given
    {
      given.prob = prob.train.class[,y]
      
      prob.holder = rep(NA, ncol(test))
      
      for(s in 1:ncol(test)) #conditional prob --- column
      {
        prob.tb = attr.prob.tb[[s]]
        cell = test[t, s] + 1
        prob = prob.tb[cell, y]
        prob.holder[s] = prob
      }
      
      pred = round(prod(prob.holder) * given.prob, 4)
      test.class[t, y+1] = pred
      pred.class[y] = pred
    }
    
    max.pred = max(pred.class)
    index.max = match(max.pred, pred.class)
    
    test.class$predFinal[t] = index.max
  }
  
  pd = test.class[,-(2:3)]
  eval = table(pd[,2], pd[,1])
  cm = confusionMatrix(eval)
  pram.eval = as.data.frame(cm$byClass)
  names(pram.eval) = c("evals")
  sens = pram.eval$evals[1]
  spec = pram.eval$evals[2]
  
  return(list("test.pred" = test.class, "accuracy" = mean(test.class[,1] == test.class[, ncol(test.class)]), 
  "prob.given.class" = attr.prob.tb, "prob.class" = prob.train.class, Sensitivity = sens,
  Specificity = spec))
  
}

result = naive.B(train = fruit.train, test = fruit.test, class.name = "Type")

cat("Probability of the class label")

## Probability of the class label

c.p = result$prob.class
names(c.p) = c("prob_apples", "pbob_oranges")
row.names(c.p) = ""
c.p

##  prob_apples pbob_oranges
##    0.4642857    0.5357143

cat("Probability of weight given the class label")

## Probability of weight given the class label

p.w.given.cl = as.data.frame(result$prob.given.class[[1]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1")
p.w.given.cl

##        given_apples given_oranges
## prob_0    0.8666667     0.5882353
## prob_1    0.1333333     0.4117647

cat("Probability of height given the class label")

## Probability of height given the class label

p.w.given.cl = as.data.frame(result$prob.given.class[[2]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1", "prob_2")
p.w.given.cl

##        given_apples given_oranges
## prob_0       0.4375     0.5000000
## prob_1       0.5000     0.2777778
## prob_2       0.0625     0.2222222

cat("Probability of width given the class label")

## Probability of width given the class label

p.w.given.cl = as.data.frame(result$prob.given.class[[3]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1", "prob_2")
p.w.given.cl

##        given_apples given_oranges
## prob_0        0.500     0.2777778
## prob_1        0.375     0.2777778
## prob_2        0.125     0.4444444

Q2c: Evaluating Test Data using Custom Function

class.label = result$test.pred
names(class.label) = c("Actual_Class", "prob|apples",   "prob|orangles",    "Pred_Class")

datatable(class.label, rownames = F, width = 500)

cat("Accuracy: ", result$accuracy)

## Accuracy:  0.6666667

Q2d: Evaluating Test Data using Built-in Function

test_data = fruit.test
test_data$Type = as.factor(test_data$Type)
test_class = test_data[, 1]
train_data = fruit.train
train_data$Type = as.factor(train_data$Type)
train_class = train_data[, 1]

nb.model = naiveBayes(train_data[,-1], train_data[,1], laplace = 1)

nb.test.fit = predict(nb.model, test_data[,-1])
cat("Predicted class using the built-in function")

## Predicted class using the built-in function

nb.test.fit

## [1] 1 1 1 1 1 1 1 1 2
## Levels: 1 2

e.acc = sum(nb.test.fit==test_data[,1]) / nrow(test_data)
cat("Accuracy: ", e.acc)

## Accuracy:  0.6666667

#t = table(nb.test.fit, test_data[,1])
#confusionMatrix(t)

These are same with the results in (2d) above.

Q2e: Repeating Evaluation Using a 75/25 Split 10 times

set.seed(2310621)
r.eval = data.frame(accuracy = rep(NA,10), sensitivity = rep(NA,10), specificity = rep(NA,10))

for(j in 1:10)
{
  sample = sample(1:n.row, round(0.75 * n.row))
  fruit.train = fruit[sample, ]
  fruit.test = fruit[-sample, ]
  
  #custom function
  result = naive.B(train = fruit.train, test = fruit.test, class.name = "Type")
  r.eval$accuracy[j] = result$accuracy
  r.eval$sensitivity[j] = result$Sensitivity
  r.eval$specificity[j] = result$Specificity
}

kable(r.eval)

accuracy	sensitivity	specificity
0.7777778	1.0000000	0.50
0.6666667	0.7500000	0.60
0.6666667	0.8000000	0.50
0.5555556	1.0000000	0.20
0.5555556	0.4285714	1.00
0.5555556	0.6666667	0.50
0.6666667	0.5000000	1.00
0.6666667	0.6000000	0.75
0.6666667	1.0000000	0.25
0.5555556	0.8000000	0.25

r.eval.mean = data.frame(Mean_accuracy = mean(r.eval$accuracy), 
  Mean_sensitivity = mean(r.eval$sensitivity), Mean_specificity = mean(r.eval$specificity))

kable(r.eval.mean, row.names = F)

Mean_accuracy	Mean_sensitivity	Mean_specificity
0.6333333	0.7545238	0.555

Question 3: Presidential

Q3a Load Addresses

State of the Union Set

#Source File of Addresses
dsrc <- DirSource('C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/files/')
getwd()

## [1] "C:/Users/2PAC/Documents/R/Data_Mining/project_4"

#Create Corpus of 'Documents' 
pages <- Corpus(dsrc, readerControl=list(language="en", stopwords=FALSE, wordLengths=c(0,Inf)))

Vector with Party Affiliation of Every President (Class Labels)

State of the Union Set

party <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/party.txt", header=FALSE)
names(party) = c("Party","President","Year")
#Class Label
C <- c("r","d")
C <- as.vector(C)

Q3b Remove Stop Words From Corpus

#Stop Words List
stopwords <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/stopwords.txt", sep="",
                      header=FALSE)
names(stopwords) = c("Stop.Words")

#Remove Stop Words 
docs <- tm_map(pages, removeWords, stopwords$Stop.Words)

Q3c Predict Party Affiliations for Speeches

i. Term Document Matrix

State of the Union Set

#Term Document Matrix (Stopwords already removed)
dtm = DocumentTermMatrix(docs, control=list(tolower=TRUE,  removePunctuation=TRUE, removeNumbers=TRUE, 
    stopwords=FALSE, wordLengths=c(1,Inf)))

TD = as.matrix(dtm)
TD = as.data.frame(TD)

#Remove rows containing test set data

# Map Files to Party Data
files <- vector(mode="character", length=0)
for(i in 1:nrow(party)){
  files[i] <- paste("a",i,".txt",sep="")
}
party$File <- files 

# Training and Test Set
party.test <- party[c(175,209,220,228,231),]
party.train <- party[-c(175,209,220,228,231),]

# Only recent speeches with Democrat vs. Republican , Remove all other parties
party.train <- subset(party.train, Party == 'r' | Party == 'd')
party.train <- party.train[order(party.train$File),] #Same order as TD.train

#Term Document of Train Data
TD.train <- TD[which(rownames(TD) %in% party.train$File),] 
TD.train <- as.data.frame(TD.train)
party.vector <- as.vector(party.train$Party)
TD.train$Party <- party.vector

#Term Document of Test Data
TD.test <- TD[which(rownames(TD) %in% party.test$File),]
TD.test <- as.data.frame(TD.test)

ii. The Bernoulli Naive Bayes Model

Train Bernoulli

State of the Union Set

# V <- extract vocabulary
V <- colnames(TD.train[,-length(colnames(TD.train))])

#Return Variables
p.c <- vector(mode="numeric", length=0) #Vector of priors
cp.tc <- matrix(nrow = length(V), ncol = 2) #Vector of conditional probabilities

# N <- count total number of docs
N <- nrow(TD.train)
N <- as.numeric(N)

if(is.data.frame(TD.train)==FALSE){
  TD.train <- as.data.frame(TD.train) # (for ease of calculation)
}

# for each c in C
for(i in 1:length(C)){
  
  # do Nc <- count docs in class
  TD.temp <- TD.train[which(TD.train$Party == C[i]),]
  Nc <- nrow(TD.temp)
  Nc <- as.numeric(Nc)
  names <- row.names(TD.temp)
  
  TD.temp <- as.data.frame(TD.temp) #set as data frame for easy calculation
  TD.temp$Party <- NULL #remove class label column for easy calculation
  TD.temp <- sapply(TD.temp, function(x) as.numeric(as.character(x))) #numeric for calc
  row.names(TD.temp) <- names
  
  # prior[c] <- Nc/N
  prior.c <- Nc/N
  p.c[i] <- prior.c
  
  # for each t in V
  for(j in 1:ncol(TD.temp)){
    Nct <- 0  
    # do Nct <- count docs in class containing t
    
    if(Nc == 1)
    {
      if(TD.temp[j]>=1){
        Nct <- Nct + 1
      }
      else{
        Nct <- Nct + 0
      }
    }
    else{
      for(k in 1:Nc){

        if(TD.temp[k,j]>=1){
          Nct <- Nct + 1
        }
        else{
          Nct <- Nct + 0
        }
    }
    }
    
    #condprob[t][c] <- (Nct + 1)/(Nc + 2)
    condprob.tc <- (Nct+1)/(Nc+2)
    cp.tc[j,i] <- condprob.tc
  }
}
colnames(cp.tc) <- C
row.names(cp.tc) <- V

#return values
cat("Head of Return Values")

## Head of Return Values

print(head(V))

## [1] "accession"      "add"            "adequate"       "administration"
## [5] "admitted"       "adopted"

print(head(p.c))

## [1] 0.494382 0.505618

print(head(cp.tc))

##                        r          d
## accession      0.1111111 0.08695652
## add            0.4777778 0.55434783
## adequate       0.6222222 0.67391304
## administration 0.8222222 0.70652174
## admitted       0.2333333 0.36956522
## adopted        0.5777778 0.52173913

Apply Bernoulli

State of the Union Set

set.seed(191919)
scores.by.doc <- rep(NA, 5)
classifier <- rep(NA, 5)

d <- nrow(TD.test)
d <- as.numeric(d)

for(doc in 1:d){                              
  doc <- as.numeric(doc)
  TD.test.d <- TD.test[doc,]
  TD.test.d <- as.data.frame(TD.test.d)
  
  index <- which(TD.test.d >= 1)
  Vd <- colnames(TD.test.d)[index]
  
  score.c <- rep(NA, 2)

  for(ii in 1:2)
  {
    sm = 1
    score.mul = rep(NA, length(V) + 1)
    score.mul[sm] <- log(p.c[ii])
    
    for(jj in 1:length(V))
    {
      sm = sm + 1
      
      if(V[jj] %in% Vd)
      {
        score.mul[sm] <- log(cp.tc[jj,ii])
      }
      else
      {
        score.mul[sm] <- log((1-cp.tc[jj,ii]))
      }
    }
    
    score.c[ii] = sum(score.mul)
    
  }
  scores.by.doc[doc] <- max(score.c)
  classifier[doc] <- C[which.is.max(score.c)]
}

paste("FINAL RESULT")

## [1] "FINAL RESULT"

f.dt.o = data.frame(pred_Label = classifier, actual = c("d", "d", "r", "d", "r"), 
  president = c("Kennedy","Clinton","Bush","Obama","Trump"))

FINAL RESULT: Bernoulli

##   pred_Label actual president
## 1          d      d   Kennedy
## 2          d      d   Clinton
## 3          d      r      Bush
## 4          d      d     Obama
## 5          d      r     Trump

iii. The Multinomial Naive Bayes Model

Train Multinomial

State of the Union Set

TD.train <- as.data.frame(TD.train)  
  
# prior[c] 
prior.c <- rep(NA,2)

# V <- extract vocabulary
V <- colnames(TD.train[,-c(ncol(TD.train))])

# P(t|c)
condprob.tc <-matrix(nrow = length(V), ncol = 2)

# N <- count documents
N <- nrow(TD.train)

# for each c in C
for(c in 1:length(C))
{
  N.c <- sum(TD.train$Party == C[c])
  prior.c[c] <- N.c/N
  
  TD.temporary <- TD.train[which(TD.train$Party == C[c]),] # Current condition
  TD.temporary <- TD.temporary[,-c(ncol(TD.temporary))]
  #Term Document Matrix without class labels, for ease of calculation
  
  # for each t in V
  # count number of tokens of t
  column.sums <- as.data.frame(colSums(TD.temporary)) #Number of tokens for each term
  colnames(column.sums) <- c("num.tokens")
  
  num.text <- sum(column.sums$num.tokens) #length of entire texts
  B <- length(V) # Number of terms in vocabulary
  
  # for each t in V
  for(t in 1:B)
  {
    condprob.tc[t,c] <- (column.sums[t,1]+1)/(num.text + B)
  }
}

colnames(condprob.tc) <- C
rownames(condprob.tc) <- V

#return values
cat("Head of Return Values")

## Head of Return Values

print(head(V))

## [1] "accession"      "add"            "adequate"       "administration"
## [5] "admitted"       "adopted"

print(head(prior.c))

## [1] 0.494382 0.505618

print(head(condprob.tc))

##                           r            d
## accession      3.190514e-05 2.600029e-05
## add            1.769285e-04 2.426694e-04
## adequate       5.771930e-04 5.026723e-04
## administration 1.247201e-03 1.155568e-03
## admitted       1.102178e-04 2.397804e-04
## adopted        5.249846e-04 4.535606e-04

Apply Multinomial

State of the Union Set

set.seed(292929)
scores.by.doc <- rep(NA, nrow(TD.test))
classifier <- rep(NA, nrow(TD.test))

for(doc in 1:nrow(TD.test))
{
  #One Document at a time
  TD.test.doc <- TD.test[doc, ]
  TD.test.doc <- as.data.frame(TD.test.doc)
  #colnames(TD.test.doc)[doc] <- row.names(TD.test)[doc]
  #TD.test.doc$terms <- rownames(TD.test.doc)
  condprob.tc <- as.data.frame(condprob.tc)
  condprob.tc$terms <- rownames(condprob.tc)
  
  #Tokens from Test Document
  index <- which(TD.test.doc >= 1)
  W <- colnames(TD.test.doc)[index]
  
  #Conditional Probabilities of Tokens Only
  condprob.tc.tokens <- condprob.tc[which(condprob.tc$terms %in% W),]
  
  #Individual Scores
  score.c <- rep(NA, 2)
  
  for(cond in 1:length(C))
  {
    sm = 1
    score.mul = rep(NA, length(W) + 1)
    score.mul[sm] <- log(prior.c[cond])
    
    for(w in 1:length(W))
    {
      sm = sm + 1
      score.mul[sm] <- log((condprob.tc.tokens[w,cond])^(TD.test.doc[1,w]))
    }
    
    score.c[cond] = sum(score.mul)
  }
  
  scores.by.doc[doc] <- max(score.c)
  classifier[doc] <- C[which.is.max(score.c)]
}

f.dt = data.frame(pred_Label = classifier, actual = c("d", "d", "r", "d", "r"), 
                  president = c("Kennedy","Clinton","Bush","Obama","Trump"))

#print(scores.by.doc)
#print(classifier)

FINAL RESULT for Multinomial

##   pred_Label actual president
## 1          d      d   Kennedy
## 2          d      d   Clinton
## 3          r      r      Bush
## 4          d      d     Obama
## 5          d      r     Trump

Q3d Bonus

#Source File of Addresses
dsrc.bonus <- DirSource("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/files/")

#Create Corpus of 'Documents' 
pages.bonus  <- Corpus(dsrc.bonus , readerControl=list(language="en", stopwords=FALSE,
                                                       wordLengths=c(0,Inf)))

party.bonus  <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/party.txt",
                         header=FALSE)
names(party.bonus ) = c("Party","President","Year")

#Class Label
C.bonus  <- c("r","d")
C.bonus  <- as.vector(C.bonus)

#Stop Words List
stopwords <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/stopwords.txt", sep="", 
                      header=FALSE)
names(stopwords) = c("Stop.Words")

#Remove Stop Words 
docs.bonus  <- tm_map(pages.bonus, removeWords, stopwords$Stop.Words)

#Term Document Matrix (Stopwords already removed)
dtm.bonus  = DocumentTermMatrix(docs.bonus, control=list(tolower=TRUE, removePunctuation=TRUE,
  removeNumbers=TRUE, stopwords=FALSE, wordLengths=c(1,Inf)))

TD.bonus  = as.matrix(dtm.bonus)
TD.bonus  = as.data.frame(TD.bonus)

#Remove rows containing test set data

# Map Files to Party Data
files.bonus  <- vector(mode="character", length=0)
for(i in 1:nrow(party.bonus)){
  files.bonus [i] <- paste("a",i,".txt",sep="")
}
party.bonus$File <- files.bonus  

# Training and Test Set
party.test.bonus  <- party.bonus[c(175,209,220,228,231),]
party.train.bonus  <- party.bonus[-c(175,209,220,228,231),]

# Only Speeches since Year 1913
party.train.bonus  <- subset(party.train.bonus , Year >= 1913)
party.train.bonus  <- party.train.bonus [order(party.train.bonus$File),] #Same order as TD.train

#Term Document of Train Data
TD.train.bonus <- TD.bonus[which(rownames(TD.bonus) %in% party.train.bonus$File),] 
TD.train.bonus <- as.data.frame(TD.train.bonus)
party.vector.bonus <- as.vector(party.train.bonus$Party)
TD.train.bonus$Party <- party.vector.bonus

#Term Document of Test Data
TD.test.bonus <- TD.bonus[which(rownames(TD.bonus) %in% party.test.bonus$File),]
TD.test.bonus <- as.data.frame(TD.test.bonus)

Train Bernoulli

# V <- extract vocabulary
V.bonus <- colnames(TD.train.bonus[,-length(colnames(TD.train.bonus))])

#Return Variables
p.c.bonus <- vector(mode="numeric", length=0) #Vector of priors
cp.tc.bonus <- matrix(nrow = length(V.bonus), ncol = 2) #Vector of conditional probabilities

# N <- count total number of docs
N.bonus <- nrow(TD.train.bonus)
N.bonus <- as.numeric(N.bonus)

if(is.data.frame(TD.train.bonus)==FALSE){
  TD.train.bonus <- as.data.frame(TD.train.bonus) # (for ease of calculation)
}

# for each c in C
for(i in 1:length(C.bonus)){
  
  # do Nc <- count docs in class
  TD.temp.bonus <- TD.train.bonus[which(TD.train.bonus$Party == C.bonus[i]),]
  Nc.bonus <- nrow(TD.temp.bonus)
  Nc.bonus <- as.numeric(Nc.bonus)
  names.bonus <- row.names(TD.temp.bonus)
  
  TD.temp.bonus <- as.data.frame(TD.temp.bonus) #set as data frame for easy calculation
  TD.temp.bonus$Party <- NULL #remove class label column for easy calculation
  TD.temp.bonus <- sapply(TD.temp.bonus, function(x) as.numeric(as.character(x))) #numeric for calc
  row.names(TD.temp.bonus) <- names.bonus
  
  # prior[c] <- Nc/N
  prior.c.bonus <- Nc.bonus/N.bonus
  p.c.bonus[i] <- prior.c.bonus
  
  # for each t in V
  for(j in 1:ncol(TD.temp.bonus)){
    Nct.bonus <- 0  
    # do Nct <- count docs in class containing t
    
    if(Nc.bonus == 1)
    {
      if(TD.temp.bonus[j]>=1){
        Nct.bonus <- Nct.bonus + 1
      }
      else{
        Nct.bonus <- Nct.bonus + 0
      }
    }
    else{
      for(k in 1:Nc.bonus){

        if(TD.temp.bonus[k,j]>=1){
          Nct.bonus <- Nct.bonus + 1
        }
        else{
          Nct.bonus <- Nct.bonus + 0
        }
    }
    }
    
    #condprob[t][c] <- (Nct + 1)/(Nc + 2)
    condprob.tc.bonus <- (Nct.bonus+1)/(Nc.bonus+2)
    cp.tc.bonus[j,i] <- condprob.tc.bonus
  }
}
colnames(cp.tc.bonus) <- C.bonus
row.names(cp.tc.bonus) <- V.bonus

#return values
cat("Head of Return Values")

## Head of Return Values

print(head(V.bonus))

## [1] "accession"      "add"            "adequate"       "administration"
## [5] "admitted"       "adopted"

print(head(p.c.bonus))

## [1] 0.4705882 0.5294118

print(head(cp.tc.bonus))

##                   r          d
## accession      0.04 0.01785714
## add            0.36 0.44642857
## adequate       0.52 0.53571429
## administration 0.76 0.55357143
## admitted       0.12 0.08928571
## adopted        0.30 0.23214286

Apply Bernoulli

set.seed(393939)
scores.by.doc.bonus <- rep(NA, 5)
classifier.bonus <- rep(NA, 5)

d.bonus <- nrow(TD.test.bonus)
d.bonus <- as.numeric(d.bonus)

for(doc in 1:d.bonus){                              
  doc <- as.numeric(doc)
  TD.test.d.bonus <- TD.test.bonus[doc,]
  TD.test.d.bonus <- as.data.frame(TD.test.d.bonus)
  
  index.bonus <- which(TD.test.d.bonus >= 1)
  Vd.bonus <- colnames(TD.test.d.bonus)[index.bonus]
  
  score.c.bonus <- rep(NA, 2)

  for(ii in 1:2)
  {
    sm.bonus = 1
    score.mul.bonus = rep(NA, length(V.bonus) + 1)
    score.mul.bonus[sm.bonus] <- log(p.c.bonus[ii])
    
    for(jj in 1:length(V.bonus))
    {
      sm.bonus = sm.bonus + 1
      
      if(V.bonus[jj] %in% Vd.bonus)
      {
        score.mul.bonus[sm.bonus] <- log(cp.tc.bonus[jj,ii])
      }
      else
      {
        score.mul.bonus[sm.bonus] <- log((1-cp.tc.bonus[jj,ii]))
      }
    }
    
    score.c.bonus[ii] = sum(score.mul.bonus)
    
  }
  scores.by.doc.bonus[doc] <- max(score.c.bonus)
  classifier.bonus[doc] <- C[which.is.max(score.c.bonus)]
}

f.dt.o.bonus = data.frame(pred_Label = classifier.bonus, actual = c("d", "d", "r", "d", "r"), 
                          president = c("Kennedy","Clinton","Bush","Obama","Trump"))

FINAL RESULT: Bernoulli (Bonus)

##   pred_Label actual president
## 1          d      d   Kennedy
## 2          d      d   Clinton
## 3          r      r      Bush
## 4          d      d     Obama
## 5          d      r     Trump

The results did not change from part c).

Train Multnomial

TD.train.bonus <- as.data.frame(TD.train.bonus)  
  
# prior[c] 
prior.c.bonus <- rep(NA,2)

# V <- extract vocabulary
V.bonus <- colnames(TD.train.bonus[,-c(ncol(TD.train.bonus))])

# P(t|c)
condprob.tc.bonus <-matrix(nrow = length(V.bonus), ncol = 2)

# N <- count documents
N.bonus <- nrow(TD.train.bonus)

# for each c in C
for(c in 1:length(C.bonus))
{
  N.c.bonus <- sum(TD.train.bonus$Party == C.bonus[c])
  prior.c.bonus[c] <- N.c.bonus/N.bonus
  
  TD.temporary.bonus <- TD.train.bonus[which(TD.train.bonus$Party == C.bonus[c]),] # Current condition
  TD.temporary.bonus <- TD.temporary.bonus[,-c(ncol(TD.temporary.bonus))] 
  #Term Document Matrix without class labels, for ease of calculation
  
  # for each t in V
  # count number of tokens of t
  column.sums.bonus <- as.data.frame(colSums(TD.temporary.bonus)) #Number of tokens for each term
  colnames(column.sums.bonus) <- c("num.tokens")
  
  num.text.bonus <- sum(column.sums.bonus$num.tokens) #length of entire texts
  B.bonus <- length(V.bonus) # Number of terms in vocabulary
  
  # for each t in V
  for(t in 1:B.bonus)
  {
    condprob.tc.bonus[t,c] <- (column.sums.bonus[t,1]+1)/(num.text.bonus + B.bonus)
  }
}
colnames(condprob.tc.bonus) <- C.bonus
rownames(condprob.tc.bonus) <- V.bonus

#return values
cat("Head of Return Values")

## Head of Return Values

print(head(V.bonus))

## [1] "accession"      "add"            "adequate"       "administration"
## [5] "admitted"       "adopted"

print(head(prior.c.bonus))

## [1] 0.4705882 0.5294118

print(head(condprob.tc.bonus))

##                           r            d
## accession      1.411313e-05 6.037371e-06
## add            1.764141e-04 2.113080e-04
## adequate       5.221858e-04 6.097745e-04
## administration 1.305465e-03 1.515380e-03
## admitted       4.233939e-05 3.018686e-05
## adopted        2.963757e-04 1.328222e-04

Apply Multinomial

set.seed(494949)
scores.by.doc.bonus <- rep(NA, nrow(TD.test.bonus))
classifier.bonus <- rep(NA, nrow(TD.test.bonus))

for(doc in 1:nrow(TD.test.bonus))
{
  #One Document at a time
  TD.test.doc.bonus <- TD.test.bonus[doc, ]
  TD.test.doc.bonus <- as.data.frame(TD.test.doc.bonus)
  #colnames(TD.test.doc)[doc] <- row.names(TD.test)[doc]
  #TD.test.doc$terms <- rownames(TD.test.doc)
  condprob.tc.bonus <- as.data.frame(condprob.tc.bonus)
  condprob.tc.bonus$terms <- rownames(condprob.tc.bonus)
  
  #Tokens from Test Document
  index.bonus <- which(TD.test.doc.bonus >= 1)
  W.bonus <- colnames(TD.test.doc.bonus)[index.bonus]
  
  #Conditional Probabilities of Tokens Only
  condprob.tc.tokens.bonus <- condprob.tc.bonus[which(condprob.tc.bonus$terms %in% W.bonus),]
  
  #Individual Scores
  score.c.bonus <- rep(NA, 2)
  
  for(cond in 1:length(C.bonus))
  {
    sm.bonus = 1
    score.mul.bonus = rep(NA, length(W.bonus) + 1)
    score.mul.bonus[sm.bonus] <- log(prior.c.bonus[cond])
    
    for(w in 1:length(W.bonus))
    {
      sm.bonus = sm.bonus + 1
      score.mul.bonus[sm.bonus] <- log((condprob.tc.tokens.bonus[w,cond])^(TD.test.doc.bonus[1,w]))
    }
    
    score.c.bonus[cond] = sum(score.mul.bonus)
  }
  
  scores.by.doc.bonus[doc] <- max(score.c.bonus)
  classifier.bonus[doc] <- C.bonus[which.is.max(score.c.bonus)]
}

f.dt.bonus = data.frame(pred_Label = classifier.bonus, actual = c("d", "d", "r", "d", "r"), 
                        president = c("Kennedy","Clinton","Bush","Obama","Trump"))

FINAL RESULT: Multinomial (Bonus)

##   pred_Label actual president
## 1          r      d   Kennedy
## 2          d      d   Clinton
## 3          r      r      Bush
## 4          d      d     Obama
## 5          d      r     Trump

The results differ (for JFK) from the set in part c) because the definitions of democrat and republican changed over the years. In the past the ideals of democrats were more republican and the ideals of republicans were more democratic. JFK is the closest data point to the change in party affiliation, so his result are effected the most.

Project

Ibrahim Odumas Odufowora & Jamie Berger

2017-05-13

Question 1: Spam Email

Q1a: Removing some varibles

Q1b: Spliting Data into Training and Test Data

Training Dataset (Head)

Test Dataset (Head)

Q1c: Naive Bayes Classifier

Q1d: Decision Tree

Q1e: Random Forest

Q1h: Bagging - Ensemble

Q1i: Boosting - Ensemble

Question 2: Fruit

Q2a: 75/25 Spliting & Removing Type Lemon

Q2b: Probabilities using Laplace Smoothing - Custom Function

Q2c: Evaluating Test Data using Custom Function

Q2d: Evaluating Test Data using Built-in Function

Q2e: Repeating Evaluation Using a 75/25 Split 10 times

Question 3: Presidential

Q3a Load Addresses

Q3b Remove Stop Words From Corpus

Q3c Predict Party Affiliations for Speeches

i. Term Document Matrix

ii. The Bernoulli Naive Bayes Model

Train Bernoulli

Apply Bernoulli

iii. The Multinomial Naive Bayes Model

Train Multinomial

Apply Multinomial

Q3d Bonus