Project

Ibrahim Odumas Odufowora & Jamie Berger

2017-05-13

Question 1: Spam Email

spam.main = read.csv("spam.csv", header=T)


Q1a: Removing some varibles

spam = subset(spam.main, select = -c(id, isuid, domain, spampct, category, cappct))

normalize = function(data) 
{
  return ((data - min(data)) / (max(data) - min(data))) 
}

#spam = as.data.frame(lapply(spam, normalize))

kable(head(spam, n = 5))
day.of.week time.of.day size.kb box local digits name special credit sucker porn chain username large.text spam
Thu 0 7 no no 0 name 1 no no no no no no no
Thu 0 2 no no 0 name 5 no no no no no no yes
Thu 14 3 no yes 0 name 2 no no no yes no no no
Thu 3 3 yes no 0 name 0 no no no no no no no
Thu 3 4 no no 0 name 2 no no no no no no no


Q1b: Spliting Data into Training and Test Data

set.seed(14023498)
row.spam = nrow(spam)
sample.count = round(0.2 * row.spam, 0)
sample.spam = sample(1:row.spam, sample.count) 

spam.train = spam[-sample.spam, ]
spam.test = spam[sample.spam, ]


Training Dataset (Head)

kable(head(spam.train, n = 5))
day.of.week time.of.day size.kb box local digits name special credit sucker porn chain username large.text spam
1 Thu 0 7 no no 0 name 1 no no no no no no no
3 Thu 14 3 no yes 0 name 2 no no no yes no no no
4 Thu 3 3 yes no 0 name 0 no no no no no no no
5 Thu 3 4 no no 0 name 2 no no no no no no no
6 Thu 4 4 no no 0 name 1 no no no no no no no


Test Dataset (Head)

kable(head(spam.test, n = 5))
day.of.week time.of.day size.kb box local digits name special credit sucker porn chain username large.text spam
1551 Tue 10 4 no no 0 name 0 no no no no no yes yes
2067 Fri 22 23 yes no 6 name 3 yes yes no no yes no yes
106 Mon 23 3 no no 1 name 1 no no no no no no no
1533 Wed 9 1 yes yes 0 name 1 no no no no no no no
913 Wed 7 10 no no 0 name 3 no yes no no yes yes yes


Q1c: Naive Bayes Classifier

test_data = spam.test
test_data$spam = as.factor(test_data$spam)
test_class = test_data[, 15]
train_data = spam.train
train_data$spam = as.factor(train_data$spam)
train_class = train_data[, 15]

set.seed(9234108)
nb.model = naiveBayes(spam ~., data = train_data)
pred = predict(nb.model, test_data)
error = 1 - sum(pred == test_class) / nrow(test_data)
acc = 1 - error
  
values = as.numeric(pred)-1
  
pred.2 = prediction(as.numeric(values), test_class)
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)
Accuracy Error AUC
0.8986175 0.1013825 0.9023562
nb.pred = as.numeric(pred) - 1


Q1d: Decision Tree

spam.tree = rpart(spam ~., data = spam.train, method = "class")
fancyRpartPlot(spam.tree, sub = "Decision Tree")

pred = predict(spam.tree, spam.test, type = "class")
conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table
##           Reference
## Prediction  no yes
##        no  263  26
##        yes  18 127
acc = conf.matrix$overall[1]
error = 1 - acc
base.error = error

pred.2 = prediction(as.numeric(pred), as.numeric(spam.test$spam))
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)
Accuracy Error AUC
0.8986175 0.1013825 0.8830042
tree.pred = as.numeric(pred) - 1
#print(summary(spam.tree))
#spam.tree


Q1e: Random Forest

set.seed(3421)

rf.model = randomForest(train_data[, -15], as.factor(as.numeric(train_class)))
rf.model
## 
## Call:
##  randomForest(x = train_data[, -15], y = as.factor(as.numeric(train_class))) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 6.62%
## Confusion matrix:
##      1   2 class.error
## 1 1132  48  0.04067797
## 2   67 490  0.12028725
pred = predict(rf.model, test_data[, -15])

conf.matrix = confusionMatrix(pred, as.factor(as.numeric(test_class)))
cf.mat.tab = conf.matrix$table
cf.mat.tab
##           Reference
## Prediction   1   2
##          1 268  15
##          2  13 138
acc = conf.matrix$overall[1]
error = 1 - acc
  
values = as.numeric(pred) - 1
  
pred.2 = prediction(as.numeric(values), test_class)
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)
Accuracy Error AUC
0.9354839 0.0645161 0.9278487


Q1h: Bagging - Ensemble

set.seed(90871)

bag = bagging(spam ~., data = spam.train, mfinal = 100)
bag.Pred = predict.bagging(bag, spam.test, newmfinal = 100)
bag.Pred$confusion
##                Observed Class
## Predicted Class  no yes
##             no  261  17
##             yes  20 136
error.bag = bag.Pred$error

The error - using bagging - on the test data is 0.0852535. This shows improvement.
The error - using the base model - on the test data is 0.1013825


Q1i: Boosting - Ensemble

boost = boosting(spam ~., data = spam.train, mfinal = 100)
boost.Pred = predict.boosting(boost, spam.test, newmfinal = 100)
boost.Pred$confusion
##                Observed Class
## Predicted Class  no yes
##             no  265  16
##             yes  16 137
error.boost = boost.Pred$error

The error - using boosting - on the test data is 0.0737327.


Question 2: Fruit

fruit.main = read.csv("fruit.txt", header=T)
names(fruit.main) = c("Type", "Weight", "Height", "Width")
fruit = fruit.main[which(fruit.main$Type < 3), ]


Q2a: 75/25 Spliting & Removing Type Lemon

set.seed(7218)
n.row = nrow(fruit)
sample = sample(1:n.row, round(0.75 * n.row))
fruit.train = fruit[sample, ]
fruit.test = fruit[-sample, ]

cat("Training Data - Fruit (Head)")
## Training Data - Fruit (Head)
kable(head(fruit.train, n = 5), row.names = F)
Type Weight Height Width
1 0 0 0
2 0 0 0
2 1 1 2
2 0 0 0
2 1 2 2
cat("\n", "Testing Data - Fruit (Head)")
## 
##  Testing Data - Fruit (Head)
kable(head(fruit.test, n = 5), row.names = F)
Type Weight Height Width
1 0 1 0
1 0 1 0
1 0 1 1
1 0 1 0
1 0 1 1


Q2b: Probabilities using Laplace Smoothing - Custom Function

naive.B = function(train, test, class.name) 
{
  if(!is.data.frame(train))
    train = as.data.frame(train())
  if(!is.data.frame(test))
    test = as.data.frame(test)
  
  index = match(class.name, names(train))
  
  train.main = train
  train = as.data.frame(train.main[, -index])
  train.class = as.data.frame(train.main[, index])
  names(train.class) = class.name
  
  test.main = test
  test = as.data.frame(test.main[, -index])
  test.class = as.data.frame(test.main[, index])
  names(test.class) = class.name
  
  #prob of class distribution
  prob.train.class = as.data.frame(table(train.class))
  prob.train.class$prob = prob.train.class$Freq / sum(prob.train.class$Freq)
  n.train.class = nrow(prob.train.class)
  prob.train.class = as.data.frame(t(prob.train.class[, -(1:2)]), row.names = NULL)
  
  n.attr = ncol(train.main)
  attr.prob.tb = list(n.attr)
  v.attr = c(1:n.attr)
  v.attr = v.attr[-index]
  h = 1
  
  for(a in v.attr)
  {
    attr = as.data.frame(train.main[, c(index, a)])
    
    prob.attr = t((table(attr)))
    mat.attr = matrix(nrow = nrow(prob.attr), ncol = ncol(prob.attr))
    
    for(i in 1:ncol(prob.attr))
    {
      mat.attr[,i] = (prob.attr[,i] + 1) / (sum(prob.attr[,i]) + nrow(prob.attr)) #smoothing
    }
    
    attr.prob.tb[[h]] = mat.attr
    h = h+1
  }
  
  #predicting the class label of the testing data
  test.class$pred1 = NA
  test.class$pred2 = NA
  #test.class$pred3 = NA
  test.class$predFinal = NA
  
  for(t in 1:nrow(test)) #loopiong thr each row in the test
  {
    pred.class = rep(NA, n.train.class) #keeping the pred given the class
    
    for(y in 1:n.train.class) #looping thr each class --- Given
    {
      given.prob = prob.train.class[,y]
      
      prob.holder = rep(NA, ncol(test))
      
      for(s in 1:ncol(test)) #conditional prob --- column
      {
        prob.tb = attr.prob.tb[[s]]
        cell = test[t, s] + 1
        prob = prob.tb[cell, y]
        prob.holder[s] = prob
      }
      
      pred = round(prod(prob.holder) * given.prob, 4)
      test.class[t, y+1] = pred
      pred.class[y] = pred
    }
    
    max.pred = max(pred.class)
    index.max = match(max.pred, pred.class)
    
    test.class$predFinal[t] = index.max
  }
  
  pd = test.class[,-(2:3)]
  eval = table(pd[,2], pd[,1])
  cm = confusionMatrix(eval)
  pram.eval = as.data.frame(cm$byClass)
  names(pram.eval) = c("evals")
  sens = pram.eval$evals[1]
  spec = pram.eval$evals[2]
  
  return(list("test.pred" = test.class, "accuracy" = mean(test.class[,1] == test.class[, ncol(test.class)]), 
  "prob.given.class" = attr.prob.tb, "prob.class" = prob.train.class, Sensitivity = sens,
  Specificity = spec))
  
}

result = naive.B(train = fruit.train, test = fruit.test, class.name = "Type")


cat("Probability of the class label")
## Probability of the class label
c.p = result$prob.class
names(c.p) = c("prob_apples", "pbob_oranges")
row.names(c.p) = ""
c.p
##  prob_apples pbob_oranges
##    0.4642857    0.5357143
cat("Probability of weight given the class label")
## Probability of weight given the class label
p.w.given.cl = as.data.frame(result$prob.given.class[[1]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1")
p.w.given.cl
##        given_apples given_oranges
## prob_0    0.8666667     0.5882353
## prob_1    0.1333333     0.4117647
cat("Probability of height given the class label")
## Probability of height given the class label
p.w.given.cl = as.data.frame(result$prob.given.class[[2]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1", "prob_2")
p.w.given.cl
##        given_apples given_oranges
## prob_0       0.4375     0.5000000
## prob_1       0.5000     0.2777778
## prob_2       0.0625     0.2222222
cat("Probability of width given the class label")
## Probability of width given the class label
p.w.given.cl = as.data.frame(result$prob.given.class[[3]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1", "prob_2")
p.w.given.cl
##        given_apples given_oranges
## prob_0        0.500     0.2777778
## prob_1        0.375     0.2777778
## prob_2        0.125     0.4444444


Q2c: Evaluating Test Data using Custom Function

class.label = result$test.pred
names(class.label) = c("Actual_Class", "prob|apples",   "prob|orangles",    "Pred_Class")

datatable(class.label, rownames = F, width = 500)
cat("Accuracy: ", result$accuracy)
## Accuracy:  0.6666667


Q2d: Evaluating Test Data using Built-in Function

test_data = fruit.test
test_data$Type = as.factor(test_data$Type)
test_class = test_data[, 1]
train_data = fruit.train
train_data$Type = as.factor(train_data$Type)
train_class = train_data[, 1]

nb.model = naiveBayes(train_data[,-1], train_data[,1], laplace = 1)

nb.test.fit = predict(nb.model, test_data[,-1])
cat("Predicted class using the built-in function")
## Predicted class using the built-in function
nb.test.fit
## [1] 1 1 1 1 1 1 1 1 2
## Levels: 1 2
e.acc = sum(nb.test.fit==test_data[,1]) / nrow(test_data)
cat("Accuracy: ", e.acc)
## Accuracy:  0.6666667
#t = table(nb.test.fit, test_data[,1])
#confusionMatrix(t)

These are same with the results in (2d) above.


Q2e: Repeating Evaluation Using a 75/25 Split 10 times

set.seed(2310621)
r.eval = data.frame(accuracy = rep(NA,10), sensitivity = rep(NA,10), specificity = rep(NA,10))

for(j in 1:10)
{
  sample = sample(1:n.row, round(0.75 * n.row))
  fruit.train = fruit[sample, ]
  fruit.test = fruit[-sample, ]
  
  #custom function
  result = naive.B(train = fruit.train, test = fruit.test, class.name = "Type")
  r.eval$accuracy[j] = result$accuracy
  r.eval$sensitivity[j] = result$Sensitivity
  r.eval$specificity[j] = result$Specificity
}

kable(r.eval)
accuracy sensitivity specificity
0.7777778 1.0000000 0.50
0.6666667 0.7500000 0.60
0.6666667 0.8000000 0.50
0.5555556 1.0000000 0.20
0.5555556 0.4285714 1.00
0.5555556 0.6666667 0.50
0.6666667 0.5000000 1.00
0.6666667 0.6000000 0.75
0.6666667 1.0000000 0.25
0.5555556 0.8000000 0.25


r.eval.mean = data.frame(Mean_accuracy = mean(r.eval$accuracy), 
  Mean_sensitivity = mean(r.eval$sensitivity), Mean_specificity = mean(r.eval$specificity))

kable(r.eval.mean, row.names = F)
Mean_accuracy Mean_sensitivity Mean_specificity
0.6333333 0.7545238 0.555


Question 3: Presidential

Q3a Load Addresses

State of the Union Set

#Source File of Addresses
dsrc <- DirSource('C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/files/')
getwd()
## [1] "C:/Users/2PAC/Documents/R/Data_Mining/project_4"
#Create Corpus of 'Documents' 
pages <- Corpus(dsrc, readerControl=list(language="en", stopwords=FALSE, wordLengths=c(0,Inf)))


Vector with Party Affiliation of Every President (Class Labels)

State of the Union Set

party <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/party.txt", header=FALSE)
names(party) = c("Party","President","Year")
#Class Label
C <- c("r","d")
C <- as.vector(C)

Q3b Remove Stop Words From Corpus

#Stop Words List
stopwords <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/stopwords.txt", sep="",
                      header=FALSE)
names(stopwords) = c("Stop.Words")

#Remove Stop Words 
docs <- tm_map(pages, removeWords, stopwords$Stop.Words)

Q3c Predict Party Affiliations for Speeches

i. Term Document Matrix

State of the Union Set

#Term Document Matrix (Stopwords already removed)
dtm = DocumentTermMatrix(docs, control=list(tolower=TRUE,  removePunctuation=TRUE, removeNumbers=TRUE, 
    stopwords=FALSE, wordLengths=c(1,Inf)))

TD = as.matrix(dtm)
TD = as.data.frame(TD)

#Remove rows containing test set data

# Map Files to Party Data
files <- vector(mode="character", length=0)
for(i in 1:nrow(party)){
  files[i] <- paste("a",i,".txt",sep="")
}
party$File <- files 

# Training and Test Set
party.test <- party[c(175,209,220,228,231),]
party.train <- party[-c(175,209,220,228,231),]

# Only recent speeches with Democrat vs. Republican , Remove all other parties
party.train <- subset(party.train, Party == 'r' | Party == 'd')
party.train <- party.train[order(party.train$File),] #Same order as TD.train

#Term Document of Train Data
TD.train <- TD[which(rownames(TD) %in% party.train$File),] 
TD.train <- as.data.frame(TD.train)
party.vector <- as.vector(party.train$Party)
TD.train$Party <- party.vector

#Term Document of Test Data
TD.test <- TD[which(rownames(TD) %in% party.test$File),]
TD.test <- as.data.frame(TD.test)

ii. The Bernoulli Naive Bayes Model

Train Bernoulli

State of the Union Set

# V <- extract vocabulary
V <- colnames(TD.train[,-length(colnames(TD.train))])

#Return Variables
p.c <- vector(mode="numeric", length=0) #Vector of priors
cp.tc <- matrix(nrow = length(V), ncol = 2) #Vector of conditional probabilities

# N <- count total number of docs
N <- nrow(TD.train)
N <- as.numeric(N)

if(is.data.frame(TD.train)==FALSE){
  TD.train <- as.data.frame(TD.train) # (for ease of calculation)
}

# for each c in C
for(i in 1:length(C)){
  
  # do Nc <- count docs in class
  TD.temp <- TD.train[which(TD.train$Party == C[i]),]
  Nc <- nrow(TD.temp)
  Nc <- as.numeric(Nc)
  names <- row.names(TD.temp)
  
  TD.temp <- as.data.frame(TD.temp) #set as data frame for easy calculation
  TD.temp$Party <- NULL #remove class label column for easy calculation
  TD.temp <- sapply(TD.temp, function(x) as.numeric(as.character(x))) #numeric for calc
  row.names(TD.temp) <- names
  
  # prior[c] <- Nc/N
  prior.c <- Nc/N
  p.c[i] <- prior.c
  
  # for each t in V
  for(j in 1:ncol(TD.temp)){
    Nct <- 0  
    # do Nct <- count docs in class containing t
    
    if(Nc == 1)
    {
      if(TD.temp[j]>=1){
        Nct <- Nct + 1
      }
      else{
        Nct <- Nct + 0
      }
    }
    else{
      for(k in 1:Nc){

        if(TD.temp[k,j]>=1){
          Nct <- Nct + 1
        }
        else{
          Nct <- Nct + 0
        }
    }
    }
    
    #condprob[t][c] <- (Nct + 1)/(Nc + 2)
    condprob.tc <- (Nct+1)/(Nc+2)
    cp.tc[j,i] <- condprob.tc
  }
}
colnames(cp.tc) <- C
row.names(cp.tc) <- V

#return values
cat("Head of Return Values")
## Head of Return Values
print(head(V))
## [1] "accession"      "add"            "adequate"       "administration"
## [5] "admitted"       "adopted"
print(head(p.c))
## [1] 0.494382 0.505618
print(head(cp.tc))
##                        r          d
## accession      0.1111111 0.08695652
## add            0.4777778 0.55434783
## adequate       0.6222222 0.67391304
## administration 0.8222222 0.70652174
## admitted       0.2333333 0.36956522
## adopted        0.5777778 0.52173913

Apply Bernoulli

State of the Union Set

set.seed(191919)
scores.by.doc <- rep(NA, 5)
classifier <- rep(NA, 5)

d <- nrow(TD.test)
d <- as.numeric(d)

for(doc in 1:d){                              
  doc <- as.numeric(doc)
  TD.test.d <- TD.test[doc,]
  TD.test.d <- as.data.frame(TD.test.d)
  
  index <- which(TD.test.d >= 1)
  Vd <- colnames(TD.test.d)[index]
  
  score.c <- rep(NA, 2)

  for(ii in 1:2)
  {
    sm = 1
    score.mul = rep(NA, length(V) + 1)
    score.mul[sm] <- log(p.c[ii])
    
    for(jj in 1:length(V))
    {
      sm = sm + 1
      
      if(V[jj] %in% Vd)
      {
        score.mul[sm] <- log(cp.tc[jj,ii])
      }
      else
      {
        score.mul[sm] <- log((1-cp.tc[jj,ii]))
      }
    }
    
    score.c[ii] = sum(score.mul)
    
  }
  scores.by.doc[doc] <- max(score.c)
  classifier[doc] <- C[which.is.max(score.c)]
}

paste("FINAL RESULT")
## [1] "FINAL RESULT"
f.dt.o = data.frame(pred_Label = classifier, actual = c("d", "d", "r", "d", "r"), 
  president = c("Kennedy","Clinton","Bush","Obama","Trump"))

FINAL RESULT: Bernoulli

##   pred_Label actual president
## 1          d      d   Kennedy
## 2          d      d   Clinton
## 3          d      r      Bush
## 4          d      d     Obama
## 5          d      r     Trump

iii. The Multinomial Naive Bayes Model

Train Multinomial

State of the Union Set

TD.train <- as.data.frame(TD.train)  
  
# prior[c] 
prior.c <- rep(NA,2)

# V <- extract vocabulary
V <- colnames(TD.train[,-c(ncol(TD.train))])

# P(t|c)
condprob.tc <-matrix(nrow = length(V), ncol = 2)

# N <- count documents
N <- nrow(TD.train)

# for each c in C
for(c in 1:length(C))
{
  N.c <- sum(TD.train$Party == C[c])
  prior.c[c] <- N.c/N
  
  TD.temporary <- TD.train[which(TD.train$Party == C[c]),] # Current condition
  TD.temporary <- TD.temporary[,-c(ncol(TD.temporary))]
  #Term Document Matrix without class labels, for ease of calculation
  
  # for each t in V
  # count number of tokens of t
  column.sums <- as.data.frame(colSums(TD.temporary)) #Number of tokens for each term
  colnames(column.sums) <- c("num.tokens")
  
  num.text <- sum(column.sums$num.tokens) #length of entire texts
  B <- length(V) # Number of terms in vocabulary
  
  # for each t in V
  for(t in 1:B)
  {
    condprob.tc[t,c] <- (column.sums[t,1]+1)/(num.text + B)
  }
}

colnames(condprob.tc) <- C
rownames(condprob.tc) <- V

#return values
cat("Head of Return Values")
## Head of Return Values
print(head(V))
## [1] "accession"      "add"            "adequate"       "administration"
## [5] "admitted"       "adopted"
print(head(prior.c))
## [1] 0.494382 0.505618
print(head(condprob.tc))
##                           r            d
## accession      3.190514e-05 2.600029e-05
## add            1.769285e-04 2.426694e-04
## adequate       5.771930e-04 5.026723e-04
## administration 1.247201e-03 1.155568e-03
## admitted       1.102178e-04 2.397804e-04
## adopted        5.249846e-04 4.535606e-04

Apply Multinomial

State of the Union Set

set.seed(292929)
scores.by.doc <- rep(NA, nrow(TD.test))
classifier <- rep(NA, nrow(TD.test))

for(doc in 1:nrow(TD.test))
{
  #One Document at a time
  TD.test.doc <- TD.test[doc, ]
  TD.test.doc <- as.data.frame(TD.test.doc)
  #colnames(TD.test.doc)[doc] <- row.names(TD.test)[doc]
  #TD.test.doc$terms <- rownames(TD.test.doc)
  condprob.tc <- as.data.frame(condprob.tc)
  condprob.tc$terms <- rownames(condprob.tc)
  
  #Tokens from Test Document
  index <- which(TD.test.doc >= 1)
  W <- colnames(TD.test.doc)[index]
  
  #Conditional Probabilities of Tokens Only
  condprob.tc.tokens <- condprob.tc[which(condprob.tc$terms %in% W),]
  
  #Individual Scores
  score.c <- rep(NA, 2)
  
  for(cond in 1:length(C))
  {
    sm = 1
    score.mul = rep(NA, length(W) + 1)
    score.mul[sm] <- log(prior.c[cond])
    
    for(w in 1:length(W))
    {
      sm = sm + 1
      score.mul[sm] <- log((condprob.tc.tokens[w,cond])^(TD.test.doc[1,w]))
    }
    
    score.c[cond] = sum(score.mul)
  }
  
  scores.by.doc[doc] <- max(score.c)
  classifier[doc] <- C[which.is.max(score.c)]
}

f.dt = data.frame(pred_Label = classifier, actual = c("d", "d", "r", "d", "r"), 
                  president = c("Kennedy","Clinton","Bush","Obama","Trump"))

#print(scores.by.doc)
#print(classifier)

FINAL RESULT for Multinomial

##   pred_Label actual president
## 1          d      d   Kennedy
## 2          d      d   Clinton
## 3          r      r      Bush
## 4          d      d     Obama
## 5          d      r     Trump

Q3d Bonus

#Source File of Addresses
dsrc.bonus <- DirSource("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/files/")

#Create Corpus of 'Documents' 
pages.bonus  <- Corpus(dsrc.bonus , readerControl=list(language="en", stopwords=FALSE,
                                                       wordLengths=c(0,Inf)))

party.bonus  <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/party.txt",
                         header=FALSE)
names(party.bonus ) = c("Party","President","Year")

#Class Label
C.bonus  <- c("r","d")
C.bonus  <- as.vector(C.bonus)

#Stop Words List
stopwords <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/stopwords.txt", sep="", 
                      header=FALSE)
names(stopwords) = c("Stop.Words")

#Remove Stop Words 
docs.bonus  <- tm_map(pages.bonus, removeWords, stopwords$Stop.Words)

#Term Document Matrix (Stopwords already removed)
dtm.bonus  = DocumentTermMatrix(docs.bonus, control=list(tolower=TRUE, removePunctuation=TRUE,
  removeNumbers=TRUE, stopwords=FALSE, wordLengths=c(1,Inf)))

TD.bonus  = as.matrix(dtm.bonus)
TD.bonus  = as.data.frame(TD.bonus)

#Remove rows containing test set data

# Map Files to Party Data
files.bonus  <- vector(mode="character", length=0)
for(i in 1:nrow(party.bonus)){
  files.bonus [i] <- paste("a",i,".txt",sep="")
}
party.bonus$File <- files.bonus  

# Training and Test Set
party.test.bonus  <- party.bonus[c(175,209,220,228,231),]
party.train.bonus  <- party.bonus[-c(175,209,220,228,231),]

# Only Speeches since Year 1913
party.train.bonus  <- subset(party.train.bonus , Year >= 1913)
party.train.bonus  <- party.train.bonus [order(party.train.bonus$File),] #Same order as TD.train

#Term Document of Train Data
TD.train.bonus <- TD.bonus[which(rownames(TD.bonus) %in% party.train.bonus$File),] 
TD.train.bonus <- as.data.frame(TD.train.bonus)
party.vector.bonus <- as.vector(party.train.bonus$Party)
TD.train.bonus$Party <- party.vector.bonus

#Term Document of Test Data
TD.test.bonus <- TD.bonus[which(rownames(TD.bonus) %in% party.test.bonus$File),]
TD.test.bonus <- as.data.frame(TD.test.bonus)

Train Bernoulli

# V <- extract vocabulary
V.bonus <- colnames(TD.train.bonus[,-length(colnames(TD.train.bonus))])

#Return Variables
p.c.bonus <- vector(mode="numeric", length=0) #Vector of priors
cp.tc.bonus <- matrix(nrow = length(V.bonus), ncol = 2) #Vector of conditional probabilities

# N <- count total number of docs
N.bonus <- nrow(TD.train.bonus)
N.bonus <- as.numeric(N.bonus)

if(is.data.frame(TD.train.bonus)==FALSE){
  TD.train.bonus <- as.data.frame(TD.train.bonus) # (for ease of calculation)
}

# for each c in C
for(i in 1:length(C.bonus)){
  
  # do Nc <- count docs in class
  TD.temp.bonus <- TD.train.bonus[which(TD.train.bonus$Party == C.bonus[i]),]
  Nc.bonus <- nrow(TD.temp.bonus)
  Nc.bonus <- as.numeric(Nc.bonus)
  names.bonus <- row.names(TD.temp.bonus)
  
  TD.temp.bonus <- as.data.frame(TD.temp.bonus) #set as data frame for easy calculation
  TD.temp.bonus$Party <- NULL #remove class label column for easy calculation
  TD.temp.bonus <- sapply(TD.temp.bonus, function(x) as.numeric(as.character(x))) #numeric for calc
  row.names(TD.temp.bonus) <- names.bonus
  
  # prior[c] <- Nc/N
  prior.c.bonus <- Nc.bonus/N.bonus
  p.c.bonus[i] <- prior.c.bonus
  
  # for each t in V
  for(j in 1:ncol(TD.temp.bonus)){
    Nct.bonus <- 0  
    # do Nct <- count docs in class containing t
    
    if(Nc.bonus == 1)
    {
      if(TD.temp.bonus[j]>=1){
        Nct.bonus <- Nct.bonus + 1
      }
      else{
        Nct.bonus <- Nct.bonus + 0
      }
    }
    else{
      for(k in 1:Nc.bonus){

        if(TD.temp.bonus[k,j]>=1){
          Nct.bonus <- Nct.bonus + 1
        }
        else{
          Nct.bonus <- Nct.bonus + 0
        }
    }
    }
    
    #condprob[t][c] <- (Nct + 1)/(Nc + 2)
    condprob.tc.bonus <- (Nct.bonus+1)/(Nc.bonus+2)
    cp.tc.bonus[j,i] <- condprob.tc.bonus
  }
}
colnames(cp.tc.bonus) <- C.bonus
row.names(cp.tc.bonus) <- V.bonus

#return values
cat("Head of Return Values")
## Head of Return Values
print(head(V.bonus))
## [1] "accession"      "add"            "adequate"       "administration"
## [5] "admitted"       "adopted"
print(head(p.c.bonus))
## [1] 0.4705882 0.5294118
print(head(cp.tc.bonus))
##                   r          d
## accession      0.04 0.01785714
## add            0.36 0.44642857
## adequate       0.52 0.53571429
## administration 0.76 0.55357143
## admitted       0.12 0.08928571
## adopted        0.30 0.23214286

Apply Bernoulli

set.seed(393939)
scores.by.doc.bonus <- rep(NA, 5)
classifier.bonus <- rep(NA, 5)

d.bonus <- nrow(TD.test.bonus)
d.bonus <- as.numeric(d.bonus)

for(doc in 1:d.bonus){                              
  doc <- as.numeric(doc)
  TD.test.d.bonus <- TD.test.bonus[doc,]
  TD.test.d.bonus <- as.data.frame(TD.test.d.bonus)
  
  index.bonus <- which(TD.test.d.bonus >= 1)
  Vd.bonus <- colnames(TD.test.d.bonus)[index.bonus]
  
  score.c.bonus <- rep(NA, 2)

  for(ii in 1:2)
  {
    sm.bonus = 1
    score.mul.bonus = rep(NA, length(V.bonus) + 1)
    score.mul.bonus[sm.bonus] <- log(p.c.bonus[ii])
    
    for(jj in 1:length(V.bonus))
    {
      sm.bonus = sm.bonus + 1
      
      if(V.bonus[jj] %in% Vd.bonus)
      {
        score.mul.bonus[sm.bonus] <- log(cp.tc.bonus[jj,ii])
      }
      else
      {
        score.mul.bonus[sm.bonus] <- log((1-cp.tc.bonus[jj,ii]))
      }
    }
    
    score.c.bonus[ii] = sum(score.mul.bonus)
    
  }
  scores.by.doc.bonus[doc] <- max(score.c.bonus)
  classifier.bonus[doc] <- C[which.is.max(score.c.bonus)]
}

f.dt.o.bonus = data.frame(pred_Label = classifier.bonus, actual = c("d", "d", "r", "d", "r"), 
                          president = c("Kennedy","Clinton","Bush","Obama","Trump"))

FINAL RESULT: Bernoulli (Bonus)

##   pred_Label actual president
## 1          d      d   Kennedy
## 2          d      d   Clinton
## 3          r      r      Bush
## 4          d      d     Obama
## 5          d      r     Trump

The results did not change from part c).

Train Multnomial

TD.train.bonus <- as.data.frame(TD.train.bonus)  
  
# prior[c] 
prior.c.bonus <- rep(NA,2)

# V <- extract vocabulary
V.bonus <- colnames(TD.train.bonus[,-c(ncol(TD.train.bonus))])

# P(t|c)
condprob.tc.bonus <-matrix(nrow = length(V.bonus), ncol = 2)

# N <- count documents
N.bonus <- nrow(TD.train.bonus)

# for each c in C
for(c in 1:length(C.bonus))
{
  N.c.bonus <- sum(TD.train.bonus$Party == C.bonus[c])
  prior.c.bonus[c] <- N.c.bonus/N.bonus
  
  TD.temporary.bonus <- TD.train.bonus[which(TD.train.bonus$Party == C.bonus[c]),] # Current condition
  TD.temporary.bonus <- TD.temporary.bonus[,-c(ncol(TD.temporary.bonus))] 
  #Term Document Matrix without class labels, for ease of calculation
  
  # for each t in V
  # count number of tokens of t
  column.sums.bonus <- as.data.frame(colSums(TD.temporary.bonus)) #Number of tokens for each term
  colnames(column.sums.bonus) <- c("num.tokens")
  
  num.text.bonus <- sum(column.sums.bonus$num.tokens) #length of entire texts
  B.bonus <- length(V.bonus) # Number of terms in vocabulary
  
  # for each t in V
  for(t in 1:B.bonus)
  {
    condprob.tc.bonus[t,c] <- (column.sums.bonus[t,1]+1)/(num.text.bonus + B.bonus)
  }
}
colnames(condprob.tc.bonus) <- C.bonus
rownames(condprob.tc.bonus) <- V.bonus

#return values
cat("Head of Return Values")
## Head of Return Values
print(head(V.bonus))
## [1] "accession"      "add"            "adequate"       "administration"
## [5] "admitted"       "adopted"
print(head(prior.c.bonus))
## [1] 0.4705882 0.5294118
print(head(condprob.tc.bonus))
##                           r            d
## accession      1.411313e-05 6.037371e-06
## add            1.764141e-04 2.113080e-04
## adequate       5.221858e-04 6.097745e-04
## administration 1.305465e-03 1.515380e-03
## admitted       4.233939e-05 3.018686e-05
## adopted        2.963757e-04 1.328222e-04

Apply Multinomial

set.seed(494949)
scores.by.doc.bonus <- rep(NA, nrow(TD.test.bonus))
classifier.bonus <- rep(NA, nrow(TD.test.bonus))

for(doc in 1:nrow(TD.test.bonus))
{
  #One Document at a time
  TD.test.doc.bonus <- TD.test.bonus[doc, ]
  TD.test.doc.bonus <- as.data.frame(TD.test.doc.bonus)
  #colnames(TD.test.doc)[doc] <- row.names(TD.test)[doc]
  #TD.test.doc$terms <- rownames(TD.test.doc)
  condprob.tc.bonus <- as.data.frame(condprob.tc.bonus)
  condprob.tc.bonus$terms <- rownames(condprob.tc.bonus)
  
  #Tokens from Test Document
  index.bonus <- which(TD.test.doc.bonus >= 1)
  W.bonus <- colnames(TD.test.doc.bonus)[index.bonus]
  
  #Conditional Probabilities of Tokens Only
  condprob.tc.tokens.bonus <- condprob.tc.bonus[which(condprob.tc.bonus$terms %in% W.bonus),]
  
  #Individual Scores
  score.c.bonus <- rep(NA, 2)
  
  for(cond in 1:length(C.bonus))
  {
    sm.bonus = 1
    score.mul.bonus = rep(NA, length(W.bonus) + 1)
    score.mul.bonus[sm.bonus] <- log(prior.c.bonus[cond])
    
    for(w in 1:length(W.bonus))
    {
      sm.bonus = sm.bonus + 1
      score.mul.bonus[sm.bonus] <- log((condprob.tc.tokens.bonus[w,cond])^(TD.test.doc.bonus[1,w]))
    }
    
    score.c.bonus[cond] = sum(score.mul.bonus)
  }
  
  scores.by.doc.bonus[doc] <- max(score.c.bonus)
  classifier.bonus[doc] <- C.bonus[which.is.max(score.c.bonus)]
}

f.dt.bonus = data.frame(pred_Label = classifier.bonus, actual = c("d", "d", "r", "d", "r"), 
                        president = c("Kennedy","Clinton","Bush","Obama","Trump"))

FINAL RESULT: Multinomial (Bonus)

##   pred_Label actual president
## 1          r      d   Kennedy
## 2          d      d   Clinton
## 3          r      r      Bush
## 4          d      d     Obama
## 5          d      r     Trump

The results differ (for JFK) from the set in part c) because the definitions of democrat and republican changed over the years. In the past the ideals of democrats were more republican and the ideals of republicans were more democratic. JFK is the closest data point to the change in party affiliation, so his result are effected the most.