spam.main = read.csv("spam.csv", header=T)
spam = subset(spam.main, select = -c(id, isuid, domain, spampct, category, cappct))
normalize = function(data)
{
return ((data - min(data)) / (max(data) - min(data)))
}
#spam = as.data.frame(lapply(spam, normalize))
kable(head(spam, n = 5))
| day.of.week | time.of.day | size.kb | box | local | digits | name | special | credit | sucker | porn | chain | username | large.text | spam |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Thu | 0 | 7 | no | no | 0 | name | 1 | no | no | no | no | no | no | no |
| Thu | 0 | 2 | no | no | 0 | name | 5 | no | no | no | no | no | no | yes |
| Thu | 14 | 3 | no | yes | 0 | name | 2 | no | no | no | yes | no | no | no |
| Thu | 3 | 3 | yes | no | 0 | name | 0 | no | no | no | no | no | no | no |
| Thu | 3 | 4 | no | no | 0 | name | 2 | no | no | no | no | no | no | no |
set.seed(14023498)
row.spam = nrow(spam)
sample.count = round(0.2 * row.spam, 0)
sample.spam = sample(1:row.spam, sample.count)
spam.train = spam[-sample.spam, ]
spam.test = spam[sample.spam, ]
kable(head(spam.train, n = 5))
| day.of.week | time.of.day | size.kb | box | local | digits | name | special | credit | sucker | porn | chain | username | large.text | spam | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Thu | 0 | 7 | no | no | 0 | name | 1 | no | no | no | no | no | no | no |
| 3 | Thu | 14 | 3 | no | yes | 0 | name | 2 | no | no | no | yes | no | no | no |
| 4 | Thu | 3 | 3 | yes | no | 0 | name | 0 | no | no | no | no | no | no | no |
| 5 | Thu | 3 | 4 | no | no | 0 | name | 2 | no | no | no | no | no | no | no |
| 6 | Thu | 4 | 4 | no | no | 0 | name | 1 | no | no | no | no | no | no | no |
kable(head(spam.test, n = 5))
| day.of.week | time.of.day | size.kb | box | local | digits | name | special | credit | sucker | porn | chain | username | large.text | spam | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1551 | Tue | 10 | 4 | no | no | 0 | name | 0 | no | no | no | no | no | yes | yes |
| 2067 | Fri | 22 | 23 | yes | no | 6 | name | 3 | yes | yes | no | no | yes | no | yes |
| 106 | Mon | 23 | 3 | no | no | 1 | name | 1 | no | no | no | no | no | no | no |
| 1533 | Wed | 9 | 1 | yes | yes | 0 | name | 1 | no | no | no | no | no | no | no |
| 913 | Wed | 7 | 10 | no | no | 0 | name | 3 | no | yes | no | no | yes | yes | yes |
test_data = spam.test
test_data$spam = as.factor(test_data$spam)
test_class = test_data[, 15]
train_data = spam.train
train_data$spam = as.factor(train_data$spam)
train_class = train_data[, 15]
set.seed(9234108)
nb.model = naiveBayes(spam ~., data = train_data)
pred = predict(nb.model, test_data)
error = 1 - sum(pred == test_class) / nrow(test_data)
acc = 1 - error
values = as.numeric(pred)-1
pred.2 = prediction(as.numeric(values), test_class)
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)
| Accuracy | Error | AUC |
|---|---|---|
| 0.8986175 | 0.1013825 | 0.9023562 |
nb.pred = as.numeric(pred) - 1
spam.tree = rpart(spam ~., data = spam.train, method = "class")
fancyRpartPlot(spam.tree, sub = "Decision Tree")
pred = predict(spam.tree, spam.test, type = "class")
conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table
## Reference
## Prediction no yes
## no 263 26
## yes 18 127
acc = conf.matrix$overall[1]
error = 1 - acc
base.error = error
pred.2 = prediction(as.numeric(pred), as.numeric(spam.test$spam))
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)
| Accuracy | Error | AUC |
|---|---|---|
| 0.8986175 | 0.1013825 | 0.8830042 |
tree.pred = as.numeric(pred) - 1
#print(summary(spam.tree))
#spam.tree
set.seed(3421)
rf.model = randomForest(train_data[, -15], as.factor(as.numeric(train_class)))
rf.model
##
## Call:
## randomForest(x = train_data[, -15], y = as.factor(as.numeric(train_class)))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 6.62%
## Confusion matrix:
## 1 2 class.error
## 1 1132 48 0.04067797
## 2 67 490 0.12028725
pred = predict(rf.model, test_data[, -15])
conf.matrix = confusionMatrix(pred, as.factor(as.numeric(test_class)))
cf.mat.tab = conf.matrix$table
cf.mat.tab
## Reference
## Prediction 1 2
## 1 268 15
## 2 13 138
acc = conf.matrix$overall[1]
error = 1 - acc
values = as.numeric(pred) - 1
pred.2 = prediction(as.numeric(values), test_class)
auc.te = performance(pred.2, "auc")
auc = as.numeric(auc.te@y.values)
result = data.frame("Accuracy" = acc, "Error" = error, "AUC" = auc)
kable(result, row.names = F)
| Accuracy | Error | AUC |
|---|---|---|
| 0.9354839 | 0.0645161 | 0.9278487 |
set.seed(90871)
bag = bagging(spam ~., data = spam.train, mfinal = 100)
bag.Pred = predict.bagging(bag, spam.test, newmfinal = 100)
bag.Pred$confusion
## Observed Class
## Predicted Class no yes
## no 261 17
## yes 20 136
error.bag = bag.Pred$error
The error - using bagging - on the test data is 0.0852535. This shows improvement.
The error - using the base model - on the test data is 0.1013825
boost = boosting(spam ~., data = spam.train, mfinal = 100)
boost.Pred = predict.boosting(boost, spam.test, newmfinal = 100)
boost.Pred$confusion
## Observed Class
## Predicted Class no yes
## no 265 16
## yes 16 137
error.boost = boost.Pred$error
The error - using boosting - on the test data is 0.0737327.
fruit.main = read.csv("fruit.txt", header=T)
names(fruit.main) = c("Type", "Weight", "Height", "Width")
fruit = fruit.main[which(fruit.main$Type < 3), ]
set.seed(7218)
n.row = nrow(fruit)
sample = sample(1:n.row, round(0.75 * n.row))
fruit.train = fruit[sample, ]
fruit.test = fruit[-sample, ]
cat("Training Data - Fruit (Head)")
## Training Data - Fruit (Head)
kable(head(fruit.train, n = 5), row.names = F)
| Type | Weight | Height | Width |
|---|---|---|---|
| 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 |
| 2 | 1 | 1 | 2 |
| 2 | 0 | 0 | 0 |
| 2 | 1 | 2 | 2 |
cat("\n", "Testing Data - Fruit (Head)")
##
## Testing Data - Fruit (Head)
kable(head(fruit.test, n = 5), row.names = F)
| Type | Weight | Height | Width |
|---|---|---|---|
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 1 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 1 |
naive.B = function(train, test, class.name)
{
if(!is.data.frame(train))
train = as.data.frame(train())
if(!is.data.frame(test))
test = as.data.frame(test)
index = match(class.name, names(train))
train.main = train
train = as.data.frame(train.main[, -index])
train.class = as.data.frame(train.main[, index])
names(train.class) = class.name
test.main = test
test = as.data.frame(test.main[, -index])
test.class = as.data.frame(test.main[, index])
names(test.class) = class.name
#prob of class distribution
prob.train.class = as.data.frame(table(train.class))
prob.train.class$prob = prob.train.class$Freq / sum(prob.train.class$Freq)
n.train.class = nrow(prob.train.class)
prob.train.class = as.data.frame(t(prob.train.class[, -(1:2)]), row.names = NULL)
n.attr = ncol(train.main)
attr.prob.tb = list(n.attr)
v.attr = c(1:n.attr)
v.attr = v.attr[-index]
h = 1
for(a in v.attr)
{
attr = as.data.frame(train.main[, c(index, a)])
prob.attr = t((table(attr)))
mat.attr = matrix(nrow = nrow(prob.attr), ncol = ncol(prob.attr))
for(i in 1:ncol(prob.attr))
{
mat.attr[,i] = (prob.attr[,i] + 1) / (sum(prob.attr[,i]) + nrow(prob.attr)) #smoothing
}
attr.prob.tb[[h]] = mat.attr
h = h+1
}
#predicting the class label of the testing data
test.class$pred1 = NA
test.class$pred2 = NA
#test.class$pred3 = NA
test.class$predFinal = NA
for(t in 1:nrow(test)) #loopiong thr each row in the test
{
pred.class = rep(NA, n.train.class) #keeping the pred given the class
for(y in 1:n.train.class) #looping thr each class --- Given
{
given.prob = prob.train.class[,y]
prob.holder = rep(NA, ncol(test))
for(s in 1:ncol(test)) #conditional prob --- column
{
prob.tb = attr.prob.tb[[s]]
cell = test[t, s] + 1
prob = prob.tb[cell, y]
prob.holder[s] = prob
}
pred = round(prod(prob.holder) * given.prob, 4)
test.class[t, y+1] = pred
pred.class[y] = pred
}
max.pred = max(pred.class)
index.max = match(max.pred, pred.class)
test.class$predFinal[t] = index.max
}
pd = test.class[,-(2:3)]
eval = table(pd[,2], pd[,1])
cm = confusionMatrix(eval)
pram.eval = as.data.frame(cm$byClass)
names(pram.eval) = c("evals")
sens = pram.eval$evals[1]
spec = pram.eval$evals[2]
return(list("test.pred" = test.class, "accuracy" = mean(test.class[,1] == test.class[, ncol(test.class)]),
"prob.given.class" = attr.prob.tb, "prob.class" = prob.train.class, Sensitivity = sens,
Specificity = spec))
}
result = naive.B(train = fruit.train, test = fruit.test, class.name = "Type")
cat("Probability of the class label")
## Probability of the class label
c.p = result$prob.class
names(c.p) = c("prob_apples", "pbob_oranges")
row.names(c.p) = ""
c.p
## prob_apples pbob_oranges
## 0.4642857 0.5357143
cat("Probability of weight given the class label")
## Probability of weight given the class label
p.w.given.cl = as.data.frame(result$prob.given.class[[1]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1")
p.w.given.cl
## given_apples given_oranges
## prob_0 0.8666667 0.5882353
## prob_1 0.1333333 0.4117647
cat("Probability of height given the class label")
## Probability of height given the class label
p.w.given.cl = as.data.frame(result$prob.given.class[[2]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1", "prob_2")
p.w.given.cl
## given_apples given_oranges
## prob_0 0.4375 0.5000000
## prob_1 0.5000 0.2777778
## prob_2 0.0625 0.2222222
cat("Probability of width given the class label")
## Probability of width given the class label
p.w.given.cl = as.data.frame(result$prob.given.class[[3]])
names(p.w.given.cl) = c("given_apples", "given_oranges")
row.names(p.w.given.cl) = c("prob_0", "prob_1", "prob_2")
p.w.given.cl
## given_apples given_oranges
## prob_0 0.500 0.2777778
## prob_1 0.375 0.2777778
## prob_2 0.125 0.4444444
class.label = result$test.pred
names(class.label) = c("Actual_Class", "prob|apples", "prob|orangles", "Pred_Class")
datatable(class.label, rownames = F, width = 500)
cat("Accuracy: ", result$accuracy)
## Accuracy: 0.6666667
test_data = fruit.test
test_data$Type = as.factor(test_data$Type)
test_class = test_data[, 1]
train_data = fruit.train
train_data$Type = as.factor(train_data$Type)
train_class = train_data[, 1]
nb.model = naiveBayes(train_data[,-1], train_data[,1], laplace = 1)
nb.test.fit = predict(nb.model, test_data[,-1])
cat("Predicted class using the built-in function")
## Predicted class using the built-in function
nb.test.fit
## [1] 1 1 1 1 1 1 1 1 2
## Levels: 1 2
e.acc = sum(nb.test.fit==test_data[,1]) / nrow(test_data)
cat("Accuracy: ", e.acc)
## Accuracy: 0.6666667
#t = table(nb.test.fit, test_data[,1])
#confusionMatrix(t)
These are same with the results in (2d) above.
set.seed(2310621)
r.eval = data.frame(accuracy = rep(NA,10), sensitivity = rep(NA,10), specificity = rep(NA,10))
for(j in 1:10)
{
sample = sample(1:n.row, round(0.75 * n.row))
fruit.train = fruit[sample, ]
fruit.test = fruit[-sample, ]
#custom function
result = naive.B(train = fruit.train, test = fruit.test, class.name = "Type")
r.eval$accuracy[j] = result$accuracy
r.eval$sensitivity[j] = result$Sensitivity
r.eval$specificity[j] = result$Specificity
}
kable(r.eval)
| accuracy | sensitivity | specificity |
|---|---|---|
| 0.7777778 | 1.0000000 | 0.50 |
| 0.6666667 | 0.7500000 | 0.60 |
| 0.6666667 | 0.8000000 | 0.50 |
| 0.5555556 | 1.0000000 | 0.20 |
| 0.5555556 | 0.4285714 | 1.00 |
| 0.5555556 | 0.6666667 | 0.50 |
| 0.6666667 | 0.5000000 | 1.00 |
| 0.6666667 | 0.6000000 | 0.75 |
| 0.6666667 | 1.0000000 | 0.25 |
| 0.5555556 | 0.8000000 | 0.25 |
r.eval.mean = data.frame(Mean_accuracy = mean(r.eval$accuracy),
Mean_sensitivity = mean(r.eval$sensitivity), Mean_specificity = mean(r.eval$specificity))
kable(r.eval.mean, row.names = F)
| Mean_accuracy | Mean_sensitivity | Mean_specificity |
|---|---|---|
| 0.6333333 | 0.7545238 | 0.555 |
State of the Union Set
#Source File of Addresses
dsrc <- DirSource('C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/files/')
getwd()
## [1] "C:/Users/2PAC/Documents/R/Data_Mining/project_4"
#Create Corpus of 'Documents'
pages <- Corpus(dsrc, readerControl=list(language="en", stopwords=FALSE, wordLengths=c(0,Inf)))
Vector with Party Affiliation of Every President (Class Labels)
State of the Union Set
party <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/party.txt", header=FALSE)
names(party) = c("Party","President","Year")
#Class Label
C <- c("r","d")
C <- as.vector(C)
#Stop Words List
stopwords <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/stopwords.txt", sep="",
header=FALSE)
names(stopwords) = c("Stop.Words")
#Remove Stop Words
docs <- tm_map(pages, removeWords, stopwords$Stop.Words)
State of the Union Set
#Term Document Matrix (Stopwords already removed)
dtm = DocumentTermMatrix(docs, control=list(tolower=TRUE, removePunctuation=TRUE, removeNumbers=TRUE,
stopwords=FALSE, wordLengths=c(1,Inf)))
TD = as.matrix(dtm)
TD = as.data.frame(TD)
#Remove rows containing test set data
# Map Files to Party Data
files <- vector(mode="character", length=0)
for(i in 1:nrow(party)){
files[i] <- paste("a",i,".txt",sep="")
}
party$File <- files
# Training and Test Set
party.test <- party[c(175,209,220,228,231),]
party.train <- party[-c(175,209,220,228,231),]
# Only recent speeches with Democrat vs. Republican , Remove all other parties
party.train <- subset(party.train, Party == 'r' | Party == 'd')
party.train <- party.train[order(party.train$File),] #Same order as TD.train
#Term Document of Train Data
TD.train <- TD[which(rownames(TD) %in% party.train$File),]
TD.train <- as.data.frame(TD.train)
party.vector <- as.vector(party.train$Party)
TD.train$Party <- party.vector
#Term Document of Test Data
TD.test <- TD[which(rownames(TD) %in% party.test$File),]
TD.test <- as.data.frame(TD.test)
State of the Union Set
# V <- extract vocabulary
V <- colnames(TD.train[,-length(colnames(TD.train))])
#Return Variables
p.c <- vector(mode="numeric", length=0) #Vector of priors
cp.tc <- matrix(nrow = length(V), ncol = 2) #Vector of conditional probabilities
# N <- count total number of docs
N <- nrow(TD.train)
N <- as.numeric(N)
if(is.data.frame(TD.train)==FALSE){
TD.train <- as.data.frame(TD.train) # (for ease of calculation)
}
# for each c in C
for(i in 1:length(C)){
# do Nc <- count docs in class
TD.temp <- TD.train[which(TD.train$Party == C[i]),]
Nc <- nrow(TD.temp)
Nc <- as.numeric(Nc)
names <- row.names(TD.temp)
TD.temp <- as.data.frame(TD.temp) #set as data frame for easy calculation
TD.temp$Party <- NULL #remove class label column for easy calculation
TD.temp <- sapply(TD.temp, function(x) as.numeric(as.character(x))) #numeric for calc
row.names(TD.temp) <- names
# prior[c] <- Nc/N
prior.c <- Nc/N
p.c[i] <- prior.c
# for each t in V
for(j in 1:ncol(TD.temp)){
Nct <- 0
# do Nct <- count docs in class containing t
if(Nc == 1)
{
if(TD.temp[j]>=1){
Nct <- Nct + 1
}
else{
Nct <- Nct + 0
}
}
else{
for(k in 1:Nc){
if(TD.temp[k,j]>=1){
Nct <- Nct + 1
}
else{
Nct <- Nct + 0
}
}
}
#condprob[t][c] <- (Nct + 1)/(Nc + 2)
condprob.tc <- (Nct+1)/(Nc+2)
cp.tc[j,i] <- condprob.tc
}
}
colnames(cp.tc) <- C
row.names(cp.tc) <- V
#return values
cat("Head of Return Values")
## Head of Return Values
print(head(V))
## [1] "accession" "add" "adequate" "administration"
## [5] "admitted" "adopted"
print(head(p.c))
## [1] 0.494382 0.505618
print(head(cp.tc))
## r d
## accession 0.1111111 0.08695652
## add 0.4777778 0.55434783
## adequate 0.6222222 0.67391304
## administration 0.8222222 0.70652174
## admitted 0.2333333 0.36956522
## adopted 0.5777778 0.52173913
State of the Union Set
set.seed(191919)
scores.by.doc <- rep(NA, 5)
classifier <- rep(NA, 5)
d <- nrow(TD.test)
d <- as.numeric(d)
for(doc in 1:d){
doc <- as.numeric(doc)
TD.test.d <- TD.test[doc,]
TD.test.d <- as.data.frame(TD.test.d)
index <- which(TD.test.d >= 1)
Vd <- colnames(TD.test.d)[index]
score.c <- rep(NA, 2)
for(ii in 1:2)
{
sm = 1
score.mul = rep(NA, length(V) + 1)
score.mul[sm] <- log(p.c[ii])
for(jj in 1:length(V))
{
sm = sm + 1
if(V[jj] %in% Vd)
{
score.mul[sm] <- log(cp.tc[jj,ii])
}
else
{
score.mul[sm] <- log((1-cp.tc[jj,ii]))
}
}
score.c[ii] = sum(score.mul)
}
scores.by.doc[doc] <- max(score.c)
classifier[doc] <- C[which.is.max(score.c)]
}
paste("FINAL RESULT")
## [1] "FINAL RESULT"
f.dt.o = data.frame(pred_Label = classifier, actual = c("d", "d", "r", "d", "r"),
president = c("Kennedy","Clinton","Bush","Obama","Trump"))
FINAL RESULT: Bernoulli
## pred_Label actual president
## 1 d d Kennedy
## 2 d d Clinton
## 3 d r Bush
## 4 d d Obama
## 5 d r Trump
State of the Union Set
TD.train <- as.data.frame(TD.train)
# prior[c]
prior.c <- rep(NA,2)
# V <- extract vocabulary
V <- colnames(TD.train[,-c(ncol(TD.train))])
# P(t|c)
condprob.tc <-matrix(nrow = length(V), ncol = 2)
# N <- count documents
N <- nrow(TD.train)
# for each c in C
for(c in 1:length(C))
{
N.c <- sum(TD.train$Party == C[c])
prior.c[c] <- N.c/N
TD.temporary <- TD.train[which(TD.train$Party == C[c]),] # Current condition
TD.temporary <- TD.temporary[,-c(ncol(TD.temporary))]
#Term Document Matrix without class labels, for ease of calculation
# for each t in V
# count number of tokens of t
column.sums <- as.data.frame(colSums(TD.temporary)) #Number of tokens for each term
colnames(column.sums) <- c("num.tokens")
num.text <- sum(column.sums$num.tokens) #length of entire texts
B <- length(V) # Number of terms in vocabulary
# for each t in V
for(t in 1:B)
{
condprob.tc[t,c] <- (column.sums[t,1]+1)/(num.text + B)
}
}
colnames(condprob.tc) <- C
rownames(condprob.tc) <- V
#return values
cat("Head of Return Values")
## Head of Return Values
print(head(V))
## [1] "accession" "add" "adequate" "administration"
## [5] "admitted" "adopted"
print(head(prior.c))
## [1] 0.494382 0.505618
print(head(condprob.tc))
## r d
## accession 3.190514e-05 2.600029e-05
## add 1.769285e-04 2.426694e-04
## adequate 5.771930e-04 5.026723e-04
## administration 1.247201e-03 1.155568e-03
## admitted 1.102178e-04 2.397804e-04
## adopted 5.249846e-04 4.535606e-04
State of the Union Set
set.seed(292929)
scores.by.doc <- rep(NA, nrow(TD.test))
classifier <- rep(NA, nrow(TD.test))
for(doc in 1:nrow(TD.test))
{
#One Document at a time
TD.test.doc <- TD.test[doc, ]
TD.test.doc <- as.data.frame(TD.test.doc)
#colnames(TD.test.doc)[doc] <- row.names(TD.test)[doc]
#TD.test.doc$terms <- rownames(TD.test.doc)
condprob.tc <- as.data.frame(condprob.tc)
condprob.tc$terms <- rownames(condprob.tc)
#Tokens from Test Document
index <- which(TD.test.doc >= 1)
W <- colnames(TD.test.doc)[index]
#Conditional Probabilities of Tokens Only
condprob.tc.tokens <- condprob.tc[which(condprob.tc$terms %in% W),]
#Individual Scores
score.c <- rep(NA, 2)
for(cond in 1:length(C))
{
sm = 1
score.mul = rep(NA, length(W) + 1)
score.mul[sm] <- log(prior.c[cond])
for(w in 1:length(W))
{
sm = sm + 1
score.mul[sm] <- log((condprob.tc.tokens[w,cond])^(TD.test.doc[1,w]))
}
score.c[cond] = sum(score.mul)
}
scores.by.doc[doc] <- max(score.c)
classifier[doc] <- C[which.is.max(score.c)]
}
f.dt = data.frame(pred_Label = classifier, actual = c("d", "d", "r", "d", "r"),
president = c("Kennedy","Clinton","Bush","Obama","Trump"))
#print(scores.by.doc)
#print(classifier)
FINAL RESULT for Multinomial
## pred_Label actual president
## 1 d d Kennedy
## 2 d d Clinton
## 3 r r Bush
## 4 d d Obama
## 5 d r Trump
#Source File of Addresses
dsrc.bonus <- DirSource("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/files/")
#Create Corpus of 'Documents'
pages.bonus <- Corpus(dsrc.bonus , readerControl=list(language="en", stopwords=FALSE,
wordLengths=c(0,Inf)))
party.bonus <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/party.txt",
header=FALSE)
names(party.bonus ) = c("Party","President","Year")
#Class Label
C.bonus <- c("r","d")
C.bonus <- as.vector(C.bonus)
#Stop Words List
stopwords <- read.csv("C:/Users/2PAC/Documents/R/Data_Mining/project_4/sotu/sotu/stopwords.txt", sep="",
header=FALSE)
names(stopwords) = c("Stop.Words")
#Remove Stop Words
docs.bonus <- tm_map(pages.bonus, removeWords, stopwords$Stop.Words)
#Term Document Matrix (Stopwords already removed)
dtm.bonus = DocumentTermMatrix(docs.bonus, control=list(tolower=TRUE, removePunctuation=TRUE,
removeNumbers=TRUE, stopwords=FALSE, wordLengths=c(1,Inf)))
TD.bonus = as.matrix(dtm.bonus)
TD.bonus = as.data.frame(TD.bonus)
#Remove rows containing test set data
# Map Files to Party Data
files.bonus <- vector(mode="character", length=0)
for(i in 1:nrow(party.bonus)){
files.bonus [i] <- paste("a",i,".txt",sep="")
}
party.bonus$File <- files.bonus
# Training and Test Set
party.test.bonus <- party.bonus[c(175,209,220,228,231),]
party.train.bonus <- party.bonus[-c(175,209,220,228,231),]
# Only Speeches since Year 1913
party.train.bonus <- subset(party.train.bonus , Year >= 1913)
party.train.bonus <- party.train.bonus [order(party.train.bonus$File),] #Same order as TD.train
#Term Document of Train Data
TD.train.bonus <- TD.bonus[which(rownames(TD.bonus) %in% party.train.bonus$File),]
TD.train.bonus <- as.data.frame(TD.train.bonus)
party.vector.bonus <- as.vector(party.train.bonus$Party)
TD.train.bonus$Party <- party.vector.bonus
#Term Document of Test Data
TD.test.bonus <- TD.bonus[which(rownames(TD.bonus) %in% party.test.bonus$File),]
TD.test.bonus <- as.data.frame(TD.test.bonus)
Train Bernoulli
# V <- extract vocabulary
V.bonus <- colnames(TD.train.bonus[,-length(colnames(TD.train.bonus))])
#Return Variables
p.c.bonus <- vector(mode="numeric", length=0) #Vector of priors
cp.tc.bonus <- matrix(nrow = length(V.bonus), ncol = 2) #Vector of conditional probabilities
# N <- count total number of docs
N.bonus <- nrow(TD.train.bonus)
N.bonus <- as.numeric(N.bonus)
if(is.data.frame(TD.train.bonus)==FALSE){
TD.train.bonus <- as.data.frame(TD.train.bonus) # (for ease of calculation)
}
# for each c in C
for(i in 1:length(C.bonus)){
# do Nc <- count docs in class
TD.temp.bonus <- TD.train.bonus[which(TD.train.bonus$Party == C.bonus[i]),]
Nc.bonus <- nrow(TD.temp.bonus)
Nc.bonus <- as.numeric(Nc.bonus)
names.bonus <- row.names(TD.temp.bonus)
TD.temp.bonus <- as.data.frame(TD.temp.bonus) #set as data frame for easy calculation
TD.temp.bonus$Party <- NULL #remove class label column for easy calculation
TD.temp.bonus <- sapply(TD.temp.bonus, function(x) as.numeric(as.character(x))) #numeric for calc
row.names(TD.temp.bonus) <- names.bonus
# prior[c] <- Nc/N
prior.c.bonus <- Nc.bonus/N.bonus
p.c.bonus[i] <- prior.c.bonus
# for each t in V
for(j in 1:ncol(TD.temp.bonus)){
Nct.bonus <- 0
# do Nct <- count docs in class containing t
if(Nc.bonus == 1)
{
if(TD.temp.bonus[j]>=1){
Nct.bonus <- Nct.bonus + 1
}
else{
Nct.bonus <- Nct.bonus + 0
}
}
else{
for(k in 1:Nc.bonus){
if(TD.temp.bonus[k,j]>=1){
Nct.bonus <- Nct.bonus + 1
}
else{
Nct.bonus <- Nct.bonus + 0
}
}
}
#condprob[t][c] <- (Nct + 1)/(Nc + 2)
condprob.tc.bonus <- (Nct.bonus+1)/(Nc.bonus+2)
cp.tc.bonus[j,i] <- condprob.tc.bonus
}
}
colnames(cp.tc.bonus) <- C.bonus
row.names(cp.tc.bonus) <- V.bonus
#return values
cat("Head of Return Values")
## Head of Return Values
print(head(V.bonus))
## [1] "accession" "add" "adequate" "administration"
## [5] "admitted" "adopted"
print(head(p.c.bonus))
## [1] 0.4705882 0.5294118
print(head(cp.tc.bonus))
## r d
## accession 0.04 0.01785714
## add 0.36 0.44642857
## adequate 0.52 0.53571429
## administration 0.76 0.55357143
## admitted 0.12 0.08928571
## adopted 0.30 0.23214286
Apply Bernoulli
set.seed(393939)
scores.by.doc.bonus <- rep(NA, 5)
classifier.bonus <- rep(NA, 5)
d.bonus <- nrow(TD.test.bonus)
d.bonus <- as.numeric(d.bonus)
for(doc in 1:d.bonus){
doc <- as.numeric(doc)
TD.test.d.bonus <- TD.test.bonus[doc,]
TD.test.d.bonus <- as.data.frame(TD.test.d.bonus)
index.bonus <- which(TD.test.d.bonus >= 1)
Vd.bonus <- colnames(TD.test.d.bonus)[index.bonus]
score.c.bonus <- rep(NA, 2)
for(ii in 1:2)
{
sm.bonus = 1
score.mul.bonus = rep(NA, length(V.bonus) + 1)
score.mul.bonus[sm.bonus] <- log(p.c.bonus[ii])
for(jj in 1:length(V.bonus))
{
sm.bonus = sm.bonus + 1
if(V.bonus[jj] %in% Vd.bonus)
{
score.mul.bonus[sm.bonus] <- log(cp.tc.bonus[jj,ii])
}
else
{
score.mul.bonus[sm.bonus] <- log((1-cp.tc.bonus[jj,ii]))
}
}
score.c.bonus[ii] = sum(score.mul.bonus)
}
scores.by.doc.bonus[doc] <- max(score.c.bonus)
classifier.bonus[doc] <- C[which.is.max(score.c.bonus)]
}
f.dt.o.bonus = data.frame(pred_Label = classifier.bonus, actual = c("d", "d", "r", "d", "r"),
president = c("Kennedy","Clinton","Bush","Obama","Trump"))
FINAL RESULT: Bernoulli (Bonus)
## pred_Label actual president
## 1 d d Kennedy
## 2 d d Clinton
## 3 r r Bush
## 4 d d Obama
## 5 d r Trump
The results did not change from part c).
Train Multnomial
TD.train.bonus <- as.data.frame(TD.train.bonus)
# prior[c]
prior.c.bonus <- rep(NA,2)
# V <- extract vocabulary
V.bonus <- colnames(TD.train.bonus[,-c(ncol(TD.train.bonus))])
# P(t|c)
condprob.tc.bonus <-matrix(nrow = length(V.bonus), ncol = 2)
# N <- count documents
N.bonus <- nrow(TD.train.bonus)
# for each c in C
for(c in 1:length(C.bonus))
{
N.c.bonus <- sum(TD.train.bonus$Party == C.bonus[c])
prior.c.bonus[c] <- N.c.bonus/N.bonus
TD.temporary.bonus <- TD.train.bonus[which(TD.train.bonus$Party == C.bonus[c]),] # Current condition
TD.temporary.bonus <- TD.temporary.bonus[,-c(ncol(TD.temporary.bonus))]
#Term Document Matrix without class labels, for ease of calculation
# for each t in V
# count number of tokens of t
column.sums.bonus <- as.data.frame(colSums(TD.temporary.bonus)) #Number of tokens for each term
colnames(column.sums.bonus) <- c("num.tokens")
num.text.bonus <- sum(column.sums.bonus$num.tokens) #length of entire texts
B.bonus <- length(V.bonus) # Number of terms in vocabulary
# for each t in V
for(t in 1:B.bonus)
{
condprob.tc.bonus[t,c] <- (column.sums.bonus[t,1]+1)/(num.text.bonus + B.bonus)
}
}
colnames(condprob.tc.bonus) <- C.bonus
rownames(condprob.tc.bonus) <- V.bonus
#return values
cat("Head of Return Values")
## Head of Return Values
print(head(V.bonus))
## [1] "accession" "add" "adequate" "administration"
## [5] "admitted" "adopted"
print(head(prior.c.bonus))
## [1] 0.4705882 0.5294118
print(head(condprob.tc.bonus))
## r d
## accession 1.411313e-05 6.037371e-06
## add 1.764141e-04 2.113080e-04
## adequate 5.221858e-04 6.097745e-04
## administration 1.305465e-03 1.515380e-03
## admitted 4.233939e-05 3.018686e-05
## adopted 2.963757e-04 1.328222e-04
Apply Multinomial
set.seed(494949)
scores.by.doc.bonus <- rep(NA, nrow(TD.test.bonus))
classifier.bonus <- rep(NA, nrow(TD.test.bonus))
for(doc in 1:nrow(TD.test.bonus))
{
#One Document at a time
TD.test.doc.bonus <- TD.test.bonus[doc, ]
TD.test.doc.bonus <- as.data.frame(TD.test.doc.bonus)
#colnames(TD.test.doc)[doc] <- row.names(TD.test)[doc]
#TD.test.doc$terms <- rownames(TD.test.doc)
condprob.tc.bonus <- as.data.frame(condprob.tc.bonus)
condprob.tc.bonus$terms <- rownames(condprob.tc.bonus)
#Tokens from Test Document
index.bonus <- which(TD.test.doc.bonus >= 1)
W.bonus <- colnames(TD.test.doc.bonus)[index.bonus]
#Conditional Probabilities of Tokens Only
condprob.tc.tokens.bonus <- condprob.tc.bonus[which(condprob.tc.bonus$terms %in% W.bonus),]
#Individual Scores
score.c.bonus <- rep(NA, 2)
for(cond in 1:length(C.bonus))
{
sm.bonus = 1
score.mul.bonus = rep(NA, length(W.bonus) + 1)
score.mul.bonus[sm.bonus] <- log(prior.c.bonus[cond])
for(w in 1:length(W.bonus))
{
sm.bonus = sm.bonus + 1
score.mul.bonus[sm.bonus] <- log((condprob.tc.tokens.bonus[w,cond])^(TD.test.doc.bonus[1,w]))
}
score.c.bonus[cond] = sum(score.mul.bonus)
}
scores.by.doc.bonus[doc] <- max(score.c.bonus)
classifier.bonus[doc] <- C.bonus[which.is.max(score.c.bonus)]
}
f.dt.bonus = data.frame(pred_Label = classifier.bonus, actual = c("d", "d", "r", "d", "r"),
president = c("Kennedy","Clinton","Bush","Obama","Trump"))
FINAL RESULT: Multinomial (Bonus)
## pred_Label actual president
## 1 r d Kennedy
## 2 d d Clinton
## 3 r r Bush
## 4 d d Obama
## 5 d r Trump
The results differ (for JFK) from the set in part c) because the definitions of democrat and republican changed over the years. In the past the ideals of democrats were more republican and the ideals of republicans were more democratic. JFK is the closest data point to the change in party affiliation, so his result are effected the most.