Laboratory 2 Naive Bays Classifiers Part 1

Package library

library("dplyr")
library("ggplot2")
library("stringr")
library("moments")
library("car")
library("vcd")
library(C50)
library(gmodels)
library("grid")
library("vcdExtra")
library("logmult")
library("MASS")
library("rpart")
library("rpart.plot")
library("kernlab")
library("caret")
library("corrplot")
library("rsample")
library(robustbase)
library("naivebayes")

Q1: Maker sure the variable convert to categorical variables and import the data into the R

creditData<-read.csv("C:/Users/Monica/Desktop/creditData.csv")
str(creditData)
sum(is.na(creditData))
creditData$Creditability<-as.factor(creditData$Creditability)

Processing the data and make the variable as categorical data

Split the data to 0.75 portion to traing and 0.25 portion to testing

set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]

Traing proportion

prop.table(table(credit_train$Creditability))

Testing proportion

prop.table(table(credit_test$Creditability))

###Q2: From the result above, we could see the results for both classes and the distribution of both clasese are similar to each other. So we could say that the distribution of both classes preserved for both traing and testing data

Traning a model on the data

library(naivebayes)
naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model

Evalue the model using the confusion Matrix and evaluation the model

(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

The accuracy we got here is proper and we could try to increase the accuracy of the model before we train the model again.

Method 1 Cut off the 0.3 correlation

creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)

First we scale the credit data

m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))
filteredData <- credit_rand[, -(highlycor[5]+1)]
filteredTraining <- filteredData[1:750,]
filteredTest <- filteredData[751:1000,]

Then we could check the proportion of the training data and also the testing data

prop.table(table(filteredTraining$Creditability))
prop.table(table(filteredTest$Creditability))
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
filteredTestPred <- predict(nb_model, newdata = filteredTest)
conf_nat <- table(filteredTestPred, filteredTest$Creditability)
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

Q3: Then we will have a new result for the accuracy. The result seems no big difference from the last one.

Part 3 Support Vector Machines

letters <- read.csv("C:/Users/Monica/Desktop/letterdata.csv")
str(letters)
letters_train <- letters[1:18000, ]  
letters_test <- letters[18001:20000, ]
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
letter_classifier

Evaluating Model Performance

letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)

Q4. We try Polynomial and RBF. The result will not be as accurate as following. There are no matching with the predictions for the actual letter. Then we will do our evaluation this way

agreement <- letter_predictions == letters_test$letter
table(agreement)

###Part 4 Onlinenewspopularity

news <- read.csv("C:/Users/Monica/Desktop/Online.csv")
str(news)
sum(is.na(news))

Data Preparation

news2 <- data.frame(news$n_tokens_title, news$n_tokens_content , news$n_unique_tokens, news$n_non_stop_words,news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords,news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares )


colnames(news2) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
news2$popular = rep('na', nrow(news2))
for(i in 1:39644) {
     if(news2$shares[i] >= 1400) {
         news2$popular[i] = "yes"} 
     else {news2$popular[i] = "no"}
}
news2$shares = news2$popular
news2$shares <- as.factor(news2$shares)
news2 <- subset(news2, select = -popular)

Set the model for the news data

news_rand <- news2[order(runif(10000)), ]
set.seed(12345)
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
nb_model <- naive_bayes(shares ~ ., data=news_train)
nb_model

Evaluate the model for the news data

news_Pred <- predict(nb_model, newdata = news_test)
(conf_nat <- table(news_Pred, news_test$shares))

Check the Accuracy of the data

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

Question 5,

###After completing all the methods, for all the analysis we conduct, the Naive Bay2 has higher accuracy compared to the fist one and it makes the result more accurate

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.