library("dplyr")
library("ggplot2")
library("stringr")
library("moments")
library("car")
library("vcd")
library(C50)
library(gmodels)
library("grid")
library("vcdExtra")
library("logmult")
library("MASS")
library("rpart")
library("rpart.plot")
library("kernlab")
library("caret")
library("corrplot")
library("rsample")
library(robustbase)
library("naivebayes")
creditData<-read.csv("C:/Users/Monica/Desktop/creditData.csv")
str(creditData)
sum(is.na(creditData))
creditData$Creditability<-as.factor(creditData$Creditability)
set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
prop.table(table(credit_train$Creditability))
prop.table(table(credit_test$Creditability))
###Q2: From the result above, we could see the results for both classes and the distribution of both clasese are similar to each other. So we could say that the distribution of both classes preserved for both traing and testing data
library(naivebayes)
naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model
(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))
filteredData <- credit_rand[, -(highlycor[5]+1)]
filteredTraining <- filteredData[1:750,]
filteredTest <- filteredData[751:1000,]
prop.table(table(filteredTraining$Creditability))
prop.table(table(filteredTest$Creditability))
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
filteredTestPred <- predict(nb_model, newdata = filteredTest)
conf_nat <- table(filteredTestPred, filteredTest$Creditability)
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
letters <- read.csv("C:/Users/Monica/Desktop/letterdata.csv")
str(letters)
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
letter_classifier
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
agreement <- letter_predictions == letters_test$letter
table(agreement)
###Part 4 Onlinenewspopularity
news <- read.csv("C:/Users/Monica/Desktop/Online.csv")
str(news)
sum(is.na(news))
news2 <- data.frame(news$n_tokens_title, news$n_tokens_content , news$n_unique_tokens, news$n_non_stop_words,news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords,news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares )
colnames(news2) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
news2$popular = rep('na', nrow(news2))
for(i in 1:39644) {
if(news2$shares[i] >= 1400) {
news2$popular[i] = "yes"}
else {news2$popular[i] = "no"}
}
news2$shares = news2$popular
news2$shares <- as.factor(news2$shares)
news2 <- subset(news2, select = -popular)
news_rand <- news2[order(runif(10000)), ]
set.seed(12345)
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
nb_model <- naive_bayes(shares ~ ., data=news_train)
nb_model
news_Pred <- predict(nb_model, newdata = news_test)
(conf_nat <- table(news_Pred, news_test$shares))
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
###After completing all the methods, for all the analysis we conduct, the Naive Bay2 has higher accuracy compared to the fist one and it makes the result more accurate
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.