creditData <-read.csv('~/creditData.csv')
str(creditData)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
creditData$Creditability <- as.factor(creditData$Creditability)
sum(is.na(creditData))
## [1] 0
set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3146667 0.6853333
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.256 0.744
library(naivebayes)
naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model
## ===================== Naive Bayes =====================
## Call:
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
##
## A priori probabilities:
##
## 0 1
## 0.3146667 0.6853333
##
## Tables:
##
## Account.Balance 0 1
## mean 1.923729 2.793774
## sd 1.036826 1.252008
##
##
## Duration.of.Credit..month. 0 1
## mean 24.46610 19.20039
## sd 13.82208 11.13433
##
##
## Payment.Status.of.Previous.Credit 0 1
## mean 2.161017 2.665370
## sd 1.071649 1.045219
##
##
## Purpose 0 1
## mean 2.927966 2.803502
## sd 2.944722 2.633253
##
##
## Credit.Amount 0 1
## mean 3964.195 2984.177
## sd 3597.093 2379.685
##
## # ... and 15 more tables
(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))
##
## 0 1
## 0 42 35
## 1 22 151
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2
#Part2
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))
## [1] 5 12 19 15 3
filteredData <- credit_rand[, -(highlycor[4]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
filteredTestPred <- predict(nb_model, newdata = filteredTest)
table(filteredTestPred, filteredTest$Creditability)
##
## filteredTestPred 0 1
## 0 41 34
## 1 23 152
(conf_nat <- table(filteredTestPred, filteredTest$Creditability))
##
## filteredTestPred 0 1
## 0 41 34
## 1 23 152
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2
#Part 3
news <- read.csv("~/OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
nb_model <- naive_bayes(shares ~ ., data=news_train)
nb_model
## ===================== Naive Bayes =====================
## Call:
## naive_bayes.formula(formula = shares ~ ., data = news_train)
##
## A priori probabilities:
##
## no yes
## 0.4308889 0.5691111
##
## Tables:
##
## n_tokens_title no yes
## mean 9.852243 9.697384
## sd 1.934198 1.984855
##
##
## n_tokens_content no yes
## mean 456.2597 507.5463
## sd 356.0001 438.3436
##
##
## n_unique_tokens no yes
## mean 0.5702261 0.5544309
## sd 0.1123952 0.1235964
##
##
## n_non_stop_words no yes
## mean 0.99432697 0.99043342
## sd 0.07511524 0.09734940
##
##
## num_hrefs no yes
## mean 9.142599 10.496486
## sd 8.655057 11.309569
##
## # ... and 12 more tables
news_Pred <- predict(nb_model, newdata = news_test)
(conf_nat <- table(news_Pred, news_test$shares))
##
## news_Pred no yes
## no 414 0
## yes 0 586
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 100
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.