##Step :1
library(tidyverse)
## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
creditData <- read_csv("creditData.csv")## reading the data
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
str(creditData) ##getting the structure of the data
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration of Credit (month) : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment Status of Previous Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value Savings/Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length of current employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment per cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex & Marital Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration in Current address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most valuable available asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age (years) : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type of apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No of Credits at this Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No of dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign Worker : int 1 1 1 2 2 2 2 2 1 1 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 21
## .. ..$ Creditability : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Account Balance : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Duration of Credit (month) : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Payment Status of Previous Credit: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Purpose : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Credit Amount : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Value Savings/Stocks : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Length of current employment : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Instalment per cent : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Sex & Marital Status : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Guarantors : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Duration in Current address : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Most valuable available asset : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Age (years) : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Concurrent Credits : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Type of apartment : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ No of Credits at this Bank : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Occupation : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ No of dependents : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Telephone : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Foreign Worker : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
sum(is.na(creditData)) ## checking if any missing values
## [1] 0
## there are no missing values in the data.
## Following command is to convert the target variable into categorical
creditData$Creditability <- as.factor(creditData$Creditability)
## Randomizing the data before splitting it to training and test
set.seed(12345)
creditData_rand <- creditData[order(runif(1000)),]
## Splitting the data into test(25%) and train (75%)
summary(creditData)
## Creditability Account Balance Duration of Credit (month)
## 0:300 Min. :1.000 Min. : 4.0
## 1:700 1st Qu.:1.000 1st Qu.:12.0
## Median :2.000 Median :18.0
## Mean :2.577 Mean :20.9
## 3rd Qu.:4.000 3rd Qu.:24.0
## Max. :4.000 Max. :72.0
## Payment Status of Previous Credit Purpose Credit Amount
## Min. :0.000 Min. : 0.000 Min. : 250
## 1st Qu.:2.000 1st Qu.: 1.000 1st Qu.: 1366
## Median :2.000 Median : 2.000 Median : 2320
## Mean :2.545 Mean : 2.828 Mean : 3271
## 3rd Qu.:4.000 3rd Qu.: 3.000 3rd Qu.: 3972
## Max. :4.000 Max. :10.000 Max. :18424
## Value Savings/Stocks Length of current employment Instalment per cent
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:2.000
## Median :1.000 Median :3.000 Median :3.000
## Mean :2.105 Mean :3.384 Mean :2.973
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :4.000
## Sex & Marital Status Guarantors Duration in Current address
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :3.000 Median :1.000 Median :3.000
## Mean :2.682 Mean :1.145 Mean :2.845
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:4.000
## Max. :4.000 Max. :3.000 Max. :4.000
## Most valuable available asset Age (years) Concurrent Credits
## Min. :1.000 Min. :19.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:27.00 1st Qu.:3.000
## Median :2.000 Median :33.00 Median :3.000
## Mean :2.358 Mean :35.54 Mean :2.675
## 3rd Qu.:3.000 3rd Qu.:42.00 3rd Qu.:3.000
## Max. :4.000 Max. :75.00 Max. :3.000
## Type of apartment No of Credits at this Bank Occupation
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000
## Median :2.000 Median :1.000 Median :3.000
## Mean :1.928 Mean :1.407 Mean :2.904
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.000 Max. :4.000
## No of dependents Telephone Foreign Worker
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :1.000
## Mean :1.155 Mean :1.404 Mean :1.037
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :2.000 Max. :2.000 Max. :2.000
summary(creditData_rand)
## Creditability Account Balance Duration of Credit (month)
## 0:300 Min. :1.000 Min. : 4.0
## 1:700 1st Qu.:1.000 1st Qu.:12.0
## Median :2.000 Median :18.0
## Mean :2.577 Mean :20.9
## 3rd Qu.:4.000 3rd Qu.:24.0
## Max. :4.000 Max. :72.0
## Payment Status of Previous Credit Purpose Credit Amount
## Min. :0.000 Min. : 0.000 Min. : 250
## 1st Qu.:2.000 1st Qu.: 1.000 1st Qu.: 1366
## Median :2.000 Median : 2.000 Median : 2320
## Mean :2.545 Mean : 2.828 Mean : 3271
## 3rd Qu.:4.000 3rd Qu.: 3.000 3rd Qu.: 3972
## Max. :4.000 Max. :10.000 Max. :18424
## Value Savings/Stocks Length of current employment Instalment per cent
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:2.000
## Median :1.000 Median :3.000 Median :3.000
## Mean :2.105 Mean :3.384 Mean :2.973
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :4.000
## Sex & Marital Status Guarantors Duration in Current address
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :3.000 Median :1.000 Median :3.000
## Mean :2.682 Mean :1.145 Mean :2.845
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:4.000
## Max. :4.000 Max. :3.000 Max. :4.000
## Most valuable available asset Age (years) Concurrent Credits
## Min. :1.000 Min. :19.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:27.00 1st Qu.:3.000
## Median :2.000 Median :33.00 Median :3.000
## Mean :2.358 Mean :35.54 Mean :2.675
## 3rd Qu.:3.000 3rd Qu.:42.00 3rd Qu.:3.000
## Max. :4.000 Max. :75.00 Max. :3.000
## Type of apartment No of Credits at this Bank Occupation
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000
## Median :2.000 Median :1.000 Median :3.000
## Mean :1.928 Mean :1.407 Mean :2.904
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.000 Max. :4.000
## No of dependents Telephone Foreign Worker
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :1.000
## Mean :1.155 Mean :1.404 Mean :1.037
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :2.000 Max. :2.000 Max. :2.000
creditData_train <- creditData_rand[1:750, ]
creditData_test <- creditData_rand[751:1000, ]
## Table for the target variable in test and train data
prop.table(table(creditData_train$Creditability))
##
## 0 1
## 0.3146667 0.6853333
prop.table(table(creditData_test$Creditability))
##
## 0 1
## 0.256 0.744
## Step:2
## Training a model on the Data
library(naivebayes)
naive_model <- naive_bayes(as.character(Creditability) ~ ., data= creditData_train)
naive_model
## ===================== Naive Bayes =====================
## Call:
## naive_bayes.formula(formula = as.character(Creditability) ~ .,
## data = creditData_train)
##
## A priori probabilities:
##
## 0 1
## 0.3146667 0.6853333
##
## Tables:
##
## Account Balance 0 1
## mean 1.923729 2.793774
## sd 1.036826 1.252008
##
##
## Duration of Credit (month) 0 1
## mean 24.46610 19.20039
## sd 13.82208 11.13433
##
##
## Payment Status of Previous Credit 0 1
## mean 2.161017 2.665370
## sd 1.071649 1.045219
##
##
## Purpose 0 1
## mean 2.927966 2.803502
## sd 2.944722 2.633253
##
##
## Credit Amount 0 1
## mean 3964.195 2984.177
## sd 3597.093 2379.685
##
## # ... and 15 more tables
## Model Evaluation
## Step:3
(conf_nat <- table(predict(naive_model, creditData_test), creditData_test$Creditability))
##
## 0 1
## 0 42 35
## 1 22 151
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2
## The output of confusion matrix shows that in 42+151 = 193 cases out of 250 cases give the right prediction which is 77.2 %. The same is shown using accuracy command.
## Part-2
library("caret")
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
creditDataScaled <- scale(creditData_rand[,2:ncol(creditData_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))
## [1] 5 12 19 15 3
## Data Exploration
filteredData <- creditData_rand[, -(highlycor[4]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]
## Training the Data
nb_model <- naive_bayes(as.character(Creditability) ~ ., data=filteredTraining)
## Model Evaluation
filteredTestPred <- predict(nb_model, newdata = filteredTest)
table(filteredTestPred, filteredTest$Creditability)
##
## filteredTestPred 0 1
## 0 41 34
## 1 23 152
(conf_nat <- table(filteredTestPred, filteredTest$Creditability))
##
## filteredTestPred 0 1
## 0 41 34
## 1 23 152
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2
## Both models have the same accuracy.
## Part-3
## Data Preparation
news <- read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
## Part-3
## Random the data before splitting
news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)
##Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
## Model design
nb_model <- naive_bayes(shares ~ ., data=news_train)
nb_model
## ===================== Naive Bayes =====================
## Call:
## naive_bayes.formula(formula = shares ~ ., data = news_train)
##
## A priori probabilities:
##
## no yes
## 0.4291111 0.5708889
##
## Tables:
##
## n_tokens_title no yes
## mean 9.820559 9.695991
## sd 1.929249 1.987754
##
##
## n_tokens_content no yes
## mean 452.2315 515.1051
## sd 347.1779 450.0206
##
##
## n_unique_tokens no yes
## mean 0.5702437 0.5542023
## sd 0.1127776 0.1232687
##
##
## n_non_stop_words no yes
## mean 0.99404453 0.99124172
## sd 0.07695147 0.09318398
##
##
## num_hrefs no yes
## mean 9.147851 10.570650
## sd 8.644083 11.540711
##
## # ... and 12 more tables
## Model Evaluation
news_Pred <- predict(nb_model, newdata = news_test)
(conf_nat <- table(news_Pred, news_test$shares))
##
## news_Pred no yes
## no 430 0
## yes 0 570
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 100
summary(news$shares)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 946 1400 3395 2800 843300
# The model on Online news data gives an acciracy of 100%. This is not usual to get 100% accuracy and it should not be considered as perfect, rather there is something unusual in the dataset causing high accuracy. If we check the variance of the variable "shares" in the original data, we see a lot of variance and then we changed it into a binary variable for classifying.