Machine Learning for Credit Data

##Step :1
library(tidyverse)

## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

creditData <- read_csv("creditData.csv")## reading the data

## Parsed with column specification:
## cols(
##   .default = col_integer()
## )

## See spec(...) for full column specifications.

str(creditData) ##getting the structure of the data

## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration of Credit (month)       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment Status of Previous Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value Savings/Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length of current employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment per cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex & Marital Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration in Current address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most valuable available asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age (years)                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type of apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No of Credits at this Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No of dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 21
##   .. ..$ Creditability                    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Account Balance                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Duration of Credit (month)       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Payment Status of Previous Credit: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Purpose                          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Credit Amount                    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Value Savings/Stocks             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Length of current employment     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Instalment per cent              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Sex & Marital Status             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Guarantors                       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Duration in Current address      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Most valuable available asset    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Age (years)                      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Concurrent Credits               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Type of apartment                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ No of Credits at this Bank       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Occupation                       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ No of dependents                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Telephone                        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Foreign Worker                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

sum(is.na(creditData)) ## checking if any missing values

## [1] 0

## there are no missing values in the data.
## Following command is to convert the target variable into categorical 
creditData$Creditability <- as.factor(creditData$Creditability) 
## Randomizing the data before splitting it to training and test
set.seed(12345)
creditData_rand <- creditData[order(runif(1000)),]
## Splitting the data into test(25%) and train (75%)
summary(creditData)

##  Creditability Account Balance Duration of Credit (month)
##  0:300         Min.   :1.000   Min.   : 4.0              
##  1:700         1st Qu.:1.000   1st Qu.:12.0              
##                Median :2.000   Median :18.0              
##                Mean   :2.577   Mean   :20.9              
##                3rd Qu.:4.000   3rd Qu.:24.0              
##                Max.   :4.000   Max.   :72.0              
##  Payment Status of Previous Credit    Purpose       Credit Amount  
##  Min.   :0.000                     Min.   : 0.000   Min.   :  250  
##  1st Qu.:2.000                     1st Qu.: 1.000   1st Qu.: 1366  
##  Median :2.000                     Median : 2.000   Median : 2320  
##  Mean   :2.545                     Mean   : 2.828   Mean   : 3271  
##  3rd Qu.:4.000                     3rd Qu.: 3.000   3rd Qu.: 3972  
##  Max.   :4.000                     Max.   :10.000   Max.   :18424  
##  Value Savings/Stocks Length of current employment Instalment per cent
##  Min.   :1.000        Min.   :1.000                Min.   :1.000      
##  1st Qu.:1.000        1st Qu.:3.000                1st Qu.:2.000      
##  Median :1.000        Median :3.000                Median :3.000      
##  Mean   :2.105        Mean   :3.384                Mean   :2.973      
##  3rd Qu.:3.000        3rd Qu.:5.000                3rd Qu.:4.000      
##  Max.   :5.000        Max.   :5.000                Max.   :4.000      
##  Sex & Marital Status   Guarantors    Duration in Current address
##  Min.   :1.000        Min.   :1.000   Min.   :1.000              
##  1st Qu.:2.000        1st Qu.:1.000   1st Qu.:2.000              
##  Median :3.000        Median :1.000   Median :3.000              
##  Mean   :2.682        Mean   :1.145   Mean   :2.845              
##  3rd Qu.:3.000        3rd Qu.:1.000   3rd Qu.:4.000              
##  Max.   :4.000        Max.   :3.000   Max.   :4.000              
##  Most valuable available asset  Age (years)    Concurrent Credits
##  Min.   :1.000                 Min.   :19.00   Min.   :1.000     
##  1st Qu.:1.000                 1st Qu.:27.00   1st Qu.:3.000     
##  Median :2.000                 Median :33.00   Median :3.000     
##  Mean   :2.358                 Mean   :35.54   Mean   :2.675     
##  3rd Qu.:3.000                 3rd Qu.:42.00   3rd Qu.:3.000     
##  Max.   :4.000                 Max.   :75.00   Max.   :3.000     
##  Type of apartment No of Credits at this Bank   Occupation   
##  Min.   :1.000     Min.   :1.000              Min.   :1.000  
##  1st Qu.:2.000     1st Qu.:1.000              1st Qu.:3.000  
##  Median :2.000     Median :1.000              Median :3.000  
##  Mean   :1.928     Mean   :1.407              Mean   :2.904  
##  3rd Qu.:2.000     3rd Qu.:2.000              3rd Qu.:3.000  
##  Max.   :3.000     Max.   :4.000              Max.   :4.000  
##  No of dependents   Telephone     Foreign Worker 
##  Min.   :1.000    Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000    1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000    Median :1.000   Median :1.000  
##  Mean   :1.155    Mean   :1.404   Mean   :1.037  
##  3rd Qu.:1.000    3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :2.000    Max.   :2.000   Max.   :2.000

summary(creditData_rand)

##  Creditability Account Balance Duration of Credit (month)
##  0:300         Min.   :1.000   Min.   : 4.0              
##  1:700         1st Qu.:1.000   1st Qu.:12.0              
##                Median :2.000   Median :18.0              
##                Mean   :2.577   Mean   :20.9              
##                3rd Qu.:4.000   3rd Qu.:24.0              
##                Max.   :4.000   Max.   :72.0              
##  Payment Status of Previous Credit    Purpose       Credit Amount  
##  Min.   :0.000                     Min.   : 0.000   Min.   :  250  
##  1st Qu.:2.000                     1st Qu.: 1.000   1st Qu.: 1366  
##  Median :2.000                     Median : 2.000   Median : 2320  
##  Mean   :2.545                     Mean   : 2.828   Mean   : 3271  
##  3rd Qu.:4.000                     3rd Qu.: 3.000   3rd Qu.: 3972  
##  Max.   :4.000                     Max.   :10.000   Max.   :18424  
##  Value Savings/Stocks Length of current employment Instalment per cent
##  Min.   :1.000        Min.   :1.000                Min.   :1.000      
##  1st Qu.:1.000        1st Qu.:3.000                1st Qu.:2.000      
##  Median :1.000        Median :3.000                Median :3.000      
##  Mean   :2.105        Mean   :3.384                Mean   :2.973      
##  3rd Qu.:3.000        3rd Qu.:5.000                3rd Qu.:4.000      
##  Max.   :5.000        Max.   :5.000                Max.   :4.000      
##  Sex & Marital Status   Guarantors    Duration in Current address
##  Min.   :1.000        Min.   :1.000   Min.   :1.000              
##  1st Qu.:2.000        1st Qu.:1.000   1st Qu.:2.000              
##  Median :3.000        Median :1.000   Median :3.000              
##  Mean   :2.682        Mean   :1.145   Mean   :2.845              
##  3rd Qu.:3.000        3rd Qu.:1.000   3rd Qu.:4.000              
##  Max.   :4.000        Max.   :3.000   Max.   :4.000              
##  Most valuable available asset  Age (years)    Concurrent Credits
##  Min.   :1.000                 Min.   :19.00   Min.   :1.000     
##  1st Qu.:1.000                 1st Qu.:27.00   1st Qu.:3.000     
##  Median :2.000                 Median :33.00   Median :3.000     
##  Mean   :2.358                 Mean   :35.54   Mean   :2.675     
##  3rd Qu.:3.000                 3rd Qu.:42.00   3rd Qu.:3.000     
##  Max.   :4.000                 Max.   :75.00   Max.   :3.000     
##  Type of apartment No of Credits at this Bank   Occupation   
##  Min.   :1.000     Min.   :1.000              Min.   :1.000  
##  1st Qu.:2.000     1st Qu.:1.000              1st Qu.:3.000  
##  Median :2.000     Median :1.000              Median :3.000  
##  Mean   :1.928     Mean   :1.407              Mean   :2.904  
##  3rd Qu.:2.000     3rd Qu.:2.000              3rd Qu.:3.000  
##  Max.   :3.000     Max.   :4.000              Max.   :4.000  
##  No of dependents   Telephone     Foreign Worker 
##  Min.   :1.000    Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000    1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000    Median :1.000   Median :1.000  
##  Mean   :1.155    Mean   :1.404   Mean   :1.037  
##  3rd Qu.:1.000    3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :2.000    Max.   :2.000   Max.   :2.000

creditData_train <- creditData_rand[1:750, ]
creditData_test <- creditData_rand[751:1000, ]
## Table for the target variable in test and train data
prop.table(table(creditData_train$Creditability))

## 
##         0         1 
## 0.3146667 0.6853333

prop.table(table(creditData_test$Creditability))

## 
##     0     1 
## 0.256 0.744

## Step:2
## Training a model on the Data
library(naivebayes)
naive_model <- naive_bayes(as.character(Creditability) ~ ., data= creditData_train)
naive_model

## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = as.character(Creditability) ~ ., 
##     data = creditData_train)
## 
## A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## Tables: 
##                
## Account Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
##                           
## Duration of Credit (month)        0        1
##                       mean 24.46610 19.20039
##                       sd   13.82208 11.13433
## 
##                                  
## Payment Status of Previous Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
##        
## Purpose        0        1
##    mean 2.927966 2.803502
##    sd   2.944722 2.633253
## 
##              
## Credit Amount        0        1
##          mean 3964.195 2984.177
##          sd   3597.093 2379.685
## 
## # ... and 15 more tables

## Model Evaluation
## Step:3
(conf_nat <- table(predict(naive_model, creditData_test), creditData_test$Creditability))

##    
##       0   1
##   0  42  35
##   1  22 151

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 77.2

## The output of confusion matrix shows that in 42+151 = 193 cases out of 250 cases give the right prediction which is 77.2 %. The same is shown using accuracy command.

## Part-2
library("caret")

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

creditDataScaled <- scale(creditData_rand[,2:ncol(creditData_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))

## [1]  5 12 19 15  3

## Data Exploration
filteredData <- creditData_rand[, -(highlycor[4]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]

## Training the Data
nb_model <- naive_bayes(as.character(Creditability) ~ ., data=filteredTraining)

## Model Evaluation
filteredTestPred <- predict(nb_model, newdata = filteredTest)
table(filteredTestPred, filteredTest$Creditability)

##                 
## filteredTestPred   0   1
##                0  41  34
##                1  23 152

(conf_nat <- table(filteredTestPred, filteredTest$Creditability))

##                 
## filteredTestPred   0   1
##                0  41  34
##                1  23 152

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 77.2

## Both models have the same accuracy.

## Part-3

## Data Preparation
news <- read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
  
}

newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)

## Part-3 
## Random the data before splitting
news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)

##Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

## Model design
nb_model <- naive_bayes(shares ~ ., data=news_train)
nb_model

## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = shares ~ ., data = news_train)
## 
## A priori probabilities: 
## 
##        no       yes 
## 0.4291111 0.5708889 
## 
## Tables: 
##               
## n_tokens_title       no      yes
##           mean 9.820559 9.695991
##           sd   1.929249 1.987754
## 
##                 
## n_tokens_content       no      yes
##             mean 452.2315 515.1051
##             sd   347.1779 450.0206
## 
##                
## n_unique_tokens        no       yes
##            mean 0.5702437 0.5542023
##            sd   0.1127776 0.1232687
## 
##                 
## n_non_stop_words         no        yes
##             mean 0.99404453 0.99124172
##             sd   0.07695147 0.09318398
## 
##          
## num_hrefs        no       yes
##      mean  9.147851 10.570650
##      sd    8.644083 11.540711
## 
## # ... and 12 more tables

## Model Evaluation
news_Pred <- predict(nb_model, newdata = news_test)
(conf_nat <- table(news_Pred, news_test$shares))

##          
## news_Pred  no yes
##       no  430   0
##       yes   0 570

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 100

summary(news$shares)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1     946    1400    3395    2800  843300

# The model on Online news data gives an acciracy of 100%. This is not usual to get 100% accuracy and it should not be considered as perfect, rather there is something unusual in the dataset causing high accuracy. If we check the variance of the variable "shares" in the original data, we see a lot of variance and then we changed it into a binary variable for classifying.