library(C50)
## Warning: package 'C50' was built under R version 3.5.1
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.1
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.1
## Loading required package: rpart
library(ca)
## Warning: package 'ca' was built under R version 3.5.1
library(class)
## Warning: package 'class' was built under R version 3.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(rpart)
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.1
library(bnlearn)
## Warning: package 'bnlearn' was built under R version 3.5.1
## 
## Attaching package: 'bnlearn'
## The following object is masked from 'package:stats':
## 
##     sigma
library(tm)
## Warning: package 'tm' was built under R version 3.5.1
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.1
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:bnlearn':
## 
##     impute
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(SnowballC)
library(klaR)
## Warning: package 'klaR' was built under R version 3.5.1
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.1
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
library(leaps)
## Warning: package 'leaps' was built under R version 3.5.1
library(colorspace)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(naivebayes)
library(psych)
## Warning: package 'psych' was built under R version 3.5.1
## 
## Attaching package: 'psych'
## The following object is masked from 'package:kernlab':
## 
##     alpha
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

PART 1

(1) DATA COLLECTION

credit <- read.csv("C:/Users/charl/Downloads/ANLY 530--Lab 2--creditData.csv")
str(credit)
## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

Note that “Account Balance” is ranked 1, “Duration of Credit…” is #2, and “Credit Amount” is ranked #3. These are the top three variables affecting the credit score function in R.

(2) Exploring the Data: Preparing the Data for the Learning Process.

summary(credit$Creditability)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     1.0     0.7     1.0     1.0
hist(credit$Creditability)

m1 <- glm(Creditability ~., family = "binomial", data = credit)
m1
## 
## Call:  glm(formula = Creditability ~ ., family = "binomial", data = credit)
## 
## Coefficients:
##                       (Intercept)                    Account.Balance  
##                        -3.9940768                          0.5799270  
##        Duration.of.Credit..month.  Payment.Status.of.Previous.Credit  
##                        -0.0245701                          0.3821907  
##                           Purpose                      Credit.Amount  
##                         0.0315277                         -0.0000934  
##              Value.Savings.Stocks       Length.of.current.employment  
##                         0.2391122                          0.1517308  
##               Instalment.per.cent               Sex...Marital.Status  
##                        -0.2983367                          0.2573791  
##                        Guarantors        Duration.in.Current.address  
##                         0.3472739                         -0.0141141  
##     Most.valuable.available.asset                        Age..years.  
##                        -0.1828445                          0.0089167  
##                Concurrent.Credits                  Type.of.apartment  
##                         0.2418915                          0.2930602  
##        No.of.Credits.at.this.Bank                         Occupation  
##                        -0.2435882                          0.0188903  
##                  No.of.dependents                          Telephone  
##                        -0.1707594                          0.2946784  
##                    Foreign.Worker  
##                         1.1583058  
## 
## Degrees of Freedom: 999 Total (i.e. Null);  979 Residual
## Null Deviance:       1222 
## Residual Deviance: 956.6     AIC: 998.6

Exploring the DATA and Randomizing the DATA

set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]

Divide DATA to Train and Test Sets. Classification Analysis.

credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]

Train and Test Sets- A good randomization should show Close percentages between Splits. Check the proportioj of Data after randomization:

prop.table(table(credit_train$Creditability))
## 
##         0         1 
## 0.3146667 0.6853333
prop.table(table(credit_test$Creditability))
## 
##     0     1 
## 0.256 0.744

(3) Training the Model

m.rpart1 <- rpart(Creditability ~., data = credit_train)
print(m.rpart1)
## n= 750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 750 161.7387000 0.68533330  
##     2) Account.Balance< 2.5 424 104.6415000 0.55660380  
##       4) Duration.of.Credit..month.>=33 85  19.6941200 0.36470590  
##         8) Value.Savings.Stocks< 4.5 73  15.3698600 0.30136990  
##          16) Age..years.< 29.5 26   1.8461540 0.07692308 *
##          17) Age..years.>=29.5 47  11.4893600 0.42553190  
##            34) Duration.in.Current.address>=3.5 24   3.9583330 0.20833330 *
##            35) Duration.in.Current.address< 3.5 23   5.2173910 0.65217390 *
##         9) Value.Savings.Stocks>=4.5 12   2.2500000 0.75000000 *
##       5) Duration.of.Credit..month.< 33 339  81.0324500 0.60471980  
##        10) Payment.Status.of.Previous.Credit< 1.5 38   8.2105260 0.31578950 *
##        11) Payment.Status.of.Previous.Credit>=1.5 301  69.2491700 0.64119600  
##          22) Credit.Amount>=10975.5 8   0.0000000 0.00000000 *
##          23) Credit.Amount< 10975.5 293  65.8703100 0.65870310  
##            46) Duration.of.Credit..month.>=11.5 233  55.4592300 0.60944210  
##              92) Credit.Amount< 1381.5 74  18.1621600 0.43243240  
##               184) Guarantors< 1.5 63  14.3174600 0.34920630  
##                 368) Telephone< 1.5 52  10.2307700 0.26923080 *
##                 369) Telephone>=1.5 11   2.1818180 0.72727270 *
##               185) Guarantors>=1.5 11   0.9090909 0.90909090 *
##              93) Credit.Amount>=1381.5 159  33.8993700 0.69182390 *
##            47) Duration.of.Credit..month.< 11.5 60   7.6500000 0.85000000 *
##     3) Account.Balance>=2.5 326  40.9325200 0.85276070  
##       6) Purpose>=8.5 32   7.2187500 0.65625000  
##        12) Length.of.current.employment< 2.5 8   0.8750000 0.12500000 *
##        13) Length.of.current.employment>=2.5 24   3.3333330 0.83333330 *
##       7) Purpose< 8.5 294  32.3435400 0.87414970 *

(4) Evaluate the Model

credit_pred <- predict(m.rpart1, credit_test)

(p <- table(credit_pred, credit_test$Creditability))
##                     
## credit_pred            0   1
##   0.0769230769230769   7   2
##   0.125                0   1
##   0.208333333333333    4   2
##   0.269230769230769    5   5
##   0.315789473684211    8   3
##   0.652173913043478    4   2
##   0.691823899371069   18  35
##   0.727272727272727    2   0
##   0.75                 1   2
##   0.833333333333333    1   9
##   0.85                 3  15
##   0.874149659863946   11 109
##   0.909090909090909    0   1

Since we are dealing with a 13 x 3 matrix dimension, we are unable to use the following accuracy model:

Accuracy <- sum(diag(p)) / (sum(p) * 100)

Visualization

The tree below shows that account balance, Duration of Credit, and Credit Amount appear to segment the loans well into different risk categories:

rpart.plot(m.rpart1, digits = 3, type = 1)

Our dichotomy is the following:

summary(credit_test$Creditability)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.744   1.000   1.000

Let’s look at the relation between Account.Balance and Duration.of.Credit..month. Two of the top three factors influencing the Credit Score Function in R:

xtabs(~ credit$Account.Balance + credit$Duration.of.Credit..month, data = credit)
##                       credit$Duration.of.Credit..month
## credit$Account.Balance  4  5  6  7  8  9 10 11 12 13 14 15 16 18 20 21 22
##                      1  0  0 22  0  3  8  4  2 53  1  2 13  1 33  3  9  0
##                      2  0  0 14  3  4 20  3  3 44  2  1 13  1 34  4  6  0
##                      3  1  0  7  0  0  3  6  0 13  0  0  6  0  5  0  2  0
##                      4  5  1 32  2  0 18 15  4 69  1  1 32  0 41  1 13  2
##                       credit$Duration.of.Credit..month
## credit$Account.Balance 24 26 27 28 30 33 36 39 40 42 45 47 48 54 60 72
##                      1 56  0  3  1 11  1 22  1  1  5  2  1 14  0  2  0
##                      2 37  1  5  0 14  0 24  2  0  2  3  0 21  1  6  1
##                      3 11  0  0  0  3  0  4  0  0  2  0  0  0  0  0  0
##                      4 80  0  5  2 12  2 33  2  0  2  0  0 13  1  5  0

Let’s change our class variables to factors:

credit$Duration <- as.factor(credit$Duration.of.Credit..month)
credit$Account <- as.factor(credit$Account.Balance)

credit$Creditability <- as.factor(credit$Creditability)
credit$Payment <- as.factor(credit$Payment.Status.of.Previous.Credit)
credit$Purpose <- as.factor(credit$Purpose)
credit$CreditA <- as.factor(credit$Credit.Amount)
credit$ValueS <- as.factor(credit$Value.Savings.Stocks)
credit$Length <- as.factor(credit$Length.of.current.employment)
credit$Instalment <- as.factor(credit$Instalment.per.cent)
credit$Sex <- as.factor(credit$Sex...Marital.Status)
credit$Guarantors <- as.factor(credit$Guarantors)
credit$Durat <- as.factor(credit$Duration.in.Current.address)
credit$Most <- as.factor(credit$Most.valuable.available.asset)
credit$Age <- as.factor(credit$Age..years.)
credit$Con <- as.factor(credit$Concurrent.Credits)
credit$Type <- as.factor(credit$Type.of.apartment)
credit$No <- as.factor(credit$No.of.Credits.at.this.Bank)
credit$Occupation <- as.factor(credit$Occupation)
credit$Number <- as.factor(credit$No.of.dependents)
credit$Telephone <- as.factor(credit$Telephone)
credit$Foreign <- as.factor(credit$Foreign.Worker)

As an alternative, let’s now create a density plot using Age..years and Account.Balance:

credit %>% ggplot(aes(x=Age..years., fill = credit$Account)) + 
  geom_density(alpha=0.8, color = "blue") + ggtitle("Density Plot")

Notice that the account balance was greatest around age 25 years.

Now, let’s create a density plot using only Creditability:

credit %>% ggplot(aes(x = Creditability, fill = credit$Creditability)) + 
  geom_density(alpha=0.8, color = "blue") + ggtitle("Density Plot")

(4) Model Evaluation

cred_pred <- predict(m.rpart1, credit_test)

head(cred_pred)
##        64       498       698         3       521       901 
## 0.8741497 0.8741497 0.8741497 0.2692308 0.8741497 0.6918239

Let’s re-evaluate part of the model:

CrossTable(credit_test$Creditability, credit_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, prop.t = FALSE, dnn = c("predicted Creditability","actual Creditability"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  250 
## 
##  
##                         | actual Creditability 
## predicted Creditability | 0.0769230769230769 |              0.125 |  0.208333333333333 |  0.269230769230769 |  0.315789473684211 |  0.652173913043478 |  0.691823899371069 |  0.727272727272727 |               0.75 |  0.833333333333333 |               0.85 |  0.874149659863946 |  0.909090909090909 |          Row Total | 
## ------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##                       0 |                  7 |                  0 |                  4 |                  5 |                  8 |                  4 |                 18 |                  2 |                  1 |                  1 |                  3 |                 11 |                  0 |                 64 | 
## ------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##                       1 |                  2 |                  1 |                  2 |                  5 |                  3 |                  2 |                 35 |                  0 |                  2 |                  9 |                 15 |                109 |                  1 |                186 | 
## ------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##            Column Total |                  9 |                  1 |                  6 |                 10 |                 11 |                  6 |                 53 |                  2 |                  3 |                 10 |                 18 |                120 |                  1 |                250 | 
## ------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## 
## 

(5) Model Improvement

Data Partition

Let us use the naive_bayes() function on our earlier training data to build the Naive Bayes Classifier:

sum(is.na(credit))
## [1] 0

Report: Above, there were no missing values identified.

convert_counts <- function(credit_train) {credit_train <- ifelse (credit_train > 0, "Yes","No")}

Use the Naive Bayes function on our earlier Training Data

set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]

Below, we use the 75%/25% Split:

credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]

Naive Bayes Model

model1 <- naive_bayes(credit$Creditability ~ credit$Account, data = credit_train)
model1
## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = credit$Creditability ~ credit$Account, 
##     data = credit_train)
## 
## A priori probabilities: 
## 
##   0   1 
## 0.3 0.7 
## 
## Tables: 
##               
## credit$Account          0          1
##              1 0.45000000 0.19857143
##              2 0.35000000 0.23428571
##              3 0.04666667 0.07000000
##              4 0.15333333 0.49714286
model2 <- naive_bayes(credit$Creditability ~ credit$Duration+credit$Payment+credit$Length, data = credit_train)
model2
## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = credit$Creditability ~ credit$Duration + 
##     credit$Payment + credit$Length, data = credit_train)
## 
## A priori probabilities: 
## 
##   0   1 
## 0.3 0.7 
## 
## Tables: 
##                
## credit$Duration           0           1
##              4  0.000000000 0.008571429
##              5  0.000000000 0.001428571
##              6  0.030000000 0.094285714
##              7  0.000000000 0.007142857
##              8  0.003333333 0.008571429
##              9  0.046666667 0.050000000
##              10 0.010000000 0.035714286
##              11 0.000000000 0.012857143
##              12 0.163333333 0.185714286
##              13 0.000000000 0.005714286
##              14 0.003333333 0.004285714
##              15 0.040000000 0.074285714
##              16 0.003333333 0.001428571
##              18 0.140000000 0.101428571
##              20 0.003333333 0.010000000
##              21 0.030000000 0.030000000
##              22 0.000000000 0.002857143
##              24 0.186666667 0.182857143
##              26 0.000000000 0.001428571
##              27 0.016666667 0.011428571
##              28 0.003333333 0.002857143
##              30 0.043333333 0.038571429
##              33 0.003333333 0.002857143
##              36 0.123333333 0.065714286
##              39 0.003333333 0.005714286
##              40 0.003333333 0.000000000
##              42 0.010000000 0.011428571
##              45 0.013333333 0.001428571
##              47 0.000000000 0.001428571
##              48 0.093333333 0.028571429
##              54 0.003333333 0.001428571
##              60 0.020000000 0.010000000
##              72 0.003333333 0.000000000
## 
##               
## credit$Payment          0          1
##              0 0.08333333 0.02142857
##              1 0.09333333 0.03000000
##              2 0.56333333 0.51571429
##              3 0.09333333 0.08571429
##              4 0.16666667 0.34714286
## 
##              
## credit$Length          0          1
##             1 0.07666667 0.05571429
##             2 0.23333333 0.14571429
##             3 0.34666667 0.33571429
##             4 0.13000000 0.19285714
##             5 0.21333333 0.27000000

Plot model:

plot(model1)

plot(model2)

Evaluate the credit model’s performance:

# Predict:

credit_pred <- predict(model1, credit_test)
## Warning in t(log_sum) + log(prior): Recycling array of length 1 in array-vector arithmetic is deprecated.
##   Use c() or as.vector() instead.
cat("\n", "Cross table of credit predictions", "\n\n")
## 
##  Cross table of credit predictions
head(cbind(credit_pred, train))
##      credit_pred train
## [1,] 2           ?
# Predict with model1:

credit_pred1 <- predict(model1, credit_test)
## Warning in t(log_sum) + log(prior): Recycling array of length 1 in array-vector arithmetic is deprecated.
##   Use c() or as.vector() instead.
cat("\n", "Cross table of credit predictions", "\n\n")
## 
##  Cross table of credit predictions
head(cbind(credit_pred1, train))
##      credit_pred1 train
## [1,] 2            ?
# Predict:

credit_pred2 <- predict(model2, credit_test)
## Warning in t(log_sum) + log(prior): Recycling array of length 1 in array-vector arithmetic is deprecated.
##   Use c() or as.vector() instead.
cat("\n", "Cross table of credit predictions", "\n\n")
## 
##  Cross table of credit predictions
head(cbind(credit_pred2, train))
##      credit_pred2 train
## [1,] 2            ?

PART 2

Step 1: Exploring and Preparing the Data:

library(caret)
library(minqa)
## Warning: package 'minqa' was built under R version 3.5.1
library(nloptr)
## Warning: package 'nloptr' was built under R version 3.5.1
## 
## Attaching package: 'nloptr'
## The following objects are masked from 'package:minqa':
## 
##     bobyqa, newuoa
library(MatrixModels)
## Warning: package 'MatrixModels' was built under R version 3.5.1
str(credit)
## 'data.frame':    1000 obs. of  37 variables:
##  $ Creditability                    : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : Factor w/ 10 levels "0","1","2","3",..: 3 1 9 1 1 1 1 1 4 4 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...
##  $ Duration                         : Factor w/ 33 levels "4","5","6","7",..: 14 6 9 9 9 7 5 3 14 18 ...
##  $ Account                          : Factor w/ 4 levels "1","2","3","4": 1 1 2 1 1 1 1 1 4 2 ...
##  $ Payment                          : Factor w/ 5 levels "0","1","2","3",..: 5 5 3 5 5 5 5 5 5 3 ...
##  $ CreditA                          : Factor w/ 923 levels "250","276","338",..: 117 529 73 399 412 424 605 219 126 653 ...
##  $ ValueS                           : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 1 1 1 1 1 1 3 ...
##  $ Length                           : Factor w/ 5 levels "1","2","3","4",..: 2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment                       : Factor w/ 4 levels "1","2","3","4": 4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex                              : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 3 3 3 2 2 ...
##  $ Durat                            : Factor w/ 4 levels "1","2","3","4": 4 2 4 2 4 3 4 4 4 4 ...
##  $ Most                             : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 2 1 1 1 3 4 ...
##  $ Age                              : Factor w/ 53 levels "19","20","21",..: 3 18 5 21 20 30 21 22 47 5 ...
##  $ Con                              : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Type                             : Factor w/ 3 levels "1","2","3": 1 1 1 1 2 1 2 2 2 1 ...
##  $ No                               : Factor w/ 4 levels "1","2","3","4": 1 2 1 2 2 2 2 1 2 1 ...
##  $ Number                           : Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 1 ...
##  $ Foreign                          : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 2 1 1 ...
sum(is.na(credit))
## [1] 0
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]

Let’s compute the correlation matrix:

## creditScaled <- scale(credit_rand[ ,2:ncol(credit_rand)], center = TRUE, scale = TRUE)

## m <- cor(creditScaled)

## highlycor <- findCorrelation(m, 0.30)

Part 3

News Popularity

Import the Data:

news <- read.csv("C:/Users/charl/Downloads/ANLY 530--OnlineNewsPopularity.csv")
str(news)
## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : int  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : int  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : int  219 255 211 531 1072 370 960 989 97 231 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : int  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : int  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : int  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : int  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: int  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : int  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

Import the Data

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

Pre-Processing the Data

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular

newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]

Let us start the classification analysis:

#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

Check the proportion of Data after randomization:

prop.table(table(news_train$shares))
## 
##        no       yes 
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
## 
##    no   yes 
## 0.414 0.586

Let us Train the Model:

news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Jul 27 21:31:57 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 9000 cases (18 attributes) from undefined.data
## 
## Decision tree:
## 
## popular = no: no (3878)
## popular = yes: yes (5122)
## 
## 
## Evaluation on training data (9000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3878          (a): class no
##          5122    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% popular
## 
## 
## Time: 0.1 secs

Evaluate the Model:

news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##          
## news_pred  no yes
##       no  414   0
##       yes   0 586
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100

Since the accuracy is 100%, we conclude there is something wrong with the model.

Let us work on improving the Naive Bayes Classifier:

We, first, randomize the data:

news_rand <- newsShort[order(runif(10000)),]

Let’s scale the data:

## newsScaled <- scale(news_rand[2:ncol(news_rand)][-1], center = TRUE, scale = TRUE)