library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.3
library(vcd)
## Warning: package 'vcd' was built under R version 3.6.3
## Loading required package: grid
library(vcdExtra)
## Warning: package 'vcdExtra' was built under R version 3.6.3
## Loading required package: gnm
## Warning: package 'gnm' was built under R version 3.6.3
library(ca)
## Warning: package 'ca' was built under R version 3.6.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(C50)
## Warning: package 'C50' was built under R version 3.6.3
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.3
credit <- read.csv("C:/Users/punthakur/Documents/HU - ANALYTICS/530-Machine Learning/credit.csv")

str(credit)
## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...
#Step 2: Exploring the data

summary(credit$Credit.Amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
table(credit$Creditability)
## 
##   0   1 
## 300 700
set.seed(12345)
credit_rand <-credit[order(runif(1000)),]

summary(credit$Credit.Amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]

prop.table(table(credit_train$ Creditability))
## 
##         0         1 
## 0.3088889 0.6911111
prop.table(table(credit_test$ Creditability))
## 
##    0    1 
## 0.22 0.78
#Step 3: Training a model on the data

library(C50)

str(credit_train$Creditability)
##  int [1:900] 1 1 1 1 1 1 1 1 1 0 ...
credit_train$Creditability<- factor(credit_train$Creditability)
str(credit_train$Creditability)
##  Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 1 ...
credit_model<-C5.0(x = credit_train[-17], y = credit_train$Creditability)
credit_model
## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$Creditability)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Tree size: 2 
## 
## Non-standard options: attempt to group attributes
summary(credit_model)
## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$Creditability)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Aug 28 16:49:51 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## Creditability = 0: 0 (278)
## Creditability = 1: 1 (622)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     278          (a): class 0
##           622    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% Creditability
## 
## 
## Time: 0.0 secs
#Evaluating Model Performance

cred_pred <- predict(credit_model, credit_test)

library (gmodels)
## Warning: package 'gmodels' was built under R version 3.6.3
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |        22 |         0 |        22 | 
##                      |     0.220 |     0.000 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |         0 |        78 |        78 | 
##                      |     0.000 |     0.780 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        22 |        78 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
## 
(p <- table(cred_pred, credit_test$Creditability))
##          
## cred_pred  0  1
##         0 22  0
##         1  0 78
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100
#Q1- If you see an accuracy of 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.

#Since we see an accuracy of 100%, this means that the model is overfitting the data and compensating for the inefficiencies by design. 


DT <- rpart(Creditability ~  Account.Balance + Credit.Amount + Payment.Status.of.Previous.Credit, data = credit_train)


summary(DT)
## Call:
## rpart(formula = Creditability ~ Account.Balance + Credit.Amount + 
##     Payment.Status.of.Previous.Credit, data = credit_train)
##   n= 900 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.04496403      0 1.0000000 1.0000000 0.04985992
## 2 0.01618705      3 0.8417266 0.8848921 0.04809394
## 3 0.01079137      5 0.8093525 0.8992806 0.04833488
## 4 0.01000000      9 0.7661871 0.8848921 0.04809394
## 
## Variable importance
##                   Account.Balance                     Credit.Amount 
##                                55                                25 
## Payment.Status.of.Previous.Credit 
##                                20 
## 
## Node number 1: 900 observations,    complexity param=0.04496403
##   predicted class=1  expected loss=0.3088889  P(node) =1
##     class counts:   278   622
##    probabilities: 0.309 0.691 
##   left son=2 (491 obs) right son=3 (409 obs)
##   Primary splits:
##       Account.Balance                   < 2.5    to the left,  improve=44.34223, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5    to the left,  improve=15.03636, (0 missing)
##       Credit.Amount                     < 3909.5 to the right, improve=12.09960, (0 missing)
##   Surrogate splits:
##       Payment.Status.of.Previous.Credit < 2.5    to the left,  agree=0.596, adj=0.11, (0 split)
## 
## Node number 2: 491 observations,    complexity param=0.04496403
##   predicted class=1  expected loss=0.4521385  P(node) =0.5455556
##     class counts:   222   269
##    probabilities: 0.452 0.548 
##   left son=4 (61 obs) right son=5 (430 obs)
##   Primary splits:
##       Payment.Status.of.Previous.Credit < 1.5    to the left,  improve=8.901367, (0 missing)
##       Credit.Amount                     < 3998   to the right, improve=8.139155, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=1.618609, (0 missing)
##   Surrogate splits:
##       Credit.Amount < 15901  to the right, agree=0.88, adj=0.033, (0 split)
## 
## Node number 3: 409 observations
##   predicted class=1  expected loss=0.1369193  P(node) =0.4544444
##     class counts:    56   353
##    probabilities: 0.137 0.863 
## 
## Node number 4: 61 observations
##   predicted class=0  expected loss=0.295082  P(node) =0.06777778
##     class counts:    43    18
##    probabilities: 0.705 0.295 
## 
## Node number 5: 430 observations,    complexity param=0.04496403
##   predicted class=1  expected loss=0.4162791  P(node) =0.4777778
##     class counts:   179   251
##    probabilities: 0.416 0.584 
##   left son=10 (33 obs) right son=11 (397 obs)
##   Primary splits:
##       Credit.Amount                     < 8015.5 to the right, improve=9.871261, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=3.396525, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=1.266883, (0 missing)
## 
## Node number 10: 33 observations
##   predicted class=0  expected loss=0.2121212  P(node) =0.03666667
##     class counts:    26     7
##    probabilities: 0.788 0.212 
## 
## Node number 11: 397 observations,    complexity param=0.01618705
##   predicted class=1  expected loss=0.3853904  P(node) =0.4411111
##     class counts:   153   244
##    probabilities: 0.385 0.615 
##   left son=22 (88 obs) right son=23 (309 obs)
##   Primary splits:
##       Credit.Amount                     < 3910   to the right, improve=2.970205, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=2.647846, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=2.611827, (0 missing)
## 
## Node number 22: 88 observations,    complexity param=0.01618705
##   predicted class=0  expected loss=0.5  P(node) =0.09777778
##     class counts:    44    44
##    probabilities: 0.500 0.500 
##   left son=44 (43 obs) right son=45 (45 obs)
##   Primary splits:
##       Account.Balance                   < 1.5    to the left,  improve=1.84186000, (0 missing)
##       Credit.Amount                     < 7413   to the left,  improve=1.29870100, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the right, improve=0.02793651, (0 missing)
##   Surrogate splits:
##       Credit.Amount                     < 4289   to the left,  agree=0.591, adj=0.163, (0 split)
##       Payment.Status.of.Previous.Credit < 2.5    to the left,  agree=0.545, adj=0.070, (0 split)
## 
## Node number 23: 309 observations,    complexity param=0.01079137
##   predicted class=1  expected loss=0.3527508  P(node) =0.3433333
##     class counts:   109   200
##    probabilities: 0.353 0.647 
##   left son=46 (286 obs) right son=47 (23 obs)
##   Primary splits:
##       Credit.Amount                     < 3504   to the left,  improve=6.184240, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=4.016779, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=1.347176, (0 missing)
## 
## Node number 44: 43 observations
##   predicted class=0  expected loss=0.3953488  P(node) =0.04777778
##     class counts:    26    17
##    probabilities: 0.605 0.395 
## 
## Node number 45: 45 observations
##   predicted class=1  expected loss=0.4  P(node) =0.05
##     class counts:    18    27
##    probabilities: 0.400 0.600 
## 
## Node number 46: 286 observations,    complexity param=0.01079137
##   predicted class=1  expected loss=0.3811189  P(node) =0.3177778
##     class counts:   109   177
##    probabilities: 0.381 0.619 
##   left son=92 (215 obs) right son=93 (71 obs)
##   Primary splits:
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=3.791813, (0 missing)
##       Credit.Amount                     < 624    to the right, improve=2.520480, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=1.615261, (0 missing)
## 
## Node number 47: 23 observations
##   predicted class=1  expected loss=0  P(node) =0.02555556
##     class counts:     0    23
##    probabilities: 0.000 1.000 
## 
## Node number 92: 215 observations,    complexity param=0.01079137
##   predicted class=1  expected loss=0.427907  P(node) =0.2388889
##     class counts:    92   123
##    probabilities: 0.428 0.572 
##   left son=184 (107 obs) right son=185 (108 obs)
##   Primary splits:
##       Account.Balance                   < 1.5    to the left,  improve=3.88193900, (0 missing)
##       Credit.Amount                     < 614.5  to the right, improve=2.25536000, (0 missing)
##       Payment.Status.of.Previous.Credit < 2.5    to the right, improve=0.08735044, (0 missing)
##   Surrogate splits:
##       Credit.Amount                     < 1254   to the right, agree=0.549, adj=0.093, (0 split)
##       Payment.Status.of.Previous.Credit < 2.5    to the left,  agree=0.549, adj=0.093, (0 split)
## 
## Node number 93: 71 observations
##   predicted class=1  expected loss=0.2394366  P(node) =0.07888889
##     class counts:    17    54
##    probabilities: 0.239 0.761 
## 
## Node number 184: 107 observations,    complexity param=0.01079137
##   predicted class=0  expected loss=0.4766355  P(node) =0.1188889
##     class counts:    56    51
##    probabilities: 0.523 0.477 
##   left son=368 (38 obs) right son=369 (69 obs)
##   Primary splits:
##       Credit.Amount < 1348.5 to the left,  improve=2.132987, (0 missing)
## 
## Node number 185: 108 observations
##   predicted class=1  expected loss=0.3333333  P(node) =0.12
##     class counts:    36    72
##    probabilities: 0.333 0.667 
## 
## Node number 368: 38 observations
##   predicted class=0  expected loss=0.3421053  P(node) =0.04222222
##     class counts:    25    13
##    probabilities: 0.658 0.342 
## 
## Node number 369: 69 observations
##   predicted class=1  expected loss=0.4492754  P(node) =0.07666667
##     class counts:    31    38
##    probabilities: 0.449 0.551
rpart.plot(DT, type = 1, extra = 102)

#Method#2: Random FOrest

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##          
## cred_pred  0  1
##         0 12 11
##         1 10 67
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 79
#Q2- What are the three most important features in this model.

importance(random_model)
##                                   MeanDecreaseGini
## Account.Balance                          43.178206
## Duration.of.Credit..month.               37.302433
## Payment.Status.of.Previous.Credit        22.765043
## Purpose                                  23.635785
## Credit.Amount                            52.337429
## Value.Savings.Stocks                     19.241822
## Length.of.current.employment             19.885455
## Instalment.per.cent                      16.451002
## Sex...Marital.Status                     13.387221
## Guarantors                                7.883390
## Duration.in.Current.address              15.540894
## Most.valuable.available.asset            17.390452
## Age..years.                              37.106310
## Concurrent.Credits                        8.774178
## Type.of.apartment                         9.631784
## No.of.Credits.at.this.Bank                7.921031
## Occupation                               11.934154
## No.of.dependents                          5.622503
## Telephone                                 7.582114
## Foreign.Worker                            1.863587
#Top 3 features by importance: Credit Amount, Account Balance and Duration of Credut Month


DT2 <- rpart(Creditability ~  Account.Balance + Credit.Amount + Payment.Status.of.Previous.Credit, data = credit_train)


summary(DT2)
## Call:
## rpart(formula = Creditability ~ Account.Balance + Credit.Amount + 
##     Payment.Status.of.Previous.Credit, data = credit_train)
##   n= 900 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.04496403      0 1.0000000 1.0000000 0.04985992
## 2 0.01618705      3 0.8417266 0.8956835 0.04827520
## 3 0.01079137      5 0.8093525 0.9100719 0.04851168
## 4 0.01000000      9 0.7661871 0.8920863 0.04821516
## 
## Variable importance
##                   Account.Balance                     Credit.Amount 
##                                55                                25 
## Payment.Status.of.Previous.Credit 
##                                20 
## 
## Node number 1: 900 observations,    complexity param=0.04496403
##   predicted class=1  expected loss=0.3088889  P(node) =1
##     class counts:   278   622
##    probabilities: 0.309 0.691 
##   left son=2 (491 obs) right son=3 (409 obs)
##   Primary splits:
##       Account.Balance                   < 2.5    to the left,  improve=44.34223, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5    to the left,  improve=15.03636, (0 missing)
##       Credit.Amount                     < 3909.5 to the right, improve=12.09960, (0 missing)
##   Surrogate splits:
##       Payment.Status.of.Previous.Credit < 2.5    to the left,  agree=0.596, adj=0.11, (0 split)
## 
## Node number 2: 491 observations,    complexity param=0.04496403
##   predicted class=1  expected loss=0.4521385  P(node) =0.5455556
##     class counts:   222   269
##    probabilities: 0.452 0.548 
##   left son=4 (61 obs) right son=5 (430 obs)
##   Primary splits:
##       Payment.Status.of.Previous.Credit < 1.5    to the left,  improve=8.901367, (0 missing)
##       Credit.Amount                     < 3998   to the right, improve=8.139155, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=1.618609, (0 missing)
##   Surrogate splits:
##       Credit.Amount < 15901  to the right, agree=0.88, adj=0.033, (0 split)
## 
## Node number 3: 409 observations
##   predicted class=1  expected loss=0.1369193  P(node) =0.4544444
##     class counts:    56   353
##    probabilities: 0.137 0.863 
## 
## Node number 4: 61 observations
##   predicted class=0  expected loss=0.295082  P(node) =0.06777778
##     class counts:    43    18
##    probabilities: 0.705 0.295 
## 
## Node number 5: 430 observations,    complexity param=0.04496403
##   predicted class=1  expected loss=0.4162791  P(node) =0.4777778
##     class counts:   179   251
##    probabilities: 0.416 0.584 
##   left son=10 (33 obs) right son=11 (397 obs)
##   Primary splits:
##       Credit.Amount                     < 8015.5 to the right, improve=9.871261, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=3.396525, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=1.266883, (0 missing)
## 
## Node number 10: 33 observations
##   predicted class=0  expected loss=0.2121212  P(node) =0.03666667
##     class counts:    26     7
##    probabilities: 0.788 0.212 
## 
## Node number 11: 397 observations,    complexity param=0.01618705
##   predicted class=1  expected loss=0.3853904  P(node) =0.4411111
##     class counts:   153   244
##    probabilities: 0.385 0.615 
##   left son=22 (88 obs) right son=23 (309 obs)
##   Primary splits:
##       Credit.Amount                     < 3910   to the right, improve=2.970205, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=2.647846, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=2.611827, (0 missing)
## 
## Node number 22: 88 observations,    complexity param=0.01618705
##   predicted class=0  expected loss=0.5  P(node) =0.09777778
##     class counts:    44    44
##    probabilities: 0.500 0.500 
##   left son=44 (43 obs) right son=45 (45 obs)
##   Primary splits:
##       Account.Balance                   < 1.5    to the left,  improve=1.84186000, (0 missing)
##       Credit.Amount                     < 7413   to the left,  improve=1.29870100, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the right, improve=0.02793651, (0 missing)
##   Surrogate splits:
##       Credit.Amount                     < 4289   to the left,  agree=0.591, adj=0.163, (0 split)
##       Payment.Status.of.Previous.Credit < 2.5    to the left,  agree=0.545, adj=0.070, (0 split)
## 
## Node number 23: 309 observations,    complexity param=0.01079137
##   predicted class=1  expected loss=0.3527508  P(node) =0.3433333
##     class counts:   109   200
##    probabilities: 0.353 0.647 
##   left son=46 (286 obs) right son=47 (23 obs)
##   Primary splits:
##       Credit.Amount                     < 3504   to the left,  improve=6.184240, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=4.016779, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=1.347176, (0 missing)
## 
## Node number 44: 43 observations
##   predicted class=0  expected loss=0.3953488  P(node) =0.04777778
##     class counts:    26    17
##    probabilities: 0.605 0.395 
## 
## Node number 45: 45 observations
##   predicted class=1  expected loss=0.4  P(node) =0.05
##     class counts:    18    27
##    probabilities: 0.400 0.600 
## 
## Node number 46: 286 observations,    complexity param=0.01079137
##   predicted class=1  expected loss=0.3811189  P(node) =0.3177778
##     class counts:   109   177
##    probabilities: 0.381 0.619 
##   left son=92 (215 obs) right son=93 (71 obs)
##   Primary splits:
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=3.791813, (0 missing)
##       Credit.Amount                     < 624    to the right, improve=2.520480, (0 missing)
##       Account.Balance                   < 1.5    to the left,  improve=1.615261, (0 missing)
## 
## Node number 47: 23 observations
##   predicted class=1  expected loss=0  P(node) =0.02555556
##     class counts:     0    23
##    probabilities: 0.000 1.000 
## 
## Node number 92: 215 observations,    complexity param=0.01079137
##   predicted class=1  expected loss=0.427907  P(node) =0.2388889
##     class counts:    92   123
##    probabilities: 0.428 0.572 
##   left son=184 (107 obs) right son=185 (108 obs)
##   Primary splits:
##       Account.Balance                   < 1.5    to the left,  improve=3.88193900, (0 missing)
##       Credit.Amount                     < 614.5  to the right, improve=2.25536000, (0 missing)
##       Payment.Status.of.Previous.Credit < 2.5    to the right, improve=0.08735044, (0 missing)
##   Surrogate splits:
##       Credit.Amount                     < 1254   to the right, agree=0.549, adj=0.093, (0 split)
##       Payment.Status.of.Previous.Credit < 2.5    to the left,  agree=0.549, adj=0.093, (0 split)
## 
## Node number 93: 71 observations
##   predicted class=1  expected loss=0.2394366  P(node) =0.07888889
##     class counts:    17    54
##    probabilities: 0.239 0.761 
## 
## Node number 184: 107 observations,    complexity param=0.01079137
##   predicted class=0  expected loss=0.4766355  P(node) =0.1188889
##     class counts:    56    51
##    probabilities: 0.523 0.477 
##   left son=368 (38 obs) right son=369 (69 obs)
##   Primary splits:
##       Credit.Amount < 1348.5 to the left,  improve=2.132987, (0 missing)
## 
## Node number 185: 108 observations
##   predicted class=1  expected loss=0.3333333  P(node) =0.12
##     class counts:    36    72
##    probabilities: 0.333 0.667 
## 
## Node number 368: 38 observations
##   predicted class=0  expected loss=0.3421053  P(node) =0.04222222
##     class counts:    25    13
##    probabilities: 0.658 0.342 
## 
## Node number 369: 69 observations
##   predicted class=1  expected loss=0.4492754  P(node) =0.07666667
##     class counts:    31    38
##    probabilities: 0.449 0.551
rpart.plot(DT2, type = 1, extra = 102)

#Now, Change the random seed to 23458 and find the new accuracy of random forest.

set.seed(23458)

credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##          
## cred_pred  0  1
##         0 12 10
##         1 10 68
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 80
#Accuracy increases slightly
#Method#3: Adding Regression to Trees

wine<-read.csv("C:/Users/punthakur/Documents/HU - ANALYTICS/530-Machine Learning/whitewines.csv")
str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...
hist(wine$quality)

#Data is normal


#Step 2: Exploring and Preparing the Data
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

#Step 3: Training a Model on the Data

m.rpart <- rpart(quality ~ ., data=wine_train)

m.rpart
## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *
rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

#Step 4: Evaluating Model Performance


p.rpart <- predict(m.rpart, wine_test)

summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597
summary(wine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
#Q3- What is your interpretation about this amount of RMSE?

if (!require("Metrics")) {
install.packages("Metrics")
library(Metrics)
}
## Loading required package: Metrics
## Warning: package 'Metrics' was built under R version 3.6.3
## 
## Attaching package: 'Metrics'
## The following object is masked from 'package:gnm':
## 
##     se
rmse(wine_test$quality, p.rpart)
## [1] 0.7448093
#The root-mean-square error is 0.74, which is high, the lower RMSE value, the better the model fits the observed data.
#News Popularity

news <- read.csv("C:/Users/punthakur/Documents/HU - ANALYTICS/530-Machine Learning/OnlineNewsPopularity_for_R.csv")
str(news)
## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...
#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)



colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

str(newsShort)
## 'data.frame':    39644 obs. of  17 variables:
##  $ n_tokens_title              : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content            : num  219 255 211 531 1072 ...
##  $ n_unique_tokens             : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words            : num  1 1 1 1 1 ...
##  $ num_hrefs                   : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_imgs                    : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                  : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length        : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ kw_max_max                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ global_sentiment_polarity   : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ avg_positive_polarity       : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ title_subjectivity          : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity    : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity      : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity: num  0.188 0 0 0 0.136 ...
##  $ shares                      : int  593 711 1500 1200 505 855 556 891 3600 710 ...
#Pre-Processing the Data

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular

newsShort$shares <- as.factor(newsShort$shares)

set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]

#CLASSIFICATION

#Train & Test Data
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]


prop.table(table(news_train$shares))
## 
##        no       yes 
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
## 
##    no   yes 
## 0.414 0.586
#Train the model
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Aug 28 16:50:16 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 9000 cases (18 attributes) from undefined.data
## 
## Decision tree:
## 
## popular = no: no (3878)
## popular = yes: yes (5122)
## 
## 
## Evaluation on training data (9000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3878          (a): class no
##          5122    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% popular
## 
## 
## Time: 0.2 secs
#EValuate the Model

news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##          
## news_pred  no yes
##       no  414   0
##       yes   0 586
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100
plot(newsShort$shares)

summary(news_test$shares)
##  no yes 
## 414 586
library(gmodels)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##               | predicted shares 
## actual shares |        no |       yes | Row Total | 
## --------------|-----------|-----------|-----------|
##            no |       414 |         0 |       414 | 
##               |     0.414 |     0.000 |           | 
## --------------|-----------|-----------|-----------|
##           yes |         0 |       586 |       586 | 
##               |     0.000 |     0.586 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |       414 |       586 |      1000 | 
## --------------|-----------|-----------|-----------|
## 
## 
#Decision Tree and Random Forest

news <- news[,-(1:2)]

#check for outliers
news=news[!news$n_unique_tokens==701,]


#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)


colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")



#standardize the dataset
for(i in ncol(news)-1){ 
  news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}


#define popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))

set.seed(23589)


news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:4000, ]
news_test <- news_rand[4001:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_modelNews <- randomForest(news_train$shares ~ . , data= news_train)


#Model training
cred_pridRF <- predict(random_modelNews, news_test)
(p2 <- table(cred_pridRF, news_test$shares))
##            
## cred_pridRF     0     1
##           0 10595  7695
##           1  7474  9879
#Accuracy
(Accuracy <- sum(diag(p2))/sum(p2)*100)
## [1] 57.44185
#importance
importance(random_modelNews)
##                              MeanDecreaseGini
## n_tokens_title                      110.84760
## n_tokens_content                    170.83073
## n_unique_tokens                     188.13582
## n_non_stop_words                    167.57842
## num_hrefs                           140.36733
## num_imgs                             94.87513
## num_videos                           59.29693
## average_token_length                195.19758
## num_keywords                        101.41642
## kw_max_max                           50.64025
## global_sentiment_polarity           204.22721
## avg_positive_polarity               191.68324
## title_subjectivity                   79.69770
## title_sentiment_polarity             85.79501
## abs_title_subjectivity               73.58037
## abs_title_sentiment_polarity         69.09077
#Decision Tree

news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Aug 28 16:50:26 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 4000 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## num_hrefs > 13:
## :...num_keywords <= 5: 0 (125/47)
## :   num_keywords > 5:
## :   :...n_unique_tokens <= 0.3550296: 1 (43/5)
## :       n_unique_tokens > 0.3550296:
## :       :...kw_max_max <= 80400:
## :           :...title_sentiment_polarity > 0.8: 1 (3)
## :           :   title_sentiment_polarity <= 0.8:
## :           :   :...global_sentiment_polarity <= 0.1593333: 1 (23/9)
## :           :       global_sentiment_polarity > 0.1593333: 0 (24/4)
## :           kw_max_max > 80400:
## :           :...n_tokens_title > 11:
## :               :...num_imgs <= 14: 0 (180/85)
## :               :   num_imgs > 14: 1 (42/10)
## :               n_tokens_title <= 11:
## :               :...global_sentiment_polarity > 0.0637982: 1 (438/139)
## :                   global_sentiment_polarity <= 0.0637982:
## :                   :...num_videos > 2: 1 (10/1)
## :                       num_videos <= 2:
## :                       :...n_tokens_content <= 381: 1 (22/5)
## :                           n_tokens_content > 381: 0 (71/21)
## num_hrefs <= 13:
## :...kw_max_max <= 617900:
##     :...n_tokens_title <= 10: 1 (282/105)
##     :   n_tokens_title > 10:
##     :   :...n_tokens_title <= 12: 0 (131/60)
##     :       n_tokens_title > 12: 1 (36/14)
##     kw_max_max > 617900:
##     :...num_imgs > 1: 1 (830/413)
##         num_imgs <= 1:
##         :...n_non_stop_words <= 0.9999999: 1 (99/44)
##             n_non_stop_words > 0.9999999:
##             :...num_videos > 9: 1 (51/19)
##                 num_videos <= 9:
##                 :...kw_max_max > 690400: 0 (1333/535)
##                     kw_max_max <= 690400:
##                     :...num_imgs > 0: 0 (171/72)
##                         num_imgs <= 0:
##                         :...num_keywords > 9: 0 (9/3)
##                             num_keywords <= 9:
##                             :...num_hrefs <= 10: 1 (73/28)
##                                 num_hrefs > 10: 0 (4)
## 
## 
## Evaluation on training data (4000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      22 1619(40.5%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    1221   792    (a): class 0
##     827  1160    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% num_hrefs
##   95.80% kw_max_max
##   69.80% num_imgs
##   43.60% num_videos
##   43.50% n_non_stop_words
##   30.30% n_tokens_title
##   26.67% num_keywords
##   21.40% n_unique_tokens
##   14.70% global_sentiment_polarity
##    2.33% n_tokens_content
##    1.25% title_sentiment_polarity
## 
## 
## Time: 0.1 secs
#Model training
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  35643 
## 
##  
##               | predicted shares 
## actual shares |         0 |         1 | Row Total | 
## --------------|-----------|-----------|-----------|
##             0 |     10344 |      7725 |     18069 | 
##               |     0.290 |     0.217 |           | 
## --------------|-----------|-----------|-----------|
##             1 |      8056 |      9518 |     17574 | 
##               |     0.226 |     0.267 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |     18400 |     17243 |     35643 | 
## --------------|-----------|-----------|-----------|
## 
## 
(p3 <- table(news_pred, news_test$shares))
##          
## news_pred     0     1
##         0 10344  8056
##         1  7725  9518
#Accuracy
(Accuracy <- sum(diag(p3))/sum(p3)*100)
## [1] 55.72483