Data Collection

library(readr)
credit <- read.csv("C:/Users/gmutya048/Downloads/credit.csv")

str(credit)
## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

Exploring the Data. Get Info about Loan Amounts Requested

summary(credit$Credit.Amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

Change DF to Table

table(credit$Creditability)
## 
##   0   1 
## 300 700

Exploring Data (Part 2). Before diving data into Train and Test sets, Randomize the data

set.seed(12345)
credit_random <- credit[order(runif(1000)), ]

Divide Data to TRain and Test Sets What is the target column? We can use 75% to 90% for trainiing, and the rest for testing.

credit_train <- credit_random[1:900, ]
credit_test <- credit_random[901:1000, ]

Divide Data to Train and Test Sets:

prop.table(table(credit_train$Creditability))
## 
##         0         1 
## 0.3088889 0.6911111
prop.table(table(credit_test$Creditability))
## 
##    0    1 
## 0.22 0.78
library(C50)

credit_train$Creditability <- as.factor(credit_train$Creditability)
credit_test$Creditability <- as.factor(credit_test$Creditability)

credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)

credit_model
## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Tree size: 85 
## 
## Non-standard options: attempt to group attributes
summary(credit_model)
## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Oct 29 19:12:57 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## Account.Balance > 2:
## :...Concurrent.Credits > 2:
## :   :...Age..years. > 33: 1 (179/11)
## :   :   Age..years. <= 33:
## :   :   :...Credit.Amount > 6681:
## :   :       :...Length.of.current.employment <= 2: 0 (4)
## :   :       :   Length.of.current.employment > 2:
## :   :       :   :...Payment.Status.of.Previous.Credit <= 3: 1 (4)
## :   :       :       Payment.Status.of.Previous.Credit > 3: 0 (3/1)
## :   :       Credit.Amount <= 6681:
## :   :       :...Occupation > 2:
## :   :           :...Occupation <= 3: 1 (120/12)
## :   :           :   Occupation > 3:
## :   :           :   :...Duration.of.Credit..month. <= 33: 1 (9)
## :   :           :       Duration.of.Credit..month. > 33: 0 (3)
## :   :           Occupation <= 2:
## :   :           :...No.of.Credits.at.this.Bank > 1: 1 (6)
## :   :               No.of.Credits.at.this.Bank <= 1:
## :   :               :...Most.valuable.available.asset > 1: 0 (3)
## :   :                   Most.valuable.available.asset <= 1:
## :   :                   :...Credit.Amount <= 1987: 1 (8/1)
## :   :                       Credit.Amount > 1987: 0 (2)
## :   Concurrent.Credits <= 2:
## :   :...Guarantors > 1: 1 (4)
## :       Guarantors <= 1:
## :       :...Purpose <= 0:
## :           :...Most.valuable.available.asset <= 2: 0 (5)
## :           :   Most.valuable.available.asset > 2:
## :           :   :...No.of.dependents <= 1: 1 (7/1)
## :           :       No.of.dependents > 1: 0 (2)
## :           Purpose > 0:
## :           :...Purpose <= 4: 1 (35/2)
## :               Purpose > 4:
## :               :...Length.of.current.employment <= 2: 0 (4)
## :                   Length.of.current.employment > 2:
## :                   :...No.of.dependents > 1: 0 (3/1)
## :                       No.of.dependents <= 1:
## :                       :...Length.of.current.employment > 3: 1 (4)
## :                           Length.of.current.employment <= 3:
## :                           :...Instalment.per.cent <= 2: 1 (2)
## :                               Instalment.per.cent > 2: 0 (2)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
##     :...Value.Savings.Stocks <= 2: 0 (49/10)
##     :   Value.Savings.Stocks > 2:
##     :   :...Credit.Amount <= 2064: 0 (3)
##     :       Credit.Amount > 2064: 1 (9/1)
##     Payment.Status.of.Previous.Credit > 1:
##     :...Credit.Amount > 7980:
##         :...Value.Savings.Stocks > 4:
##         :   :...Payment.Status.of.Previous.Credit <= 2: 0 (4/1)
##         :   :   Payment.Status.of.Previous.Credit > 2: 1 (3)
##         :   Value.Savings.Stocks <= 4:
##         :   :...Account.Balance > 1: 0 (15)
##         :       Account.Balance <= 1:
##         :       :...Concurrent.Credits <= 2: 0 (2)
##         :           Concurrent.Credits > 2:
##         :           :...Credit.Amount <= 10297: 0 (6)
##         :               Credit.Amount > 10297: 1 (3)
##         Credit.Amount <= 7980:
##         :...Duration.of.Credit..month. <= 11:
##             :...Occupation > 3:
##             :   :...Concurrent.Credits <= 2: 1 (3)
##             :   :   Concurrent.Credits > 2:
##             :   :   :...Payment.Status.of.Previous.Credit <= 2: 1 (4/1)
##             :   :       Payment.Status.of.Previous.Credit > 2: 0 (3)
##             :   Occupation <= 3:
##             :   :...Age..years. > 32: 1 (34)
##             :       Age..years. <= 32:
##             :       :...Most.valuable.available.asset <= 1: 1 (13/1)
##             :           Most.valuable.available.asset > 1:
##             :           :...Instalment.per.cent <= 3: 1 (6/1)
##             :               Instalment.per.cent > 3: 0 (6/1)
##             Duration.of.Credit..month. > 11:
##             :...Duration.of.Credit..month. > 36:
##                 :...Length.of.current.employment <= 1: 1 (3)
##                 :   Length.of.current.employment > 1:
##                 :   :...No.of.dependents > 1: 1 (5/1)
##                 :       No.of.dependents <= 1:
##                 :       :...Duration.in.Current.address <= 1: 1 (4/1)
##                 :           Duration.in.Current.address > 1: 0 (23)
##                 Duration.of.Credit..month. <= 36:
##                 :...Guarantors > 2:
##                     :...Foreign.Worker <= 1: 1 (23/1)
##                     :   Foreign.Worker > 1: 0 (2)
##                     Guarantors <= 2:
##                     :...Credit.Amount <= 1381:
##                         :...Telephone > 1:
##                         :   :...Sex...Marital.Status > 3: 0 (2)
##                         :   :   Sex...Marital.Status <= 3:
##                         :   :   :...Duration.of.Credit..month. <= 16: 1 (7)
##                         :   :       Duration.of.Credit..month. > 16: 0 (3/1)
##                         :   Telephone <= 1:
##                         :   :...Concurrent.Credits <= 2: 0 (9)
##                         :       Concurrent.Credits > 2:
##                         :       :...Account.Balance <= 1: 0 (29/6)
##                         :           Account.Balance > 1: [S1]
##                         Credit.Amount > 1381:
##                         :...Guarantors > 1:
##                             :...Foreign.Worker > 1: 1 (2)
##                             :   Foreign.Worker <= 1:
##                             :   :...Instalment.per.cent > 2: 0 (5)
##                             :       Instalment.per.cent <= 2: [S2]
##                             Guarantors <= 1:
##                             :...Payment.Status.of.Previous.Credit > 3:
##                                 :...Age..years. > 33: 1 (22)
##                                 :   Age..years. <= 33:
##                                 :   :...Purpose > 3: 1 (7)
##                                 :       Purpose <= 3: [S3]
##                                 Payment.Status.of.Previous.Credit <= 3:
##                                 :...Instalment.per.cent <= 2:
##                                     :...No.of.dependents > 1:
##                                     :   :...Purpose <= 0: 1 (2)
##                                     :   :   Purpose > 0: 0 (3)
##                                     :   No.of.dependents <= 1: [S4]
##                                     Instalment.per.cent > 2:
##                                     :...Concurrent.Credits <= 1: 1 (8/1)
##                                         Concurrent.Credits > 1:
##                                         :...Sex...Marital.Status <= 1: 0 (6/1)
##                                             Sex...Marital.Status > 1:
##                                             :...Account.Balance > 1: [S5]
##                                                 Account.Balance <= 1: [S6]
## 
## SubTree [S1]
## 
## Duration.in.Current.address > 3: 1 (8/1)
## Duration.in.Current.address <= 3:
## :...Purpose > 2: 0 (5)
##     Purpose <= 2:
##     :...Type.of.apartment <= 1: 0 (2)
##         Type.of.apartment > 1: 1 (5/1)
## 
## SubTree [S2]
## 
## Duration.in.Current.address <= 2: 1 (2)
## Duration.in.Current.address > 2: 0 (4/1)
## 
## SubTree [S3]
## 
## Duration.of.Credit..month. <= 16: 1 (4)
## Duration.of.Credit..month. > 16:
## :...Length.of.current.employment <= 3: 0 (8)
##     Length.of.current.employment > 3: 1 (6/1)
## 
## SubTree [S4]
## 
## Duration.in.Current.address > 1: 1 (41/6)
## Duration.in.Current.address <= 1:
## :...Value.Savings.Stocks > 3: 0 (2)
##     Value.Savings.Stocks <= 3:
##     :...Length.of.current.employment > 2: 1 (4)
##         Length.of.current.employment <= 2:
##         :...Instalment.per.cent <= 1: 0 (3)
##             Instalment.per.cent > 1: 1 (3/1)
## 
## SubTree [S5]
## 
## Sex...Marital.Status > 3: 0 (2)
## Sex...Marital.Status <= 3:
## :...Length.of.current.employment > 3: 1 (10)
##     Length.of.current.employment <= 3:
##     :...Duration.in.Current.address <= 1: 1 (5)
##         Duration.in.Current.address > 1:
##         :...Length.of.current.employment <= 2: 0 (4)
##             Length.of.current.employment > 2:
##             :...Value.Savings.Stocks <= 1: 0 (3)
##                 Value.Savings.Stocks > 1: 1 (5)
## 
## SubTree [S6]
## 
## Payment.Status.of.Previous.Credit > 2: 0 (3)
## Payment.Status.of.Previous.Credit <= 2:
## :...Purpose <= 0: 0 (7/1)
##     Purpose > 0:
##     :...Most.valuable.available.asset <= 1: 0 (5/1)
##         Most.valuable.available.asset > 1:
##         :...Sex...Marital.Status <= 2: 1 (6)
##             Sex...Marital.Status > 2:
##             :...Length.of.current.employment > 4: 0 (5)
##                 Length.of.current.employment <= 4:
##                 :...Telephone > 1: 1 (3)
##                     Telephone <= 1:
##                     :...Length.of.current.employment <= 2: 0 (2)
##                         Length.of.current.employment > 2:
##                         :...Age..years. <= 28: 1 (4)
##                             Age..years. > 28: 0 (2)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      85   70( 7.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     233    45    (a): class 0
##      25   597    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% Account.Balance
##   67.11% Credit.Amount
##   63.11% Concurrent.Credits
##   55.33% Payment.Status.of.Previous.Credit
##   50.33% Age..years.
##   45.44% Duration.of.Credit..month.
##   40.11% Guarantors
##   24.44% Occupation
##   18.33% Instalment.per.cent
##   15.56% Purpose
##   14.22% Length.of.current.employment
##   13.67% Duration.in.Current.address
##   12.67% Value.Savings.Stocks
##   12.22% No.of.dependents
##    9.33% Sex...Marital.Status
##    9.00% Telephone
##    8.78% Most.valuable.available.asset
##    4.22% Foreign.Worker
##    2.11% No.of.Credits.at.this.Bank
##    0.78% Type.of.apartment
## 
## 
## Time: 0.0 secs

Model Eval

Use “predict” to predict the output for a test set

cred_pred <- predict(credit_model, credit_test)

library(gmodels)
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |         8 |        14 |        22 | 
##                      |     0.080 |     0.140 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        17 |        61 |        78 | 
##                      |     0.170 |     0.610 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        25 |        75 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
## 

Model Eval (Part 2)

The table indicates that we had 100 records in our test set

11 defaults were mis-classified, i.e. false negatives or a type II error

16 actual defaults were mis-classified as not defaulting, i.e., false positives or a type I error.

(p <- table(cred_pred, credit_test$Creditability))
##          
## cred_pred  0  1
##         0  8 17
##         1 14 61
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 69
library(ggplot2)
library(rpart)
library(rpart.plot)
DT <- rpart(Creditability ~ . , data= credit_train)
summary(DT)
## Call:
## rpart(formula = Creditability ~ ., data = credit_train)
##   n= 900 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.05575540      0 1.0000000 1.0000000 0.04985992
## 2 0.05395683      3 0.8309353 0.9928058 0.04976005
## 3 0.02038369      4 0.7769784 0.8633094 0.04772122
## 4 0.01438849      7 0.7158273 0.8309353 0.04713603
## 5 0.01000000      8 0.7014388 0.8021583 0.04658871
## 
## Variable importance
##                   Account.Balance Payment.Status.of.Previous.Credit 
##                                33                                13 
##              Value.Savings.Stocks        Duration.of.Credit..month. 
##                                10                                10 
##                     Credit.Amount     Most.valuable.available.asset 
##                                10                                 7 
##                       Age..years.        No.of.Credits.at.this.Bank 
##                                 4                                 3 
##      Length.of.current.employment       Duration.in.Current.address 
##                                 2                                 2 
##                 Type.of.apartment                        Occupation 
##                                 2                                 1 
##                           Purpose               Instalment.per.cent 
##                                 1                                 1 
## 
## Node number 1: 900 observations,    complexity param=0.0557554
##   predicted class=1  expected loss=0.3088889  P(node) =1
##     class counts:   278   622
##    probabilities: 0.309 0.691 
##   left son=2 (491 obs) right son=3 (409 obs)
##   Primary splits:
##       Account.Balance                   < 2.5    to the left,  improve=44.34223, (0 missing)
##       Value.Savings.Stocks              < 2.5    to the left,  improve=15.17035, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5    to the left,  improve=15.03636, (0 missing)
##       Duration.of.Credit..month.        < 34.5   to the right, improve=13.22006, (0 missing)
##       Credit.Amount                     < 3909.5 to the right, improve=12.09960, (0 missing)
##   Surrogate splits:
##       Value.Savings.Stocks              < 2.5    to the left,  agree=0.612, adj=0.147, (0 split)
##       Payment.Status.of.Previous.Credit < 2.5    to the left,  agree=0.596, adj=0.110, (0 split)
##       No.of.Credits.at.this.Bank        < 1.5    to the left,  agree=0.558, adj=0.027, (0 split)
##       Duration.of.Credit..month.        < 10.5   to the right, agree=0.557, adj=0.024, (0 split)
##       Length.of.current.employment      < 4.5    to the left,  agree=0.557, adj=0.024, (0 split)
## 
## Node number 2: 491 observations,    complexity param=0.0557554
##   predicted class=1  expected loss=0.4521385  P(node) =0.5455556
##     class counts:   222   269
##    probabilities: 0.452 0.548 
##   left son=4 (215 obs) right son=5 (276 obs)
##   Primary splits:
##       Duration.of.Credit..month.        < 22.5   to the right, improve=11.007130, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5    to the left,  improve= 8.901367, (0 missing)
##       Value.Savings.Stocks              < 2.5    to the left,  improve= 8.828724, (0 missing)
##       Credit.Amount                     < 3998   to the right, improve= 8.139155, (0 missing)
##       Most.valuable.available.asset     < 1.5    to the right, improve= 7.854004, (0 missing)
##   Surrogate splits:
##       Credit.Amount                 < 2805.5 to the right, agree=0.752, adj=0.433, (0 split)
##       Most.valuable.available.asset < 2.5    to the right, agree=0.640, adj=0.177, (0 split)
##       Type.of.apartment             < 2.5    to the right, agree=0.605, adj=0.098, (0 split)
##       Purpose                       < 8.5    to the right, agree=0.601, adj=0.088, (0 split)
##       Occupation                    < 3.5    to the right, agree=0.595, adj=0.074, (0 split)
## 
## Node number 3: 409 observations
##   predicted class=1  expected loss=0.1369193  P(node) =0.4544444
##     class counts:    56   353
##    probabilities: 0.137 0.863 
## 
## Node number 4: 215 observations,    complexity param=0.0557554
##   predicted class=0  expected loss=0.427907  P(node) =0.2388889
##     class counts:   123    92
##    probabilities: 0.572 0.428 
##   left son=8 (179 obs) right son=9 (36 obs)
##   Primary splits:
##       Value.Savings.Stocks        < 3.5    to the left,  improve=7.491063, (0 missing)
##       Duration.in.Current.address < 1.5    to the right, improve=2.974818, (0 missing)
##       Instalment.per.cent         < 2.5    to the right, improve=2.560354, (0 missing)
##       Duration.of.Credit..month.  < 31.5   to the right, improve=2.522542, (0 missing)
##       Credit.Amount               < 1381.5 to the left,  improve=2.433560, (0 missing)
##   Surrogate splits:
##       Occupation < 1.5    to the right, agree=0.837, adj=0.028, (0 split)
## 
## Node number 5: 276 observations,    complexity param=0.05395683
##   predicted class=1  expected loss=0.3586957  P(node) =0.3066667
##     class counts:    99   177
##    probabilities: 0.359 0.641 
##   left son=10 (23 obs) right son=11 (253 obs)
##   Primary splits:
##       Payment.Status.of.Previous.Credit < 1.5    to the left,  improve=10.962450, (0 missing)
##       Most.valuable.available.asset     < 1.5    to the right, improve= 4.927551, (0 missing)
##       Credit.Amount                     < 7491.5 to the right, improve= 3.568808, (0 missing)
##       Guarantors                        < 2.5    to the left,  improve= 3.132524, (0 missing)
##       Duration.of.Credit..month.        < 11.5   to the right, improve= 3.114577, (0 missing)
## 
## Node number 8: 179 observations
##   predicted class=0  expected loss=0.3687151  P(node) =0.1988889
##     class counts:   113    66
##    probabilities: 0.631 0.369 
## 
## Node number 9: 36 observations
##   predicted class=1  expected loss=0.2777778  P(node) =0.04
##     class counts:    10    26
##    probabilities: 0.278 0.722 
## 
## Node number 10: 23 observations
##   predicted class=0  expected loss=0.173913  P(node) =0.02555556
##     class counts:    19     4
##    probabilities: 0.826 0.174 
## 
## Node number 11: 253 observations,    complexity param=0.02038369
##   predicted class=1  expected loss=0.3162055  P(node) =0.2811111
##     class counts:    80   173
##    probabilities: 0.316 0.684 
##   left son=22 (7 obs) right son=23 (246 obs)
##   Primary splits:
##       Credit.Amount                     < 7491.5 to the right, improve=4.213154, (0 missing)
##       Duration.of.Credit..month.        < 11.5   to the right, improve=3.497348, (0 missing)
##       Most.valuable.available.asset     < 1.5    to the right, improve=2.873204, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5    to the left,  improve=2.342197, (0 missing)
##       Guarantors                        < 2.5    to the left,  improve=2.135887, (0 missing)
## 
## Node number 22: 7 observations
##   predicted class=0  expected loss=0.1428571  P(node) =0.007777778
##     class counts:     6     1
##    probabilities: 0.857 0.143 
## 
## Node number 23: 246 observations,    complexity param=0.02038369
##   predicted class=1  expected loss=0.300813  P(node) =0.2733333
##     class counts:    74   172
##    probabilities: 0.301 0.699 
##   left son=46 (102 obs) right son=47 (144 obs)
##   Primary splits:
##       Credit.Amount                 < 1348.5 to the left,  improve=4.290132, (0 missing)
##       Duration.of.Credit..month.    < 11.5   to the right, improve=3.834379, (0 missing)
##       Most.valuable.available.asset < 2.5    to the right, improve=2.346406, (0 missing)
##       Value.Savings.Stocks          < 2.5    to the left,  improve=2.166893, (0 missing)
##       Guarantors                    < 2.5    to the left,  improve=1.819584, (0 missing)
##   Surrogate splits:
##       Instalment.per.cent        < 3.5    to the right, agree=0.671, adj=0.206, (0 split)
##       Duration.of.Credit..month. < 9.5    to the left,  agree=0.618, adj=0.078, (0 split)
##       Occupation                 < 2.5    to the left,  agree=0.618, adj=0.078, (0 split)
##       Purpose                    < 3.5    to the right, agree=0.606, adj=0.049, (0 split)
##       Age..years.                < 21.5   to the left,  agree=0.606, adj=0.049, (0 split)
## 
## Node number 46: 102 observations,    complexity param=0.02038369
##   predicted class=1  expected loss=0.4117647  P(node) =0.1133333
##     class counts:    42    60
##    probabilities: 0.412 0.588 
##   left son=92 (30 obs) right son=93 (72 obs)
##   Primary splits:
##       Most.valuable.available.asset < 2.5    to the right, improve=7.061765, (0 missing)
##       No.of.Credits.at.this.Bank    < 1.5    to the left,  improve=5.158800, (0 missing)
##       Duration.of.Credit..month.    < 11.5   to the right, improve=4.778929, (0 missing)
##       Guarantors                    < 1.5    to the left,  improve=3.759167, (0 missing)
##       Credit.Amount                 < 624    to the right, improve=3.340892, (0 missing)
##   Surrogate splits:
##       Type.of.apartment          < 2.5    to the right, agree=0.755, adj=0.167, (0 split)
##       Credit.Amount              < 1238   to the right, agree=0.716, adj=0.033, (0 split)
##       No.of.Credits.at.this.Bank < 2.5    to the right, agree=0.716, adj=0.033, (0 split)
##       Occupation                 < 1.5    to the left,  agree=0.716, adj=0.033, (0 split)
##       No.of.dependents           < 1.5    to the right, agree=0.716, adj=0.033, (0 split)
## 
## Node number 47: 144 observations
##   predicted class=1  expected loss=0.2222222  P(node) =0.16
##     class counts:    32   112
##    probabilities: 0.222 0.778 
## 
## Node number 92: 30 observations,    complexity param=0.01438849
##   predicted class=0  expected loss=0.3  P(node) =0.03333333
##     class counts:    21     9
##    probabilities: 0.700 0.300 
##   left son=184 (18 obs) right son=185 (12 obs)
##   Primary splits:
##       Age..years.                       < 36.5   to the left,  improve=5.377778, (0 missing)
##       No.of.Credits.at.this.Bank        < 1.5    to the left,  improve=4.800000, (0 missing)
##       Duration.in.Current.address       < 3.5    to the left,  improve=2.609050, (0 missing)
##       Duration.of.Credit..month.        < 11.5   to the right, improve=2.304545, (0 missing)
##       Payment.Status.of.Previous.Credit < 3      to the left,  improve=1.679365, (0 missing)
##   Surrogate splits:
##       Duration.in.Current.address       < 3.5    to the left,  agree=0.833, adj=0.583, (0 split)
##       No.of.Credits.at.this.Bank        < 1.5    to the left,  agree=0.800, adj=0.500, (0 split)
##       Length.of.current.employment      < 4.5    to the left,  agree=0.767, adj=0.417, (0 split)
##       Duration.of.Credit..month.        < 8.5    to the right, agree=0.700, adj=0.250, (0 split)
##       Payment.Status.of.Previous.Credit < 3      to the left,  agree=0.700, adj=0.250, (0 split)
## 
## Node number 93: 72 observations
##   predicted class=1  expected loss=0.2916667  P(node) =0.08
##     class counts:    21    51
##    probabilities: 0.292 0.708 
## 
## Node number 184: 18 observations
##   predicted class=0  expected loss=0.05555556  P(node) =0.02
##     class counts:    17     1
##    probabilities: 0.944 0.056 
## 
## Node number 185: 12 observations
##   predicted class=1  expected loss=0.3333333  P(node) =0.01333333
##     class counts:     4     8
##    probabilities: 0.333 0.667
rpart.plot(DT, type=1, extra = 102)

library(randomForest) 
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##          
## cred_pred  0  1
##         0 12 11
##         1 10 67
library(readr)
whitewines <- read.csv("C:/Users/gmutya048/Downloads/whitewines.csv")
str(whitewines)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...
hist(whitewines$quality)

Step 2: Exploring and Preparing the Data

wine_train <- whitewines[1:3750, ]
wine_test <- whitewines[3751:4898, ]

Step 3: Training a Model on the Data

library(rpart)
library(rpart.plot)
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *
rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597
summary(wine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525

News Popularity

Import the data

library(readr)
news<- read.csv("C:/Users/gmutya048/Downloads/OnlineNewsPopularity_for_R.csv")
str(news)
## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : chr  "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

Technique 1

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

Pre-processing

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- subset( newsShort, select = -popular )
set.seed(12345)
news_rand <- newsShort[order(runif(49000)), ]
news_train <- news_rand[1:44000, ]
news_test <- news_rand[44001:49000, ]
prop.table(table(news_train$shares))
## 
##        no       yes 
## 0.4662568 0.5337432

Model training

library(C50)
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Oct 29 19:13:11 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## *** ignoring cases with bad or unknown class
## 
## Read 35622 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## num_imgs > 3:
## :...kw_max_max <= 15000:
## :   :...n_tokens_content <= 1111: no (20/2)
## :   :   n_tokens_content > 1111: yes (7/2)
## :   kw_max_max > 15000:
## :   :...num_keywords <= 5:
## :       :...n_unique_tokens > 0.5483304:
## :       :   :...num_videos > 0:
## :       :   :   :...kw_max_max <= 310800: no (8/3)
## :       :   :   :   kw_max_max > 310800: yes (87/21)
## :       :   :   num_videos <= 0:
## :       :   :   :...average_token_length > 5.22907: yes (13)
## :       :   :       average_token_length <= 5.22907:
## :       :   :       :...global_sentiment_polarity <= 0.04: no (27/8)
## :       :   :           global_sentiment_polarity > 0.04: yes (125/42)
## :       :   n_unique_tokens <= 0.5483304:
## :       :   :...n_tokens_title > 9:
## :       :       :...kw_max_max <= 690400: yes (78/33)
## :       :       :   kw_max_max > 690400: no (545/228)
## :       :       n_tokens_title <= 9:
## :       :       :...num_videos <= 1: yes (267/108)
## :       :           num_videos > 1:
## :       :           :...num_imgs > 7: no (7)
## :       :               num_imgs <= 7:
## :       :               :...num_videos <= 2: yes (7)
## :       :                   num_videos > 2: no (6/1)
## :       num_keywords > 5:
## :       :...num_hrefs > 17:
## :           :...n_tokens_content > 2691: yes (53/2)
## :           :   n_tokens_content <= 2691:
## :           :   :...n_tokens_content <= 342: yes (661/156)
## :           :       n_tokens_content > 342:
## :           :       :...n_tokens_title > 9:
## :           :           :...num_imgs <= 12:
## :           :           :   :...num_videos <= 6: yes (592/189)
## :           :           :   :   num_videos > 6:
## :           :           :   :   :...n_tokens_title <= 11: yes (15/3)
## :           :           :   :       n_tokens_title > 11: no (28/8)
## :           :           :   num_imgs > 12:
## :           :           :   :...kw_max_max > 310800:
## :           :           :       :...n_tokens_title <= 13: yes (716/287)
## :           :           :       :   n_tokens_title > 13: no (61/28)
## :           :           :       kw_max_max <= 310800:
## :           :           :       :...kw_max_max <= 17100: no (8)
## :           :           :           kw_max_max > 17100:
## :           :           :           :...num_hrefs <= 22: yes (8)
## :           :           :               num_hrefs > 22: no (31/12)
## :           :           n_tokens_title <= 9:
## :           :           :...abs_title_sentiment_polarity <= 0.425:
## :           :               :...num_imgs <= 28: yes (645/187)
## :           :               :   num_imgs > 28:
## :           :               :   :...average_token_length > 4.696252: yes (34/6)
## :           :               :       average_token_length <= 4.696252:
## :           :               :       :...kw_max_max <= 617900: yes (7/2)
## :           :               :           kw_max_max > 617900: no (33/6)
## :           :               abs_title_sentiment_polarity > 0.425:
## :           :               :...n_tokens_content > 457: yes (209/31)
## :           :                   n_tokens_content <= 457:
## :           :                   :...n_tokens_title <= 6: yes (6)
## :           :                       n_tokens_title > 6:
## :           :                       :...kw_max_max <= 690400: no (6)
## :           :                           kw_max_max > 690400:
## :           :                           :...num_keywords > 9: no (12/3)
## :           :                               num_keywords <= 9: [S1]
## :           num_hrefs <= 17:
## :           :...title_sentiment_polarity > 0.4: yes (658/198)
## :               title_sentiment_polarity <= 0.4:
## :               :...kw_max_max <= 617900: yes (479/157)
## :                   kw_max_max > 617900:
## :                   :...n_tokens_title > 10:
## :                       :...num_hrefs > 6: yes (1088/444)
## :                       :   num_hrefs <= 6:
## :                       :   :...num_videos <= 0: no (503/235)
## :                       :       num_videos > 0:
## :                       :       :...n_unique_tokens <= 0.3534247: yes (37/3)
## :                       :           n_unique_tokens > 0.3534247:
## :                       :           :...avg_positive_polarity > 0.3814904: yes (127/35)
## :                       :               avg_positive_polarity <= 0.3814904:
## :                       :               :...num_hrefs <= 5: no (142/60)
## :                       :                   num_hrefs > 5: [S2]
## :                       n_tokens_title <= 10:
## :                       :...global_sentiment_polarity > 0.09483681: yes (1135/393)
## :                           global_sentiment_polarity <= 0.09483681:
## :                           :...n_tokens_content > 1575:
## :                               :...title_sentiment_polarity <= -0.1461111: yes (4/1)
## :                               :   title_sentiment_polarity > -0.1461111: no (26/1)
## :                               n_tokens_content <= 1575:
## :                               :...kw_max_max <= 690400:
## :                                   :...n_tokens_title > 7: no (32/9)
## :                                   :   n_tokens_title <= 7: [S3]
## :                                   kw_max_max > 690400:
## :                                   :...num_imgs > 6: yes (520/196)
## :                                       num_imgs <= 6:
## :                                       :...num_hrefs <= 9:
## :                                           :...num_videos > 1: no (6)
## :                                           :   num_videos <= 1: [S4]
## :                                           num_hrefs > 9: [S5]
## num_imgs <= 3:
## :...n_unique_tokens > 0.4304388:
##     :...kw_max_max <= 617900:
##     :   :...n_tokens_content > 1204: yes (37/4)
##     :   :   n_tokens_content <= 1204:
##     :   :   :...kw_max_max > 69100: yes (1747/638)
##     :   :       kw_max_max <= 69100:
##     :   :       :...kw_max_max <= 41600: no (374/170)
##     :   :           kw_max_max > 41600: yes (1411/638)
##     :   kw_max_max > 617900:
##     :   :...global_sentiment_polarity <= 0.08261218:
##     :       :...num_imgs <= 0:
##     :       :   :...num_videos > 25:
##     :       :   :   :...num_hrefs <= 52: no (71/18)
##     :       :   :   :   num_hrefs > 52: yes (4)
##     :       :   :   num_videos <= 25:
##     :       :   :   :...kw_max_max <= 690400: yes (250/107)
##     :       :   :       kw_max_max > 690400:
##     :       :   :       :...num_videos > 15: yes (103/37)
##     :       :   :           num_videos <= 15:
##     :       :   :           :...num_keywords <= 6: no (360/141)
##     :       :   :               num_keywords > 6: yes (596/292)
##     :       :   num_imgs > 0:
##     :       :   :...n_unique_tokens > 0.7682927:
##     :       :       :...title_sentiment_polarity <= -0.3: no (3)
##     :       :       :   title_sentiment_polarity > -0.3: yes (45/13)
##     :       :       n_unique_tokens <= 0.7682927:
##     :       :       :...num_hrefs <= 1:
##     :       :           :...kw_max_max > 690400: no (163/26)
##     :       :           :   kw_max_max <= 690400:
##     :       :           :   :...num_keywords > 6: yes (3)
##     :       :           :       num_keywords <= 6:
##     :       :           :       :...num_keywords <= 5: yes (5/1)
##     :       :           :           num_keywords > 5: no (7)
##     :       :           num_hrefs > 1:
##     :       :           :...num_hrefs > 20:
##     :       :               :...num_keywords <= 6: no (150/64)
##     :       :               :   num_keywords > 6: yes (212/91)
##     :       :               num_hrefs <= 20:
##     :       :               :...average_token_length > 4.747073:
##     :       :                   :...num_hrefs > 4:
##     :       :                   :   :...n_tokens_content <= 164: yes (32/7)
##     :       :                   :   :   n_tokens_content > 164: no (2129/730)
##     :       :                   :   num_hrefs <= 4:
##     :       :                   :   :...kw_max_max <= 690400:
##     :       :                   :       :...n_tokens_title <= 9: no (22/7)
##     :       :                   :       :   n_tokens_title > 9: yes (21/5)
##     :       :                   :       kw_max_max > 690400:
##     :       :                   :       :...num_imgs <= 2: no (435/110)
##     :       :                   :           num_imgs > 2:
##     :       :                   :           :...n_unique_tokens <= 0.4491979: yes (4)
##     :       :                   :               n_unique_tokens > 0.4491979: no (30/10)
##     :       :                   average_token_length <= 4.747073:
##     :       :                   :...num_imgs > 2:
##     :       :                       :...num_hrefs <= 3:
##     :       :                       :   :...n_unique_tokens > 0.6126984: yes (3)
##     :       :                       :   :   n_unique_tokens <= 0.6126984: [S6]
##     :       :                       :   num_hrefs > 3:
##     :       :                       :   :...num_keywords > 6: yes (64/16)
##     :       :                       :       num_keywords <= 6: [S7]
##     :       :                       num_imgs <= 2:
##     :       :                       :...num_keywords <= 5: no (750/284)
##     :       :                           num_keywords > 5:
##     :       :                           :...num_imgs <= 1: no (1317/581)
##     :       :                               num_imgs > 1: [S8]
##     :       global_sentiment_polarity > 0.08261218:
##     :       :...num_hrefs > 15:
##     :           :...n_tokens_title <= 9: yes (582/189)
##     :           :   n_tokens_title > 9:
##     :           :   :...abs_title_subjectivity > 0.09583333: yes (748/329)
##     :           :       abs_title_subjectivity <= 0.09583333:
##     :           :       :...num_imgs <= 0: yes (43/18)
##     :           :           num_imgs > 0:
##     :           :           :...num_keywords <= 5: no (36/6)
##     :           :               num_keywords > 5:
##     :           :               :...num_videos > 0: no (46/11)
##     :           :                   num_videos <= 0:
##     :           :                   :...num_imgs > 1: yes (4)
##     :           :                       num_imgs <= 1: [S9]
##     :           num_hrefs <= 15:
##     :           :...num_imgs > 1:
##     :               :...n_unique_tokens > 0.6486486:
##     :               :   :...global_sentiment_polarity <= 0.3318182: yes (159/35)
##     :               :   :   global_sentiment_polarity > 0.3318182: no (10/2)
##     :               :   n_unique_tokens <= 0.6486486:
##     :               :   :...num_keywords <= 9: yes (1404/654)
##     :               :       num_keywords > 9:
##     :               :       :...num_videos > 1: yes (17/3)
##     :               :           num_videos <= 1:
##     :               :           :...num_videos <= 0:
##     :               :               :...num_hrefs <= 8: yes (74/20)
##     :               :               :   num_hrefs > 8: [S10]
##     :               :               num_videos > 0:
##     :               :               :...avg_positive_polarity <= 0.3091198: yes (19/5)
##     :               :                   avg_positive_polarity > 0.3091198:
##     :               :                   :...num_hrefs > 9: yes (20/1)
##     :               :                       num_hrefs <= 9:
##     :               :                       :...num_hrefs <= 6: no (14/2)
##     :               :                           num_hrefs > 6: yes (22/10)
##     :               num_imgs <= 1:
##     :               :...num_videos > 0:
##     :                   :...kw_max_max <= 690400:
##     :                   :   :...num_imgs <= 0:
##     :                   :   :   :...num_keywords <= 7:
##     :                   :   :   :   :...n_tokens_title <= 7: no (19/8)
##     :                   :   :   :   :   n_tokens_title > 7: yes (212/66)
##     :                   :   :   :   num_keywords > 7:
##     :                   :   :   :   :...n_unique_tokens <= 0.6703297: no (61/24)
##     :                   :   :   :       n_unique_tokens > 0.6703297: yes (22/4)
##     :                   :   :   num_imgs > 0:
##     :                   :   :   :...average_token_length <= 4.39404: yes (29/4)
##     :                   :   :       average_token_length > 4.39404:
##     :                   :   :       :...num_hrefs > 9: yes (44/13)
##     :                   :   :           num_hrefs <= 9:
##     :                   :   :           :...num_videos > 3: yes (12/3)
##     :                   :   :               num_videos <= 3:
##     :                   :   :               :...num_keywords > 6: no (69/17)
##     :                   :   :                   num_keywords <= 6: [S11]
##     :                   :   kw_max_max > 690400:
##     :                   :   :...num_videos > 10: yes (191/71)
##     :                   :       num_videos <= 10:
##     :                   :       :...n_tokens_title <= 9: no (771/370)
##     :                   :           n_tokens_title > 9:
##     :                   :           :...num_hrefs <= 9: yes (1684/755)
##     :                   :               num_hrefs > 9:
##     :                   :               :...num_keywords <= 8:
##     :                   :                   :...num_keywords <= 5: [S12]
##     :                   :                   :   num_keywords > 5: [S13]
##     :                   :                   num_keywords > 8:
##     :                   :                   :...num_imgs <= 0: no (44/19)
##     :                   :                       num_imgs > 0: [S14]
##     :                   num_videos <= 0:
##     :                   :...n_tokens_content > 668: yes (759/324)
##     :                       n_tokens_content <= 668:
##     :                       :...title_sentiment_polarity > -0.008333334:
##     :                           :...num_imgs > 0:
##     :                           :   :...n_tokens_content <= 121: yes (101/36)
##     :                           :   :   n_tokens_content > 121: no (4008/1746)
##     :                           :   num_imgs <= 0:
##     :                           :   :...n_tokens_content > 141:
##     :                           :       :...num_keywords <= 5: no (73/31)
##     :                           :       :   num_keywords > 5: yes (411/158)
##     :                           :       n_tokens_content <= 141:
##     :                           :       :...n_tokens_title > 9:
##     :                           :           :...n_unique_tokens <= 0.8265306: no (104/21)
##     :                           :           :   n_unique_tokens > 0.8265306: yes (8/1)
##     :                           :           n_tokens_title <= 9:
##     :                           :           :...kw_max_max <= 690400: yes (35/10)
##     :                           :               kw_max_max > 690400: [S15]
##     :                           title_sentiment_polarity <= -0.008333334:
##     :                           :...n_unique_tokens > 0.71875:
##     :                               :...n_tokens_title > 12: no (89/3)
##     :                               :   n_tokens_title <= 12:
##     :                               :   :...n_unique_tokens <= 0.7894737: no (69/11)
##     :                               :       n_unique_tokens > 0.7894737: yes (14/5)
##     :                               n_unique_tokens <= 0.71875:
##     :                               :...n_tokens_title > 12: no (89/25)
##     :                                   n_tokens_title <= 12:
##     :                                   :...num_imgs > 0: no (303/124)
##     :                                       num_imgs <= 0: [S16]
##     n_unique_tokens <= 0.4304388:
##     :...num_hrefs > 19: yes (387/93)
##         num_hrefs <= 19:
##         :...num_videos > 8:
##             :...num_keywords <= 8: no (40/10)
##             :   num_keywords > 8:
##             :   :...average_token_length <= 4.819527: yes (8)
##             :       average_token_length > 4.819527: no (2)
##             num_videos <= 8:
##             :...kw_max_max <= 663600: yes (180/46)
##                 kw_max_max > 663600:
##                 :...global_sentiment_polarity > 0.154227: yes (399/112)
##                     global_sentiment_polarity <= 0.154227:
##                     :...average_token_length > 4.729216:
##                         :...num_videos <= 1:
##                         :   :...num_hrefs <= 14: no (166/50)
##                         :   :   num_hrefs > 14:
##                         :   :   :...global_sentiment_polarity <= 0.1335317: no (40/19)
##                         :   :       global_sentiment_polarity > 0.1335317: yes (9)
##                         :   num_videos > 1:
##                         :   :...title_sentiment_polarity <= -0.1402778: no (5)
##                         :       title_sentiment_polarity > -0.1402778:
##                         :       :...avg_positive_polarity <= 0.2935334: no (5/1)
##                         :           avg_positive_polarity > 0.2935334: yes (19/3)
##                         average_token_length <= 4.729216:
##                         :...num_imgs > 2: yes (99/32)
##                             num_imgs <= 2:
##                             :...kw_max_max <= 690400: yes (85/34)
##                                 kw_max_max > 690400:
##                                 :...num_hrefs > 15: no (55/22)
##                                     num_hrefs <= 15:
##                                     :...num_hrefs > 10: yes (138/39)
##                                         num_hrefs <= 10:
##                                         :...n_tokens_content > 1113:
##                                             :...n_unique_tokens > 0.4021739: no (50/12)
##                                             :   n_unique_tokens <= 0.4021739:
##                                             :   :...num_videos > 2: no (4)
##                                             :       num_videos <= 2: [S17]
##                                             n_tokens_content <= 1113:
##                                             :...num_imgs > 1: yes (37/10)
##                                                 num_imgs <= 1:
##                                                 :...num_imgs > 0: [S18]
##                                                     num_imgs <= 0: [S19]
## 
## SubTree [S1]
## 
## n_tokens_content <= 422: yes (15/1)
## n_tokens_content > 422: no (7/2)
## 
## SubTree [S2]
## 
## title_sentiment_polarity <= -0.325: no (4)
## title_sentiment_polarity > -0.325: yes (38/12)
## 
## SubTree [S3]
## 
## global_sentiment_polarity <= 0.02484504: no (4/1)
## global_sentiment_polarity > 0.02484504: yes (7)
## 
## SubTree [S4]
## 
## n_tokens_content <= 436: yes (27/10)
## n_tokens_content > 436: no (66/15)
## 
## SubTree [S5]
## 
## average_token_length > 5.011019: no (5)
## average_token_length <= 5.011019:
## :...num_videos <= 3: yes (52/14)
##     num_videos > 3: no (5/1)
## 
## SubTree [S6]
## 
## global_sentiment_polarity <= 0.06216631: no (19/1)
## global_sentiment_polarity > 0.06216631:
## :...global_sentiment_polarity <= 0.07848307: yes (5)
##     global_sentiment_polarity > 0.07848307: no (3)
## 
## SubTree [S7]
## 
## average_token_length <= 4.407609: yes (6)
## average_token_length > 4.407609:
## :...n_tokens_title <= 14: no (48/19)
##     n_tokens_title > 14: yes (3)
## 
## SubTree [S8]
## 
## n_tokens_content <= 623: yes (216/101)
## n_tokens_content > 623:
## :...avg_positive_polarity <= 0.3597222: no (59/9)
##     avg_positive_polarity > 0.3597222:
##     :...num_keywords > 9: yes (4)
##         num_keywords <= 9:
##         :...global_sentiment_polarity <= 0.07012397: no (18/5)
##             global_sentiment_polarity > 0.07012397: yes (5)
## 
## SubTree [S9]
## 
## abs_title_subjectivity > 0.01136364: no (23/9)
## abs_title_subjectivity <= 0.01136364:
## :...avg_positive_polarity <= 0.2931903: no (2)
##     avg_positive_polarity > 0.2931903: yes (18/3)
## 
## SubTree [S10]
## 
## abs_title_subjectivity <= 0.4125: yes (28/8)
## abs_title_subjectivity > 0.4125: no (28/9)
## 
## SubTree [S11]
## 
## global_sentiment_polarity <= 0.2202445: no (35/16)
## global_sentiment_polarity > 0.2202445: yes (6)
## 
## SubTree [S12]
## 
## global_sentiment_polarity <= 0.2365167: no (43/14)
## global_sentiment_polarity > 0.2365167: yes (4)
## 
## SubTree [S13]
## 
## abs_title_sentiment_polarity <= 0.7: no (152/58)
## abs_title_sentiment_polarity > 0.7: yes (8/3)
## 
## SubTree [S14]
## 
## average_token_length > 4.955523: no (13/2)
## average_token_length <= 4.955523:
## :...num_videos <= 9: yes (71/20)
##     num_videos > 9: no (5/1)
## 
## SubTree [S15]
## 
## n_tokens_content <= 96: no (19/4)
## n_tokens_content > 96:
## :...title_subjectivity <= 0.6625: yes (10)
##     title_subjectivity > 0.6625: no (4/1)
## 
## SubTree [S16]
## 
## abs_title_subjectivity > 0.425: no (5)
## abs_title_subjectivity <= 0.425:
## :...n_tokens_title > 10: yes (23/4)
##     n_tokens_title <= 10:
##     :...n_tokens_title <= 9: yes (12/3)
##         n_tokens_title > 9:
##         :...n_tokens_content <= 168: yes (2)
##             n_tokens_content > 168: no (8)
## 
## SubTree [S17]
## 
## global_sentiment_polarity <= 0.1441667: yes (66/25)
## global_sentiment_polarity > 0.1441667: no (7)
## 
## SubTree [S18]
## 
## n_non_stop_words <= 0: yes (63/19)
## n_non_stop_words > 0:
## :...global_sentiment_polarity <= 0.02890873: no (8/1)
##     global_sentiment_polarity > 0.02890873:
##     :...n_unique_tokens <= 0.3853428: yes (18/1)
##         n_unique_tokens > 0.3853428:
##         :...num_hrefs <= 5: no (27/8)
##             num_hrefs > 5: yes (54/19)
## 
## SubTree [S19]
## 
## n_tokens_title > 14: no (28/10)
## n_tokens_title <= 14:
## :...num_videos > 0: yes (602/253)
##     num_videos <= 0:
##     :...n_tokens_title > 13: yes (5)
##         n_tokens_title <= 13:
##         :...num_keywords <= 7: no (15/1)
##             num_keywords > 7:
##             :...num_keywords <= 9: yes (14/3)
##                 num_keywords > 9:
##                 :...abs_title_subjectivity <= 0.09415584: yes (2)
##                     abs_title_subjectivity > 0.09415584: no (4)
## 
## 
## Evaluation on training data (35622 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##     190 13393(37.6%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    8682  7927    (a): class no
##    5466 13547    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% num_imgs
##   98.77% kw_max_max
##   82.81% num_hrefs
##   78.13% n_unique_tokens
##   67.85% global_sentiment_polarity
##   48.03% n_tokens_content
##   45.46% num_keywords
##   44.31% num_videos
##   37.22% n_tokens_title
##   29.28% title_sentiment_polarity
##   20.81% average_token_length
##    3.18% abs_title_sentiment_polarity
##    2.90% abs_title_subjectivity
##    1.45% avg_positive_polarity
##    0.48% n_non_stop_words
##    0.04% title_subjectivity
## 
## 
## Time: 0.9 secs

Evaluate the Model

news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##          
## news_pred   no  yes
##       no   867  674
##       yes 1014 1467
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 58.03083

Technique 2

library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
news_model2<- ksvm(shares ~ ., data=news_train, kernel="vanilladot")
##  Setting default kernel parameters
summary(news_model2)
## Length  Class   Mode 
##      1   ksvm     S4

Evaluate the Model

news_pred2 <- predict(news_model2, news_test)
summary(news_pred2)
##   no  yes 
## 1353 2669

Technique 3

Model training

library(rpart)
m.rpart <- rpart(shares ~ ., data=news_train)
summary(m.rpart)
## Call:
## rpart(formula = shares ~ ., data = news_train)
##   n=35622 (8378 observations deleted due to missingness)
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.03251249      0 1.0000000 1.0000000 0.005668843
## 2 0.01216208      2 0.9349750 0.9471973 0.005642957
## 3 0.01000000      3 0.9228129 0.9380456 0.005637047
## 
## Variable importance
##                  num_imgs global_sentiment_polarity                kw_max_max 
##                        44                        22                        11 
##     avg_positive_polarity                 num_hrefs          n_non_stop_words 
##                         6                         5                         5 
##          n_tokens_content      average_token_length           n_unique_tokens 
##                         4                         2                         2 
## 
## Node number 1: 35622 observations,    complexity param=0.03251249
##   predicted class=yes  expected loss=0.4662568  P(node) =1
##     class counts: 16609 19013
##    probabilities: 0.466 0.534 
##   left son=2 (26313 obs) right son=3 (9309 obs)
##   Primary splits:
##       num_imgs                  < 3.5        to the left,  improve=204.43790, (0 missing)
##       num_hrefs                 < 13.5       to the left,  improve=163.26770, (0 missing)
##       global_sentiment_polarity < 0.09269021 to the left,  improve=130.72360, (0 missing)
##       n_unique_tokens           < 0.4396835  to the right, improve= 90.54954, (0 missing)
##       num_keywords              < 6.5        to the left,  improve= 78.65559, (0 missing)
##   Surrogate splits:
##       num_hrefs                 < 18.5       to the left,  agree=0.768, adj=0.114, (0 split)
##       n_non_stop_words          < 1          to the left,  agree=0.757, adj=0.070, (0 split)
##       n_tokens_content          < 1339.5     to the left,  agree=0.753, adj=0.057, (0 split)
##       global_sentiment_polarity < -0.1855226 to the right, agree=0.740, adj=0.004, (0 split)
##       average_token_length      < 5.626769   to the left,  agree=0.740, adj=0.004, (0 split)
## 
## Node number 2: 26313 observations,    complexity param=0.03251249
##   predicted class=yes  expected loss=0.4981188  P(node) =0.7386727
##     class counts: 13107 13206
##    probabilities: 0.498 0.502 
##   left son=4 (10534 obs) right son=5 (15779 obs)
##   Primary splits:
##       global_sentiment_polarity < 0.09269021 to the left,  improve=99.22453, (0 missing)
##       n_unique_tokens           < 0.4304826  to the right, improve=78.06604, (0 missing)
##       kw_max_max                < 677000     to the right, improve=76.51894, (0 missing)
##       num_hrefs                 < 19.5       to the left,  improve=56.02928, (0 missing)
##       average_token_length      < 4.781043   to the right, improve=54.85230, (0 missing)
##   Surrogate splits:
##       avg_positive_polarity < 0.3012278  to the left,  agree=0.702, adj=0.256, (0 split)
##       n_unique_tokens       < 0.1845133  to the left,  agree=0.630, adj=0.077, (0 split)
##       average_token_length  < 3.659021   to the left,  agree=0.630, adj=0.077, (0 split)
##       n_tokens_content      < 9          to the left,  agree=0.630, adj=0.077, (0 split)
##       n_non_stop_words      < 0.5        to the left,  agree=0.630, adj=0.077, (0 split)
## 
## Node number 3: 9309 observations
##   predicted class=yes  expected loss=0.3761951  P(node) =0.2613273
##     class counts:  3502  5807
##    probabilities: 0.376 0.624 
## 
## Node number 4: 10534 observations,    complexity param=0.01216208
##   predicted class=no   expected loss=0.4487374  P(node) =0.2957161
##     class counts:  5807  4727
##    probabilities: 0.551 0.449 
##   left son=8 (9288 obs) right son=9 (1246 obs)
##   Primary splits:
##       kw_max_max           < 654150     to the right, improve=49.48608, (0 missing)
##       n_non_stop_words     < 1          to the right, improve=48.04450, (0 missing)
##       n_tokens_content     < 161.5      to the right, improve=47.03210, (0 missing)
##       num_imgs             < 0.5        to the right, improve=45.60699, (0 missing)
##       average_token_length < 4.729654   to the right, improve=45.30676, (0 missing)
##   Surrogate splits:
##       n_unique_tokens < 0.9755102  to the left,  agree=0.882, adj=0.002, (0 split)
## 
## Node number 5: 15779 observations
##   predicted class=yes  expected loss=0.4626402  P(node) =0.4429566
##     class counts:  7300  8479
##    probabilities: 0.463 0.537 
## 
## Node number 8: 9288 observations
##   predicted class=no   expected loss=0.4309862  P(node) =0.2607377
##     class counts:  5285  4003
##    probabilities: 0.569 0.431 
## 
## Node number 9: 1246 observations
##   predicted class=yes  expected loss=0.4189406  P(node) =0.03497838
##     class counts:   522   724
##    probabilities: 0.419 0.581

Visualization

library(rpart.plot)
rpart.plot(m.rpart, digits=3, type=1)

Evaluate the Model

p.rpart <- predict(m.rpart, news_test)
summary(p.rpart) 
##        no              yes        
##  Min.   :0.3762   Min.   :0.4310  
##  1st Qu.:0.4626   1st Qu.:0.5374  
##  Median :0.4626   Median :0.5374  
##  Mean   :0.4659   Mean   :0.5341  
##  3rd Qu.:0.4626   3rd Qu.:0.5374  
##  Max.   :0.5690   Max.   :0.6238
summary(news_test$shares) 
##   no  yes NA's 
## 1881 2141  978