Method #1. Tree-based classification

Step 1: Collecting the data

credit <- read.csv("~/lab1/credit.csv")
View(credit)
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

table(credit$Creditability)

## 
##   0   1 
## 300 700

Step 2:Exploring the data

#randomize the data
set.seed(12346)
#Divide randomized data to train and test sets
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

table(credit$Creditability)

## 
##   0   1 
## 300 700

credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.2955556 0.7044444

prop.table(table(credit_test$Creditability))

## 
##    0    1 
## 0.34 0.66

Step 3: Training the model

if (!require("C50")) {
install.packages("C50")
library(C50)
}

## Loading required package: C50

## Warning: package 'C50' was built under R version 3.5.3

credit_train$Creditability <- as.factor(credit_train$Creditability)
credit_test$Creditability <- as.factor(credit_test$Creditability)
credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)
summary(credit_model)

## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Dec 13 12:59:56 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## Account.Balance > 2:
## :...Account.Balance <= 3:
## :   :...Foreign.Worker > 1: 1 (3)
## :   :   Foreign.Worker <= 1:
## :   :   :...Most.valuable.available.asset <= 1:
## :   :       :...Occupation <= 2: 0 (5)
## :   :       :   Occupation > 2:
## :   :       :   :...Instalment.per.cent <= 3: 1 (7)
## :   :       :       Instalment.per.cent > 3: 0 (4/1)
## :   :       Most.valuable.available.asset > 1:
## :   :       :...No.of.dependents <= 1: 1 (34/4)
## :   :           No.of.dependents > 1:
## :   :           :...Credit.Amount <= 1278: 0 (2)
## :   :               Credit.Amount > 1278: 1 (2)
## :   Account.Balance > 3:
## :   :...Concurrent.Credits <= 2:
## :       :...Purpose <= 0:
## :       :   :...No.of.Credits.at.this.Bank <= 1: 1 (3)
## :       :   :   No.of.Credits.at.this.Bank > 1: 0 (6/1)
## :       :   Purpose > 0:
## :       :   :...Purpose <= 4: 1 (28/2)
## :       :       Purpose > 4:
## :       :       :...Instalment.per.cent <= 1: 1 (2)
## :       :           Instalment.per.cent > 1:
## :       :           :...Length.of.current.employment <= 3: 0 (6/1)
## :       :               Length.of.current.employment > 3: 1 (7/1)
## :       Concurrent.Credits > 2:
## :       :...Length.of.current.employment > 3: 1 (139/5)
## :           Length.of.current.employment <= 3:
## :           :...Credit.Amount <= 4455: 1 (134/11)
## :               Credit.Amount > 4455:
## :               :...No.of.dependents > 1: 1 (5)
## :                   No.of.dependents <= 1:
## :                   :...Purpose <= 1:
## :                       :...Duration.of.Credit..month. <= 42: 1 (9)
## :                       :   Duration.of.Credit..month. > 42: 0 (2)
## :                       Purpose > 1:
## :                       :...Sex...Marital.Status <= 2: 0 (4)
## :                           Sex...Marital.Status > 2:
## :                           :...Occupation <= 3: 1 (2)
## :                               Occupation > 3: 0 (4)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
##     :...Type.of.apartment <= 1: 0 (14/1)
##     :   Type.of.apartment > 1:
##     :   :...Type.of.apartment > 2:
##     :       :...Length.of.current.employment <= 2: 1 (3/1)
##     :       :   Length.of.current.employment > 2: 0 (8)
##     :       Type.of.apartment <= 2:
##     :       :...Occupation > 3: 0 (3)
##     :           Occupation <= 3:
##     :           :...Age..years. > 35: 0 (9/1)
##     :               Age..years. <= 35:
##     :               :...Telephone > 1: 1 (8/1)
##     :                   Telephone <= 1:
##     :                   :...Credit.Amount <= 3556: 0 (8/2)
##     :                       Credit.Amount > 3556: 1 (5)
##     Payment.Status.of.Previous.Credit > 1:
##     :...Duration.of.Credit..month. > 22:
##         :...Value.Savings.Stocks > 3:
##         :   :...Account.Balance > 1: 1 (19/1)
##         :   :   Account.Balance <= 1:
##         :   :   :...No.of.Credits.at.this.Bank > 1: 1 (3)
##         :   :       No.of.Credits.at.this.Bank <= 1:
##         :   :       :...Occupation > 3: 0 (2)
##         :   :           Occupation <= 3:
##         :   :           :...Telephone <= 1: 0 (7/1)
##         :   :               Telephone > 1: 1 (2)
##         :   Value.Savings.Stocks <= 3:
##         :   :...Duration.of.Credit..month. > 42:
##         :       :...Duration.in.Current.address <= 1: 1 (3/1)
##         :       :   Duration.in.Current.address > 1: 0 (25/2)
##         :       Duration.of.Credit..month. <= 42:
##         :       :...Purpose <= 0:
##         :           :...Value.Savings.Stocks > 1: 1 (4/1)
##         :           :   Value.Savings.Stocks <= 1:
##         :           :   :...Instalment.per.cent > 2: 0 (18/1)
##         :           :       Instalment.per.cent <= 2:
##         :           :       :...Age..years. <= 35: 0 (2)
##         :           :           Age..years. > 35: 1 (2)
##         :           Purpose > 0:
##         :           :...Age..years. > 52: 1 (7)
##         :               Age..years. <= 52:
##         :               :...Type.of.apartment > 2:
##         :                   :...Age..years. <= 46: 1 (8)
##         :                   :   Age..years. > 46: 0 (2)
##         :                   Type.of.apartment <= 2:
##         :                   :...Concurrent.Credits <= 1: 0 (8/1)
##         :                       Concurrent.Credits > 1:
##         :                       :...Sex...Marital.Status <= 2: [S1]
##         :                           Sex...Marital.Status > 2:
##         :                           :...No.of.Credits.at.this.Bank > 1:
##         :                               :...Age..years. <= 25: 0 (5/1)
##         :                               :   Age..years. > 25: 1 (12/1)
##         :                               No.of.Credits.at.this.Bank <= 1:
##         :                               :...Guarantors > 2: 1 (2)
##         :                                   Guarantors <= 2:
##         :                                   :...No.of.dependents > 1: 0 (5)
##         :                                       No.of.dependents <= 1:
##         :                                       :...Telephone <= 1: [S2]
##         :                                           Telephone > 1: [S3]
##         Duration.of.Credit..month. <= 22:
##         :...Guarantors > 2:
##             :...Purpose <= 1: 0 (4/1)
##             :   Purpose > 1: 1 (24)
##             Guarantors <= 2:
##             :...Payment.Status.of.Previous.Credit > 3: 1 (67/13)
##                 Payment.Status.of.Previous.Credit <= 3:
##                 :...Value.Savings.Stocks > 2: 1 (44/10)
##                     Value.Savings.Stocks <= 2:
##                     :...Instalment.per.cent <= 3:
##                         :...Occupation <= 2: 1 (16/2)
##                         :   Occupation > 2:
##                         :   :...No.of.Credits.at.this.Bank > 1:
##                         :       :...Account.Balance <= 1: 0 (2)
##                         :       :   Account.Balance > 1:
##                         :       :   :...Type.of.apartment <= 1: 0 (2)
##                         :       :       Type.of.apartment > 1: 1 (3)
##                         :       No.of.Credits.at.this.Bank <= 1:
##                         :       :...Telephone > 1: [S4]
##                         :           Telephone <= 1:
##                         :           :...Credit.Amount <= 1347: 0 (6)
##                         :               Credit.Amount > 1347: [S5]
##                         Instalment.per.cent > 3:
##                         :...No.of.Credits.at.this.Bank > 1:
##                             :...Type.of.apartment <= 1: 0 (3)
##                             :   Type.of.apartment > 1: 1 (5)
##                             No.of.Credits.at.this.Bank <= 1:
##                             :...Age..years. > 35: 0 (11/1)
##                                 Age..years. <= 35:
##                                 :...Length.of.current.employment <= 1: 0 (4)
##                                     Length.of.current.employment > 1:
##                                     :...Credit.Amount > 1755: 1 (7)
##                                         Credit.Amount <= 1755:
##                                         :...Value.Savings.Stocks > 1: 0 (2)
##                                             Value.Savings.Stocks <= 1:
##                                             :...Telephone > 1: 0 (2)
##                                                 Telephone <= 1: [S6]
## 
## SubTree [S1]
## 
## No.of.Credits.at.this.Bank > 1: 0 (4)
## No.of.Credits.at.this.Bank <= 1:
## :...Duration.of.Credit..month. > 33: 1 (2)
##     Duration.of.Credit..month. <= 33:
##     :...Purpose <= 2: 0 (8/1)
##         Purpose > 2:
##         :...Account.Balance <= 1: 0 (5/1)
##             Account.Balance > 1: 1 (3)
## 
## SubTree [S2]
## 
## Age..years. <= 31: 1 (8)
## Age..years. > 31: 0 (4/1)
## 
## SubTree [S3]
## 
## Age..years. <= 29: 0 (5)
## Age..years. > 29: 1 (5/1)
## 
## SubTree [S4]
## 
## Most.valuable.available.asset <= 3: 1 (10/1)
## Most.valuable.available.asset > 3: 0 (5/1)
## 
## SubTree [S5]
## 
## Duration.of.Credit..month. > 16: 0 (4/1)
## Duration.of.Credit..month. <= 16:
## :...Purpose <= 2: 1 (11)
##     Purpose > 2:
##     :...Length.of.current.employment <= 3: 0 (2)
##         Length.of.current.employment > 3: 1 (3)
## 
## SubTree [S6]
## 
## Length.of.current.employment > 4: 1 (3)
## Length.of.current.employment <= 4:
## :...Length.of.current.employment > 3: 0 (2)
##     Length.of.current.employment <= 3:
##     :...Age..years. <= 27: 1 (9/3)
##         Age..years. > 27: 0 (3)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      83   79( 8.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     207    59    (a): class 0
##      20   614    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% Account.Balance
##   54.67% Payment.Status.of.Previous.Credit
##   49.44% Duration.of.Credit..month.
##   47.44% Concurrent.Credits
##   40.00% Length.of.current.employment
##   37.67% Value.Savings.Stocks
##   31.44% Guarantors
##   26.22% Purpose
##   25.67% Credit.Amount
##   21.11% No.of.Credits.at.this.Bank
##   18.89% Age..years.
##   18.11% Instalment.per.cent
##   17.44% Type.of.apartment
##   14.44% Occupation
##   12.44% Telephone
##   10.11% No.of.dependents
##    8.67% Sex...Marital.Status
##    7.67% Most.valuable.available.asset
##    6.33% Foreign.Worker
##    3.11% Duration.in.Current.address
## 
## 
## Time: 0.0 secs

Step 4: Evaluating Model Performance

cred_pred <- predict(credit_model, credit_test)

#Model Evaluation Method 1
if (!require("gmodels")) {
install.packages("gmodels")
library(gmodels)
}

## Loading required package: gmodels

## Warning: package 'gmodels' was built under R version 3.5.3

CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |        19 |        15 |        34 | 
##                      |     0.190 |     0.150 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |         9 |        57 |        66 | 
##                      |     0.090 |     0.570 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        28 |        72 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
##

(p <- table(cred_pred, credit_test$Creditability))

##          
## cred_pred  0  1
##         0 19  9
##         1 15 57

#Accuracy
(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 76

Q1- If you see the accuracy 0f 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.

Answer: Accuracy means that the data generated using training model data set match the test set, it’s no saying the modal is well designed, because it could be an overmatched prediction model or it could cause an error in the creation of the model. The combination of accuracy and other parameters will give a better support on make a decision about the model.

Alternative solution & Visualize the model

if (!require("rpart")) {
install.packages("rpart")
library(rpart)
}

## Loading required package: rpart

## Warning: package 'rpart' was built under R version 3.5.3

if (!require("rpart.plot")) {
install.packages("rpart.plot")
library(rpart.plot)
}

## Loading required package: rpart.plot

## Warning: package 'rpart.plot' was built under R version 3.5.3

if (!require("ggplot2")) {
install.packages("ggplot2")
library(ggplot2)
}

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.2

DT <- rpart(Creditability ~ . , data= credit_train)
summary(DT)

## Call:
## rpart(formula = Creditability ~ ., data = credit_train)
##   n= 900 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.04323308      0 1.0000000 1.0000000 0.05146151
## 2 0.04135338      3 0.8496241 0.9022556 0.04987406
## 3 0.01629073      4 0.8082707 0.8796992 0.04947003
## 4 0.01315789      7 0.7593985 0.8872180 0.04960634
## 5 0.01127820     14 0.6654135 0.9022556 0.04987406
## 6 0.01000000     15 0.6541353 0.9022556 0.04987406
## 
## Variable importance
##                   Account.Balance        Duration.of.Credit..month. 
##                                30                                17 
##                     Credit.Amount              Value.Savings.Stocks 
##                                11                                10 
## Payment.Status.of.Previous.Credit                           Purpose 
##                                 9                                 5 
##     Most.valuable.available.asset                       Age..years. 
##                                 4                                 3 
##      Length.of.current.employment                        Guarantors 
##                                 3                                 3 
##                Concurrent.Credits                        Occupation 
##                                 2                                 2 
##                 Type.of.apartment 
##                                 1 
## 
## Node number 1: 900 observations,    complexity param=0.04323308
##   predicted class=1  expected loss=0.2955556  P(node) =1
##     class counts:   266   634
##    probabilities: 0.296 0.704 
##   left son=2 (492 obs) right son=3 (408 obs)
##   Primary splits:
##       Account.Balance                   < 2.5     to the left,  improve=40.96088, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=15.92438, (0 missing)
##       Value.Savings.Stocks              < 2.5     to the left,  improve=12.98196, (0 missing)
##       Duration.of.Credit..month.        < 43.5    to the right, improve=12.49926, (0 missing)
##       Credit.Amount                     < 11191   to the right, improve=11.19833, (0 missing)
##   Surrogate splits:
##       Value.Savings.Stocks              < 2.5     to the left,  agree=0.617, adj=0.154, (0 split)
##       Payment.Status.of.Previous.Credit < 3.5     to the left,  agree=0.591, adj=0.098, (0 split)
##       Age..years.                       < 30.5    to the left,  agree=0.557, adj=0.022, (0 split)
##       Duration.of.Credit..month.        < 6.5     to the right, agree=0.554, adj=0.017, (0 split)
##       Length.of.current.employment      < 4.5     to the left,  agree=0.554, adj=0.017, (0 split)
## 
## Node number 2: 492 observations,    complexity param=0.04323308
##   predicted class=1  expected loss=0.4329268  P(node) =0.5466667
##     class counts:   213   279
##    probabilities: 0.433 0.567 
##   left son=4 (213 obs) right son=5 (279 obs)
##   Primary splits:
##       Duration.of.Credit..month.        < 22.5    to the right, improve=11.010300, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve= 8.667244, (0 missing)
##       Most.valuable.available.asset     < 1.5     to the right, improve= 8.534391, (0 missing)
##       Value.Savings.Stocks              < 2.5     to the left,  improve= 7.282673, (0 missing)
##       Credit.Amount                     < 8079    to the right, improve= 7.238941, (0 missing)
##   Surrogate splits:
##       Credit.Amount                 < 2805.5  to the right, agree=0.756, adj=0.437, (0 split)
##       Most.valuable.available.asset < 2.5     to the right, agree=0.652, adj=0.197, (0 split)
##       Type.of.apartment             < 2.5     to the right, agree=0.610, adj=0.099, (0 split)
##       Purpose                       < 8.5     to the right, agree=0.608, adj=0.094, (0 split)
##       Occupation                    < 3.5     to the right, agree=0.598, adj=0.070, (0 split)
## 
## Node number 3: 408 observations
##   predicted class=1  expected loss=0.129902  P(node) =0.4533333
##     class counts:    53   355
##    probabilities: 0.130 0.870 
## 
## Node number 4: 213 observations,    complexity param=0.04323308
##   predicted class=0  expected loss=0.4460094  P(node) =0.2366667
##     class counts:   118    95
##    probabilities: 0.554 0.446 
##   left son=8 (176 obs) right son=9 (37 obs)
##   Primary splits:
##       Value.Savings.Stocks       < 3.5     to the left,  improve=7.209076, (0 missing)
##       Duration.of.Credit..month. < 43.5    to the right, improve=3.897888, (0 missing)
##       Purpose                    < 0.5     to the left,  improve=3.207167, (0 missing)
##       Credit.Amount              < 1376.5  to the left,  improve=3.155186, (0 missing)
##       Instalment.per.cent        < 2.5     to the right, improve=2.469776, (0 missing)
## 
## Node number 5: 279 observations,    complexity param=0.04135338
##   predicted class=1  expected loss=0.3405018  P(node) =0.31
##     class counts:    95   184
##    probabilities: 0.341 0.659 
##   left son=10 (25 obs) right son=11 (254 obs)
##   Primary splits:
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=7.909699, (0 missing)
##       Most.valuable.available.asset     < 1.5     to the right, improve=5.732535, (0 missing)
##       Credit.Amount                     < 1285    to the left,  improve=3.493557, (0 missing)
##       Guarantors                        < 2.5     to the left,  improve=3.389748, (0 missing)
##       Duration.of.Credit..month.        < 11.5    to the right, improve=3.193548, (0 missing)
## 
## Node number 8: 176 observations,    complexity param=0.01315789
##   predicted class=0  expected loss=0.3863636  P(node) =0.1955556
##     class counts:   108    68
##    probabilities: 0.614 0.386 
##   left son=16 (37 obs) right son=17 (139 obs)
##   Primary splits:
##       Duration.of.Credit..month.  < 43.5    to the right, improve=5.913810, (0 missing)
##       Instalment.per.cent         < 2.5     to the right, improve=2.169142, (0 missing)
##       Credit.Amount               < 11788   to the right, improve=2.048485, (0 missing)
##       Purpose                     < 0.5     to the left,  improve=1.923477, (0 missing)
##       Duration.in.Current.address < 1.5     to the right, improve=1.757062, (0 missing)
##   Surrogate splits:
##       Credit.Amount < 13319.5 to the right, agree=0.812, adj=0.108, (0 split)
## 
## Node number 9: 37 observations
##   predicted class=1  expected loss=0.2702703  P(node) =0.04111111
##     class counts:    10    27
##    probabilities: 0.270 0.730 
## 
## Node number 10: 25 observations
##   predicted class=0  expected loss=0.28  P(node) =0.02777778
##     class counts:    18     7
##    probabilities: 0.720 0.280 
## 
## Node number 11: 254 observations,    complexity param=0.01629073
##   predicted class=1  expected loss=0.3031496  P(node) =0.2822222
##     class counts:    77   177
##    probabilities: 0.303 0.697 
##   left son=22 (150 obs) right son=23 (104 obs)
##   Primary splits:
##       Most.valuable.available.asset < 1.5     to the right, improve=3.609063, (0 missing)
##       Duration.of.Credit..month.    < 11.5    to the right, improve=3.068361, (0 missing)
##       Credit.Amount                 < 1285    to the left,  improve=3.034775, (0 missing)
##       Guarantors                    < 2.5     to the left,  improve=2.417995, (0 missing)
##       Age..years.                   < 25.5    to the left,  improve=1.818908, (0 missing)
##   Surrogate splits:
##       Occupation                 < 2.5     to the right, agree=0.661, adj=0.173, (0 split)
##       Guarantors                 < 1.5     to the left,  agree=0.646, adj=0.135, (0 split)
##       Sex...Marital.Status       < 3.5     to the left,  agree=0.618, adj=0.067, (0 split)
##       Duration.of.Credit..month. < 10.5    to the right, agree=0.614, adj=0.058, (0 split)
##       Foreign.Worker             < 1.5     to the left,  agree=0.614, adj=0.058, (0 split)
## 
## Node number 16: 37 observations
##   predicted class=0  expected loss=0.1351351  P(node) =0.04111111
##     class counts:    32     5
##    probabilities: 0.865 0.135 
## 
## Node number 17: 139 observations,    complexity param=0.01315789
##   predicted class=0  expected loss=0.4532374  P(node) =0.1544444
##     class counts:    76    63
##    probabilities: 0.547 0.453 
##   left son=34 (28 obs) right son=35 (111 obs)
##   Primary splits:
##       Purpose              < 0.5     to the left,  improve=2.896591, (0 missing)
##       Credit.Amount        < 2249    to the left,  improve=2.896591, (0 missing)
##       Guarantors           < 2.5     to the left,  improve=2.591311, (0 missing)
##       Instalment.per.cent  < 2.5     to the right, improve=1.888214, (0 missing)
##       Sex...Marital.Status < 2.5     to the left,  improve=1.527897, (0 missing)
##   Surrogate splits:
##       Credit.Amount < 1368.5  to the left,  agree=0.827, adj=0.143, (0 split)
## 
## Node number 22: 150 observations,    complexity param=0.01629073
##   predicted class=1  expected loss=0.3733333  P(node) =0.1666667
##     class counts:    56    94
##    probabilities: 0.373 0.627 
##   left son=44 (49 obs) right son=45 (101 obs)
##   Primary splits:
##       Credit.Amount                     < 1285.5  to the left,  improve=4.595234, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5     to the left,  improve=2.389231, (0 missing)
##       No.of.Credits.at.this.Bank        < 1.5     to the left,  improve=1.698487, (0 missing)
##       Age..years.                       < 25.5    to the left,  improve=1.366154, (0 missing)
##       Duration.of.Credit..month.        < 8.5     to the right, improve=1.153320, (0 missing)
##   Surrogate splits:
##       Duration.of.Credit..month. < 8.5     to the left,  agree=0.707, adj=0.102, (0 split)
##       Purpose                    < 3.5     to the right, agree=0.700, adj=0.082, (0 split)
##       Age..years.                < 20.5    to the left,  agree=0.687, adj=0.041, (0 split)
##       No.of.dependents           < 1.5     to the right, agree=0.680, adj=0.020, (0 split)
## 
## Node number 23: 104 observations
##   predicted class=1  expected loss=0.2019231  P(node) =0.1155556
##     class counts:    21    83
##    probabilities: 0.202 0.798 
## 
## Node number 34: 28 observations
##   predicted class=0  expected loss=0.25  P(node) =0.03111111
##     class counts:    21     7
##    probabilities: 0.750 0.250 
## 
## Node number 35: 111 observations,    complexity param=0.01315789
##   predicted class=1  expected loss=0.4954955  P(node) =0.1233333
##     class counts:    55    56
##    probabilities: 0.495 0.505 
##   left son=70 (102 obs) right son=71 (9 obs)
##   Primary splits:
##       Guarantors           < 2.5     to the left,  improve=2.894188, (0 missing)
##       Purpose              < 1.5     to the right, improve=2.474901, (0 missing)
##       Age..years.          < 52.5    to the left,  improve=1.919258, (0 missing)
##       Concurrent.Credits   < 1.5     to the left,  improve=1.632795, (0 missing)
##       Sex...Marital.Status < 2.5     to the left,  improve=1.424384, (0 missing)
## 
## Node number 44: 49 observations,    complexity param=0.01629073
##   predicted class=0  expected loss=0.4489796  P(node) =0.05444444
##     class counts:    27    22
##    probabilities: 0.551 0.449 
##   left son=88 (37 obs) right son=89 (12 obs)
##   Primary splits:
##       Duration.of.Credit..month.    < 8.5     to the right, improve=4.695348, (0 missing)
##       Telephone                     < 1.5     to the left,  improve=3.629513, (0 missing)
##       Age..years.                   < 36.5    to the left,  improve=3.482993, (0 missing)
##       Most.valuable.available.asset < 2.5     to the right, improve=3.067640, (0 missing)
##       Length.of.current.employment  < 3.5     to the left,  improve=2.803821, (0 missing)
##   Surrogate splits:
##       Credit.Amount < 624     to the right, agree=0.857, adj=0.417, (0 split)
##       Occupation    < 3.5     to the left,  agree=0.796, adj=0.167, (0 split)
## 
## Node number 45: 101 observations,    complexity param=0.0112782
##   predicted class=1  expected loss=0.2871287  P(node) =0.1122222
##     class counts:    29    72
##    probabilities: 0.287 0.713 
##   left son=90 (9 obs) right son=91 (92 obs)
##   Primary splits:
##       Credit.Amount                     < 6414.5  to the right, improve=2.846535, (0 missing)
##       Length.of.current.employment      < 4.5     to the right, improve=2.549066, (0 missing)
##       Age..years.                       < 49      to the right, improve=1.884996, (0 missing)
##       Payment.Status.of.Previous.Credit < 3.5     to the left,  improve=1.604687, (0 missing)
##       Instalment.per.cent               < 3.5     to the left,  improve=1.530175, (0 missing)
##   Surrogate splits:
##       Age..years. < 67.5    to the right, agree=0.921, adj=0.111, (0 split)
## 
## Node number 70: 102 observations,    complexity param=0.01315789
##   predicted class=0  expected loss=0.4705882  P(node) =0.1133333
##     class counts:    54    48
##    probabilities: 0.529 0.471 
##   left son=140 (85 obs) right son=141 (17 obs)
##   Primary splits:
##       Purpose                           < 1.5     to the right, improve=2.258824, (0 missing)
##       Concurrent.Credits                < 1.5     to the left,  improve=2.258824, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=1.846785, (0 missing)
##       Sex...Marital.Status              < 2.5     to the left,  improve=1.838022, (0 missing)
##       Length.of.current.employment      < 3.5     to the left,  improve=1.411765, (0 missing)
## 
## Node number 71: 9 observations
##   predicted class=1  expected loss=0.1111111  P(node) =0.01
##     class counts:     1     8
##    probabilities: 0.111 0.889 
## 
## Node number 88: 37 observations
##   predicted class=0  expected loss=0.3243243  P(node) =0.04111111
##     class counts:    25    12
##    probabilities: 0.676 0.324 
## 
## Node number 89: 12 observations
##   predicted class=1  expected loss=0.1666667  P(node) =0.01333333
##     class counts:     2    10
##    probabilities: 0.167 0.833 
## 
## Node number 90: 9 observations
##   predicted class=0  expected loss=0.3333333  P(node) =0.01
##     class counts:     6     3
##    probabilities: 0.667 0.333 
## 
## Node number 91: 92 observations
##   predicted class=1  expected loss=0.25  P(node) =0.1022222
##     class counts:    23    69
##    probabilities: 0.250 0.750 
## 
## Node number 140: 85 observations,    complexity param=0.01315789
##   predicted class=0  expected loss=0.4235294  P(node) =0.09444444
##     class counts:    49    36
##    probabilities: 0.576 0.424 
##   left son=280 (16 obs) right son=281 (69 obs)
##   Primary splits:
##       Concurrent.Credits                < 1.5     to the left,  improve=2.196100, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=1.820168, (0 missing)
##       Telephone                         < 1.5     to the right, improve=1.466993, (0 missing)
##       Age..years.                       < 28.5    to the right, improve=1.191024, (0 missing)
##       Duration.in.Current.address       < 1.5     to the right, improve=1.152941, (0 missing)
##   Surrogate splits:
##       Duration.of.Credit..month. < 39.5    to the right, agree=0.824, adj=0.062, (0 split)
## 
## Node number 141: 17 observations
##   predicted class=1  expected loss=0.2941176  P(node) =0.01888889
##     class counts:     5    12
##    probabilities: 0.294 0.706 
## 
## Node number 280: 16 observations
##   predicted class=0  expected loss=0.1875  P(node) =0.01777778
##     class counts:    13     3
##    probabilities: 0.812 0.188 
## 
## Node number 281: 69 observations,    complexity param=0.01315789
##   predicted class=0  expected loss=0.4782609  P(node) =0.07666667
##     class counts:    36    33
##    probabilities: 0.522 0.478 
##   left son=562 (44 obs) right son=563 (25 obs)
##   Primary splits:
##       Age..years.                  < 28.5    to the right, improve=2.0511460, (0 missing)
##       Telephone                    < 1.5     to the right, improve=1.6213370, (0 missing)
##       Length.of.current.employment < 3.5     to the left,  improve=1.2326770, (0 missing)
##       Credit.Amount                < 2440.5  to the left,  improve=0.8487091, (0 missing)
##       Sex...Marital.Status         < 2.5     to the left,  improve=0.7317415, (0 missing)
##   Surrogate splits:
##       Value.Savings.Stocks          < 2.5     to the left,  agree=0.696, adj=0.16, (0 split)
##       Most.valuable.available.asset < 1.5     to the right, agree=0.681, adj=0.12, (0 split)
##       Sex...Marital.Status          < 3.5     to the left,  agree=0.667, adj=0.08, (0 split)
##       Type.of.apartment             < 1.5     to the right, agree=0.652, adj=0.04, (0 split)
## 
## Node number 562: 44 observations,    complexity param=0.01315789
##   predicted class=0  expected loss=0.3863636  P(node) =0.04888889
##     class counts:    27    17
##    probabilities: 0.614 0.386 
##   left son=1124 (30 obs) right son=1125 (14 obs)
##   Primary splits:
##       Length.of.current.employment      < 3.5     to the left,  improve=2.7017320, (0 missing)
##       Payment.Status.of.Previous.Credit < 2.5     to the left,  improve=2.0774290, (0 missing)
##       No.of.Credits.at.this.Bank        < 1.5     to the left,  improve=1.2803030, (0 missing)
##       Sex...Marital.Status              < 2.5     to the left,  improve=0.9350649, (0 missing)
##       Credit.Amount                     < 2168.5  to the left,  improve=0.6096681, (0 missing)
##   Surrogate splits:
##       Age..years. < 52.5    to the left,  agree=0.705, adj=0.071, (0 split)
## 
## Node number 563: 25 observations
##   predicted class=1  expected loss=0.36  P(node) =0.02777778
##     class counts:     9    16
##    probabilities: 0.360 0.640 
## 
## Node number 1124: 30 observations
##   predicted class=0  expected loss=0.2666667  P(node) =0.03333333
##     class counts:    22     8
##    probabilities: 0.733 0.267 
## 
## Node number 1125: 14 observations
##   predicted class=1  expected loss=0.3571429  P(node) =0.01555556
##     class counts:     5     9
##    probabilities: 0.357 0.643

rpart.plot(DT, type=1, extra = 102)

Method #2. Random forest

#Pre-processing
if (!require("randomForest")) {
install.packages("randomForest")
library(randomForest)
}

## Loading required package: randomForest

## Warning: package 'randomForest' was built under R version 3.5.3

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)

##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

#Model Evaluation
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))

##          
## cred_pred  0  1
##         0 18  9
##         1 16 57

#Accuracy
(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 75

Q2- What are the three most important features in this model.

Answer:

importance(random_model)

##                                   MeanDecreaseGini
## Account.Balance                          41.274824
## Duration.of.Credit..month.               38.198510
## Payment.Status.of.Previous.Credit        21.554201
## Purpose                                  23.730586
## Credit.Amount                            49.202906
## Value.Savings.Stocks                     17.713488
## Length.of.current.employment             18.957148
## Instalment.per.cent                      15.251498
## Sex...Marital.Status                     13.607525
## Guarantors                                7.544454
## Duration.in.Current.address              15.079679
## Most.valuable.available.asset            16.454145
## Age..years.                              39.686369
## Concurrent.Credits                        8.231060
## Type.of.apartment                         9.910866
## No.of.Credits.at.this.Bank                8.164530
## Occupation                               12.122744
## No.of.dependents                          5.041466
## Telephone                                 7.251499
## Foreign.Worker                            1.676513

Three most imprtant features are Credit Amount at 49.202906, Account Balance at 41.274824, and Age by years at 37.019536.

Now, Change the random seed to 23458 and find the new accuracy of random forest.

Asnwer:

set.seed(23458)
random_model1 <- randomForest(Creditability ~ (Credit.Amount + Account.Balance + Age..years.), data= credit_train)
cred_pred1 <- predict(random_model1, credit_test)
(p1 <- table(cred_pred1, credit_test$Creditability))

##           
## cred_pred1  0  1
##          0  8  7
##          1 26 59

(Accuracy <- sum(diag(p1))/sum(p1)*100)

## [1] 67

Now the accuracy is drop from 75 to 67 for this model after add more parameters.

Method #3. Adding regression to trees

#Loading the data
wine <- read.csv("~/lab1/whitewines.csv")
View(wine)
str(wine)

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...

#Exploring and Preparing the Data
wine_train <- wine[1:3750, ] 
wine_test <- wine[3751:4898, ]

#Training a Model on the Data
m.rpart <- rpart(quality ~ ., data=wine_train)
summary(m.rpart)

## Call:
## rpart(formula = quality ~ ., data = wine_train)
##   n= 3750 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.15501053      0 1.0000000 1.0005815 0.02446345
## 2 0.05098911      1 0.8449895 0.8461816 0.02334172
## 3 0.02796998      2 0.7940004 0.8024666 0.02270119
## 4 0.01970128      3 0.7660304 0.7777624 0.02148003
## 5 0.01265926      4 0.7463291 0.7612306 0.02075603
## 6 0.01007193      5 0.7336698 0.7551843 0.02070614
## 7 0.01000000      6 0.7235979 0.7514380 0.02055973
## 
## Variable importance
##              alcohol              density     volatile.acidity 
##                   34                   21                   15 
##            chlorides total.sulfur.dioxide  free.sulfur.dioxide 
##                   11                    7                    6 
##       residual.sugar            sulphates          citric.acid 
##                    3                    1                    1 
## 
## Node number 1: 3750 observations,    complexity param=0.1550105
##   mean=5.870933, MSE=0.7854751 
##   left son=2 (2372 obs) right son=3 (1378 obs)
##   Primary splits:
##       alcohol              < 10.85    to the left,  improve=0.15501050, (0 missing)
##       density              < 0.992035 to the right, improve=0.10915940, (0 missing)
##       chlorides            < 0.0395   to the right, improve=0.07682258, (0 missing)
##       total.sulfur.dioxide < 158.5    to the right, improve=0.04089663, (0 missing)
##       citric.acid          < 0.235    to the left,  improve=0.03636458, (0 missing)
##   Surrogate splits:
##       density              < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
##       chlorides            < 0.0375   to the right, agree=0.757, adj=0.339, (0 split)
##       total.sulfur.dioxide < 103.5    to the right, agree=0.690, adj=0.155, (0 split)
##       residual.sugar       < 5.375    to the right, agree=0.667, adj=0.094, (0 split)
##       sulphates            < 0.345    to the right, agree=0.647, adj=0.038, (0 split)
## 
## Node number 2: 2372 observations,    complexity param=0.05098911
##   mean=5.604975, MSE=0.5981709 
##   left son=4 (1611 obs) right son=5 (761 obs)
##   Primary splits:
##       volatile.acidity    < 0.2275   to the right, improve=0.10585250, (0 missing)
##       free.sulfur.dioxide < 13.5     to the left,  improve=0.03390500, (0 missing)
##       citric.acid         < 0.235    to the left,  improve=0.03204075, (0 missing)
##       alcohol             < 10.11667 to the left,  improve=0.03136524, (0 missing)
##       chlorides           < 0.0585   to the right, improve=0.01633599, (0 missing)
##   Surrogate splits:
##       pH                   < 3.485    to the left,  agree=0.694, adj=0.047, (0 split)
##       sulphates            < 0.755    to the left,  agree=0.685, adj=0.020, (0 split)
##       total.sulfur.dioxide < 105.5    to the right, agree=0.683, adj=0.011, (0 split)
##       residual.sugar       < 0.75     to the right, agree=0.681, adj=0.007, (0 split)
##       chlorides            < 0.0285   to the right, agree=0.680, adj=0.003, (0 split)
## 
## Node number 3: 1378 observations,    complexity param=0.02796998
##   mean=6.328737, MSE=0.7765472 
##   left son=6 (84 obs) right son=7 (1294 obs)
##   Primary splits:
##       free.sulfur.dioxide  < 10.5     to the left,  improve=0.07699080, (0 missing)
##       alcohol              < 11.76667 to the left,  improve=0.06210660, (0 missing)
##       total.sulfur.dioxide < 67.5     to the left,  improve=0.04438619, (0 missing)
##       residual.sugar       < 1.375    to the left,  improve=0.02905351, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02613259, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 53.5     to the left,  agree=0.952, adj=0.214, (0 split)
##       volatile.acidity     < 0.875    to the right, agree=0.940, adj=0.024, (0 split)
## 
## Node number 4: 1611 observations,    complexity param=0.01265926
##   mean=5.43203, MSE=0.5098121 
##   left son=8 (688 obs) right son=9 (923 obs)
##   Primary splits:
##       volatile.acidity    < 0.3025   to the right, improve=0.04540111, (0 missing)
##       alcohol             < 10.05    to the left,  improve=0.03874403, (0 missing)
##       free.sulfur.dioxide < 13.5     to the left,  improve=0.03338886, (0 missing)
##       chlorides           < 0.0495   to the right, improve=0.02574623, (0 missing)
##       citric.acid         < 0.195    to the left,  improve=0.02327981, (0 missing)
##   Surrogate splits:
##       citric.acid          < 0.215    to the left,  agree=0.633, adj=0.141, (0 split)
##       free.sulfur.dioxide  < 20.5     to the left,  agree=0.600, adj=0.063, (0 split)
##       chlorides            < 0.0595   to the right, agree=0.593, adj=0.047, (0 split)
##       residual.sugar       < 1.15     to the left,  agree=0.583, adj=0.023, (0 split)
##       total.sulfur.dioxide < 219.25   to the right, agree=0.582, adj=0.022, (0 split)
## 
## Node number 5: 761 observations
##   mean=5.971091, MSE=0.5878633 
## 
## Node number 6: 84 observations
##   mean=5.369048, MSE=1.137613 
## 
## Node number 7: 1294 observations,    complexity param=0.01970128
##   mean=6.391036, MSE=0.6894405 
##   left son=14 (629 obs) right son=15 (665 obs)
##   Primary splits:
##       alcohol              < 11.76667 to the left,  improve=0.06504696, (0 missing)
##       chlorides            < 0.0395   to the right, improve=0.02758705, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02750932, (0 missing)
##       pH                   < 3.055    to the left,  improve=0.02307356, (0 missing)
##       total.sulfur.dioxide < 191.5    to the right, improve=0.02186818, (0 missing)
##   Surrogate splits:
##       density              < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
##       volatile.acidity     < 0.2675   to the left,  agree=0.637, adj=0.253, (0 split)
##       chlorides            < 0.0365   to the right, agree=0.630, adj=0.238, (0 split)
##       residual.sugar       < 1.475    to the left,  agree=0.575, adj=0.126, (0 split)
##       total.sulfur.dioxide < 128.5    to the right, agree=0.574, adj=0.124, (0 split)
## 
## Node number 8: 688 observations
##   mean=5.255814, MSE=0.4054895 
## 
## Node number 9: 923 observations
##   mean=5.56338, MSE=0.5471747 
## 
## Node number 14: 629 observations,    complexity param=0.01007193
##   mean=6.173291, MSE=0.6838017 
##   left son=28 (11 obs) right son=29 (618 obs)
##   Primary splits:
##       volatile.acidity     < 0.465    to the right, improve=0.06897561, (0 missing)
##       total.sulfur.dioxide < 200      to the right, improve=0.04223066, (0 missing)
##       residual.sugar       < 0.975    to the left,  improve=0.03061714, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02978501, (0 missing)
##       sulphates            < 0.575    to the left,  improve=0.02165970, (0 missing)
##   Surrogate splits:
##       citric.acid          < 0.045    to the left,  agree=0.986, adj=0.182, (0 split)
##       total.sulfur.dioxide < 279.25   to the right, agree=0.986, adj=0.182, (0 split)
## 
## Node number 15: 665 observations
##   mean=6.596992, MSE=0.6075098 
## 
## Node number 28: 11 observations
##   mean=4.545455, MSE=0.9752066 
## 
## Node number 29: 618 observations
##   mean=6.202265, MSE=0.6306098

#Model Visualization
rpart.plot(m.rpart, digits=3, type=1)

#Model evaluation
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597

summary(wine_test$quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000

Q3- What is your interpretation about this amount of RMSE?

Asnwer:

if (!require("Metrics")) {
install.packages("Metrics")
library(Metrics)
}

## Loading required package: Metrics

## Warning: package 'Metrics' was built under R version 3.5.3

rmse(wine_test$quality, p.rpart)

## [1] 0.7448093

The root-mean-square error is 0.745, the lower RMSE value, the more model fit the observed data, and 0.745 is still high.

Method #4. News Popularity

#Import the data
news <- read.csv("~/lab1/OnlineNewsPopularity_for_R.csv")
# View(news)
str(news)

## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
str(newsShort)

## 'data.frame':    39644 obs. of  17 variables:
##  $ n_tokens_title              : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content            : num  219 255 211 531 1072 ...
##  $ n_unique_tokens             : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words            : num  1 1 1 1 1 ...
##  $ num_hrefs                   : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_imgs                    : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                  : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length        : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ kw_max_max                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ global_sentiment_polarity   : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ avg_positive_polarity       : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ title_subjectivity          : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity    : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity      : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity: num  0.188 0 0 0 0.136 ...
##  $ shares                      : int  593 711 1500 1200 505 855 556 891 3600 710 ...

#Pre-processing
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "1"} 
     else {newsShort$popular[i] = "0"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)

news_rand <- newsShort[order(runif(10000)), ]
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

prop.table(table(news_train$shares))

## 
##         0         1 
## 0.4308889 0.5691111

prop.table(table(news_test$shares))

## 
##     0     1 
## 0.414 0.586

#Model training
library("C50", lib.loc="~/R/win-library/3.5")
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)

## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Dec 13 13:00:07 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 9000 cases (18 attributes) from undefined.data
## 
## Decision tree:
## 
## popular = 0: 0 (3878)
## popular = 1: 1 (5122)
## 
## 
## Evaluation on training data (9000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3878          (a): class 0
##          5122    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% popular
## 
## 
## Time: 0.1 secs

#Evaluate the model
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))

##          
## news_pred   0   1
##         0 414   0
##         1   0 586

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 100

Q4- Try decision tree and random forest and evaluate the model.

Answer: #Step 1: RANDOM FOREST

news <- read.csv("~/lab1/OnlineNewsPopularity_for_R.csv")
news <- news[,-(1:2)]

#Check for outliers
news=news[!news$n_unique_tokens==701,]

#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

#Standardize the dataset
for(i in ncol(news)-1){ 
  news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}

#Define popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))

set.seed(12346)

news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:3964, ]
news_test <- news_rand[3965:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_modelNews <- randomForest(news_train$shares ~ . , data= news_train)

#Model training
cred_pridRF <- predict(random_modelNews, news_test)
(p2 <- table(cred_pridRF, news_test$shares))

##            
## cred_pridRF     0     1
##           0 10014  7264
##           1  8088 10313

#Accuracy
(Accuracy <- sum(diag(p2))/sum(p2)*100)

## [1] 56.97189

#importance
importance(random_modelNews)

##                              MeanDecreaseGini
## n_tokens_title                      110.95451
## n_tokens_content                    172.15879
## n_unique_tokens                     188.13602
## n_non_stop_words                    168.50493
## num_hrefs                           148.12312
## num_imgs                             90.89631
## num_videos                           52.40012
## average_token_length                195.31143
## num_keywords                         93.64084
## kw_max_max                           49.10667
## global_sentiment_polarity           214.07528
## avg_positive_polarity               187.35883
## title_subjectivity                   74.85371
## title_sentiment_polarity             82.44230
## abs_title_subjectivity               73.17472
## abs_title_sentiment_polarity         68.02149

Step 2: Decision Tree

news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)

## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Dec 13 13:00:13 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 3964 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## global_sentiment_polarity > 0.1079453:
## :...num_hrefs > 12:
## :   :...n_tokens_title <= 10: 1 (436/135)
## :   :   n_tokens_title > 10:
## :   :   :...abs_title_subjectivity > 0.3416667: 1 (137/47)
## :   :       abs_title_subjectivity <= 0.3416667:
## :   :       :...n_unique_tokens <= 0.3832077: 1 (7)
## :   :           n_unique_tokens > 0.3832077: 0 (123/51)
## :   num_hrefs <= 12:
## :   :...abs_title_subjectivity > 0.1272727: 1 (1135/509)
## :       abs_title_subjectivity <= 0.1272727:
## :       :...num_imgs <= 1:
## :           :...n_unique_tokens > 0.4742152: 0 (255/87)
## :           :   n_unique_tokens <= 0.4742152:
## :           :   :...n_tokens_title <= 12: 1 (22/3)
## :           :       n_tokens_title > 12: 0 (5)
## :           num_imgs > 1:
## :           :...num_imgs <= 6: 1 (47/16)
## :               num_imgs > 6:
## :               :...kw_max_max <= 227300: 1 (6/1)
## :                   kw_max_max > 227300: 0 (44/15)
## global_sentiment_polarity <= 0.1079453:
## :...num_hrefs > 19: 1 (186/79)
##     num_hrefs <= 19:
##     :...n_non_stop_words <= 0.9999999: 1 (112/51)
##         n_non_stop_words > 0.9999999:
##         :...num_keywords <= 6:
##             :...num_hrefs <= 1: 0 (39/4)
##             :   num_hrefs > 1:
##             :   :...title_sentiment_polarity <= 0.1083333: 0 (449/140)
##             :       title_sentiment_polarity > 0.1083333:
##             :       :...num_keywords <= 5: 1 (74/28)
##             :           num_keywords > 5:
##             :           :...average_token_length <= 4.411043: 1 (9/1)
##             :               average_token_length > 4.411043: 0 (48/14)
##             num_keywords > 6:
##             :...num_imgs > 2: 1 (226/102)
##                 num_imgs <= 2:
##                 :...num_imgs > 0:
##                     :...kw_max_max <= 227300: 1 (25/10)
##                     :   kw_max_max > 227300: 0 (437/159)
##                     num_imgs <= 0:
##                     :...num_keywords <= 7: 1 (46/15)
##                         num_keywords > 7:
##                         :...num_videos > 15: 1 (5)
##                             num_videos <= 15:
##                             :...num_hrefs <= 2: 1 (10/2)
##                                 num_hrefs > 2: 0 (81/30)
## 
## 
## Evaluation on training data (3964 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      25 1499(37.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     981   999    (a): class 0
##     500  1484    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% num_hrefs
##  100.00% global_sentiment_polarity
##   44.93% abs_title_subjectivity
##   39.38% n_non_stop_words
##   36.55% num_keywords
##   30.50% num_imgs
##   18.42% n_tokens_title
##   14.63% title_sentiment_polarity
##   12.92% kw_max_max
##   10.39% n_unique_tokens
##    2.42% num_videos
##    1.44% average_token_length
## 
## 
## Time: 0.1 secs

#Model training
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  35679 
## 
##  
##               | predicted shares 
## actual shares |         0 |         1 | Row Total | 
## --------------|-----------|-----------|-----------|
##             0 |      7692 |     10410 |     18102 | 
##               |     0.216 |     0.292 |           | 
## --------------|-----------|-----------|-----------|
##             1 |      5596 |     11981 |     17577 | 
##               |     0.157 |     0.336 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |     13288 |     22391 |     35679 | 
## --------------|-----------|-----------|-----------|
## 
##

(p3 <- table(news_pred, news_test$shares))

##          
## news_pred     0     1
##         0  7692  5596
##         1 10410 11981

#Accuracy
(Accuracy <- sum(diag(p3))/sum(p3)*100)

## [1] 55.13888

ANLY 530-53- B-2019 Lab #1

Student ID: 235208

2019-12-13