Title: Lab1 Method #1. Tree-based classification # Dataset:
# replace the ? (question marks) into NAs
rawdata <- read.csv(file="F:/homework/530/credit.csv", na.strings='?')
str(rawdata)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
summary(rawdata)
## Creditability Account.Balance Duration.of.Credit..month.
## Min. :0.0 Min. :1.000 Min. : 4.0
## 1st Qu.:0.0 1st Qu.:1.000 1st Qu.:12.0
## Median :1.0 Median :2.000 Median :18.0
## Mean :0.7 Mean :2.577 Mean :20.9
## 3rd Qu.:1.0 3rd Qu.:4.000 3rd Qu.:24.0
## Max. :1.0 Max. :4.000 Max. :72.0
## Payment.Status.of.Previous.Credit Purpose Credit.Amount
## Min. :0.000 Min. : 0.000 Min. : 250
## 1st Qu.:2.000 1st Qu.: 1.000 1st Qu.: 1366
## Median :2.000 Median : 2.000 Median : 2320
## Mean :2.545 Mean : 2.828 Mean : 3271
## 3rd Qu.:4.000 3rd Qu.: 3.000 3rd Qu.: 3972
## Max. :4.000 Max. :10.000 Max. :18424
## Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:2.000
## Median :1.000 Median :3.000 Median :3.000
## Mean :2.105 Mean :3.384 Mean :2.973
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :4.000
## Sex...Marital.Status Guarantors Duration.in.Current.address
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :3.000 Median :1.000 Median :3.000
## Mean :2.682 Mean :1.145 Mean :2.845
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:4.000
## Max. :4.000 Max. :3.000 Max. :4.000
## Most.valuable.available.asset Age..years. Concurrent.Credits
## Min. :1.000 Min. :19.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:27.00 1st Qu.:3.000
## Median :2.000 Median :33.00 Median :3.000
## Mean :2.358 Mean :35.54 Mean :2.675
## 3rd Qu.:3.000 3rd Qu.:42.00 3rd Qu.:3.000
## Max. :4.000 Max. :75.00 Max. :3.000
## Type.of.apartment No.of.Credits.at.this.Bank Occupation
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000
## Median :2.000 Median :1.000 Median :3.000
## Mean :1.928 Mean :1.407 Mean :2.904
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.000 Max. :4.000
## No.of.dependents Telephone Foreign.Worker
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :1.000
## Mean :1.155 Mean :1.404 Mean :1.037
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :2.000 Max. :2.000 Max. :2.000
#check the target variable"Creditability"
table(rawdata$Creditability)
##
## 0 1
## 300 700
#randomize the data
set.seed(12346)
#split randomized data to train and test sets
dateset <- rawdata[order(runif(1000)), ]
summary(dateset$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
train <- dateset[1:900, ]
test <- dateset[901:1000, ]
prop.table(table(train$Creditability))
##
## 0 1
## 0.2955556 0.7044444
if (!require("C50")) {
install.packages("C50")
library(C50)
}
## Loading required package: C50
## Warning: package 'C50' was built under R version 3.6.3
train$Creditability <- as.factor(train$Creditability)
test$Creditability <- as.factor(test$Creditability)
credit_model <- C5.0(x = train[-1], y = train$Creditability)
summary(credit_model)
##
## Call:
## C5.0.default(x = train[-1], y = train$Creditability)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Jun 19 18:29:01 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## Account.Balance > 2:
## :...Account.Balance <= 3:
## : :...Foreign.Worker > 1: 1 (3)
## : : Foreign.Worker <= 1:
## : : :...Most.valuable.available.asset <= 1:
## : : :...Occupation <= 2: 0 (5)
## : : : Occupation > 2:
## : : : :...Instalment.per.cent <= 3: 1 (7)
## : : : Instalment.per.cent > 3: 0 (4/1)
## : : Most.valuable.available.asset > 1:
## : : :...No.of.dependents <= 1: 1 (34/4)
## : : No.of.dependents > 1:
## : : :...Credit.Amount <= 1278: 0 (2)
## : : Credit.Amount > 1278: 1 (2)
## : Account.Balance > 3:
## : :...Concurrent.Credits <= 2:
## : :...Purpose <= 0:
## : : :...No.of.Credits.at.this.Bank <= 1: 1 (3)
## : : : No.of.Credits.at.this.Bank > 1: 0 (6/1)
## : : Purpose > 0:
## : : :...Purpose <= 4: 1 (28/2)
## : : Purpose > 4:
## : : :...Instalment.per.cent <= 1: 1 (2)
## : : Instalment.per.cent > 1:
## : : :...Length.of.current.employment <= 3: 0 (6/1)
## : : Length.of.current.employment > 3: 1 (7/1)
## : Concurrent.Credits > 2:
## : :...Length.of.current.employment > 3: 1 (139/5)
## : Length.of.current.employment <= 3:
## : :...Credit.Amount <= 4455: 1 (134/11)
## : Credit.Amount > 4455:
## : :...No.of.dependents > 1: 1 (5)
## : No.of.dependents <= 1:
## : :...Purpose <= 1:
## : :...Duration.of.Credit..month. <= 42: 1 (9)
## : : Duration.of.Credit..month. > 42: 0 (2)
## : Purpose > 1:
## : :...Sex...Marital.Status <= 2: 0 (4)
## : Sex...Marital.Status > 2:
## : :...Occupation <= 3: 1 (2)
## : Occupation > 3: 0 (4)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
## :...Type.of.apartment <= 1: 0 (14/1)
## : Type.of.apartment > 1:
## : :...Type.of.apartment > 2:
## : :...Length.of.current.employment <= 2: 1 (3/1)
## : : Length.of.current.employment > 2: 0 (8)
## : Type.of.apartment <= 2:
## : :...Occupation > 3: 0 (3)
## : Occupation <= 3:
## : :...Age..years. > 35: 0 (9/1)
## : Age..years. <= 35:
## : :...Telephone > 1: 1 (8/1)
## : Telephone <= 1:
## : :...Credit.Amount <= 3556: 0 (8/2)
## : Credit.Amount > 3556: 1 (5)
## Payment.Status.of.Previous.Credit > 1:
## :...Duration.of.Credit..month. > 22:
## :...Value.Savings.Stocks > 3:
## : :...Account.Balance > 1: 1 (19/1)
## : : Account.Balance <= 1:
## : : :...No.of.Credits.at.this.Bank > 1: 1 (3)
## : : No.of.Credits.at.this.Bank <= 1:
## : : :...Occupation > 3: 0 (2)
## : : Occupation <= 3:
## : : :...Telephone <= 1: 0 (7/1)
## : : Telephone > 1: 1 (2)
## : Value.Savings.Stocks <= 3:
## : :...Duration.of.Credit..month. > 42:
## : :...Duration.in.Current.address <= 1: 1 (3/1)
## : : Duration.in.Current.address > 1: 0 (25/2)
## : Duration.of.Credit..month. <= 42:
## : :...Purpose <= 0:
## : :...Value.Savings.Stocks > 1: 1 (4/1)
## : : Value.Savings.Stocks <= 1:
## : : :...Instalment.per.cent > 2: 0 (18/1)
## : : Instalment.per.cent <= 2:
## : : :...Age..years. <= 35: 0 (2)
## : : Age..years. > 35: 1 (2)
## : Purpose > 0:
## : :...Age..years. > 52: 1 (7)
## : Age..years. <= 52:
## : :...Type.of.apartment > 2:
## : :...Age..years. <= 46: 1 (8)
## : : Age..years. > 46: 0 (2)
## : Type.of.apartment <= 2:
## : :...Concurrent.Credits <= 1: 0 (8/1)
## : Concurrent.Credits > 1:
## : :...Sex...Marital.Status <= 2: [S1]
## : Sex...Marital.Status > 2:
## : :...No.of.Credits.at.this.Bank > 1:
## : :...Age..years. <= 25: 0 (5/1)
## : : Age..years. > 25: 1 (12/1)
## : No.of.Credits.at.this.Bank <= 1:
## : :...Guarantors > 2: 1 (2)
## : Guarantors <= 2:
## : :...No.of.dependents > 1: 0 (5)
## : No.of.dependents <= 1:
## : :...Telephone <= 1: [S2]
## : Telephone > 1: [S3]
## Duration.of.Credit..month. <= 22:
## :...Guarantors > 2:
## :...Purpose <= 1: 0 (4/1)
## : Purpose > 1: 1 (24)
## Guarantors <= 2:
## :...Payment.Status.of.Previous.Credit > 3: 1 (67/13)
## Payment.Status.of.Previous.Credit <= 3:
## :...Value.Savings.Stocks > 2: 1 (44/10)
## Value.Savings.Stocks <= 2:
## :...Instalment.per.cent <= 3:
## :...Occupation <= 2: 1 (16/2)
## : Occupation > 2:
## : :...No.of.Credits.at.this.Bank > 1:
## : :...Account.Balance <= 1: 0 (2)
## : : Account.Balance > 1:
## : : :...Type.of.apartment <= 1: 0 (2)
## : : Type.of.apartment > 1: 1 (3)
## : No.of.Credits.at.this.Bank <= 1:
## : :...Telephone > 1: [S4]
## : Telephone <= 1:
## : :...Credit.Amount <= 1347: 0 (6)
## : Credit.Amount > 1347: [S5]
## Instalment.per.cent > 3:
## :...No.of.Credits.at.this.Bank > 1:
## :...Type.of.apartment <= 1: 0 (3)
## : Type.of.apartment > 1: 1 (5)
## No.of.Credits.at.this.Bank <= 1:
## :...Age..years. > 35: 0 (11/1)
## Age..years. <= 35:
## :...Length.of.current.employment <= 1: 0 (4)
## Length.of.current.employment > 1:
## :...Credit.Amount > 1755: 1 (7)
## Credit.Amount <= 1755:
## :...Value.Savings.Stocks > 1: 0 (2)
## Value.Savings.Stocks <= 1:
## :...Telephone > 1: 0 (2)
## Telephone <= 1: [S6]
##
## SubTree [S1]
##
## No.of.Credits.at.this.Bank > 1: 0 (4)
## No.of.Credits.at.this.Bank <= 1:
## :...Duration.of.Credit..month. > 33: 1 (2)
## Duration.of.Credit..month. <= 33:
## :...Purpose <= 2: 0 (8/1)
## Purpose > 2:
## :...Account.Balance <= 1: 0 (5/1)
## Account.Balance > 1: 1 (3)
##
## SubTree [S2]
##
## Age..years. <= 31: 1 (8)
## Age..years. > 31: 0 (4/1)
##
## SubTree [S3]
##
## Age..years. <= 29: 0 (5)
## Age..years. > 29: 1 (5/1)
##
## SubTree [S4]
##
## Most.valuable.available.asset <= 3: 1 (10/1)
## Most.valuable.available.asset > 3: 0 (5/1)
##
## SubTree [S5]
##
## Duration.of.Credit..month. > 16: 0 (4/1)
## Duration.of.Credit..month. <= 16:
## :...Purpose <= 2: 1 (11)
## Purpose > 2:
## :...Length.of.current.employment <= 3: 0 (2)
## Length.of.current.employment > 3: 1 (3)
##
## SubTree [S6]
##
## Length.of.current.employment > 4: 1 (3)
## Length.of.current.employment <= 4:
## :...Length.of.current.employment > 3: 0 (2)
## Length.of.current.employment <= 3:
## :...Age..years. <= 27: 1 (9/3)
## Age..years. > 27: 0 (3)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 83 79( 8.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 207 59 (a): class 0
## 20 614 (b): class 1
##
##
## Attribute usage:
##
## 100.00% Account.Balance
## 54.67% Payment.Status.of.Previous.Credit
## 49.44% Duration.of.Credit..month.
## 47.44% Concurrent.Credits
## 40.00% Length.of.current.employment
## 37.67% Value.Savings.Stocks
## 31.44% Guarantors
## 26.22% Purpose
## 25.67% Credit.Amount
## 21.11% No.of.Credits.at.this.Bank
## 18.89% Age..years.
## 18.11% Instalment.per.cent
## 17.44% Type.of.apartment
## 14.44% Occupation
## 12.44% Telephone
## 10.11% No.of.dependents
## 8.67% Sex...Marital.Status
## 7.67% Most.valuable.available.asset
## 6.33% Foreign.Worker
## 3.11% Duration.in.Current.address
##
##
## Time: 0.0 secs
cred_pred <- predict(credit_model, test)
#Model Evaluation Method 1
if (!require("gmodels")) {
install.packages("gmodels")
library(gmodels)
}
## Loading required package: gmodels
## Warning: package 'gmodels' was built under R version 3.6.2
CrossTable(test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ---------------------|-----------|-----------|-----------|
## 0 | 19 | 15 | 34 |
## | 0.190 | 0.150 | |
## ---------------------|-----------|-----------|-----------|
## 1 | 9 | 57 | 66 |
## | 0.090 | 0.570 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 28 | 72 | 100 |
## ---------------------|-----------|-----------|-----------|
##
##
(p <- table(cred_pred, test$Creditability))
##
## cred_pred 0 1
## 0 19 9
## 1 15 57
#Accuracy
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 76
if (!require("rpart")) {
install.packages("rpart")
library(rpart)
}
## Loading required package: rpart
if (!require("rpart.plot")) {
install.packages("rpart.plot")
library(rpart.plot)
}
## Loading required package: rpart.plot
## Warning: package 'rpart.plot' was built under R version 3.6.3
if (!require("ggplot2")) {
install.packages("ggplot2")
library(ggplot2)
}
## Loading required package: ggplot2
DT <- rpart(Creditability ~ . , data= train)
summary(DT)
## Call:
## rpart(formula = Creditability ~ ., data = train)
## n= 900
##
## CP nsplit rel error xerror xstd
## 1 0.04323308 0 1.0000000 1.0000000 0.05146151
## 2 0.04135338 3 0.8496241 0.9812030 0.05117617
## 3 0.01629073 4 0.8082707 0.9398496 0.05051540
## 4 0.01315789 7 0.7593985 0.9661654 0.05094120
## 5 0.01127820 14 0.6654135 0.9774436 0.05111799
## 6 0.01000000 15 0.6541353 0.9736842 0.05105944
##
## Variable importance
## Account.Balance Duration.of.Credit..month.
## 30 17
## Credit.Amount Value.Savings.Stocks
## 11 10
## Payment.Status.of.Previous.Credit Purpose
## 9 5
## Most.valuable.available.asset Age..years.
## 4 3
## Length.of.current.employment Guarantors
## 3 3
## Concurrent.Credits Occupation
## 2 2
## Type.of.apartment
## 1
##
## Node number 1: 900 observations, complexity param=0.04323308
## predicted class=1 expected loss=0.2955556 P(node) =1
## class counts: 266 634
## probabilities: 0.296 0.704
## left son=2 (492 obs) right son=3 (408 obs)
## Primary splits:
## Account.Balance < 2.5 to the left, improve=40.96088, (0 missing)
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve=15.92438, (0 missing)
## Value.Savings.Stocks < 2.5 to the left, improve=12.98196, (0 missing)
## Duration.of.Credit..month. < 43.5 to the right, improve=12.49926, (0 missing)
## Credit.Amount < 11191 to the right, improve=11.19833, (0 missing)
## Surrogate splits:
## Value.Savings.Stocks < 2.5 to the left, agree=0.617, adj=0.154, (0 split)
## Payment.Status.of.Previous.Credit < 3.5 to the left, agree=0.591, adj=0.098, (0 split)
## Age..years. < 30.5 to the left, agree=0.557, adj=0.022, (0 split)
## Duration.of.Credit..month. < 6.5 to the right, agree=0.554, adj=0.017, (0 split)
## Length.of.current.employment < 4.5 to the left, agree=0.554, adj=0.017, (0 split)
##
## Node number 2: 492 observations, complexity param=0.04323308
## predicted class=1 expected loss=0.4329268 P(node) =0.5466667
## class counts: 213 279
## probabilities: 0.433 0.567
## left son=4 (213 obs) right son=5 (279 obs)
## Primary splits:
## Duration.of.Credit..month. < 22.5 to the right, improve=11.010300, (0 missing)
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve= 8.667244, (0 missing)
## Most.valuable.available.asset < 1.5 to the right, improve= 8.534391, (0 missing)
## Value.Savings.Stocks < 2.5 to the left, improve= 7.282673, (0 missing)
## Credit.Amount < 8079 to the right, improve= 7.238941, (0 missing)
## Surrogate splits:
## Credit.Amount < 2805.5 to the right, agree=0.756, adj=0.437, (0 split)
## Most.valuable.available.asset < 2.5 to the right, agree=0.652, adj=0.197, (0 split)
## Type.of.apartment < 2.5 to the right, agree=0.610, adj=0.099, (0 split)
## Purpose < 8.5 to the right, agree=0.608, adj=0.094, (0 split)
## Occupation < 3.5 to the right, agree=0.598, adj=0.070, (0 split)
##
## Node number 3: 408 observations
## predicted class=1 expected loss=0.129902 P(node) =0.4533333
## class counts: 53 355
## probabilities: 0.130 0.870
##
## Node number 4: 213 observations, complexity param=0.04323308
## predicted class=0 expected loss=0.4460094 P(node) =0.2366667
## class counts: 118 95
## probabilities: 0.554 0.446
## left son=8 (176 obs) right son=9 (37 obs)
## Primary splits:
## Value.Savings.Stocks < 3.5 to the left, improve=7.209076, (0 missing)
## Duration.of.Credit..month. < 43.5 to the right, improve=3.897888, (0 missing)
## Purpose < 0.5 to the left, improve=3.207167, (0 missing)
## Credit.Amount < 1376.5 to the left, improve=3.155186, (0 missing)
## Instalment.per.cent < 2.5 to the right, improve=2.469776, (0 missing)
##
## Node number 5: 279 observations, complexity param=0.04135338
## predicted class=1 expected loss=0.3405018 P(node) =0.31
## class counts: 95 184
## probabilities: 0.341 0.659
## left son=10 (25 obs) right son=11 (254 obs)
## Primary splits:
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve=7.909699, (0 missing)
## Most.valuable.available.asset < 1.5 to the right, improve=5.732535, (0 missing)
## Credit.Amount < 1285 to the left, improve=3.493557, (0 missing)
## Guarantors < 2.5 to the left, improve=3.389748, (0 missing)
## Duration.of.Credit..month. < 11.5 to the right, improve=3.193548, (0 missing)
##
## Node number 8: 176 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.3863636 P(node) =0.1955556
## class counts: 108 68
## probabilities: 0.614 0.386
## left son=16 (37 obs) right son=17 (139 obs)
## Primary splits:
## Duration.of.Credit..month. < 43.5 to the right, improve=5.913810, (0 missing)
## Instalment.per.cent < 2.5 to the right, improve=2.169142, (0 missing)
## Credit.Amount < 11788 to the right, improve=2.048485, (0 missing)
## Purpose < 0.5 to the left, improve=1.923477, (0 missing)
## Duration.in.Current.address < 1.5 to the right, improve=1.757062, (0 missing)
## Surrogate splits:
## Credit.Amount < 13319.5 to the right, agree=0.812, adj=0.108, (0 split)
##
## Node number 9: 37 observations
## predicted class=1 expected loss=0.2702703 P(node) =0.04111111
## class counts: 10 27
## probabilities: 0.270 0.730
##
## Node number 10: 25 observations
## predicted class=0 expected loss=0.28 P(node) =0.02777778
## class counts: 18 7
## probabilities: 0.720 0.280
##
## Node number 11: 254 observations, complexity param=0.01629073
## predicted class=1 expected loss=0.3031496 P(node) =0.2822222
## class counts: 77 177
## probabilities: 0.303 0.697
## left son=22 (150 obs) right son=23 (104 obs)
## Primary splits:
## Most.valuable.available.asset < 1.5 to the right, improve=3.609063, (0 missing)
## Duration.of.Credit..month. < 11.5 to the right, improve=3.068361, (0 missing)
## Credit.Amount < 1285 to the left, improve=3.034775, (0 missing)
## Guarantors < 2.5 to the left, improve=2.417995, (0 missing)
## Age..years. < 25.5 to the left, improve=1.818908, (0 missing)
## Surrogate splits:
## Occupation < 2.5 to the right, agree=0.661, adj=0.173, (0 split)
## Guarantors < 1.5 to the left, agree=0.646, adj=0.135, (0 split)
## Sex...Marital.Status < 3.5 to the left, agree=0.618, adj=0.067, (0 split)
## Duration.of.Credit..month. < 10.5 to the right, agree=0.614, adj=0.058, (0 split)
## Foreign.Worker < 1.5 to the left, agree=0.614, adj=0.058, (0 split)
##
## Node number 16: 37 observations
## predicted class=0 expected loss=0.1351351 P(node) =0.04111111
## class counts: 32 5
## probabilities: 0.865 0.135
##
## Node number 17: 139 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.4532374 P(node) =0.1544444
## class counts: 76 63
## probabilities: 0.547 0.453
## left son=34 (28 obs) right son=35 (111 obs)
## Primary splits:
## Purpose < 0.5 to the left, improve=2.896591, (0 missing)
## Credit.Amount < 2249 to the left, improve=2.896591, (0 missing)
## Guarantors < 2.5 to the left, improve=2.591311, (0 missing)
## Instalment.per.cent < 2.5 to the right, improve=1.888214, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=1.527897, (0 missing)
## Surrogate splits:
## Credit.Amount < 1368.5 to the left, agree=0.827, adj=0.143, (0 split)
##
## Node number 22: 150 observations, complexity param=0.01629073
## predicted class=1 expected loss=0.3733333 P(node) =0.1666667
## class counts: 56 94
## probabilities: 0.373 0.627
## left son=44 (49 obs) right son=45 (101 obs)
## Primary splits:
## Credit.Amount < 1285.5 to the left, improve=4.595234, (0 missing)
## Payment.Status.of.Previous.Credit < 3.5 to the left, improve=2.389231, (0 missing)
## No.of.Credits.at.this.Bank < 1.5 to the left, improve=1.698487, (0 missing)
## Age..years. < 25.5 to the left, improve=1.366154, (0 missing)
## Duration.of.Credit..month. < 8.5 to the right, improve=1.153320, (0 missing)
## Surrogate splits:
## Duration.of.Credit..month. < 8.5 to the left, agree=0.707, adj=0.102, (0 split)
## Purpose < 3.5 to the right, agree=0.700, adj=0.082, (0 split)
## Age..years. < 20.5 to the left, agree=0.687, adj=0.041, (0 split)
## No.of.dependents < 1.5 to the right, agree=0.680, adj=0.020, (0 split)
##
## Node number 23: 104 observations
## predicted class=1 expected loss=0.2019231 P(node) =0.1155556
## class counts: 21 83
## probabilities: 0.202 0.798
##
## Node number 34: 28 observations
## predicted class=0 expected loss=0.25 P(node) =0.03111111
## class counts: 21 7
## probabilities: 0.750 0.250
##
## Node number 35: 111 observations, complexity param=0.01315789
## predicted class=1 expected loss=0.4954955 P(node) =0.1233333
## class counts: 55 56
## probabilities: 0.495 0.505
## left son=70 (102 obs) right son=71 (9 obs)
## Primary splits:
## Guarantors < 2.5 to the left, improve=2.894188, (0 missing)
## Purpose < 1.5 to the right, improve=2.474901, (0 missing)
## Age..years. < 52.5 to the left, improve=1.919258, (0 missing)
## Concurrent.Credits < 1.5 to the left, improve=1.632795, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=1.424384, (0 missing)
##
## Node number 44: 49 observations, complexity param=0.01629073
## predicted class=0 expected loss=0.4489796 P(node) =0.05444444
## class counts: 27 22
## probabilities: 0.551 0.449
## left son=88 (37 obs) right son=89 (12 obs)
## Primary splits:
## Duration.of.Credit..month. < 8.5 to the right, improve=4.695348, (0 missing)
## Telephone < 1.5 to the left, improve=3.629513, (0 missing)
## Age..years. < 36.5 to the left, improve=3.482993, (0 missing)
## Most.valuable.available.asset < 2.5 to the right, improve=3.067640, (0 missing)
## Length.of.current.employment < 3.5 to the left, improve=2.803821, (0 missing)
## Surrogate splits:
## Credit.Amount < 624 to the right, agree=0.857, adj=0.417, (0 split)
## Occupation < 3.5 to the left, agree=0.796, adj=0.167, (0 split)
##
## Node number 45: 101 observations, complexity param=0.0112782
## predicted class=1 expected loss=0.2871287 P(node) =0.1122222
## class counts: 29 72
## probabilities: 0.287 0.713
## left son=90 (9 obs) right son=91 (92 obs)
## Primary splits:
## Credit.Amount < 6414.5 to the right, improve=2.846535, (0 missing)
## Length.of.current.employment < 4.5 to the right, improve=2.549066, (0 missing)
## Age..years. < 49 to the right, improve=1.884996, (0 missing)
## Payment.Status.of.Previous.Credit < 3.5 to the left, improve=1.604687, (0 missing)
## Instalment.per.cent < 3.5 to the left, improve=1.530175, (0 missing)
## Surrogate splits:
## Age..years. < 67.5 to the right, agree=0.921, adj=0.111, (0 split)
##
## Node number 70: 102 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.4705882 P(node) =0.1133333
## class counts: 54 48
## probabilities: 0.529 0.471
## left son=140 (85 obs) right son=141 (17 obs)
## Primary splits:
## Purpose < 1.5 to the right, improve=2.258824, (0 missing)
## Concurrent.Credits < 1.5 to the left, improve=2.258824, (0 missing)
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve=1.846785, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=1.838022, (0 missing)
## Length.of.current.employment < 3.5 to the left, improve=1.411765, (0 missing)
##
## Node number 71: 9 observations
## predicted class=1 expected loss=0.1111111 P(node) =0.01
## class counts: 1 8
## probabilities: 0.111 0.889
##
## Node number 88: 37 observations
## predicted class=0 expected loss=0.3243243 P(node) =0.04111111
## class counts: 25 12
## probabilities: 0.676 0.324
##
## Node number 89: 12 observations
## predicted class=1 expected loss=0.1666667 P(node) =0.01333333
## class counts: 2 10
## probabilities: 0.167 0.833
##
## Node number 90: 9 observations
## predicted class=0 expected loss=0.3333333 P(node) =0.01
## class counts: 6 3
## probabilities: 0.667 0.333
##
## Node number 91: 92 observations
## predicted class=1 expected loss=0.25 P(node) =0.1022222
## class counts: 23 69
## probabilities: 0.250 0.750
##
## Node number 140: 85 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.4235294 P(node) =0.09444444
## class counts: 49 36
## probabilities: 0.576 0.424
## left son=280 (16 obs) right son=281 (69 obs)
## Primary splits:
## Concurrent.Credits < 1.5 to the left, improve=2.196100, (0 missing)
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve=1.820168, (0 missing)
## Telephone < 1.5 to the right, improve=1.466993, (0 missing)
## Age..years. < 28.5 to the right, improve=1.191024, (0 missing)
## Duration.in.Current.address < 1.5 to the right, improve=1.152941, (0 missing)
## Surrogate splits:
## Duration.of.Credit..month. < 39.5 to the right, agree=0.824, adj=0.062, (0 split)
##
## Node number 141: 17 observations
## predicted class=1 expected loss=0.2941176 P(node) =0.01888889
## class counts: 5 12
## probabilities: 0.294 0.706
##
## Node number 280: 16 observations
## predicted class=0 expected loss=0.1875 P(node) =0.01777778
## class counts: 13 3
## probabilities: 0.812 0.188
##
## Node number 281: 69 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.4782609 P(node) =0.07666667
## class counts: 36 33
## probabilities: 0.522 0.478
## left son=562 (44 obs) right son=563 (25 obs)
## Primary splits:
## Age..years. < 28.5 to the right, improve=2.0511460, (0 missing)
## Telephone < 1.5 to the right, improve=1.6213370, (0 missing)
## Length.of.current.employment < 3.5 to the left, improve=1.2326770, (0 missing)
## Credit.Amount < 2440.5 to the left, improve=0.8487091, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=0.7317415, (0 missing)
## Surrogate splits:
## Value.Savings.Stocks < 2.5 to the left, agree=0.696, adj=0.16, (0 split)
## Most.valuable.available.asset < 1.5 to the right, agree=0.681, adj=0.12, (0 split)
## Sex...Marital.Status < 3.5 to the left, agree=0.667, adj=0.08, (0 split)
## Type.of.apartment < 1.5 to the right, agree=0.652, adj=0.04, (0 split)
##
## Node number 562: 44 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.3863636 P(node) =0.04888889
## class counts: 27 17
## probabilities: 0.614 0.386
## left son=1124 (30 obs) right son=1125 (14 obs)
## Primary splits:
## Length.of.current.employment < 3.5 to the left, improve=2.7017320, (0 missing)
## Payment.Status.of.Previous.Credit < 2.5 to the left, improve=2.0774290, (0 missing)
## No.of.Credits.at.this.Bank < 1.5 to the left, improve=1.2803030, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=0.9350649, (0 missing)
## Credit.Amount < 2168.5 to the left, improve=0.6096681, (0 missing)
## Surrogate splits:
## Age..years. < 52.5 to the left, agree=0.705, adj=0.071, (0 split)
##
## Node number 563: 25 observations
## predicted class=1 expected loss=0.36 P(node) =0.02777778
## class counts: 9 16
## probabilities: 0.360 0.640
##
## Node number 1124: 30 observations
## predicted class=0 expected loss=0.2666667 P(node) =0.03333333
## class counts: 22 8
## probabilities: 0.733 0.267
##
## Node number 1125: 14 observations
## predicted class=1 expected loss=0.3571429 P(node) =0.01555556
## class counts: 5 9
## probabilities: 0.357 0.643
rpart.plot(DT, type=1, extra = 102)
#Method #2. Random forest
if (!require("randomForest")) {
install.packages("randomForest")
library(randomForest)
}
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
train$Creditability <- as.factor(train$Creditability)
rf_model <- randomForest(Creditability ~ . , data= train)
summary(rf_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 900 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1800 matrix numeric
## oob.times 900 -none- numeric
## classes 2 -none- character
## importance 20 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 900 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
#Evaluation
cred_pred <- predict(rf_model, test)
(p <- table(cred_pred, test$Creditability))
##
## cred_pred 0 1
## 0 17 10
## 1 17 56
#Accuracy
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 73
importance(rf_model)
## MeanDecreaseGini
## Account.Balance 40.807660
## Duration.of.Credit..month. 38.138331
## Payment.Status.of.Previous.Credit 21.509260
## Purpose 23.149714
## Credit.Amount 49.940301
## Value.Savings.Stocks 17.648853
## Length.of.current.employment 18.605632
## Instalment.per.cent 15.410043
## Sex...Marital.Status 13.440547
## Guarantors 7.243751
## Duration.in.Current.address 15.275036
## Most.valuable.available.asset 16.931605
## Age..years. 39.749907
## Concurrent.Credits 8.609968
## Type.of.apartment 9.814678
## No.of.Credits.at.this.Bank 8.131338
## Occupation 11.930925
## No.of.dependents 5.113108
## Telephone 7.538531
## Foreign.Worker 1.513942
#Three most imprtant features are Account.Balance at 49.280951, Account Balance at 41.392868, and Age by years at 38.919060.
#Now, Change the random seed to 23458 and find the new accuracy of random forest.
set.seed(23458)
rf_model1 <- randomForest(Creditability ~ (Credit.Amount + Account.Balance + Age..years.), data= train)
cred_pred1 <- predict(rf_model1, test)
(p1 <- table(cred_pred1, test$Creditability))
##
## cred_pred1 0 1
## 0 8 7
## 1 26 59
(Accuracy <- sum(diag(p1))/sum(p1)*100)
## [1] 67
#The accuracy is drop from 74 to 67 for this model when more parameters are added.
#Loading the data
winedata <- read.csv(file="F:/homework/530/whitewines.csv", na.strings='?')
str(winedata)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
summary(winedata)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
#splite train and test dataset
wine_train <- winedata[1:3750, ]
wine_test <- winedata[3751:4898, ]
#Model on the Data on train dataset
m.rpart <- rpart(quality ~ ., data=wine_train)
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.15501053 0 1.0000000 1.0002630 0.02445079
## 2 0.05098911 1 0.8449895 0.8477242 0.02335875
## 3 0.02796998 2 0.7940004 0.8062166 0.02281638
## 4 0.01970128 3 0.7660304 0.7803097 0.02159532
## 5 0.01265926 4 0.7463291 0.7661249 0.02092459
## 6 0.01007193 5 0.7336698 0.7582832 0.02077944
## 7 0.01000000 6 0.7235979 0.7486815 0.02050843
##
## Variable importance
## alcohol density volatile.acidity
## 34 21 15
## chlorides total.sulfur.dioxide free.sulfur.dioxide
## 11 7 6
## residual.sugar sulphates citric.acid
## 3 1 1
##
## Node number 1: 3750 observations, complexity param=0.1550105
## mean=5.870933, MSE=0.7854751
## left son=2 (2372 obs) right son=3 (1378 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.15501050, (0 missing)
## density < 0.992035 to the right, improve=0.10915940, (0 missing)
## chlorides < 0.0395 to the right, improve=0.07682258, (0 missing)
## total.sulfur.dioxide < 158.5 to the right, improve=0.04089663, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03636458, (0 missing)
## Surrogate splits:
## density < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
## chlorides < 0.0375 to the right, agree=0.757, adj=0.339, (0 split)
## total.sulfur.dioxide < 103.5 to the right, agree=0.690, adj=0.155, (0 split)
## residual.sugar < 5.375 to the right, agree=0.667, adj=0.094, (0 split)
## sulphates < 0.345 to the right, agree=0.647, adj=0.038, (0 split)
##
## Node number 2: 2372 observations, complexity param=0.05098911
## mean=5.604975, MSE=0.5981709
## left son=4 (1611 obs) right son=5 (761 obs)
## Primary splits:
## volatile.acidity < 0.2275 to the right, improve=0.10585250, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03390500, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03204075, (0 missing)
## alcohol < 10.11667 to the left, improve=0.03136524, (0 missing)
## chlorides < 0.0585 to the right, improve=0.01633599, (0 missing)
## Surrogate splits:
## pH < 3.485 to the left, agree=0.694, adj=0.047, (0 split)
## sulphates < 0.755 to the left, agree=0.685, adj=0.020, (0 split)
## total.sulfur.dioxide < 105.5 to the right, agree=0.683, adj=0.011, (0 split)
## residual.sugar < 0.75 to the right, agree=0.681, adj=0.007, (0 split)
## chlorides < 0.0285 to the right, agree=0.680, adj=0.003, (0 split)
##
## Node number 3: 1378 observations, complexity param=0.02796998
## mean=6.328737, MSE=0.7765472
## left son=6 (84 obs) right son=7 (1294 obs)
## Primary splits:
## free.sulfur.dioxide < 10.5 to the left, improve=0.07699080, (0 missing)
## alcohol < 11.76667 to the left, improve=0.06210660, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.04438619, (0 missing)
## residual.sugar < 1.375 to the left, improve=0.02905351, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02613259, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 53.5 to the left, agree=0.952, adj=0.214, (0 split)
## volatile.acidity < 0.875 to the right, agree=0.940, adj=0.024, (0 split)
##
## Node number 4: 1611 observations, complexity param=0.01265926
## mean=5.43203, MSE=0.5098121
## left son=8 (688 obs) right son=9 (923 obs)
## Primary splits:
## volatile.acidity < 0.3025 to the right, improve=0.04540111, (0 missing)
## alcohol < 10.05 to the left, improve=0.03874403, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03338886, (0 missing)
## chlorides < 0.0495 to the right, improve=0.02574623, (0 missing)
## citric.acid < 0.195 to the left, improve=0.02327981, (0 missing)
## Surrogate splits:
## citric.acid < 0.215 to the left, agree=0.633, adj=0.141, (0 split)
## free.sulfur.dioxide < 20.5 to the left, agree=0.600, adj=0.063, (0 split)
## chlorides < 0.0595 to the right, agree=0.593, adj=0.047, (0 split)
## residual.sugar < 1.15 to the left, agree=0.583, adj=0.023, (0 split)
## total.sulfur.dioxide < 219.25 to the right, agree=0.582, adj=0.022, (0 split)
##
## Node number 5: 761 observations
## mean=5.971091, MSE=0.5878633
##
## Node number 6: 84 observations
## mean=5.369048, MSE=1.137613
##
## Node number 7: 1294 observations, complexity param=0.01970128
## mean=6.391036, MSE=0.6894405
## left son=14 (629 obs) right son=15 (665 obs)
## Primary splits:
## alcohol < 11.76667 to the left, improve=0.06504696, (0 missing)
## chlorides < 0.0395 to the right, improve=0.02758705, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02750932, (0 missing)
## pH < 3.055 to the left, improve=0.02307356, (0 missing)
## total.sulfur.dioxide < 191.5 to the right, improve=0.02186818, (0 missing)
## Surrogate splits:
## density < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
## volatile.acidity < 0.2675 to the left, agree=0.637, adj=0.253, (0 split)
## chlorides < 0.0365 to the right, agree=0.630, adj=0.238, (0 split)
## residual.sugar < 1.475 to the left, agree=0.575, adj=0.126, (0 split)
## total.sulfur.dioxide < 128.5 to the right, agree=0.574, adj=0.124, (0 split)
##
## Node number 8: 688 observations
## mean=5.255814, MSE=0.4054895
##
## Node number 9: 923 observations
## mean=5.56338, MSE=0.5471747
##
## Node number 14: 629 observations, complexity param=0.01007193
## mean=6.173291, MSE=0.6838017
## left son=28 (11 obs) right son=29 (618 obs)
## Primary splits:
## volatile.acidity < 0.465 to the right, improve=0.06897561, (0 missing)
## total.sulfur.dioxide < 200 to the right, improve=0.04223066, (0 missing)
## residual.sugar < 0.975 to the left, improve=0.03061714, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02978501, (0 missing)
## sulphates < 0.575 to the left, improve=0.02165970, (0 missing)
## Surrogate splits:
## citric.acid < 0.045 to the left, agree=0.986, adj=0.182, (0 split)
## total.sulfur.dioxide < 279.25 to the right, agree=0.986, adj=0.182, (0 split)
##
## Node number 15: 665 observations
## mean=6.596992, MSE=0.6075098
##
## Node number 28: 11 observations
## mean=4.545455, MSE=0.9752066
##
## Node number 29: 618 observations
## mean=6.202265, MSE=0.6306098
#Model Visualization
rpart.plot(m.rpart, digits=3, type=1)
#Model evaluation
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
Q3- What is your interpretation about this amount of RMSE?
if (!require("Metrics")) {
install.packages("Metrics")
library(Metrics)
}
## Loading required package: Metrics
## Warning: package 'Metrics' was built under R version 3.6.3
rmse(wine_test$quality, p.rpart)
## [1] 0.7448093
#the lower RMSE value can be used to see if the model fit the observed data well.The lower the better.
#The root-mean-square error is 0.745.It is still high.
Method #4. News Popularity
#Import the data
Popularity<- read.csv(file="F:/homework/530/OnlineNewsPopularity_for_R.csv", na.strings='?')
# View(news)
str(Popularity)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
#minify instances
PopularitysShort <- data.frame(Popularity$n_tokens_title, Popularity$n_tokens_content, Popularity$n_unique_tokens, Popularity$n_non_stop_words, Popularity$num_hrefs, Popularity$num_imgs, Popularity$num_videos, Popularity$average_token_length, Popularity$num_keywords, Popularity$kw_max_max, Popularity$global_sentiment_polarity, Popularity$avg_positive_polarity, Popularity$title_subjectivity, Popularity$title_sentiment_polarity, Popularity$abs_title_subjectivity, Popularity$abs_title_sentiment_polarity, Popularity$shares)
colnames(PopularitysShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
str(PopularitysShort)
## 'data.frame': 39644 obs. of 17 variables:
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity: num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
Processing data
PopularitysShort$popular = rep('na', nrow(PopularitysShort))
for(i in 1:39644) {
if(PopularitysShort$shares[i] >= 1400) {
PopularitysShort$popular[i] = "1"}
else {PopularitysShort$popular[i] = "0"}
}
PopularitysShort$shares = PopularitysShort$popular
PopularitysShort$shares <- as.factor(PopularitysShort$shares)
set.seed(12345)
Popularitys_rand <- PopularitysShort[order(runif(10000)), ]
#Split the data into training and test datasets
Popularitys_train <- Popularitys_rand[1:9000, ]
Popularitys_test <- Popularitys_rand[9001:10000, ]
prop.table(table(Popularitys_train$shares))
##
## 0 1
## 0.4308889 0.5691111
prop.table(table(Popularitys_test$shares))
##
## 0 1
## 0.414 0.586
Model training
library("C50", lib.loc="~/R/win-library/3.5")
Popularitys_model <- C5.0(Popularitys_train[-17], Popularitys_train$shares)
summary(Popularitys_model)
##
## Call:
## C5.0.default(x = Popularitys_train[-17], y = Popularitys_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Jun 19 18:29:22 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 9000 cases (18 attributes) from undefined.data
##
## Decision tree:
##
## popular = 0: 0 (3878)
## popular = 1: 1 (5122)
##
##
## Evaluation on training data (9000 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 3878 (a): class 0
## 5122 (b): class 1
##
##
## Attribute usage:
##
## 100.00% popular
##
##
## Time: 0.1 secs
Evaluate the model
#Evaluate the model
Popularitys_pred <- predict(Popularitys_model, Popularitys_test)
(p <- table(Popularitys_pred, Popularitys_test$shares))
##
## Popularitys_pred 0 1
## 0 414 0
## 1 0 586
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100
Q4- Try decision tree and random forest and evaluate the model.
Popularity<- read.csv(file="F:/homework/530/OnlineNewsPopularity_for_R.csv", na.strings='?')
Popularity <- Popularity[,-(1:2)]
#Check for outliers
Popularity=Popularity[!Popularity$n_unique_tokens==701,]
#minify instances
PopularityShort <- data.frame(Popularity$n_tokens_title, Popularity$n_tokens_content, Popularity$n_unique_tokens, Popularity$n_non_stop_words, Popularity$num_hrefs, Popularity$num_imgs, Popularity$num_videos, Popularity$average_token_length, Popularity$num_keywords, Popularity$kw_max_max, Popularity$global_sentiment_polarity, Popularity$avg_positive_polarity, Popularity$title_subjectivity, Popularity$title_sentiment_polarity, Popularity$abs_title_subjectivity, Popularity$abs_title_sentiment_polarity, Popularity$shares)
colnames(PopularityShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#Standardize the dataset
for(i in ncol(Popularity)-1){
Popularity[,i]<-scale(Popularity[,i], center = TRUE, scale = TRUE)
}
#Define popular articles
PopularityShort$shares <- as.factor(ifelse(PopularityShort$shares > 1400,1,0))
set.seed(12346)
Popularity_rand <- PopularityShort[order(runif(39643)), ]
Popularity_train <- Popularity_rand[1:3964, ]
Popularity_test <- Popularity_rand[3965:39643, ]
Popularity_train$shares <- as.factor(Popularity_train$shares)
random_modelNews <- randomForest(Popularity_train$shares ~ . , data= Popularity_train)
#Model training
cred_pridRF <- predict(random_modelNews, Popularity_test)
(p2 <- table(cred_pridRF, Popularity_test$shares))
##
## cred_pridRF 0 1
## 0 10014 7264
## 1 8088 10313
#Accuracy
(Accuracy <- sum(diag(p2))/sum(p2)*100)
## [1] 56.97189
#importance
importance(random_modelNews)
## MeanDecreaseGini
## n_tokens_title 110.95451
## n_tokens_content 172.15879
## n_unique_tokens 188.13602
## n_non_stop_words 168.50493
## num_hrefs 148.12312
## num_imgs 90.89631
## num_videos 52.40012
## average_token_length 195.31143
## num_keywords 93.64084
## kw_max_max 49.10667
## global_sentiment_polarity 214.07528
## avg_positive_polarity 187.35883
## title_subjectivity 74.85371
## title_sentiment_polarity 82.44230
## abs_title_subjectivity 73.17472
## abs_title_sentiment_polarity 68.02149
Step 2: Decision Tree
#Evaluate the model
Popularity_model <- C5.0(Popularity_train[-17], Popularity_train$shares)
summary(Popularity_model)
##
## Call:
## C5.0.default(x = Popularity_train[-17], y = Popularity_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Jun 19 18:29:34 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 3964 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## global_sentiment_polarity > 0.1079453:
## :...num_hrefs > 12:
## : :...n_tokens_title <= 10: 1 (436/135)
## : : n_tokens_title > 10:
## : : :...abs_title_subjectivity > 0.3416667: 1 (137/47)
## : : abs_title_subjectivity <= 0.3416667:
## : : :...n_unique_tokens <= 0.3832077: 1 (7)
## : : n_unique_tokens > 0.3832077: 0 (123/51)
## : num_hrefs <= 12:
## : :...abs_title_subjectivity > 0.1272727: 1 (1135/509)
## : abs_title_subjectivity <= 0.1272727:
## : :...num_imgs <= 1:
## : :...n_unique_tokens > 0.4742152: 0 (255/87)
## : : n_unique_tokens <= 0.4742152:
## : : :...n_tokens_title <= 12: 1 (22/3)
## : : n_tokens_title > 12: 0 (5)
## : num_imgs > 1:
## : :...num_imgs <= 6: 1 (47/16)
## : num_imgs > 6:
## : :...kw_max_max <= 227300: 1 (6/1)
## : kw_max_max > 227300: 0 (44/15)
## global_sentiment_polarity <= 0.1079453:
## :...num_hrefs > 19: 1 (186/79)
## num_hrefs <= 19:
## :...n_non_stop_words <= 0.9999999: 1 (112/51)
## n_non_stop_words > 0.9999999:
## :...num_keywords <= 6:
## :...num_hrefs <= 1: 0 (39/4)
## : num_hrefs > 1:
## : :...title_sentiment_polarity <= 0.1083333: 0 (449/140)
## : title_sentiment_polarity > 0.1083333:
## : :...num_keywords <= 5: 1 (74/28)
## : num_keywords > 5:
## : :...average_token_length <= 4.411043: 1 (9/1)
## : average_token_length > 4.411043: 0 (48/14)
## num_keywords > 6:
## :...num_imgs > 2: 1 (226/102)
## num_imgs <= 2:
## :...num_imgs > 0:
## :...kw_max_max <= 227300: 1 (25/10)
## : kw_max_max > 227300: 0 (437/159)
## num_imgs <= 0:
## :...num_keywords <= 7: 1 (46/15)
## num_keywords > 7:
## :...num_videos > 15: 1 (5)
## num_videos <= 15:
## :...num_hrefs <= 2: 1 (10/2)
## num_hrefs > 2: 0 (81/30)
##
##
## Evaluation on training data (3964 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 25 1499(37.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 981 999 (a): class 0
## 500 1484 (b): class 1
##
##
## Attribute usage:
##
## 100.00% num_hrefs
## 100.00% global_sentiment_polarity
## 44.93% abs_title_subjectivity
## 39.38% n_non_stop_words
## 36.55% num_keywords
## 30.50% num_imgs
## 18.42% n_tokens_title
## 14.63% title_sentiment_polarity
## 12.92% kw_max_max
## 10.39% n_unique_tokens
## 2.42% num_videos
## 1.44% average_token_length
##
##
## Time: 0.1 secs
#Model training
Popularity_pred <- predict(Popularity_model, Popularity_test)
CrossTable(Popularity_test$shares, Popularity_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 35679
##
##
## | predicted shares
## actual shares | 0 | 1 | Row Total |
## --------------|-----------|-----------|-----------|
## 0 | 7692 | 10410 | 18102 |
## | 0.216 | 0.292 | |
## --------------|-----------|-----------|-----------|
## 1 | 5596 | 11981 | 17577 |
## | 0.157 | 0.336 | |
## --------------|-----------|-----------|-----------|
## Column Total | 13288 | 22391 | 35679 |
## --------------|-----------|-----------|-----------|
##
##
(p3 <- table(Popularity_pred, Popularity_test$shares))
##
## Popularity_pred 0 1
## 0 7692 5596
## 1 10410 11981
#Accuracy
(Accuracy <- sum(diag(p3))/sum(p3)*100)
## [1] 55.13888