credit <- read.csv("~/lab1/credit.csv")
View(credit)
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$Creditability)
##
## 0 1
## 300 700
#randomize the data
set.seed(12346)
#Divide randomized data to train and test sets
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$Creditability)
##
## 0 1
## 300 700
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.2955556 0.7044444
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.34 0.66
if (!require("C50")) {
install.packages("C50")
library(C50)
}
## Loading required package: C50
## Warning: package 'C50' was built under R version 3.5.3
credit_train$Creditability <- as.factor(credit_train$Creditability)
credit_test$Creditability <- as.factor(credit_test$Creditability)
credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Dec 13 12:59:56 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## Account.Balance > 2:
## :...Account.Balance <= 3:
## : :...Foreign.Worker > 1: 1 (3)
## : : Foreign.Worker <= 1:
## : : :...Most.valuable.available.asset <= 1:
## : : :...Occupation <= 2: 0 (5)
## : : : Occupation > 2:
## : : : :...Instalment.per.cent <= 3: 1 (7)
## : : : Instalment.per.cent > 3: 0 (4/1)
## : : Most.valuable.available.asset > 1:
## : : :...No.of.dependents <= 1: 1 (34/4)
## : : No.of.dependents > 1:
## : : :...Credit.Amount <= 1278: 0 (2)
## : : Credit.Amount > 1278: 1 (2)
## : Account.Balance > 3:
## : :...Concurrent.Credits <= 2:
## : :...Purpose <= 0:
## : : :...No.of.Credits.at.this.Bank <= 1: 1 (3)
## : : : No.of.Credits.at.this.Bank > 1: 0 (6/1)
## : : Purpose > 0:
## : : :...Purpose <= 4: 1 (28/2)
## : : Purpose > 4:
## : : :...Instalment.per.cent <= 1: 1 (2)
## : : Instalment.per.cent > 1:
## : : :...Length.of.current.employment <= 3: 0 (6/1)
## : : Length.of.current.employment > 3: 1 (7/1)
## : Concurrent.Credits > 2:
## : :...Length.of.current.employment > 3: 1 (139/5)
## : Length.of.current.employment <= 3:
## : :...Credit.Amount <= 4455: 1 (134/11)
## : Credit.Amount > 4455:
## : :...No.of.dependents > 1: 1 (5)
## : No.of.dependents <= 1:
## : :...Purpose <= 1:
## : :...Duration.of.Credit..month. <= 42: 1 (9)
## : : Duration.of.Credit..month. > 42: 0 (2)
## : Purpose > 1:
## : :...Sex...Marital.Status <= 2: 0 (4)
## : Sex...Marital.Status > 2:
## : :...Occupation <= 3: 1 (2)
## : Occupation > 3: 0 (4)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
## :...Type.of.apartment <= 1: 0 (14/1)
## : Type.of.apartment > 1:
## : :...Type.of.apartment > 2:
## : :...Length.of.current.employment <= 2: 1 (3/1)
## : : Length.of.current.employment > 2: 0 (8)
## : Type.of.apartment <= 2:
## : :...Occupation > 3: 0 (3)
## : Occupation <= 3:
## : :...Age..years. > 35: 0 (9/1)
## : Age..years. <= 35:
## : :...Telephone > 1: 1 (8/1)
## : Telephone <= 1:
## : :...Credit.Amount <= 3556: 0 (8/2)
## : Credit.Amount > 3556: 1 (5)
## Payment.Status.of.Previous.Credit > 1:
## :...Duration.of.Credit..month. > 22:
## :...Value.Savings.Stocks > 3:
## : :...Account.Balance > 1: 1 (19/1)
## : : Account.Balance <= 1:
## : : :...No.of.Credits.at.this.Bank > 1: 1 (3)
## : : No.of.Credits.at.this.Bank <= 1:
## : : :...Occupation > 3: 0 (2)
## : : Occupation <= 3:
## : : :...Telephone <= 1: 0 (7/1)
## : : Telephone > 1: 1 (2)
## : Value.Savings.Stocks <= 3:
## : :...Duration.of.Credit..month. > 42:
## : :...Duration.in.Current.address <= 1: 1 (3/1)
## : : Duration.in.Current.address > 1: 0 (25/2)
## : Duration.of.Credit..month. <= 42:
## : :...Purpose <= 0:
## : :...Value.Savings.Stocks > 1: 1 (4/1)
## : : Value.Savings.Stocks <= 1:
## : : :...Instalment.per.cent > 2: 0 (18/1)
## : : Instalment.per.cent <= 2:
## : : :...Age..years. <= 35: 0 (2)
## : : Age..years. > 35: 1 (2)
## : Purpose > 0:
## : :...Age..years. > 52: 1 (7)
## : Age..years. <= 52:
## : :...Type.of.apartment > 2:
## : :...Age..years. <= 46: 1 (8)
## : : Age..years. > 46: 0 (2)
## : Type.of.apartment <= 2:
## : :...Concurrent.Credits <= 1: 0 (8/1)
## : Concurrent.Credits > 1:
## : :...Sex...Marital.Status <= 2: [S1]
## : Sex...Marital.Status > 2:
## : :...No.of.Credits.at.this.Bank > 1:
## : :...Age..years. <= 25: 0 (5/1)
## : : Age..years. > 25: 1 (12/1)
## : No.of.Credits.at.this.Bank <= 1:
## : :...Guarantors > 2: 1 (2)
## : Guarantors <= 2:
## : :...No.of.dependents > 1: 0 (5)
## : No.of.dependents <= 1:
## : :...Telephone <= 1: [S2]
## : Telephone > 1: [S3]
## Duration.of.Credit..month. <= 22:
## :...Guarantors > 2:
## :...Purpose <= 1: 0 (4/1)
## : Purpose > 1: 1 (24)
## Guarantors <= 2:
## :...Payment.Status.of.Previous.Credit > 3: 1 (67/13)
## Payment.Status.of.Previous.Credit <= 3:
## :...Value.Savings.Stocks > 2: 1 (44/10)
## Value.Savings.Stocks <= 2:
## :...Instalment.per.cent <= 3:
## :...Occupation <= 2: 1 (16/2)
## : Occupation > 2:
## : :...No.of.Credits.at.this.Bank > 1:
## : :...Account.Balance <= 1: 0 (2)
## : : Account.Balance > 1:
## : : :...Type.of.apartment <= 1: 0 (2)
## : : Type.of.apartment > 1: 1 (3)
## : No.of.Credits.at.this.Bank <= 1:
## : :...Telephone > 1: [S4]
## : Telephone <= 1:
## : :...Credit.Amount <= 1347: 0 (6)
## : Credit.Amount > 1347: [S5]
## Instalment.per.cent > 3:
## :...No.of.Credits.at.this.Bank > 1:
## :...Type.of.apartment <= 1: 0 (3)
## : Type.of.apartment > 1: 1 (5)
## No.of.Credits.at.this.Bank <= 1:
## :...Age..years. > 35: 0 (11/1)
## Age..years. <= 35:
## :...Length.of.current.employment <= 1: 0 (4)
## Length.of.current.employment > 1:
## :...Credit.Amount > 1755: 1 (7)
## Credit.Amount <= 1755:
## :...Value.Savings.Stocks > 1: 0 (2)
## Value.Savings.Stocks <= 1:
## :...Telephone > 1: 0 (2)
## Telephone <= 1: [S6]
##
## SubTree [S1]
##
## No.of.Credits.at.this.Bank > 1: 0 (4)
## No.of.Credits.at.this.Bank <= 1:
## :...Duration.of.Credit..month. > 33: 1 (2)
## Duration.of.Credit..month. <= 33:
## :...Purpose <= 2: 0 (8/1)
## Purpose > 2:
## :...Account.Balance <= 1: 0 (5/1)
## Account.Balance > 1: 1 (3)
##
## SubTree [S2]
##
## Age..years. <= 31: 1 (8)
## Age..years. > 31: 0 (4/1)
##
## SubTree [S3]
##
## Age..years. <= 29: 0 (5)
## Age..years. > 29: 1 (5/1)
##
## SubTree [S4]
##
## Most.valuable.available.asset <= 3: 1 (10/1)
## Most.valuable.available.asset > 3: 0 (5/1)
##
## SubTree [S5]
##
## Duration.of.Credit..month. > 16: 0 (4/1)
## Duration.of.Credit..month. <= 16:
## :...Purpose <= 2: 1 (11)
## Purpose > 2:
## :...Length.of.current.employment <= 3: 0 (2)
## Length.of.current.employment > 3: 1 (3)
##
## SubTree [S6]
##
## Length.of.current.employment > 4: 1 (3)
## Length.of.current.employment <= 4:
## :...Length.of.current.employment > 3: 0 (2)
## Length.of.current.employment <= 3:
## :...Age..years. <= 27: 1 (9/3)
## Age..years. > 27: 0 (3)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 83 79( 8.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 207 59 (a): class 0
## 20 614 (b): class 1
##
##
## Attribute usage:
##
## 100.00% Account.Balance
## 54.67% Payment.Status.of.Previous.Credit
## 49.44% Duration.of.Credit..month.
## 47.44% Concurrent.Credits
## 40.00% Length.of.current.employment
## 37.67% Value.Savings.Stocks
## 31.44% Guarantors
## 26.22% Purpose
## 25.67% Credit.Amount
## 21.11% No.of.Credits.at.this.Bank
## 18.89% Age..years.
## 18.11% Instalment.per.cent
## 17.44% Type.of.apartment
## 14.44% Occupation
## 12.44% Telephone
## 10.11% No.of.dependents
## 8.67% Sex...Marital.Status
## 7.67% Most.valuable.available.asset
## 6.33% Foreign.Worker
## 3.11% Duration.in.Current.address
##
##
## Time: 0.0 secs
cred_pred <- predict(credit_model, credit_test)
#Model Evaluation Method 1
if (!require("gmodels")) {
install.packages("gmodels")
library(gmodels)
}
## Loading required package: gmodels
## Warning: package 'gmodels' was built under R version 3.5.3
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ---------------------|-----------|-----------|-----------|
## 0 | 19 | 15 | 34 |
## | 0.190 | 0.150 | |
## ---------------------|-----------|-----------|-----------|
## 1 | 9 | 57 | 66 |
## | 0.090 | 0.570 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 28 | 72 | 100 |
## ---------------------|-----------|-----------|-----------|
##
##
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 19 9
## 1 15 57
#Accuracy
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 76
Answer: Accuracy means that the data generated using training model data set match the test set, it’s no saying the modal is well designed, because it could be an overmatched prediction model or it could cause an error in the creation of the model. The combination of accuracy and other parameters will give a better support on make a decision about the model.
if (!require("rpart")) {
install.packages("rpart")
library(rpart)
}
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 3.5.3
if (!require("rpart.plot")) {
install.packages("rpart.plot")
library(rpart.plot)
}
## Loading required package: rpart.plot
## Warning: package 'rpart.plot' was built under R version 3.5.3
if (!require("ggplot2")) {
install.packages("ggplot2")
library(ggplot2)
}
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
DT <- rpart(Creditability ~ . , data= credit_train)
summary(DT)
## Call:
## rpart(formula = Creditability ~ ., data = credit_train)
## n= 900
##
## CP nsplit rel error xerror xstd
## 1 0.04323308 0 1.0000000 1.0000000 0.05146151
## 2 0.04135338 3 0.8496241 0.9022556 0.04987406
## 3 0.01629073 4 0.8082707 0.8796992 0.04947003
## 4 0.01315789 7 0.7593985 0.8872180 0.04960634
## 5 0.01127820 14 0.6654135 0.9022556 0.04987406
## 6 0.01000000 15 0.6541353 0.9022556 0.04987406
##
## Variable importance
## Account.Balance Duration.of.Credit..month.
## 30 17
## Credit.Amount Value.Savings.Stocks
## 11 10
## Payment.Status.of.Previous.Credit Purpose
## 9 5
## Most.valuable.available.asset Age..years.
## 4 3
## Length.of.current.employment Guarantors
## 3 3
## Concurrent.Credits Occupation
## 2 2
## Type.of.apartment
## 1
##
## Node number 1: 900 observations, complexity param=0.04323308
## predicted class=1 expected loss=0.2955556 P(node) =1
## class counts: 266 634
## probabilities: 0.296 0.704
## left son=2 (492 obs) right son=3 (408 obs)
## Primary splits:
## Account.Balance < 2.5 to the left, improve=40.96088, (0 missing)
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve=15.92438, (0 missing)
## Value.Savings.Stocks < 2.5 to the left, improve=12.98196, (0 missing)
## Duration.of.Credit..month. < 43.5 to the right, improve=12.49926, (0 missing)
## Credit.Amount < 11191 to the right, improve=11.19833, (0 missing)
## Surrogate splits:
## Value.Savings.Stocks < 2.5 to the left, agree=0.617, adj=0.154, (0 split)
## Payment.Status.of.Previous.Credit < 3.5 to the left, agree=0.591, adj=0.098, (0 split)
## Age..years. < 30.5 to the left, agree=0.557, adj=0.022, (0 split)
## Duration.of.Credit..month. < 6.5 to the right, agree=0.554, adj=0.017, (0 split)
## Length.of.current.employment < 4.5 to the left, agree=0.554, adj=0.017, (0 split)
##
## Node number 2: 492 observations, complexity param=0.04323308
## predicted class=1 expected loss=0.4329268 P(node) =0.5466667
## class counts: 213 279
## probabilities: 0.433 0.567
## left son=4 (213 obs) right son=5 (279 obs)
## Primary splits:
## Duration.of.Credit..month. < 22.5 to the right, improve=11.010300, (0 missing)
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve= 8.667244, (0 missing)
## Most.valuable.available.asset < 1.5 to the right, improve= 8.534391, (0 missing)
## Value.Savings.Stocks < 2.5 to the left, improve= 7.282673, (0 missing)
## Credit.Amount < 8079 to the right, improve= 7.238941, (0 missing)
## Surrogate splits:
## Credit.Amount < 2805.5 to the right, agree=0.756, adj=0.437, (0 split)
## Most.valuable.available.asset < 2.5 to the right, agree=0.652, adj=0.197, (0 split)
## Type.of.apartment < 2.5 to the right, agree=0.610, adj=0.099, (0 split)
## Purpose < 8.5 to the right, agree=0.608, adj=0.094, (0 split)
## Occupation < 3.5 to the right, agree=0.598, adj=0.070, (0 split)
##
## Node number 3: 408 observations
## predicted class=1 expected loss=0.129902 P(node) =0.4533333
## class counts: 53 355
## probabilities: 0.130 0.870
##
## Node number 4: 213 observations, complexity param=0.04323308
## predicted class=0 expected loss=0.4460094 P(node) =0.2366667
## class counts: 118 95
## probabilities: 0.554 0.446
## left son=8 (176 obs) right son=9 (37 obs)
## Primary splits:
## Value.Savings.Stocks < 3.5 to the left, improve=7.209076, (0 missing)
## Duration.of.Credit..month. < 43.5 to the right, improve=3.897888, (0 missing)
## Purpose < 0.5 to the left, improve=3.207167, (0 missing)
## Credit.Amount < 1376.5 to the left, improve=3.155186, (0 missing)
## Instalment.per.cent < 2.5 to the right, improve=2.469776, (0 missing)
##
## Node number 5: 279 observations, complexity param=0.04135338
## predicted class=1 expected loss=0.3405018 P(node) =0.31
## class counts: 95 184
## probabilities: 0.341 0.659
## left son=10 (25 obs) right son=11 (254 obs)
## Primary splits:
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve=7.909699, (0 missing)
## Most.valuable.available.asset < 1.5 to the right, improve=5.732535, (0 missing)
## Credit.Amount < 1285 to the left, improve=3.493557, (0 missing)
## Guarantors < 2.5 to the left, improve=3.389748, (0 missing)
## Duration.of.Credit..month. < 11.5 to the right, improve=3.193548, (0 missing)
##
## Node number 8: 176 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.3863636 P(node) =0.1955556
## class counts: 108 68
## probabilities: 0.614 0.386
## left son=16 (37 obs) right son=17 (139 obs)
## Primary splits:
## Duration.of.Credit..month. < 43.5 to the right, improve=5.913810, (0 missing)
## Instalment.per.cent < 2.5 to the right, improve=2.169142, (0 missing)
## Credit.Amount < 11788 to the right, improve=2.048485, (0 missing)
## Purpose < 0.5 to the left, improve=1.923477, (0 missing)
## Duration.in.Current.address < 1.5 to the right, improve=1.757062, (0 missing)
## Surrogate splits:
## Credit.Amount < 13319.5 to the right, agree=0.812, adj=0.108, (0 split)
##
## Node number 9: 37 observations
## predicted class=1 expected loss=0.2702703 P(node) =0.04111111
## class counts: 10 27
## probabilities: 0.270 0.730
##
## Node number 10: 25 observations
## predicted class=0 expected loss=0.28 P(node) =0.02777778
## class counts: 18 7
## probabilities: 0.720 0.280
##
## Node number 11: 254 observations, complexity param=0.01629073
## predicted class=1 expected loss=0.3031496 P(node) =0.2822222
## class counts: 77 177
## probabilities: 0.303 0.697
## left son=22 (150 obs) right son=23 (104 obs)
## Primary splits:
## Most.valuable.available.asset < 1.5 to the right, improve=3.609063, (0 missing)
## Duration.of.Credit..month. < 11.5 to the right, improve=3.068361, (0 missing)
## Credit.Amount < 1285 to the left, improve=3.034775, (0 missing)
## Guarantors < 2.5 to the left, improve=2.417995, (0 missing)
## Age..years. < 25.5 to the left, improve=1.818908, (0 missing)
## Surrogate splits:
## Occupation < 2.5 to the right, agree=0.661, adj=0.173, (0 split)
## Guarantors < 1.5 to the left, agree=0.646, adj=0.135, (0 split)
## Sex...Marital.Status < 3.5 to the left, agree=0.618, adj=0.067, (0 split)
## Duration.of.Credit..month. < 10.5 to the right, agree=0.614, adj=0.058, (0 split)
## Foreign.Worker < 1.5 to the left, agree=0.614, adj=0.058, (0 split)
##
## Node number 16: 37 observations
## predicted class=0 expected loss=0.1351351 P(node) =0.04111111
## class counts: 32 5
## probabilities: 0.865 0.135
##
## Node number 17: 139 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.4532374 P(node) =0.1544444
## class counts: 76 63
## probabilities: 0.547 0.453
## left son=34 (28 obs) right son=35 (111 obs)
## Primary splits:
## Purpose < 0.5 to the left, improve=2.896591, (0 missing)
## Credit.Amount < 2249 to the left, improve=2.896591, (0 missing)
## Guarantors < 2.5 to the left, improve=2.591311, (0 missing)
## Instalment.per.cent < 2.5 to the right, improve=1.888214, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=1.527897, (0 missing)
## Surrogate splits:
## Credit.Amount < 1368.5 to the left, agree=0.827, adj=0.143, (0 split)
##
## Node number 22: 150 observations, complexity param=0.01629073
## predicted class=1 expected loss=0.3733333 P(node) =0.1666667
## class counts: 56 94
## probabilities: 0.373 0.627
## left son=44 (49 obs) right son=45 (101 obs)
## Primary splits:
## Credit.Amount < 1285.5 to the left, improve=4.595234, (0 missing)
## Payment.Status.of.Previous.Credit < 3.5 to the left, improve=2.389231, (0 missing)
## No.of.Credits.at.this.Bank < 1.5 to the left, improve=1.698487, (0 missing)
## Age..years. < 25.5 to the left, improve=1.366154, (0 missing)
## Duration.of.Credit..month. < 8.5 to the right, improve=1.153320, (0 missing)
## Surrogate splits:
## Duration.of.Credit..month. < 8.5 to the left, agree=0.707, adj=0.102, (0 split)
## Purpose < 3.5 to the right, agree=0.700, adj=0.082, (0 split)
## Age..years. < 20.5 to the left, agree=0.687, adj=0.041, (0 split)
## No.of.dependents < 1.5 to the right, agree=0.680, adj=0.020, (0 split)
##
## Node number 23: 104 observations
## predicted class=1 expected loss=0.2019231 P(node) =0.1155556
## class counts: 21 83
## probabilities: 0.202 0.798
##
## Node number 34: 28 observations
## predicted class=0 expected loss=0.25 P(node) =0.03111111
## class counts: 21 7
## probabilities: 0.750 0.250
##
## Node number 35: 111 observations, complexity param=0.01315789
## predicted class=1 expected loss=0.4954955 P(node) =0.1233333
## class counts: 55 56
## probabilities: 0.495 0.505
## left son=70 (102 obs) right son=71 (9 obs)
## Primary splits:
## Guarantors < 2.5 to the left, improve=2.894188, (0 missing)
## Purpose < 1.5 to the right, improve=2.474901, (0 missing)
## Age..years. < 52.5 to the left, improve=1.919258, (0 missing)
## Concurrent.Credits < 1.5 to the left, improve=1.632795, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=1.424384, (0 missing)
##
## Node number 44: 49 observations, complexity param=0.01629073
## predicted class=0 expected loss=0.4489796 P(node) =0.05444444
## class counts: 27 22
## probabilities: 0.551 0.449
## left son=88 (37 obs) right son=89 (12 obs)
## Primary splits:
## Duration.of.Credit..month. < 8.5 to the right, improve=4.695348, (0 missing)
## Telephone < 1.5 to the left, improve=3.629513, (0 missing)
## Age..years. < 36.5 to the left, improve=3.482993, (0 missing)
## Most.valuable.available.asset < 2.5 to the right, improve=3.067640, (0 missing)
## Length.of.current.employment < 3.5 to the left, improve=2.803821, (0 missing)
## Surrogate splits:
## Credit.Amount < 624 to the right, agree=0.857, adj=0.417, (0 split)
## Occupation < 3.5 to the left, agree=0.796, adj=0.167, (0 split)
##
## Node number 45: 101 observations, complexity param=0.0112782
## predicted class=1 expected loss=0.2871287 P(node) =0.1122222
## class counts: 29 72
## probabilities: 0.287 0.713
## left son=90 (9 obs) right son=91 (92 obs)
## Primary splits:
## Credit.Amount < 6414.5 to the right, improve=2.846535, (0 missing)
## Length.of.current.employment < 4.5 to the right, improve=2.549066, (0 missing)
## Age..years. < 49 to the right, improve=1.884996, (0 missing)
## Payment.Status.of.Previous.Credit < 3.5 to the left, improve=1.604687, (0 missing)
## Instalment.per.cent < 3.5 to the left, improve=1.530175, (0 missing)
## Surrogate splits:
## Age..years. < 67.5 to the right, agree=0.921, adj=0.111, (0 split)
##
## Node number 70: 102 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.4705882 P(node) =0.1133333
## class counts: 54 48
## probabilities: 0.529 0.471
## left son=140 (85 obs) right son=141 (17 obs)
## Primary splits:
## Purpose < 1.5 to the right, improve=2.258824, (0 missing)
## Concurrent.Credits < 1.5 to the left, improve=2.258824, (0 missing)
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve=1.846785, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=1.838022, (0 missing)
## Length.of.current.employment < 3.5 to the left, improve=1.411765, (0 missing)
##
## Node number 71: 9 observations
## predicted class=1 expected loss=0.1111111 P(node) =0.01
## class counts: 1 8
## probabilities: 0.111 0.889
##
## Node number 88: 37 observations
## predicted class=0 expected loss=0.3243243 P(node) =0.04111111
## class counts: 25 12
## probabilities: 0.676 0.324
##
## Node number 89: 12 observations
## predicted class=1 expected loss=0.1666667 P(node) =0.01333333
## class counts: 2 10
## probabilities: 0.167 0.833
##
## Node number 90: 9 observations
## predicted class=0 expected loss=0.3333333 P(node) =0.01
## class counts: 6 3
## probabilities: 0.667 0.333
##
## Node number 91: 92 observations
## predicted class=1 expected loss=0.25 P(node) =0.1022222
## class counts: 23 69
## probabilities: 0.250 0.750
##
## Node number 140: 85 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.4235294 P(node) =0.09444444
## class counts: 49 36
## probabilities: 0.576 0.424
## left son=280 (16 obs) right son=281 (69 obs)
## Primary splits:
## Concurrent.Credits < 1.5 to the left, improve=2.196100, (0 missing)
## Payment.Status.of.Previous.Credit < 1.5 to the left, improve=1.820168, (0 missing)
## Telephone < 1.5 to the right, improve=1.466993, (0 missing)
## Age..years. < 28.5 to the right, improve=1.191024, (0 missing)
## Duration.in.Current.address < 1.5 to the right, improve=1.152941, (0 missing)
## Surrogate splits:
## Duration.of.Credit..month. < 39.5 to the right, agree=0.824, adj=0.062, (0 split)
##
## Node number 141: 17 observations
## predicted class=1 expected loss=0.2941176 P(node) =0.01888889
## class counts: 5 12
## probabilities: 0.294 0.706
##
## Node number 280: 16 observations
## predicted class=0 expected loss=0.1875 P(node) =0.01777778
## class counts: 13 3
## probabilities: 0.812 0.188
##
## Node number 281: 69 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.4782609 P(node) =0.07666667
## class counts: 36 33
## probabilities: 0.522 0.478
## left son=562 (44 obs) right son=563 (25 obs)
## Primary splits:
## Age..years. < 28.5 to the right, improve=2.0511460, (0 missing)
## Telephone < 1.5 to the right, improve=1.6213370, (0 missing)
## Length.of.current.employment < 3.5 to the left, improve=1.2326770, (0 missing)
## Credit.Amount < 2440.5 to the left, improve=0.8487091, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=0.7317415, (0 missing)
## Surrogate splits:
## Value.Savings.Stocks < 2.5 to the left, agree=0.696, adj=0.16, (0 split)
## Most.valuable.available.asset < 1.5 to the right, agree=0.681, adj=0.12, (0 split)
## Sex...Marital.Status < 3.5 to the left, agree=0.667, adj=0.08, (0 split)
## Type.of.apartment < 1.5 to the right, agree=0.652, adj=0.04, (0 split)
##
## Node number 562: 44 observations, complexity param=0.01315789
## predicted class=0 expected loss=0.3863636 P(node) =0.04888889
## class counts: 27 17
## probabilities: 0.614 0.386
## left son=1124 (30 obs) right son=1125 (14 obs)
## Primary splits:
## Length.of.current.employment < 3.5 to the left, improve=2.7017320, (0 missing)
## Payment.Status.of.Previous.Credit < 2.5 to the left, improve=2.0774290, (0 missing)
## No.of.Credits.at.this.Bank < 1.5 to the left, improve=1.2803030, (0 missing)
## Sex...Marital.Status < 2.5 to the left, improve=0.9350649, (0 missing)
## Credit.Amount < 2168.5 to the left, improve=0.6096681, (0 missing)
## Surrogate splits:
## Age..years. < 52.5 to the left, agree=0.705, adj=0.071, (0 split)
##
## Node number 563: 25 observations
## predicted class=1 expected loss=0.36 P(node) =0.02777778
## class counts: 9 16
## probabilities: 0.360 0.640
##
## Node number 1124: 30 observations
## predicted class=0 expected loss=0.2666667 P(node) =0.03333333
## class counts: 22 8
## probabilities: 0.733 0.267
##
## Node number 1125: 14 observations
## predicted class=1 expected loss=0.3571429 P(node) =0.01555556
## class counts: 5 9
## probabilities: 0.357 0.643
rpart.plot(DT, type=1, extra = 102)
#Pre-processing
if (!require("randomForest")) {
install.packages("randomForest")
library(randomForest)
}
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 900 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1800 matrix numeric
## oob.times 900 -none- numeric
## classes 2 -none- character
## importance 20 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 900 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
#Model Evaluation
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 18 9
## 1 16 57
#Accuracy
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 75
Answer:
importance(random_model)
## MeanDecreaseGini
## Account.Balance 41.274824
## Duration.of.Credit..month. 38.198510
## Payment.Status.of.Previous.Credit 21.554201
## Purpose 23.730586
## Credit.Amount 49.202906
## Value.Savings.Stocks 17.713488
## Length.of.current.employment 18.957148
## Instalment.per.cent 15.251498
## Sex...Marital.Status 13.607525
## Guarantors 7.544454
## Duration.in.Current.address 15.079679
## Most.valuable.available.asset 16.454145
## Age..years. 39.686369
## Concurrent.Credits 8.231060
## Type.of.apartment 9.910866
## No.of.Credits.at.this.Bank 8.164530
## Occupation 12.122744
## No.of.dependents 5.041466
## Telephone 7.251499
## Foreign.Worker 1.676513
Three most imprtant features are Credit Amount at 49.202906, Account Balance at 41.274824, and Age by years at 37.019536.
Asnwer:
set.seed(23458)
random_model1 <- randomForest(Creditability ~ (Credit.Amount + Account.Balance + Age..years.), data= credit_train)
cred_pred1 <- predict(random_model1, credit_test)
(p1 <- table(cred_pred1, credit_test$Creditability))
##
## cred_pred1 0 1
## 0 8 7
## 1 26 59
(Accuracy <- sum(diag(p1))/sum(p1)*100)
## [1] 67
Now the accuracy is drop from 75 to 67 for this model after add more parameters.
#Loading the data
wine <- read.csv("~/lab1/whitewines.csv")
View(wine)
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
#Exploring and Preparing the Data
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
#Training a Model on the Data
m.rpart <- rpart(quality ~ ., data=wine_train)
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.15501053 0 1.0000000 1.0005815 0.02446345
## 2 0.05098911 1 0.8449895 0.8461816 0.02334172
## 3 0.02796998 2 0.7940004 0.8024666 0.02270119
## 4 0.01970128 3 0.7660304 0.7777624 0.02148003
## 5 0.01265926 4 0.7463291 0.7612306 0.02075603
## 6 0.01007193 5 0.7336698 0.7551843 0.02070614
## 7 0.01000000 6 0.7235979 0.7514380 0.02055973
##
## Variable importance
## alcohol density volatile.acidity
## 34 21 15
## chlorides total.sulfur.dioxide free.sulfur.dioxide
## 11 7 6
## residual.sugar sulphates citric.acid
## 3 1 1
##
## Node number 1: 3750 observations, complexity param=0.1550105
## mean=5.870933, MSE=0.7854751
## left son=2 (2372 obs) right son=3 (1378 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.15501050, (0 missing)
## density < 0.992035 to the right, improve=0.10915940, (0 missing)
## chlorides < 0.0395 to the right, improve=0.07682258, (0 missing)
## total.sulfur.dioxide < 158.5 to the right, improve=0.04089663, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03636458, (0 missing)
## Surrogate splits:
## density < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
## chlorides < 0.0375 to the right, agree=0.757, adj=0.339, (0 split)
## total.sulfur.dioxide < 103.5 to the right, agree=0.690, adj=0.155, (0 split)
## residual.sugar < 5.375 to the right, agree=0.667, adj=0.094, (0 split)
## sulphates < 0.345 to the right, agree=0.647, adj=0.038, (0 split)
##
## Node number 2: 2372 observations, complexity param=0.05098911
## mean=5.604975, MSE=0.5981709
## left son=4 (1611 obs) right son=5 (761 obs)
## Primary splits:
## volatile.acidity < 0.2275 to the right, improve=0.10585250, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03390500, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03204075, (0 missing)
## alcohol < 10.11667 to the left, improve=0.03136524, (0 missing)
## chlorides < 0.0585 to the right, improve=0.01633599, (0 missing)
## Surrogate splits:
## pH < 3.485 to the left, agree=0.694, adj=0.047, (0 split)
## sulphates < 0.755 to the left, agree=0.685, adj=0.020, (0 split)
## total.sulfur.dioxide < 105.5 to the right, agree=0.683, adj=0.011, (0 split)
## residual.sugar < 0.75 to the right, agree=0.681, adj=0.007, (0 split)
## chlorides < 0.0285 to the right, agree=0.680, adj=0.003, (0 split)
##
## Node number 3: 1378 observations, complexity param=0.02796998
## mean=6.328737, MSE=0.7765472
## left son=6 (84 obs) right son=7 (1294 obs)
## Primary splits:
## free.sulfur.dioxide < 10.5 to the left, improve=0.07699080, (0 missing)
## alcohol < 11.76667 to the left, improve=0.06210660, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.04438619, (0 missing)
## residual.sugar < 1.375 to the left, improve=0.02905351, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02613259, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 53.5 to the left, agree=0.952, adj=0.214, (0 split)
## volatile.acidity < 0.875 to the right, agree=0.940, adj=0.024, (0 split)
##
## Node number 4: 1611 observations, complexity param=0.01265926
## mean=5.43203, MSE=0.5098121
## left son=8 (688 obs) right son=9 (923 obs)
## Primary splits:
## volatile.acidity < 0.3025 to the right, improve=0.04540111, (0 missing)
## alcohol < 10.05 to the left, improve=0.03874403, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03338886, (0 missing)
## chlorides < 0.0495 to the right, improve=0.02574623, (0 missing)
## citric.acid < 0.195 to the left, improve=0.02327981, (0 missing)
## Surrogate splits:
## citric.acid < 0.215 to the left, agree=0.633, adj=0.141, (0 split)
## free.sulfur.dioxide < 20.5 to the left, agree=0.600, adj=0.063, (0 split)
## chlorides < 0.0595 to the right, agree=0.593, adj=0.047, (0 split)
## residual.sugar < 1.15 to the left, agree=0.583, adj=0.023, (0 split)
## total.sulfur.dioxide < 219.25 to the right, agree=0.582, adj=0.022, (0 split)
##
## Node number 5: 761 observations
## mean=5.971091, MSE=0.5878633
##
## Node number 6: 84 observations
## mean=5.369048, MSE=1.137613
##
## Node number 7: 1294 observations, complexity param=0.01970128
## mean=6.391036, MSE=0.6894405
## left son=14 (629 obs) right son=15 (665 obs)
## Primary splits:
## alcohol < 11.76667 to the left, improve=0.06504696, (0 missing)
## chlorides < 0.0395 to the right, improve=0.02758705, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02750932, (0 missing)
## pH < 3.055 to the left, improve=0.02307356, (0 missing)
## total.sulfur.dioxide < 191.5 to the right, improve=0.02186818, (0 missing)
## Surrogate splits:
## density < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
## volatile.acidity < 0.2675 to the left, agree=0.637, adj=0.253, (0 split)
## chlorides < 0.0365 to the right, agree=0.630, adj=0.238, (0 split)
## residual.sugar < 1.475 to the left, agree=0.575, adj=0.126, (0 split)
## total.sulfur.dioxide < 128.5 to the right, agree=0.574, adj=0.124, (0 split)
##
## Node number 8: 688 observations
## mean=5.255814, MSE=0.4054895
##
## Node number 9: 923 observations
## mean=5.56338, MSE=0.5471747
##
## Node number 14: 629 observations, complexity param=0.01007193
## mean=6.173291, MSE=0.6838017
## left son=28 (11 obs) right son=29 (618 obs)
## Primary splits:
## volatile.acidity < 0.465 to the right, improve=0.06897561, (0 missing)
## total.sulfur.dioxide < 200 to the right, improve=0.04223066, (0 missing)
## residual.sugar < 0.975 to the left, improve=0.03061714, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02978501, (0 missing)
## sulphates < 0.575 to the left, improve=0.02165970, (0 missing)
## Surrogate splits:
## citric.acid < 0.045 to the left, agree=0.986, adj=0.182, (0 split)
## total.sulfur.dioxide < 279.25 to the right, agree=0.986, adj=0.182, (0 split)
##
## Node number 15: 665 observations
## mean=6.596992, MSE=0.6075098
##
## Node number 28: 11 observations
## mean=4.545455, MSE=0.9752066
##
## Node number 29: 618 observations
## mean=6.202265, MSE=0.6306098
#Model Visualization
rpart.plot(m.rpart, digits=3, type=1)
#Model evaluation
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
Asnwer:
if (!require("Metrics")) {
install.packages("Metrics")
library(Metrics)
}
## Loading required package: Metrics
## Warning: package 'Metrics' was built under R version 3.5.3
rmse(wine_test$quality, p.rpart)
## [1] 0.7448093
The root-mean-square error is 0.745, the lower RMSE value, the more model fit the observed data, and 0.745 is still high.
#Import the data
news <- read.csv("~/lab1/OnlineNewsPopularity_for_R.csv")
# View(news)
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
str(newsShort)
## 'data.frame': 39644 obs. of 17 variables:
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity: num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
#Pre-processing
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "1"}
else {newsShort$popular[i] = "0"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
prop.table(table(news_train$shares))
##
## 0 1
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
##
## 0 1
## 0.414 0.586
#Model training
library("C50", lib.loc="~/R/win-library/3.5")
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Dec 13 13:00:07 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 9000 cases (18 attributes) from undefined.data
##
## Decision tree:
##
## popular = 0: 0 (3878)
## popular = 1: 1 (5122)
##
##
## Evaluation on training data (9000 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 3878 (a): class 0
## 5122 (b): class 1
##
##
## Attribute usage:
##
## 100.00% popular
##
##
## Time: 0.1 secs
#Evaluate the model
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##
## news_pred 0 1
## 0 414 0
## 1 0 586
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100
Answer: #Step 1: RANDOM FOREST
news <- read.csv("~/lab1/OnlineNewsPopularity_for_R.csv")
news <- news[,-(1:2)]
#Check for outliers
news=news[!news$n_unique_tokens==701,]
#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#Standardize the dataset
for(i in ncol(news)-1){
news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}
#Define popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))
set.seed(12346)
news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:3964, ]
news_test <- news_rand[3965:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_modelNews <- randomForest(news_train$shares ~ . , data= news_train)
#Model training
cred_pridRF <- predict(random_modelNews, news_test)
(p2 <- table(cred_pridRF, news_test$shares))
##
## cred_pridRF 0 1
## 0 10014 7264
## 1 8088 10313
#Accuracy
(Accuracy <- sum(diag(p2))/sum(p2)*100)
## [1] 56.97189
#importance
importance(random_modelNews)
## MeanDecreaseGini
## n_tokens_title 110.95451
## n_tokens_content 172.15879
## n_unique_tokens 188.13602
## n_non_stop_words 168.50493
## num_hrefs 148.12312
## num_imgs 90.89631
## num_videos 52.40012
## average_token_length 195.31143
## num_keywords 93.64084
## kw_max_max 49.10667
## global_sentiment_polarity 214.07528
## avg_positive_polarity 187.35883
## title_subjectivity 74.85371
## title_sentiment_polarity 82.44230
## abs_title_subjectivity 73.17472
## abs_title_sentiment_polarity 68.02149
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Dec 13 13:00:13 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 3964 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## global_sentiment_polarity > 0.1079453:
## :...num_hrefs > 12:
## : :...n_tokens_title <= 10: 1 (436/135)
## : : n_tokens_title > 10:
## : : :...abs_title_subjectivity > 0.3416667: 1 (137/47)
## : : abs_title_subjectivity <= 0.3416667:
## : : :...n_unique_tokens <= 0.3832077: 1 (7)
## : : n_unique_tokens > 0.3832077: 0 (123/51)
## : num_hrefs <= 12:
## : :...abs_title_subjectivity > 0.1272727: 1 (1135/509)
## : abs_title_subjectivity <= 0.1272727:
## : :...num_imgs <= 1:
## : :...n_unique_tokens > 0.4742152: 0 (255/87)
## : : n_unique_tokens <= 0.4742152:
## : : :...n_tokens_title <= 12: 1 (22/3)
## : : n_tokens_title > 12: 0 (5)
## : num_imgs > 1:
## : :...num_imgs <= 6: 1 (47/16)
## : num_imgs > 6:
## : :...kw_max_max <= 227300: 1 (6/1)
## : kw_max_max > 227300: 0 (44/15)
## global_sentiment_polarity <= 0.1079453:
## :...num_hrefs > 19: 1 (186/79)
## num_hrefs <= 19:
## :...n_non_stop_words <= 0.9999999: 1 (112/51)
## n_non_stop_words > 0.9999999:
## :...num_keywords <= 6:
## :...num_hrefs <= 1: 0 (39/4)
## : num_hrefs > 1:
## : :...title_sentiment_polarity <= 0.1083333: 0 (449/140)
## : title_sentiment_polarity > 0.1083333:
## : :...num_keywords <= 5: 1 (74/28)
## : num_keywords > 5:
## : :...average_token_length <= 4.411043: 1 (9/1)
## : average_token_length > 4.411043: 0 (48/14)
## num_keywords > 6:
## :...num_imgs > 2: 1 (226/102)
## num_imgs <= 2:
## :...num_imgs > 0:
## :...kw_max_max <= 227300: 1 (25/10)
## : kw_max_max > 227300: 0 (437/159)
## num_imgs <= 0:
## :...num_keywords <= 7: 1 (46/15)
## num_keywords > 7:
## :...num_videos > 15: 1 (5)
## num_videos <= 15:
## :...num_hrefs <= 2: 1 (10/2)
## num_hrefs > 2: 0 (81/30)
##
##
## Evaluation on training data (3964 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 25 1499(37.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 981 999 (a): class 0
## 500 1484 (b): class 1
##
##
## Attribute usage:
##
## 100.00% num_hrefs
## 100.00% global_sentiment_polarity
## 44.93% abs_title_subjectivity
## 39.38% n_non_stop_words
## 36.55% num_keywords
## 30.50% num_imgs
## 18.42% n_tokens_title
## 14.63% title_sentiment_polarity
## 12.92% kw_max_max
## 10.39% n_unique_tokens
## 2.42% num_videos
## 1.44% average_token_length
##
##
## Time: 0.1 secs
#Model training
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 35679
##
##
## | predicted shares
## actual shares | 0 | 1 | Row Total |
## --------------|-----------|-----------|-----------|
## 0 | 7692 | 10410 | 18102 |
## | 0.216 | 0.292 | |
## --------------|-----------|-----------|-----------|
## 1 | 5596 | 11981 | 17577 |
## | 0.157 | 0.336 | |
## --------------|-----------|-----------|-----------|
## Column Total | 13288 | 22391 | 35679 |
## --------------|-----------|-----------|-----------|
##
##
(p3 <- table(news_pred, news_test$shares))
##
## news_pred 0 1
## 0 7692 5596
## 1 10410 11981
#Accuracy
(Accuracy <- sum(diag(p3))/sum(p3)*100)
## [1] 55.13888