LAB 1 classification
Method 1 Decision Tree
Uploading the data
credit <- read.csv("credit.csv")
str(credit)
## 'data.frame': 1000 obs. of 17 variables:
## $ checking_balance : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
## $ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_history : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
## $ purpose : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ savings_balance : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
## $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
## $ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
## $ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ other_credit : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ housing : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
## $ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
## $ job : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
## $ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
## $ phone : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
exploring the data
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$default)
##
## no yes
## 700 300
Randomize the data
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
divide data to train and test sets
credit_train <- credit_rand[1:900, ]
credit_test<- credit_rand[901:1000, ]
prop.table(table(credit_train$default))
##
## no yes
## 0.7022222 0.2977778
prop.table(table(credit_test$default))
##
## no yes
## 0.68 0.32
training the model
library(C50)
## Warning: package 'C50' was built under R version 3.4.4
credit_model <- C5.0(x = credit_train[-17], y= credit_train$default)
## Warning in as.POSIXlt.POSIXct(x, tz): unable to identify current timezone 'U':
## please set environment variable 'TZ'
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue Jul 03 13:07:26 2018
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## checking_balance = unknown: no (358/44)
## checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
## :...credit_history in {perfect,very good}:
## :...dependents > 1: yes (10/1)
## : dependents <= 1:
## : :...savings_balance = < 100 DM: yes (39/11)
## : savings_balance in {> 1000 DM,500 - 1000 DM,unknown}: no (8/1)
## : savings_balance = 100 - 500 DM:
## : :...checking_balance = < 0 DM: no (1)
## : checking_balance in {> 200 DM,1 - 200 DM}: yes (5/1)
## credit_history in {critical,good,poor}:
## :...months_loan_duration <= 11: no (87/14)
## months_loan_duration > 11:
## :...savings_balance = > 1000 DM: no (13)
## savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM,unknown}:
## :...checking_balance = > 200 DM:
## :...dependents > 1: yes (3)
## : dependents <= 1:
## : :...credit_history in {good,poor}: no (23/3)
## : credit_history = critical:
## : :...amount <= 2337: yes (3)
## : amount > 2337: no (6)
## checking_balance = 1 - 200 DM:
## :...savings_balance = unknown: no (34/6)
## : savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM}:
## : :...months_loan_duration > 45: yes (11/1)
## : months_loan_duration <= 45:
## : :...other_credit = store:
## : :...age <= 35: yes (4)
## : : age > 35: no (2)
## : other_credit = bank:
## : :...years_at_residence <= 1: no (3)
## : : years_at_residence > 1:
## : : :...existing_loans_count <= 1: yes (5)
## : : existing_loans_count > 1:
## : : :...percent_of_income <= 2: no (4/1)
## : : percent_of_income > 2: yes (3)
## : other_credit = none:
## : :...job = unemployed: no (1)
## : job = management:
## : :...amount <= 7511: no (10/3)
## : : amount > 7511: yes (7)
## : job = unskilled: [S1]
## : job = skilled:
## : :...dependents <= 1: no (55/15)
## : dependents > 1:
## : :...age <= 34: no (3)
## : age > 34: yes (4)
## checking_balance = < 0 DM:
## :...job = management: no (26/6)
## job = unemployed: yes (4/1)
## job = unskilled:
## :...employment_duration in {4 - 7 years,
## : : unemployed}: no (4)
## : employment_duration = < 1 year:
## : :...other_credit = bank: no (1)
## : : other_credit in {none,store}: yes (11/2)
## : employment_duration = > 7 years:
## : :...other_credit in {bank,none}: no (5/1)
## : : other_credit = store: yes (2)
## : employment_duration = 1 - 4 years:
## : :...age <= 39: no (14/3)
## : age > 39:
## : :...credit_history in {critical,good}: yes (3)
## : credit_history = poor: no (1)
## job = skilled:
## :...credit_history = poor:
## :...savings_balance in {< 100 DM,100 - 500 DM,
## : : 500 - 1000 DM}: yes (8)
## : savings_balance = unknown: no (1)
## credit_history = critical:
## :...other_credit = store: no (0)
## : other_credit = bank: yes (4)
## : other_credit = none:
## : :...savings_balance in {100 - 500 DM,
## : : unknown}: no (1)
## : savings_balance = 500 - 1000 DM: yes (1)
## : savings_balance = < 100 DM:
## : :...months_loan_duration <= 13:
## : :...percent_of_income <= 3: yes (3)
## : : percent_of_income > 3: no (3/1)
## : months_loan_duration > 13:
## : :...amount <= 5293: no (10/1)
## : amount > 5293: yes (2)
## credit_history = good:
## :...existing_loans_count > 1: yes (5)
## existing_loans_count <= 1:
## :...other_credit = store: no (2)
## other_credit = bank:
## :...percent_of_income <= 2: yes (2)
## : percent_of_income > 2: no (6/1)
## other_credit = none: [S2]
##
## SubTree [S1]
##
## employment_duration in {< 1 year,1 - 4 years}: yes (11/3)
## employment_duration in {> 7 years,4 - 7 years,unemployed}: no (8)
##
## SubTree [S2]
##
## savings_balance = 100 - 500 DM: yes (3)
## savings_balance = 500 - 1000 DM: no (1)
## savings_balance = unknown:
## :...phone = no: yes (9/1)
## : phone = yes: no (3/1)
## savings_balance = < 100 DM:
## :...percent_of_income <= 1: no (4)
## percent_of_income > 1:
## :...phone = yes: yes (10/1)
## phone = no:
## :...purpose in {business,car0,education,renovations}: yes (3)
## purpose = car:
## :...percent_of_income <= 3: no (2)
## : percent_of_income > 3: yes (6/1)
## purpose = furniture/appliances:
## :...years_at_residence <= 1: no (4)
## years_at_residence > 1:
## :...housing = other: no (1)
## housing = rent: yes (2)
## housing = own:
## :...amount <= 1778: no (3)
## amount > 1778:
## :...years_at_residence <= 3: yes (6)
## years_at_residence > 3: no (3/1)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 66 125(13.9%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 609 23 (a): class no
## 102 166 (b): class yes
##
##
## Attribute usage:
##
## 100.00% checking_balance
## 60.22% credit_history
## 53.22% months_loan_duration
## 49.44% savings_balance
## 30.89% job
## 25.89% other_credit
## 17.78% dependents
## 9.67% existing_loans_count
## 7.22% percent_of_income
## 6.67% employment_duration
## 5.78% phone
## 5.56% amount
## 3.78% years_at_residence
## 3.44% age
## 3.33% purpose
## 1.67% housing
##
##
## Time: 0.0 secs
evaluation of the model
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.4.4
cred_pred <- predict(credit_model, credit_test)
CrossTable(credit_test$default, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual default', 'predicted default'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | predicted default
## actual default | no | yes | Row Total |
## ---------------|-----------|-----------|-----------|
## no | 57 | 11 | 68 |
## | 0.570 | 0.110 | |
## ---------------|-----------|-----------|-----------|
## yes | 16 | 16 | 32 |
## | 0.160 | 0.160 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 73 | 27 | 100 |
## ---------------|-----------|-----------|-----------|
##
##
the table shows that in 100 observations, we had 11 Type 2 errors, false negative, and 16 type 1 errors, false positive
(p <- table(cred_pred, credit_test$default))
##
## cred_pred no yes
## no 57 16
## yes 11 16
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 73
Visualize the model
library(ggplot2)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
DT <- rpart(default ~ checking_balance + credit_history + months_loan_duration, data= credit_train)
summary(DT)
## Call:
## rpart(formula = default ~ checking_balance + credit_history +
## months_loan_duration, data = credit_train)
## n= 900
##
## CP nsplit rel error xerror xstd
## 1 0.04104478 0 1.0000000 1.0000000 0.05118820
## 2 0.01305970 3 0.8768657 0.9440299 0.05032182
## 3 0.01000000 5 0.8507463 0.9365672 0.05019994
##
## Variable importance
## checking_balance months_loan_duration credit_history
## 60 20 20
##
## Node number 1: 900 observations, complexity param=0.04104478
## predicted class=no expected loss=0.2977778 P(node) =1
## class counts: 632 268
## probabilities: 0.702 0.298
## left son=2 (414 obs) right son=3 (486 obs)
## Primary splits:
## checking_balance splits as RLRL, improve=39.30076, (0 missing)
## credit_history splits as LLRLR, improve=14.01920, (0 missing)
## months_loan_duration < 34.5 to the left, improve=11.31821, (0 missing)
## Surrogate splits:
## credit_history splits as LRRRR, agree=0.592, adj=0.114, (0 split)
## months_loan_duration < 5.5 to the left, agree=0.547, adj=0.014, (0 split)
##
## Node number 2: 414 observations
## predicted class=no expected loss=0.1376812 P(node) =0.46
## class counts: 357 57
## probabilities: 0.862 0.138
##
## Node number 3: 486 observations, complexity param=0.04104478
## predicted class=no expected loss=0.4341564 P(node) =0.54
## class counts: 275 211
## probabilities: 0.566 0.434
## left son=6 (274 obs) right son=7 (212 obs)
## Primary splits:
## months_loan_duration < 22.5 to the left, improve=10.423870, (0 missing)
## credit_history splits as LLRLR, improve= 8.598609, (0 missing)
## checking_balance splits as R-L-, improve= 3.119042, (0 missing)
## Surrogate splits:
## credit_history splits as LLRRR, agree=0.611, adj=0.108, (0 split)
##
## Node number 6: 274 observations, complexity param=0.04104478
## predicted class=no expected loss=0.3430657 P(node) =0.3044444
## class counts: 180 94
## probabilities: 0.657 0.343
## left son=12 (249 obs) right son=13 (25 obs)
## Primary splits:
## credit_history splits as LLRLR, improve=7.817224, (0 missing)
## months_loan_duration < 11.5 to the left, improve=3.919498, (0 missing)
## checking_balance splits as R-L-, improve=1.868613, (0 missing)
##
## Node number 7: 212 observations, complexity param=0.0130597
## predicted class=yes expected loss=0.4481132 P(node) =0.2355556
## class counts: 95 117
## probabilities: 0.448 0.552
## left son=14 (173 obs) right son=15 (39 obs)
## Primary splits:
## months_loan_duration < 47.5 to the left, improve=2.635873, (0 missing)
## credit_history splits as LRRLR, improve=1.294771, (0 missing)
## checking_balance splits as R-L-, improve=1.058491, (0 missing)
##
## Node number 12: 249 observations
## predicted class=no expected loss=0.3052209 P(node) =0.2766667
## class counts: 173 76
## probabilities: 0.695 0.305
##
## Node number 13: 25 observations
## predicted class=yes expected loss=0.28 P(node) =0.02777778
## class counts: 7 18
## probabilities: 0.280 0.720
##
## Node number 14: 173 observations, complexity param=0.0130597
## predicted class=yes expected loss=0.4855491 P(node) =0.1922222
## class counts: 84 89
## probabilities: 0.486 0.514
## left son=28 (77 obs) right son=29 (96 obs)
## Primary splits:
## checking_balance splits as R-L-, improve=0.9959275, (0 missing)
## credit_history splits as LRRRR, improve=0.8924092, (0 missing)
## months_loan_duration < 31.5 to the left, improve=0.3748116, (0 missing)
## Surrogate splits:
## credit_history splits as LRRLR, agree=0.618, adj=0.143, (0 split)
## months_loan_duration < 25 to the right, agree=0.566, adj=0.026, (0 split)
##
## Node number 15: 39 observations
## predicted class=yes expected loss=0.2820513 P(node) =0.04333333
## class counts: 11 28
## probabilities: 0.282 0.718
##
## Node number 28: 77 observations
## predicted class=no expected loss=0.4545455 P(node) =0.08555556
## class counts: 42 35
## probabilities: 0.545 0.455
##
## Node number 29: 96 observations
## predicted class=yes expected loss=0.4375 P(node) =0.1066667
## class counts: 42 54
## probabilities: 0.438 0.562
rpart.plot(DT, type = 1, extra = 102)

Method 2 Support Vector Machine
letters <- read.csv ("letterdata.csv")
str(letters)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
Data preparation
letters_train <- letters[1:18000, ]
Letters_test <- letters[18001:20000, ]
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.4.4
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
letter_classifier <- ksvm(letter ~., data= letters_train,kernel="vanilladot")
## Setting default kernel parameters
summary(letter_classifier)
## Length Class Mode
## 1 ksvm S4
evaluate the model
letter_predictions <- predict(letter_classifier, Letters_test)
(p<- table(letter_predictions,Letters_test$letter))
##
## letter_predictions A B C D E F G H I J K L M N O P Q R
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5
## I 0 0 0 0 0 0 0 0 65 3 0 0 0 0 0 0 0 0
## J 0 1 0 0 0 1 0 0 3 61 0 0 0 0 1 0 0 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
##
## letter_predictions S T U V W X Y Z
## A 0 1 2 0 1 0 0 0
## B 3 0 0 0 0 0 0 0
## C 0 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 1 0
## E 6 0 0 0 0 1 0 0
## F 2 0 0 1 0 0 2 0
## G 3 2 0 0 0 0 0 0
## H 0 3 0 2 0 0 1 0
## I 2 0 0 0 0 2 1 0
## J 1 0 0 0 0 1 0 4
## K 0 1 2 0 0 4 0 0
## L 0 0 0 0 0 0 0 0
## M 0 0 1 0 6 0 0 0
## N 0 0 1 0 2 0 0 0
## O 0 0 1 0 0 0 0 0
## P 0 0 0 0 0 0 1 0
## Q 1 0 0 0 0 0 0 0
## R 0 1 0 1 0 0 0 0
## S 47 1 0 0 0 1 0 6
## T 1 83 1 0 0 0 2 2
## U 0 0 83 0 0 0 0 0
## V 0 0 0 64 1 0 1 0
## W 0 0 0 3 59 0 0 0
## X 0 0 0 0 0 76 1 0
## Y 0 1 0 0 0 1 58 0
## Z 5 1 0 0 0 0 0 70
accuracy in percentage
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 83.95
method 3 adding regression to trees
wine <- read.csv ("whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
wine_train <- wine [1:3750, ]
wine_test <- wine [3751:4898, ]
m.rpart <- rpart(quality ~., data=wine_train)
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.15501053 0 1.0000000 1.0002914 0.02445384
## 2 0.05098911 1 0.8449895 0.8455865 0.02333186
## 3 0.02796998 2 0.7940004 0.8008829 0.02270400
## 4 0.01970128 3 0.7660304 0.7750184 0.02147562
## 5 0.01265926 4 0.7463291 0.7557558 0.02070564
## 6 0.01007193 5 0.7336698 0.7465271 0.02050370
## 7 0.01000000 6 0.7235979 0.7450390 0.02053188
##
## Variable importance
## alcohol density volatile.acidity
## 34 21 15
## chlorides total.sulfur.dioxide free.sulfur.dioxide
## 11 7 6
## residual.sugar sulphates citric.acid
## 3 1 1
##
## Node number 1: 3750 observations, complexity param=0.1550105
## mean=5.870933, MSE=0.7854751
## left son=2 (2372 obs) right son=3 (1378 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.15501050, (0 missing)
## density < 0.992035 to the right, improve=0.10915940, (0 missing)
## chlorides < 0.0395 to the right, improve=0.07682258, (0 missing)
## total.sulfur.dioxide < 158.5 to the right, improve=0.04089663, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03636458, (0 missing)
## Surrogate splits:
## density < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
## chlorides < 0.0375 to the right, agree=0.757, adj=0.339, (0 split)
## total.sulfur.dioxide < 103.5 to the right, agree=0.690, adj=0.155, (0 split)
## residual.sugar < 5.375 to the right, agree=0.667, adj=0.094, (0 split)
## sulphates < 0.345 to the right, agree=0.647, adj=0.038, (0 split)
##
## Node number 2: 2372 observations, complexity param=0.05098911
## mean=5.604975, MSE=0.5981709
## left son=4 (1611 obs) right son=5 (761 obs)
## Primary splits:
## volatile.acidity < 0.2275 to the right, improve=0.10585250, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03390500, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03204075, (0 missing)
## alcohol < 10.11667 to the left, improve=0.03136524, (0 missing)
## chlorides < 0.0585 to the right, improve=0.01633599, (0 missing)
## Surrogate splits:
## pH < 3.485 to the left, agree=0.694, adj=0.047, (0 split)
## sulphates < 0.755 to the left, agree=0.685, adj=0.020, (0 split)
## total.sulfur.dioxide < 105.5 to the right, agree=0.683, adj=0.011, (0 split)
## residual.sugar < 0.75 to the right, agree=0.681, adj=0.007, (0 split)
## chlorides < 0.0285 to the right, agree=0.680, adj=0.003, (0 split)
##
## Node number 3: 1378 observations, complexity param=0.02796998
## mean=6.328737, MSE=0.7765472
## left son=6 (84 obs) right son=7 (1294 obs)
## Primary splits:
## free.sulfur.dioxide < 10.5 to the left, improve=0.07699080, (0 missing)
## alcohol < 11.76667 to the left, improve=0.06210660, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.04438619, (0 missing)
## residual.sugar < 1.375 to the left, improve=0.02905351, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02613259, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 53.5 to the left, agree=0.952, adj=0.214, (0 split)
## volatile.acidity < 0.875 to the right, agree=0.940, adj=0.024, (0 split)
##
## Node number 4: 1611 observations, complexity param=0.01265926
## mean=5.43203, MSE=0.5098121
## left son=8 (688 obs) right son=9 (923 obs)
## Primary splits:
## volatile.acidity < 0.3025 to the right, improve=0.04540111, (0 missing)
## alcohol < 10.05 to the left, improve=0.03874403, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03338886, (0 missing)
## chlorides < 0.0495 to the right, improve=0.02574623, (0 missing)
## citric.acid < 0.195 to the left, improve=0.02327981, (0 missing)
## Surrogate splits:
## citric.acid < 0.215 to the left, agree=0.633, adj=0.141, (0 split)
## free.sulfur.dioxide < 20.5 to the left, agree=0.600, adj=0.063, (0 split)
## chlorides < 0.0595 to the right, agree=0.593, adj=0.047, (0 split)
## residual.sugar < 1.15 to the left, agree=0.583, adj=0.023, (0 split)
## total.sulfur.dioxide < 219.25 to the right, agree=0.582, adj=0.022, (0 split)
##
## Node number 5: 761 observations
## mean=5.971091, MSE=0.5878633
##
## Node number 6: 84 observations
## mean=5.369048, MSE=1.137613
##
## Node number 7: 1294 observations, complexity param=0.01970128
## mean=6.391036, MSE=0.6894405
## left son=14 (629 obs) right son=15 (665 obs)
## Primary splits:
## alcohol < 11.76667 to the left, improve=0.06504696, (0 missing)
## chlorides < 0.0395 to the right, improve=0.02758705, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02750932, (0 missing)
## pH < 3.055 to the left, improve=0.02307356, (0 missing)
## total.sulfur.dioxide < 191.5 to the right, improve=0.02186818, (0 missing)
## Surrogate splits:
## density < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
## volatile.acidity < 0.2675 to the left, agree=0.637, adj=0.253, (0 split)
## chlorides < 0.0365 to the right, agree=0.630, adj=0.238, (0 split)
## residual.sugar < 1.475 to the left, agree=0.575, adj=0.126, (0 split)
## total.sulfur.dioxide < 128.5 to the right, agree=0.574, adj=0.124, (0 split)
##
## Node number 8: 688 observations
## mean=5.255814, MSE=0.4054895
##
## Node number 9: 923 observations
## mean=5.56338, MSE=0.5471747
##
## Node number 14: 629 observations, complexity param=0.01007193
## mean=6.173291, MSE=0.6838017
## left son=28 (11 obs) right son=29 (618 obs)
## Primary splits:
## volatile.acidity < 0.465 to the right, improve=0.06897561, (0 missing)
## total.sulfur.dioxide < 200 to the right, improve=0.04223066, (0 missing)
## residual.sugar < 0.975 to the left, improve=0.03061714, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02978501, (0 missing)
## sulphates < 0.575 to the left, improve=0.02165970, (0 missing)
## Surrogate splits:
## citric.acid < 0.045 to the left, agree=0.986, adj=0.182, (0 split)
## total.sulfur.dioxide < 279.25 to the right, agree=0.986, adj=0.182, (0 split)
##
## Node number 15: 665 observations
## mean=6.596992, MSE=0.6075098
##
## Node number 28: 11 observations
## mean=4.545455, MSE=0.9752066
##
## Node number 29: 618 observations
## mean=6.202265, MSE=0.6306098
rpart.plot(m.rpart, digits = 3, type = 1)

evaluation of the model
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
news popularity dataset
news <- read.csv("OnlineNewsPopularity.csv")
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
convert share variable from a numerical to categorical
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- subset(newsShort, select = - popular)
Randomize the data
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
prop.table(table(news_train$shares))
##
## no yes
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
##
## no yes
## 0.414 0.586
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue Jul 03 13:07:50 2018
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 9000 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## n_unique_tokens <= 0.4466737:
## :...kw_max_max <= 17100:
## : :...n_tokens_content <= 1215: no (29/5)
## : : n_tokens_content > 1215: yes (8/2)
## : kw_max_max > 17100:
## : :...kw_max_max <= 617900:
## : :...n_tokens_title <= 10: yes (426/99)
## : : n_tokens_title > 10:
## : : :...num_hrefs <= 27: yes (176/57)
## : : num_hrefs > 27:
## : : :...num_hrefs <= 62: no (11/1)
## : : num_hrefs > 62: yes (2)
## : kw_max_max > 617900:
## : :...n_non_stop_words > 0: yes (508/177)
## : n_non_stop_words <= 0:
## : :...title_subjectivity <= 0.4772727: yes (16/3)
## : title_subjectivity > 0.4772727:
## : :...num_keywords <= 9: no (15/3)
## : num_keywords > 9: yes (2)
## n_unique_tokens > 0.4466737:
## :...kw_max_max > 617900:
## :...num_hrefs > 13:
## : :...n_tokens_title <= 9: yes (350/112)
## : : n_tokens_title > 9:
## : : :...average_token_length <= 4.811628: yes (199/74)
## : : average_token_length > 4.811628:
## : : :...num_keywords > 8: no (26/5)
## : : num_keywords <= 8:
## : : :...num_keywords <= 6: no (63/21)
## : : num_keywords > 6:
## : : :...kw_max_max > 690400: no (9/3)
## : : kw_max_max <= 690400:
## : : :...avg_positive_polarity <= 0.21725: no (3)
## : : avg_positive_polarity > 0.21725:
## : : :...n_unique_tokens <= 0.6048387: yes (24/3)
## : : n_unique_tokens > 0.6048387: no (2)
## : num_hrefs <= 13:
## : :...num_imgs <= 0:
## : :...title_sentiment_polarity <= -0.025:
## : : :...num_keywords > 7:
## : : : :...n_tokens_content <= 83: yes (5)
## : : : : n_tokens_content > 83: no (83/17)
## : : : num_keywords <= 7:
## : : : :...avg_positive_polarity <= 0.3493939:
## : : : :...kw_max_max <= 690400: yes (32/5)
## : : : : kw_max_max > 690400: no (2)
## : : : avg_positive_polarity > 0.3493939:
## : : : :...abs_title_subjectivity <= 0.02222222: yes (3)
## : : : abs_title_subjectivity > 0.02222222: no (46/14)
## : : title_sentiment_polarity > -0.025:
## : : :...global_sentiment_polarity > 0.002449495: yes (651/253)
## : : global_sentiment_polarity <= 0.002449495:
## : : :...kw_max_max > 690400:
## : : :...title_sentiment_polarity <= 0.06818182: no (3)
## : : : title_sentiment_polarity > 0.06818182: yes (3)
## : : kw_max_max <= 690400:
## : : :...global_sentiment_polarity > -0.006586199: no (9)
## : : global_sentiment_polarity <= -0.006586199:
## : : :...abs_title_subjectivity <= 0.125: no (7/1)
## : : abs_title_subjectivity > 0.125:
## : : :...n_unique_tokens <= 0.6138614: yes (9)
## : : n_unique_tokens > 0.6138614:
## : : :...n_tokens_title <= 9: no (20/5)
## : : n_tokens_title > 9: [S1]
## : num_imgs > 0:
## : :...title_sentiment_polarity > 0.7: yes (30/6)
## : title_sentiment_polarity <= 0.7:
## : :...num_videos > 3:
## : :...kw_max_max > 690400: no (10/2)
## : : kw_max_max <= 690400:
## : : :...num_keywords <= 5: no (5/1)
## : : num_keywords > 5: yes (39/8)
## : num_videos <= 3:
## : :...n_tokens_title <= 6:
## : :...num_hrefs > 5: yes (45/12)
## : : num_hrefs <= 5:
## : : :...n_tokens_content <= 583: no (25/7)
## : : n_tokens_content > 583: yes (4)
## : n_tokens_title > 6:
## : :...average_token_length > 4.408367:
## : :...num_keywords > 7: no (567/215)
## : : num_keywords <= 7:
## : : :...num_hrefs <= 10: no (939/386)
## : : num_hrefs > 10:
## : : :...num_keywords > 4: yes (92/33)
## : : num_keywords <= 4:
## : : :...n_unique_tokens <= 0.5074425: yes (2)
## : : n_unique_tokens > 0.5074425: no (13/1)
## : average_token_length <= 4.408367:
## : :...n_tokens_title > 11:
## : :...num_hrefs <= 4: yes (30/12)
## : : num_hrefs > 4:
## : : :...title_subjectivity > 0.8: yes (3)
## : : title_subjectivity <= 0.8: [S2]
## : n_tokens_title <= 11:
## : :...global_sentiment_polarity > 0.1577778: yes (69/14)
## : global_sentiment_polarity <= 0.1577778:
## : :...num_videos <= 0: [S3]
## : num_videos > 0:
## : :...num_videos > 1: no (2)
## : num_videos <= 1: [S4]
## kw_max_max <= 617900:
## :...kw_max_max > 80400: yes (1832/633)
## kw_max_max <= 80400:
## :...kw_max_max <= 0: no (50/20)
## kw_max_max > 0:
## :...num_keywords > 6: yes (1402/599)
## num_keywords <= 6:
## :...n_tokens_title <= 7: yes (107/38)
## n_tokens_title > 7:
## :...kw_max_max <= 39400:
## :...abs_title_sentiment_polarity > 0.55: yes (8/1)
## : abs_title_sentiment_polarity <= 0.55:
## : :...n_tokens_title > 12:
## : :...title_sentiment_polarity <= -0.1190476: no (3)
## : : title_sentiment_polarity > -0.1190476: yes (11/1)
## : n_tokens_title <= 12:
## : :...title_sentiment_polarity <= 0.075: no (98/28)
## : title_sentiment_polarity > 0.075:
## : :...n_tokens_title <= 11: yes (34/14)
## : n_tokens_title > 11: no (3)
## kw_max_max > 39400:
## :...kw_max_max > 73100:
## :...n_tokens_title <= 12: yes (80/29)
## : n_tokens_title > 12: no (8/2)
## kw_max_max <= 73100:
## :...n_tokens_title > 11: yes (123/52)
## n_tokens_title <= 11:
## :...kw_max_max > 57600:
## :...title_subjectivity <= 0.8125: no (262/121)
## : title_subjectivity > 0.8125: yes (23/4)
## kw_max_max <= 57600:
## :...num_videos > 0: no (52/17)
## num_videos <= 0:
## :...n_tokens_title <= 8: yes (22/5)
## n_tokens_title > 8:
## :...n_unique_tokens > 0.6940299: yes (6)
## n_unique_tokens <= 0.6940299:
## :...num_keywords > 5: [S5]
## num_keywords <= 5: [S6]
##
## SubTree [S1]
##
## average_token_length <= 4.461285: no (4/1)
## average_token_length > 4.461285: yes (8)
##
## SubTree [S2]
##
## average_token_length <= 4.353046: no (22)
## average_token_length > 4.353046: yes (10/4)
##
## SubTree [S3]
##
## average_token_length > 4.400285: yes (6)
## average_token_length <= 4.400285:
## :...avg_positive_polarity <= 0.3125:
## :...num_keywords <= 8: yes (21/5)
## : num_keywords > 8: no (3)
## avg_positive_polarity > 0.3125:
## :...n_tokens_title <= 7: yes (5/1)
## n_tokens_title > 7: no (54/15)
##
## SubTree [S4]
##
## average_token_length > 4.31361: yes (14/2)
## average_token_length <= 4.31361:
## :...global_sentiment_polarity <= 0.09637173: no (7)
## global_sentiment_polarity > 0.09637173: yes (4)
##
## SubTree [S5]
##
## n_tokens_content <= 226: yes (4)
## n_tokens_content > 226: no (32/9)
##
## SubTree [S6]
##
## num_imgs > 6: yes (6/2)
## num_imgs <= 6:
## :...num_imgs <= 0: yes (2)
## num_imgs > 0: no (57/26)
##
##
## Evaluation on training data (9000 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 82 3186(35.4%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 1618 2260 (a): class no
## 926 4196 (b): class yes
##
##
## Attribute usage:
##
## 100.00% n_unique_tokens
## 100.00% kw_max_max
## 48.21% num_keywords
## 46.68% n_tokens_title
## 41.90% num_hrefs
## 33.94% title_sentiment_polarity
## 33.01% num_imgs
## 24.46% average_token_length
## 24.13% num_videos
## 9.99% global_sentiment_polarity
## 6.01% n_non_stop_words
## 3.92% title_subjectivity
## 2.17% avg_positive_polarity
## 2.11% n_tokens_content
## 1.74% abs_title_sentiment_polarity
## 1.08% abs_title_subjectivity
##
##
## Time: 0.2 secs
news_pred <- predict(news_model, news_test)
(p<- table(news_pred, news_test$shares))
##
## news_pred no yes
## no 119 124
## yes 295 462
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 58.1
method 3 support vector machine
set.seed(12345)
news_rand2 <- newsShort[order(runif(10000)), ]
news_rand_train <- news_rand2[1:9000, ]
news_rand_test <- news_rand2[9001:10000, ]
news_classifier<- ksvm(shares ~., data = news_rand_train, kernel = "vanilladot")
## Setting default kernel parameters
summary(news_classifier)
## Length Class Mode
## 1 ksvm S4
news_predict <- predict(news_classifier, news_rand_train)
(p<- table (news_predict, news_rand_train$shares))
##
## news_predict no yes
## no 0 0
## yes 3878 5122
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 56.91111
method 2 adding regression to trees
set.seed(15342)
news_artrand <- newsShort[order(runif(10000)), ]
news_arttrain <- news_artrand[1:9000, ]
news_arttest <- news_artrand[9001:10000, ]
news.rpart <- rpart(shares~., data= news_arttrain)
summary(news.rpart)
## Call:
## rpart(formula = shares ~ ., data = news_arttrain)
## n= 9000
##
## CP nsplit rel error xerror xstd
## 1 0.010891 0 1.0000000 1.0000000 0.01208953
## 2 0.010000 5 0.9351685 0.9696424 0.01204120
##
## Variable importance
## n_unique_tokens kw_max_max n_tokens_content
## 26 20 12
## n_non_stop_words num_hrefs num_imgs
## 11 10 9
## title_sentiment_polarity num_videos average_token_length
## 9 2 2
## avg_positive_polarity
## 1
##
## Node number 1: 9000 observations, complexity param=0.010891
## predicted class=yes expected loss=0.4318889 P(node) =1
## class counts: 3887 5113
## probabilities: 0.432 0.568
## left son=2 (7737 obs) right son=3 (1263 obs)
## Primary splits:
## n_unique_tokens < 0.4491332 to the right, improve=38.98318, (0 missing)
## kw_max_max < 654150 to the right, improve=33.19935, (0 missing)
## n_tokens_content < 1075.5 to the left, improve=27.57814, (0 missing)
## num_hrefs < 12.5 to the left, improve=26.64352, (0 missing)
## n_non_stop_words < 1 to the left, improve=23.17481, (0 missing)
## Surrogate splits:
## n_tokens_content < 941.5 to the left, agree=0.913, adj=0.380, (0 split)
## n_non_stop_words < 1 to the left, agree=0.910, adj=0.361, (0 split)
## num_imgs < 18.5 to the left, agree=0.869, adj=0.067, (0 split)
## average_token_length < 1.83045 to the right, agree=0.867, adj=0.055, (0 split)
## avg_positive_polarity < 0.01666667 to the right, agree=0.866, adj=0.046, (0 split)
##
## Node number 2: 7737 observations, complexity param=0.010891
## predicted class=yes expected loss=0.4506915 P(node) =0.8596667
## class counts: 3487 4250
## probabilities: 0.451 0.549
## left son=4 (3564 obs) right son=5 (4173 obs)
## Primary splits:
## kw_max_max < 654150 to the right, improve=31.767210, (0 missing)
## num_hrefs < 12.5 to the left, improve=13.280220, (0 missing)
## title_sentiment_polarity < -0.01104798 to the left, improve=11.940440, (0 missing)
## global_sentiment_polarity < 0.04930952 to the left, improve= 9.797646, (0 missing)
## num_imgs < 5.5 to the left, improve= 9.428988, (0 missing)
## Surrogate splits:
## num_hrefs < 19.5 to the right, agree=0.546, adj=0.015, (0 split)
## n_unique_tokens < 0.4522799 to the left, agree=0.541, adj=0.003, (0 split)
## avg_positive_polarity < 0.2507858 to the left, agree=0.541, adj=0.003, (0 split)
## n_tokens_content < 1494 to the right, agree=0.540, adj=0.002, (0 split)
## n_non_stop_words < 1 to the right, agree=0.540, adj=0.001, (0 split)
##
## Node number 3: 1263 observations
## predicted class=yes expected loss=0.3167063 P(node) =0.1403333
## class counts: 400 863
## probabilities: 0.317 0.683
##
## Node number 4: 3564 observations, complexity param=0.010891
## predicted class=yes expected loss=0.4997194 P(node) =0.396
## class counts: 1781 1783
## probabilities: 0.500 0.500
## left son=8 (2893 obs) right son=9 (671 obs)
## Primary splits:
## num_hrefs < 13.5 to the left, improve=15.187180, (0 missing)
## title_sentiment_polarity < -0.01041667 to the left, improve= 9.979963, (0 missing)
## global_sentiment_polarity < 0.1005325 to the left, improve= 9.831783, (0 missing)
## average_token_length < 4.825436 to the right, improve= 9.511795, (0 missing)
## num_videos < 0.5 to the left, improve= 9.082234, (0 missing)
## Surrogate splits:
## n_non_stop_words < 1 to the left, agree=0.837, adj=0.136, (0 split)
## n_tokens_content < 767.5 to the left, agree=0.832, adj=0.106, (0 split)
## num_imgs < 9.5 to the left, agree=0.820, adj=0.042, (0 split)
## average_token_length < 5.451829 to the left, agree=0.815, adj=0.018, (0 split)
## n_unique_tokens < 0.4501119 to the right, agree=0.812, adj=0.003, (0 split)
##
## Node number 5: 4173 observations
## predicted class=yes expected loss=0.4088186 P(node) =0.4636667
## class counts: 1706 2467
## probabilities: 0.409 0.591
##
## Node number 8: 2893 observations, complexity param=0.010891
## predicted class=no expected loss=0.4780505 P(node) =0.3214444
## class counts: 1510 1383
## probabilities: 0.522 0.478
## left son=16 (1973 obs) right son=17 (920 obs)
## Primary splits:
## num_imgs < 0.5 to the right, improve=11.168960, (0 missing)
## title_sentiment_polarity < -0.02310606 to the left, improve= 9.565074, (0 missing)
## num_keywords < 7.5 to the right, improve= 9.117394, (0 missing)
## average_token_length < 4.575603 to the right, improve= 8.947140, (0 missing)
## global_sentiment_polarity < 0.04639137 to the left, improve= 8.725868, (0 missing)
## Surrogate splits:
## num_videos < 0.5 to the left, agree=0.759, adj=0.241, (0 split)
## n_unique_tokens < 0.6921134 to the left, agree=0.755, adj=0.230, (0 split)
## n_non_stop_words < 1 to the right, agree=0.744, adj=0.196, (0 split)
## n_tokens_content < 161.5 to the right, agree=0.743, adj=0.192, (0 split)
## global_sentiment_polarity < 0.2891369 to the left, agree=0.688, adj=0.018, (0 split)
##
## Node number 9: 671 observations
## predicted class=yes expected loss=0.4038748 P(node) =0.07455556
## class counts: 271 400
## probabilities: 0.404 0.596
##
## Node number 16: 1973 observations
## predicted class=no expected loss=0.4480487 P(node) =0.2192222
## class counts: 1089 884
## probabilities: 0.552 0.448
##
## Node number 17: 920 observations, complexity param=0.010891
## predicted class=yes expected loss=0.4576087 P(node) =0.1022222
## class counts: 421 499
## probabilities: 0.458 0.542
## left son=34 (161 obs) right son=35 (759 obs)
## Primary splits:
## title_sentiment_polarity < -0.02393939 to the left, improve=13.846880, (0 missing)
## num_keywords < 7.5 to the right, improve=10.004550, (0 missing)
## n_non_stop_words < 1 to the left, improve= 6.713292, (0 missing)
## n_tokens_content < 118.5 to the left, improve= 5.599327, (0 missing)
## avg_positive_polarity < 0.4089286 to the right, improve= 4.966911, (0 missing)
## Surrogate splits:
## global_sentiment_polarity < -0.1812963 to the left, agree=0.829, adj=0.025, (0 split)
##
## Node number 34: 161 observations
## predicted class=no expected loss=0.3540373 P(node) =0.01788889
## class counts: 104 57
## probabilities: 0.646 0.354
##
## Node number 35: 759 observations
## predicted class=yes expected loss=0.4176548 P(node) =0.08433333
## class counts: 317 442
## probabilities: 0.418 0.582
Viz
rpart.plot(news.rpart, digits = 3, type = 1)

p.news.rpart <- predict(news.rpart, news_arttest)
summary(p.news.rpart)
## no yes
## Min. :0.3167 Min. :0.3540
## 1st Qu.:0.4088 1st Qu.:0.4480
## Median :0.4088 Median :0.5912
## Mean :0.4373 Mean :0.5627
## 3rd Qu.:0.5520 3rd Qu.:0.5912
## Max. :0.6460 Max. :0.6833
summary(news_arttest$shares)
## no yes
## 405 595
which method is better? in my opinion, it depends. we need to determine the level of accuracy, if we don’t need the most accurate result, we can chose an algorihm that give good approximate cutting processing time and overfitting issues. Then the next variable that should be taken into account is the training time, usually the more time it takes to train the model the more accurate it is. Some algorithm are more sensitive to the number of data points. It also depends on the kind of problem we are dealing with, most machine learning SVM included assume linearity, if we know it’s not the case we can avoid those and opt for other algorithms. Finally, i would say that the number of features and parameters is also important while choosing which technique to use. Sometimes we might use multiple technique to come up with the best approximation. I think that would be the best answer.