#Method 1
# Step 1: Collecting the data
credit <- read.csv('D:/HU Homework/530-90-O/credit.csv')
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
#Step 2: Exploring the data
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$Creditability)
##
## 0 1
## 300 700
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$ Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
summary(credit_rand$amount)
## Length Class Mode
## 0 NULL NULL
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$ Creditability))
##
## 0 1
## 0.3088889 0.6911111
prop.table(table(credit_test$ Creditability))
##
## 0 1
## 0.22 0.78
#Step 3: Training a model on the data
library(C50)
## Warning: package 'C50' was built under R version 4.0.3
credit_model <- C5.0(x = credit_train[-1], y = as.factor(credit_train$Creditability))
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-1], y = as.factor(credit_train$Creditability))
##
##
## C5.0 [Release 2.07 GPL Edition] Tue Jan 05 19:57:42 2021
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## Account.Balance > 2:
## :...Concurrent.Credits > 2:
## : :...Age..years. > 33: 1 (179/11)
## : : Age..years. <= 33:
## : : :...Credit.Amount > 6681:
## : : :...Length.of.current.employment <= 2: 0 (4)
## : : : Length.of.current.employment > 2:
## : : : :...Payment.Status.of.Previous.Credit <= 3: 1 (4)
## : : : Payment.Status.of.Previous.Credit > 3: 0 (3/1)
## : : Credit.Amount <= 6681:
## : : :...Occupation > 2:
## : : :...Occupation <= 3: 1 (120/12)
## : : : Occupation > 3:
## : : : :...Duration.of.Credit..month. <= 33: 1 (9)
## : : : Duration.of.Credit..month. > 33: 0 (3)
## : : Occupation <= 2:
## : : :...No.of.Credits.at.this.Bank > 1: 1 (6)
## : : No.of.Credits.at.this.Bank <= 1:
## : : :...Most.valuable.available.asset > 1: 0 (3)
## : : Most.valuable.available.asset <= 1:
## : : :...Credit.Amount <= 1987: 1 (8/1)
## : : Credit.Amount > 1987: 0 (2)
## : Concurrent.Credits <= 2:
## : :...Guarantors > 1: 1 (4)
## : Guarantors <= 1:
## : :...Purpose <= 0:
## : :...Most.valuable.available.asset <= 2: 0 (5)
## : : Most.valuable.available.asset > 2:
## : : :...No.of.dependents <= 1: 1 (7/1)
## : : No.of.dependents > 1: 0 (2)
## : Purpose > 0:
## : :...Purpose <= 4: 1 (35/2)
## : Purpose > 4:
## : :...Length.of.current.employment <= 2: 0 (4)
## : Length.of.current.employment > 2:
## : :...No.of.dependents > 1: 0 (3/1)
## : No.of.dependents <= 1:
## : :...Length.of.current.employment > 3: 1 (4)
## : Length.of.current.employment <= 3:
## : :...Instalment.per.cent <= 2: 1 (2)
## : Instalment.per.cent > 2: 0 (2)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
## :...Value.Savings.Stocks <= 2: 0 (49/10)
## : Value.Savings.Stocks > 2:
## : :...Credit.Amount <= 2064: 0 (3)
## : Credit.Amount > 2064: 1 (9/1)
## Payment.Status.of.Previous.Credit > 1:
## :...Credit.Amount > 7980:
## :...Value.Savings.Stocks > 4:
## : :...Payment.Status.of.Previous.Credit <= 2: 0 (4/1)
## : : Payment.Status.of.Previous.Credit > 2: 1 (3)
## : Value.Savings.Stocks <= 4:
## : :...Account.Balance > 1: 0 (15)
## : Account.Balance <= 1:
## : :...Concurrent.Credits <= 2: 0 (2)
## : Concurrent.Credits > 2:
## : :...Credit.Amount <= 10297: 0 (6)
## : Credit.Amount > 10297: 1 (3)
## Credit.Amount <= 7980:
## :...Duration.of.Credit..month. <= 11:
## :...Occupation > 3:
## : :...Concurrent.Credits <= 2: 1 (3)
## : : Concurrent.Credits > 2:
## : : :...Payment.Status.of.Previous.Credit <= 2: 1 (4/1)
## : : Payment.Status.of.Previous.Credit > 2: 0 (3)
## : Occupation <= 3:
## : :...Age..years. > 32: 1 (34)
## : Age..years. <= 32:
## : :...Most.valuable.available.asset <= 1: 1 (13/1)
## : Most.valuable.available.asset > 1:
## : :...Instalment.per.cent <= 3: 1 (6/1)
## : Instalment.per.cent > 3: 0 (6/1)
## Duration.of.Credit..month. > 11:
## :...Duration.of.Credit..month. > 36:
## :...Length.of.current.employment <= 1: 1 (3)
## : Length.of.current.employment > 1:
## : :...No.of.dependents > 1: 1 (5/1)
## : No.of.dependents <= 1:
## : :...Duration.in.Current.address <= 1: 1 (4/1)
## : Duration.in.Current.address > 1: 0 (23)
## Duration.of.Credit..month. <= 36:
## :...Guarantors > 2:
## :...Foreign.Worker <= 1: 1 (23/1)
## : Foreign.Worker > 1: 0 (2)
## Guarantors <= 2:
## :...Credit.Amount <= 1381:
## :...Telephone > 1:
## : :...Sex...Marital.Status > 3: 0 (2)
## : : Sex...Marital.Status <= 3:
## : : :...Duration.of.Credit..month. <= 16: 1 (7)
## : : Duration.of.Credit..month. > 16: 0 (3/1)
## : Telephone <= 1:
## : :...Concurrent.Credits <= 2: 0 (9)
## : Concurrent.Credits > 2:
## : :...Account.Balance <= 1: 0 (29/6)
## : Account.Balance > 1: [S1]
## Credit.Amount > 1381:
## :...Guarantors > 1:
## :...Foreign.Worker > 1: 1 (2)
## : Foreign.Worker <= 1:
## : :...Instalment.per.cent > 2: 0 (5)
## : Instalment.per.cent <= 2: [S2]
## Guarantors <= 1:
## :...Payment.Status.of.Previous.Credit > 3:
## :...Age..years. > 33: 1 (22)
## : Age..years. <= 33:
## : :...Purpose > 3: 1 (7)
## : Purpose <= 3: [S3]
## Payment.Status.of.Previous.Credit <= 3:
## :...Instalment.per.cent <= 2:
## :...No.of.dependents > 1:
## : :...Purpose <= 0: 1 (2)
## : : Purpose > 0: 0 (3)
## : No.of.dependents <= 1: [S4]
## Instalment.per.cent > 2:
## :...Concurrent.Credits <= 1: 1 (8/1)
## Concurrent.Credits > 1:
## :...Sex...Marital.Status <= 1: 0 (6/1)
## Sex...Marital.Status > 1:
## :...Account.Balance > 1: [S5]
## Account.Balance <= 1: [S6]
##
## SubTree [S1]
##
## Duration.in.Current.address > 3: 1 (8/1)
## Duration.in.Current.address <= 3:
## :...Purpose > 2: 0 (5)
## Purpose <= 2:
## :...Type.of.apartment <= 1: 0 (2)
## Type.of.apartment > 1: 1 (5/1)
##
## SubTree [S2]
##
## Duration.in.Current.address <= 2: 1 (2)
## Duration.in.Current.address > 2: 0 (4/1)
##
## SubTree [S3]
##
## Duration.of.Credit..month. <= 16: 1 (4)
## Duration.of.Credit..month. > 16:
## :...Length.of.current.employment <= 3: 0 (8)
## Length.of.current.employment > 3: 1 (6/1)
##
## SubTree [S4]
##
## Duration.in.Current.address > 1: 1 (41/6)
## Duration.in.Current.address <= 1:
## :...Value.Savings.Stocks > 3: 0 (2)
## Value.Savings.Stocks <= 3:
## :...Length.of.current.employment > 2: 1 (4)
## Length.of.current.employment <= 2:
## :...Instalment.per.cent <= 1: 0 (3)
## Instalment.per.cent > 1: 1 (3/1)
##
## SubTree [S5]
##
## Sex...Marital.Status > 3: 0 (2)
## Sex...Marital.Status <= 3:
## :...Length.of.current.employment > 3: 1 (10)
## Length.of.current.employment <= 3:
## :...Duration.in.Current.address <= 1: 1 (5)
## Duration.in.Current.address > 1:
## :...Length.of.current.employment <= 2: 0 (4)
## Length.of.current.employment > 2:
## :...Value.Savings.Stocks <= 1: 0 (3)
## Value.Savings.Stocks > 1: 1 (5)
##
## SubTree [S6]
##
## Payment.Status.of.Previous.Credit > 2: 0 (3)
## Payment.Status.of.Previous.Credit <= 2:
## :...Purpose <= 0: 0 (7/1)
## Purpose > 0:
## :...Most.valuable.available.asset <= 1: 0 (5/1)
## Most.valuable.available.asset > 1:
## :...Sex...Marital.Status <= 2: 1 (6)
## Sex...Marital.Status > 2:
## :...Length.of.current.employment > 4: 0 (5)
## Length.of.current.employment <= 4:
## :...Telephone > 1: 1 (3)
## Telephone <= 1:
## :...Length.of.current.employment <= 2: 0 (2)
## Length.of.current.employment > 2:
## :...Age..years. <= 28: 1 (4)
## Age..years. > 28: 0 (2)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 85 70( 7.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 233 45 (a): class 0
## 25 597 (b): class 1
##
##
## Attribute usage:
##
## 100.00% Account.Balance
## 67.11% Credit.Amount
## 63.11% Concurrent.Credits
## 55.33% Payment.Status.of.Previous.Credit
## 50.33% Age..years.
## 45.44% Duration.of.Credit..month.
## 40.11% Guarantors
## 24.44% Occupation
## 18.33% Instalment.per.cent
## 15.56% Purpose
## 14.22% Length.of.current.employment
## 13.67% Duration.in.Current.address
## 12.67% Value.Savings.Stocks
## 12.22% No.of.dependents
## 9.33% Sex...Marital.Status
## 9.00% Telephone
## 8.78% Most.valuable.available.asset
## 4.22% Foreign.Worker
## 2.11% No.of.Credits.at.this.Bank
## 0.78% Type.of.apartment
##
##
## Time: 0.0 secs
#Step 4: Evaluating Model Performance
cred_pred <- predict(credit_model, credit_test)
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.0.3
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ---------------------|-----------|-----------|-----------|
## 0 | 8 | 14 | 22 |
## | 0.080 | 0.140 | |
## ---------------------|-----------|-----------|-----------|
## 1 | 17 | 61 | 78 |
## | 0.170 | 0.610 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 25 | 75 | 100 |
## ---------------------|-----------|-----------|-----------|
##
##
# Method2. Random forest
credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest::randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 900 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1800 matrix numeric
## oob.times 900 -none- numeric
## classes 2 -none- character
## importance 20 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 900 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 11 10
## 1 11 68
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 79
set.seed(23458)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.2988889 0.7011111
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.31 0.69
random_model <- randomForest::randomForest(Creditability ~ . , data= credit_train)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
summary(random_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 900 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 900 -none- numeric
## importance 20 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 900 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0.185366666666667 1 0
## 0.255633333333333 1 0
## 0.308933333333333 1 0
## 0.3295 1 0
## 0.343666666666667 0 1
## 0.361233333333333 1 0
## 0.381933333333333 1 0
## 0.3912 0 1
## 0.415066666666667 1 0
## 0.4221 0 1
## 0.426866666666667 1 0
## 0.4331 0 1
## 0.4517 0 1
## 0.451933333333333 1 0
## 0.462433333333333 1 0
## 0.468066666666667 1 0
## 0.4855 1 0
## 0.4986 1 0
## 0.520466666666667 0 1
## 0.522933333333333 1 0
## 0.529466666666667 1 0
## 0.545133333333333 0 1
## 0.546666666666667 1 0
## 0.547866666666667 0 1
## 0.566333333333333 0 1
## 0.567466666666667 1 0
## 0.587866666666667 0 1
## 0.5935 1 0
## 0.5966 0 1
## 0.6023 0 1
## 0.614566666666667 1 0
## 0.6177 1 0
## 0.627271428571428 0 1
## 0.633566666666667 0 1
## 0.6358 0 1
## 0.6373 0 1
## 0.6486 0 1
## 0.650566666666667 0 1
## 0.658271428571429 1 0
## 0.6601 0 1
## 0.693733333333333 0 1
## 0.699666666666667 0 1
## 0.7018 0 1
## 0.702066666666667 1 0
## 0.702133333333333 1 0
## 0.705733333333334 0 1
## 0.705933333333333 0 1
## 0.711033333333333 0 1
## 0.713666666666667 0 1
## 0.723966666666667 0 1
## 0.724133333333333 0 1
## 0.726833333333334 0 1
## 0.728966666666667 1 0
## 0.731733333333333 1 0
## 0.739 1 0
## 0.744433333333333 0 1
## 0.744633333333333 0 1
## 0.7457 1 0
## 0.749566666666667 0 1
## 0.7509 0 1
## 0.762666666666666 0 1
## 0.762766666666667 0 1
## 0.776033333333334 0 1
## 0.7779 0 1
## 0.787666666666667 0 1
## 0.801233333333334 0 1
## 0.8117 0 1
## 0.818133333333333 0 1
## 0.825633333333334 0 1
## 0.836166666666667 0 1
## 0.839566666666667 0 1
## 0.843033333333333 0 1
## 0.8485 1 0
## 0.856004761904762 1 0
## 0.861533333333333 0 1
## 0.873466666666667 0 1
## 0.875066666666667 0 1
## 0.8826 0 1
## 0.883566666666667 1 0
## 0.883933333333333 0 1
## 0.889833333333333 0 1
## 0.901600000000001 0 1
## 0.9058 0 1
## 0.907933333333333 0 1
## 0.909933333333333 0 1
## 0.918633333333334 0 1
## 0.919533333333334 0 1
## 0.9268 1 0
## 0.9279 0 1
## 0.931533333333333 0 1
## 0.938 0 1
## 0.9431 0 1
## 0.947733333333333 0 1
## 0.9562 0 1
## 0.9598 0 1
## 0.967233333333333 0 1
## 0.9704 0 1
## 0.970566666666667 0 1
## 0.9853 0 1
## 0.9899 0 1
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 1
# Method3. Adding regression to trees
#Step1: Collecting the Data
wine <- read.csv('D:/HU Homework/530-90-O/whitewines.csv')
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
hist(wine$quality)

#Step2: Exploring and Preparing the Data
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
#Step3:Training a Model on the Data
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.3
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 2945.53200 5.870933
## 2) alcohol< 10.85 2372 1418.86100 5.604975
## 4) volatile.acidity>=0.2275 1611 821.30730 5.432030
## 8) volatile.acidity>=0.3025 688 278.97670 5.255814 *
## 9) volatile.acidity< 0.3025 923 505.04230 5.563380 *
## 5) volatile.acidity< 0.2275 761 447.36400 5.971091 *
## 3) alcohol>=10.85 1378 1070.08200 6.328737
## 6) free.sulfur.dioxide< 10.5 84 95.55952 5.369048 *
## 7) free.sulfur.dioxide>=10.5 1294 892.13600 6.391036
## 14) alcohol< 11.76667 629 430.11130 6.173291
## 28) volatile.acidity>=0.465 11 10.72727 4.545455 *
## 29) volatile.acidity< 0.465 618 389.71680 6.202265 *
## 15) alcohol>=11.76667 665 403.99400 6.596992 *
rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

#Step4: Evaluating Model Performance
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
#Method4. News Popularity
#Collecting the Data
news <- read.csv("D:/HU Homework/530-90-O/OnlineNewsPopularity.csv")
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : chr "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#Pre-processing
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- subset( newsShort, select = -popular )
set.seed(12345)
news_rand <- newsShort[order(runif(49000)), ]
news_train <- news_rand[1:44000, ]
news_test <- news_rand[44001:49000, ]
prop.table(table(news_train$shares))
##
## no yes
## 0.4662568 0.5337432
prop.table(table(news_test$shares))
##
## no yes
## 0.4676778 0.5323222
#Model Training
library(C50)
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue Jan 05 19:57:55 2021
## -------------------------------
##
## Class specified by attribute `outcome'
## *** ignoring cases with bad or unknown class
##
## Read 35622 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## num_imgs > 3:
## :...kw_max_max <= 15000:
## : :...n_tokens_content <= 1111: no (20/2)
## : : n_tokens_content > 1111: yes (7/2)
## : kw_max_max > 15000:
## : :...num_keywords <= 5:
## : :...n_unique_tokens > 0.5483304:
## : : :...num_videos > 0:
## : : : :...kw_max_max <= 310800: no (8/3)
## : : : : kw_max_max > 310800: yes (87/21)
## : : : num_videos <= 0:
## : : : :...average_token_length > 5.22907: yes (13)
## : : : average_token_length <= 5.22907:
## : : : :...global_sentiment_polarity <= 0.04: no (27/8)
## : : : global_sentiment_polarity > 0.04: yes (125/42)
## : : n_unique_tokens <= 0.5483304:
## : : :...n_tokens_title > 9:
## : : :...kw_max_max <= 690400: yes (78/33)
## : : : kw_max_max > 690400: no (545/228)
## : : n_tokens_title <= 9:
## : : :...num_videos <= 1: yes (267/108)
## : : num_videos > 1:
## : : :...num_imgs > 7: no (7)
## : : num_imgs <= 7:
## : : :...num_videos <= 2: yes (7)
## : : num_videos > 2: no (6/1)
## : num_keywords > 5:
## : :...num_hrefs > 17:
## : :...n_tokens_content > 2691: yes (53/2)
## : : n_tokens_content <= 2691:
## : : :...n_tokens_content <= 342: yes (661/156)
## : : n_tokens_content > 342:
## : : :...n_tokens_title > 9:
## : : :...num_imgs <= 12:
## : : : :...num_videos <= 6: yes (592/189)
## : : : : num_videos > 6:
## : : : : :...n_tokens_title <= 11: yes (15/3)
## : : : : n_tokens_title > 11: no (28/8)
## : : : num_imgs > 12:
## : : : :...kw_max_max > 310800:
## : : : :...n_tokens_title <= 13: yes (716/287)
## : : : : n_tokens_title > 13: no (61/28)
## : : : kw_max_max <= 310800:
## : : : :...kw_max_max <= 17100: no (8)
## : : : kw_max_max > 17100:
## : : : :...num_hrefs <= 22: yes (8)
## : : : num_hrefs > 22: no (31/12)
## : : n_tokens_title <= 9:
## : : :...abs_title_sentiment_polarity <= 0.425:
## : : :...num_imgs <= 28: yes (645/187)
## : : : num_imgs > 28:
## : : : :...average_token_length > 4.696252: yes (34/6)
## : : : average_token_length <= 4.696252:
## : : : :...kw_max_max <= 617900: yes (7/2)
## : : : kw_max_max > 617900: no (33/6)
## : : abs_title_sentiment_polarity > 0.425:
## : : :...n_tokens_content > 457: yes (209/31)
## : : n_tokens_content <= 457:
## : : :...n_tokens_title <= 6: yes (6)
## : : n_tokens_title > 6:
## : : :...kw_max_max <= 690400: no (6)
## : : kw_max_max > 690400:
## : : :...num_keywords > 9: no (12/3)
## : : num_keywords <= 9: [S1]
## : num_hrefs <= 17:
## : :...title_sentiment_polarity > 0.4: yes (658/198)
## : title_sentiment_polarity <= 0.4:
## : :...kw_max_max <= 617900: yes (479/157)
## : kw_max_max > 617900:
## : :...n_tokens_title > 10:
## : :...num_hrefs > 6: yes (1088/444)
## : : num_hrefs <= 6:
## : : :...num_videos <= 0: no (503/235)
## : : num_videos > 0:
## : : :...n_unique_tokens <= 0.3534247: yes (37/3)
## : : n_unique_tokens > 0.3534247:
## : : :...avg_positive_polarity > 0.3814904: yes (127/35)
## : : avg_positive_polarity <= 0.3814904:
## : : :...num_hrefs <= 5: no (142/60)
## : : num_hrefs > 5: [S2]
## : n_tokens_title <= 10:
## : :...global_sentiment_polarity > 0.09483681: yes (1135/393)
## : global_sentiment_polarity <= 0.09483681:
## : :...n_tokens_content > 1575:
## : :...title_sentiment_polarity <= -0.1461111: yes (4/1)
## : : title_sentiment_polarity > -0.1461111: no (26/1)
## : n_tokens_content <= 1575:
## : :...kw_max_max <= 690400:
## : :...n_tokens_title > 7: no (32/9)
## : : n_tokens_title <= 7: [S3]
## : kw_max_max > 690400:
## : :...num_imgs > 6: yes (520/196)
## : num_imgs <= 6:
## : :...num_hrefs <= 9:
## : :...num_videos > 1: no (6)
## : : num_videos <= 1: [S4]
## : num_hrefs > 9: [S5]
## num_imgs <= 3:
## :...n_unique_tokens > 0.4304388:
## :...kw_max_max <= 617900:
## : :...n_tokens_content > 1204: yes (37/4)
## : : n_tokens_content <= 1204:
## : : :...kw_max_max > 69100: yes (1747/638)
## : : kw_max_max <= 69100:
## : : :...kw_max_max <= 41600: no (374/170)
## : : kw_max_max > 41600: yes (1411/638)
## : kw_max_max > 617900:
## : :...global_sentiment_polarity <= 0.08261218:
## : :...num_imgs <= 0:
## : : :...num_videos > 25:
## : : : :...num_hrefs <= 52: no (71/18)
## : : : : num_hrefs > 52: yes (4)
## : : : num_videos <= 25:
## : : : :...kw_max_max <= 690400: yes (250/107)
## : : : kw_max_max > 690400:
## : : : :...num_videos > 15: yes (103/37)
## : : : num_videos <= 15:
## : : : :...num_keywords <= 6: no (360/141)
## : : : num_keywords > 6: yes (596/292)
## : : num_imgs > 0:
## : : :...n_unique_tokens > 0.7682927:
## : : :...title_sentiment_polarity <= -0.3: no (3)
## : : : title_sentiment_polarity > -0.3: yes (45/13)
## : : n_unique_tokens <= 0.7682927:
## : : :...num_hrefs <= 1:
## : : :...kw_max_max > 690400: no (163/26)
## : : : kw_max_max <= 690400:
## : : : :...num_keywords > 6: yes (3)
## : : : num_keywords <= 6:
## : : : :...num_keywords <= 5: yes (5/1)
## : : : num_keywords > 5: no (7)
## : : num_hrefs > 1:
## : : :...num_hrefs > 20:
## : : :...num_keywords <= 6: no (150/64)
## : : : num_keywords > 6: yes (212/91)
## : : num_hrefs <= 20:
## : : :...average_token_length > 4.747073:
## : : :...num_hrefs > 4:
## : : : :...n_tokens_content <= 164: yes (32/7)
## : : : : n_tokens_content > 164: no (2129/730)
## : : : num_hrefs <= 4:
## : : : :...kw_max_max <= 690400:
## : : : :...n_tokens_title <= 9: no (22/7)
## : : : : n_tokens_title > 9: yes (21/5)
## : : : kw_max_max > 690400:
## : : : :...num_imgs <= 2: no (435/110)
## : : : num_imgs > 2:
## : : : :...n_unique_tokens <= 0.4491979: yes (4)
## : : : n_unique_tokens > 0.4491979: no (30/10)
## : : average_token_length <= 4.747073:
## : : :...num_imgs > 2:
## : : :...num_hrefs <= 3:
## : : : :...n_unique_tokens > 0.6126984: yes (3)
## : : : : n_unique_tokens <= 0.6126984: [S6]
## : : : num_hrefs > 3:
## : : : :...num_keywords > 6: yes (64/16)
## : : : num_keywords <= 6: [S7]
## : : num_imgs <= 2:
## : : :...num_keywords <= 5: no (750/284)
## : : num_keywords > 5:
## : : :...num_imgs <= 1: no (1317/581)
## : : num_imgs > 1: [S8]
## : global_sentiment_polarity > 0.08261218:
## : :...num_hrefs > 15:
## : :...n_tokens_title <= 9: yes (582/189)
## : : n_tokens_title > 9:
## : : :...abs_title_subjectivity > 0.09583333: yes (748/329)
## : : abs_title_subjectivity <= 0.09583333:
## : : :...num_imgs <= 0: yes (43/18)
## : : num_imgs > 0:
## : : :...num_keywords <= 5: no (36/6)
## : : num_keywords > 5:
## : : :...num_videos > 0: no (46/11)
## : : num_videos <= 0:
## : : :...num_imgs > 1: yes (4)
## : : num_imgs <= 1: [S9]
## : num_hrefs <= 15:
## : :...num_imgs > 1:
## : :...n_unique_tokens > 0.6486486:
## : : :...global_sentiment_polarity <= 0.3318182: yes (159/35)
## : : : global_sentiment_polarity > 0.3318182: no (10/2)
## : : n_unique_tokens <= 0.6486486:
## : : :...num_keywords <= 9: yes (1404/654)
## : : num_keywords > 9:
## : : :...num_videos > 1: yes (17/3)
## : : num_videos <= 1:
## : : :...num_videos <= 0:
## : : :...num_hrefs <= 8: yes (74/20)
## : : : num_hrefs > 8: [S10]
## : : num_videos > 0:
## : : :...avg_positive_polarity <= 0.3091198: yes (19/5)
## : : avg_positive_polarity > 0.3091198:
## : : :...num_hrefs > 9: yes (20/1)
## : : num_hrefs <= 9:
## : : :...num_hrefs <= 6: no (14/2)
## : : num_hrefs > 6: yes (22/10)
## : num_imgs <= 1:
## : :...num_videos > 0:
## : :...kw_max_max <= 690400:
## : : :...num_imgs <= 0:
## : : : :...num_keywords <= 7:
## : : : : :...n_tokens_title <= 7: no (19/8)
## : : : : : n_tokens_title > 7: yes (212/66)
## : : : : num_keywords > 7:
## : : : : :...n_unique_tokens <= 0.6703297: no (61/24)
## : : : : n_unique_tokens > 0.6703297: yes (22/4)
## : : : num_imgs > 0:
## : : : :...average_token_length <= 4.39404: yes (29/4)
## : : : average_token_length > 4.39404:
## : : : :...num_hrefs > 9: yes (44/13)
## : : : num_hrefs <= 9:
## : : : :...num_videos > 3: yes (12/3)
## : : : num_videos <= 3:
## : : : :...num_keywords > 6: no (69/17)
## : : : num_keywords <= 6: [S11]
## : : kw_max_max > 690400:
## : : :...num_videos > 10: yes (191/71)
## : : num_videos <= 10:
## : : :...n_tokens_title <= 9: no (771/370)
## : : n_tokens_title > 9:
## : : :...num_hrefs <= 9: yes (1684/755)
## : : num_hrefs > 9:
## : : :...num_keywords <= 8:
## : : :...num_keywords <= 5: [S12]
## : : : num_keywords > 5: [S13]
## : : num_keywords > 8:
## : : :...num_imgs <= 0: no (44/19)
## : : num_imgs > 0: [S14]
## : num_videos <= 0:
## : :...n_tokens_content > 668: yes (759/324)
## : n_tokens_content <= 668:
## : :...title_sentiment_polarity > -0.008333334:
## : :...num_imgs > 0:
## : : :...n_tokens_content <= 121: yes (101/36)
## : : : n_tokens_content > 121: no (4008/1746)
## : : num_imgs <= 0:
## : : :...n_tokens_content > 141:
## : : :...num_keywords <= 5: no (73/31)
## : : : num_keywords > 5: yes (411/158)
## : : n_tokens_content <= 141:
## : : :...n_tokens_title > 9:
## : : :...n_unique_tokens <= 0.8265306: no (104/21)
## : : : n_unique_tokens > 0.8265306: yes (8/1)
## : : n_tokens_title <= 9:
## : : :...kw_max_max <= 690400: yes (35/10)
## : : kw_max_max > 690400: [S15]
## : title_sentiment_polarity <= -0.008333334:
## : :...n_unique_tokens > 0.71875:
## : :...n_tokens_title > 12: no (89/3)
## : : n_tokens_title <= 12:
## : : :...n_unique_tokens <= 0.7894737: no (69/11)
## : : n_unique_tokens > 0.7894737: yes (14/5)
## : n_unique_tokens <= 0.71875:
## : :...n_tokens_title > 12: no (89/25)
## : n_tokens_title <= 12:
## : :...num_imgs > 0: no (303/124)
## : num_imgs <= 0: [S16]
## n_unique_tokens <= 0.4304388:
## :...num_hrefs > 19: yes (387/93)
## num_hrefs <= 19:
## :...num_videos > 8:
## :...num_keywords <= 8: no (40/10)
## : num_keywords > 8:
## : :...average_token_length <= 4.819527: yes (8)
## : average_token_length > 4.819527: no (2)
## num_videos <= 8:
## :...kw_max_max <= 663600: yes (180/46)
## kw_max_max > 663600:
## :...global_sentiment_polarity > 0.154227: yes (399/112)
## global_sentiment_polarity <= 0.154227:
## :...average_token_length > 4.729216:
## :...num_videos <= 1:
## : :...num_hrefs <= 14: no (166/50)
## : : num_hrefs > 14:
## : : :...global_sentiment_polarity <= 0.1335317: no (40/19)
## : : global_sentiment_polarity > 0.1335317: yes (9)
## : num_videos > 1:
## : :...title_sentiment_polarity <= -0.1402778: no (5)
## : title_sentiment_polarity > -0.1402778:
## : :...avg_positive_polarity <= 0.2935334: no (5/1)
## : avg_positive_polarity > 0.2935334: yes (19/3)
## average_token_length <= 4.729216:
## :...num_imgs > 2: yes (99/32)
## num_imgs <= 2:
## :...kw_max_max <= 690400: yes (85/34)
## kw_max_max > 690400:
## :...num_hrefs > 15: no (55/22)
## num_hrefs <= 15:
## :...num_hrefs > 10: yes (138/39)
## num_hrefs <= 10:
## :...n_tokens_content > 1113:
## :...n_unique_tokens > 0.4021739: no (50/12)
## : n_unique_tokens <= 0.4021739:
## : :...num_videos > 2: no (4)
## : num_videos <= 2: [S17]
## n_tokens_content <= 1113:
## :...num_imgs > 1: yes (37/10)
## num_imgs <= 1:
## :...num_imgs > 0: [S18]
## num_imgs <= 0: [S19]
##
## SubTree [S1]
##
## n_tokens_content <= 422: yes (15/1)
## n_tokens_content > 422: no (7/2)
##
## SubTree [S2]
##
## title_sentiment_polarity <= -0.325: no (4)
## title_sentiment_polarity > -0.325: yes (38/12)
##
## SubTree [S3]
##
## global_sentiment_polarity <= 0.02484504: no (4/1)
## global_sentiment_polarity > 0.02484504: yes (7)
##
## SubTree [S4]
##
## n_tokens_content <= 436: yes (27/10)
## n_tokens_content > 436: no (66/15)
##
## SubTree [S5]
##
## average_token_length > 5.011019: no (5)
## average_token_length <= 5.011019:
## :...num_videos <= 3: yes (52/14)
## num_videos > 3: no (5/1)
##
## SubTree [S6]
##
## global_sentiment_polarity <= 0.06216631: no (19/1)
## global_sentiment_polarity > 0.06216631:
## :...global_sentiment_polarity <= 0.07848307: yes (5)
## global_sentiment_polarity > 0.07848307: no (3)
##
## SubTree [S7]
##
## average_token_length <= 4.407609: yes (6)
## average_token_length > 4.407609:
## :...n_tokens_title <= 14: no (48/19)
## n_tokens_title > 14: yes (3)
##
## SubTree [S8]
##
## n_tokens_content <= 623: yes (216/101)
## n_tokens_content > 623:
## :...avg_positive_polarity <= 0.3597222: no (59/9)
## avg_positive_polarity > 0.3597222:
## :...num_keywords > 9: yes (4)
## num_keywords <= 9:
## :...global_sentiment_polarity <= 0.07012397: no (18/5)
## global_sentiment_polarity > 0.07012397: yes (5)
##
## SubTree [S9]
##
## abs_title_subjectivity > 0.01136364: no (23/9)
## abs_title_subjectivity <= 0.01136364:
## :...avg_positive_polarity <= 0.2931903: no (2)
## avg_positive_polarity > 0.2931903: yes (18/3)
##
## SubTree [S10]
##
## abs_title_subjectivity <= 0.4125: yes (28/8)
## abs_title_subjectivity > 0.4125: no (28/9)
##
## SubTree [S11]
##
## global_sentiment_polarity <= 0.2202445: no (35/16)
## global_sentiment_polarity > 0.2202445: yes (6)
##
## SubTree [S12]
##
## global_sentiment_polarity <= 0.2365167: no (43/14)
## global_sentiment_polarity > 0.2365167: yes (4)
##
## SubTree [S13]
##
## abs_title_sentiment_polarity <= 0.7: no (152/58)
## abs_title_sentiment_polarity > 0.7: yes (8/3)
##
## SubTree [S14]
##
## average_token_length > 4.955523: no (13/2)
## average_token_length <= 4.955523:
## :...num_videos <= 9: yes (71/20)
## num_videos > 9: no (5/1)
##
## SubTree [S15]
##
## n_tokens_content <= 96: no (19/4)
## n_tokens_content > 96:
## :...title_subjectivity <= 0.6625: yes (10)
## title_subjectivity > 0.6625: no (4/1)
##
## SubTree [S16]
##
## abs_title_subjectivity > 0.425: no (5)
## abs_title_subjectivity <= 0.425:
## :...n_tokens_title > 10: yes (23/4)
## n_tokens_title <= 10:
## :...n_tokens_title <= 9: yes (12/3)
## n_tokens_title > 9:
## :...n_tokens_content <= 168: yes (2)
## n_tokens_content > 168: no (8)
##
## SubTree [S17]
##
## global_sentiment_polarity <= 0.1441667: yes (66/25)
## global_sentiment_polarity > 0.1441667: no (7)
##
## SubTree [S18]
##
## n_non_stop_words <= 0: yes (63/19)
## n_non_stop_words > 0:
## :...global_sentiment_polarity <= 0.02890873: no (8/1)
## global_sentiment_polarity > 0.02890873:
## :...n_unique_tokens <= 0.3853428: yes (18/1)
## n_unique_tokens > 0.3853428:
## :...num_hrefs <= 5: no (27/8)
## num_hrefs > 5: yes (54/19)
##
## SubTree [S19]
##
## n_tokens_title > 14: no (28/10)
## n_tokens_title <= 14:
## :...num_videos > 0: yes (602/253)
## num_videos <= 0:
## :...n_tokens_title > 13: yes (5)
## n_tokens_title <= 13:
## :...num_keywords <= 7: no (15/1)
## num_keywords > 7:
## :...num_keywords <= 9: yes (14/3)
## num_keywords > 9:
## :...abs_title_subjectivity <= 0.09415584: yes (2)
## abs_title_subjectivity > 0.09415584: no (4)
##
##
## Evaluation on training data (35622 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 190 13393(37.6%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 8682 7927 (a): class no
## 5466 13547 (b): class yes
##
##
## Attribute usage:
##
## 100.00% num_imgs
## 98.77% kw_max_max
## 82.81% num_hrefs
## 78.13% n_unique_tokens
## 67.85% global_sentiment_polarity
## 48.03% n_tokens_content
## 45.46% num_keywords
## 44.31% num_videos
## 37.22% n_tokens_title
## 29.28% title_sentiment_polarity
## 20.81% average_token_length
## 3.18% abs_title_sentiment_polarity
## 2.90% abs_title_subjectivity
## 1.45% avg_positive_polarity
## 0.48% n_non_stop_words
## 0.04% title_subjectivity
##
##
## Time: 0.8 secs
#Evaluate the Model
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##
## news_pred no yes
## no 867 674
## yes 1014 1467
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 58.03083
#Technique 2
#Model Training
random_model <- randomForest::randomForest(shares ~ .,data = news_train,na.action=na.omit)
random_model
##
## Call:
## randomForest(formula = shares ~ ., data = news_train, na.action = na.omit)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 40.03%
## Confusion matrix:
## no yes class.error
## no 8246 8363 0.5035222
## yes 5897 13116 0.3101562
summary(random_model)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 35622 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 71244 matrix numeric
## oob.times 35622 -none- numeric
## classes 2 -none- character
## importance 16 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 35622 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
## na.action 8378 omit numeric
cred_pred = predict(random_model, news_test)
p = table(cred_pred, news_test$shares)
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 59.59722
#Technique 3
#Model training
library(rpart)
m.rpart <- rpart(shares ~ ., data=news_train)
summary(m.rpart)
## Call:
## rpart(formula = shares ~ ., data = news_train)
## n=35622 (8378 observations deleted due to missingness)
##
## CP nsplit rel error xerror xstd
## 1 0.03251249 0 1.0000000 1.0000000 0.005668843
## 2 0.01216208 2 0.9349750 0.9386477 0.005637448
## 3 0.01000000 3 0.9228129 0.9301583 0.005631613
##
## Variable importance
## num_imgs global_sentiment_polarity kw_max_max
## 44 22 11
## avg_positive_polarity num_hrefs n_non_stop_words
## 6 5 5
## n_tokens_content average_token_length n_unique_tokens
## 4 2 2
##
## Node number 1: 35622 observations, complexity param=0.03251249
## predicted class=yes expected loss=0.4662568 P(node) =1
## class counts: 16609 19013
## probabilities: 0.466 0.534
## left son=2 (26313 obs) right son=3 (9309 obs)
## Primary splits:
## num_imgs < 3.5 to the left, improve=204.43790, (0 missing)
## num_hrefs < 13.5 to the left, improve=163.26770, (0 missing)
## global_sentiment_polarity < 0.09269021 to the left, improve=130.72360, (0 missing)
## n_unique_tokens < 0.4396835 to the right, improve= 90.54954, (0 missing)
## num_keywords < 6.5 to the left, improve= 78.65559, (0 missing)
## Surrogate splits:
## num_hrefs < 18.5 to the left, agree=0.768, adj=0.114, (0 split)
## n_non_stop_words < 1 to the left, agree=0.757, adj=0.070, (0 split)
## n_tokens_content < 1339.5 to the left, agree=0.753, adj=0.057, (0 split)
## global_sentiment_polarity < -0.1855226 to the right, agree=0.740, adj=0.004, (0 split)
## average_token_length < 5.626769 to the left, agree=0.740, adj=0.004, (0 split)
##
## Node number 2: 26313 observations, complexity param=0.03251249
## predicted class=yes expected loss=0.4981188 P(node) =0.7386727
## class counts: 13107 13206
## probabilities: 0.498 0.502
## left son=4 (10534 obs) right son=5 (15779 obs)
## Primary splits:
## global_sentiment_polarity < 0.09269021 to the left, improve=99.22453, (0 missing)
## n_unique_tokens < 0.4304826 to the right, improve=78.06604, (0 missing)
## kw_max_max < 677000 to the right, improve=76.51894, (0 missing)
## num_hrefs < 19.5 to the left, improve=56.02928, (0 missing)
## average_token_length < 4.781043 to the right, improve=54.85230, (0 missing)
## Surrogate splits:
## avg_positive_polarity < 0.3012278 to the left, agree=0.702, adj=0.256, (0 split)
## n_unique_tokens < 0.1845133 to the left, agree=0.630, adj=0.077, (0 split)
## average_token_length < 3.659021 to the left, agree=0.630, adj=0.077, (0 split)
## n_tokens_content < 9 to the left, agree=0.630, adj=0.077, (0 split)
## n_non_stop_words < 0.5 to the left, agree=0.630, adj=0.077, (0 split)
##
## Node number 3: 9309 observations
## predicted class=yes expected loss=0.3761951 P(node) =0.2613273
## class counts: 3502 5807
## probabilities: 0.376 0.624
##
## Node number 4: 10534 observations, complexity param=0.01216208
## predicted class=no expected loss=0.4487374 P(node) =0.2957161
## class counts: 5807 4727
## probabilities: 0.551 0.449
## left son=8 (9288 obs) right son=9 (1246 obs)
## Primary splits:
## kw_max_max < 654150 to the right, improve=49.48608, (0 missing)
## n_non_stop_words < 1 to the right, improve=48.04450, (0 missing)
## n_tokens_content < 161.5 to the right, improve=47.03210, (0 missing)
## num_imgs < 0.5 to the right, improve=45.60699, (0 missing)
## average_token_length < 4.729654 to the right, improve=45.30676, (0 missing)
## Surrogate splits:
## n_unique_tokens < 0.9755102 to the left, agree=0.882, adj=0.002, (0 split)
##
## Node number 5: 15779 observations
## predicted class=yes expected loss=0.4626402 P(node) =0.4429566
## class counts: 7300 8479
## probabilities: 0.463 0.537
##
## Node number 8: 9288 observations
## predicted class=no expected loss=0.4309862 P(node) =0.2607377
## class counts: 5285 4003
## probabilities: 0.569 0.431
##
## Node number 9: 1246 observations
## predicted class=yes expected loss=0.4189406 P(node) =0.03497838
## class counts: 522 724
## probabilities: 0.419 0.581
#Visualization
library(rpart.plot)
rpart.plot(m.rpart, digits=3, type=1)

#Evaluate the Model
p.rpart <- predict(m.rpart, news_test)
summary(p.rpart)
## no yes
## Min. :0.3762 Min. :0.4310
## 1st Qu.:0.4626 1st Qu.:0.5374
## Median :0.4626 Median :0.5374
## Mean :0.4659 Mean :0.5341
## 3rd Qu.:0.4626 3rd Qu.:0.5374
## Max. :0.5690 Max. :0.6238
summary(news_test$shares)
## no yes NA's
## 1881 2141 978