Step 1: Collecting the data
credit <- read.csv("/Users/RunhaoWang/Desktop/530 90/credit.csv")
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$Creditability)
##
## 0 1
## 300 700
Step 2:Exploring the data
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$Creditability)
##
## 0 1
## 300 700
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3088889 0.6911111
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.22 0.78
Step 3: Training the model
if (!require("C50")) {
install.packages("C50")
library(C50)
}
## Loading required package: C50
credit_train$Creditability <- as.factor(credit_train$Creditability)
credit_test$Creditability <- as.factor(credit_test$Creditability)
credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)
credit_model
##
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 20
##
## Tree size: 85
##
## Non-standard options: attempt to group attributes
Step 4: Evaluating Model Performance
cred_pred <- predict(credit_model, credit_test)
if (!require("gmodels")) {
install.packages("gmodels")
library(gmodels)
}
## Loading required package: gmodels
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ---------------------|-----------|-----------|-----------|
## 0 | 8 | 14 | 22 |
## | 0.080 | 0.140 | |
## ---------------------|-----------|-----------|-----------|
## 1 | 17 | 61 | 78 |
## | 0.170 | 0.610 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 25 | 75 | 100 |
## ---------------------|-----------|-----------|-----------|
##
##
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 8 17
## 1 14 61
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 69
Answer:Accuracy means that the data generated using training model data set match the test set. But it doesn’t not mean the model is well designed, because it could be overmatched or it might cause an error in the creation of the model. The combination of accuracy and other parameters will give a better support on whether the model is a good one.
Method #2. Random forest
if (!require("randomForest")) {
install.packages("randomForest")
library(randomForest)
}
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 900 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1800 matrix numeric
## oob.times 900 -none- numeric
## classes 2 -none- character
## importance 20 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 900 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 11 10
## 1 11 68
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 79
Answer:
importance(random_model)
## MeanDecreaseGini
## Account.Balance 42.599355
## Duration.of.Credit..month. 37.502785
## Payment.Status.of.Previous.Credit 22.563009
## Purpose 23.774048
## Credit.Amount 52.397155
## Value.Savings.Stocks 19.388385
## Length.of.current.employment 20.221289
## Instalment.per.cent 16.394636
## Sex...Marital.Status 13.424449
## Guarantors 7.475422
## Duration.in.Current.address 15.563685
## Most.valuable.available.asset 17.326842
## Age..years. 37.377916
## Concurrent.Credits 8.480725
## Type.of.apartment 9.595344
## No.of.Credits.at.this.Bank 8.424006
## Occupation 12.669816
## No.of.dependents 5.774473
## Telephone 7.505291
## Foreign.Worker 1.746964
Three most imprtant features are Credit Amount at 49.280407, Account Balance at 41.291567, and Age by years at 39.224293.
set.seed(23458)
random_model1 <- randomForest(Creditability ~ (Credit.Amount + Account.Balance + Age..years.), data= credit_train)
cred_pred1 <- predict(random_model1, credit_test)
(p1 <- table(cred_pred1, credit_test$Creditability))
##
## cred_pred1 0 1
## 0 5 4
## 1 17 74
(Accuracy <- sum(diag(p1))/sum(p1)*100)
## [1] 79
Now the accuracy is drop to 67 for this model after add more parameters.
Method #3. Adding regression to trees
wine <- read.csv("/Users/RunhaoWang/Desktop/530 90/whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
#hist(wine$quality)
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
if (!require("rpart")) {
install.packages("rpart")
library(rpart)
}
## Loading required package: rpart
if (!require("rpart.plot")) {
install.packages("rpart.plot")
library(rpart.plot)
}
## Loading required package: rpart.plot
m.rpart <- rpart(quality ~ ., data=wine_train)
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.15501053 0 1.0000000 1.0008512 0.02447624
## 2 0.05098911 1 0.8449895 0.8528051 0.02344581
## 3 0.02796998 2 0.7940004 0.8069699 0.02277538
## 4 0.01970128 3 0.7660304 0.7913725 0.02219904
## 5 0.01265926 4 0.7463291 0.7628362 0.02090075
## 6 0.01007193 5 0.7336698 0.7593753 0.02087092
## 7 0.01000000 6 0.7235979 0.7534140 0.02074481
##
## Variable importance
## alcohol density volatile.acidity
## 34 21 15
## chlorides total.sulfur.dioxide free.sulfur.dioxide
## 11 7 6
## residual.sugar sulphates citric.acid
## 3 1 1
##
## Node number 1: 3750 observations, complexity param=0.1550105
## mean=5.870933, MSE=0.7854751
## left son=2 (2372 obs) right son=3 (1378 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.15501050, (0 missing)
## density < 0.992035 to the right, improve=0.10915940, (0 missing)
## chlorides < 0.0395 to the right, improve=0.07682258, (0 missing)
## total.sulfur.dioxide < 158.5 to the right, improve=0.04089663, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03636458, (0 missing)
## Surrogate splits:
## density < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
## chlorides < 0.0375 to the right, agree=0.757, adj=0.339, (0 split)
## total.sulfur.dioxide < 103.5 to the right, agree=0.690, adj=0.155, (0 split)
## residual.sugar < 5.375 to the right, agree=0.667, adj=0.094, (0 split)
## sulphates < 0.345 to the right, agree=0.647, adj=0.038, (0 split)
##
## Node number 2: 2372 observations, complexity param=0.05098911
## mean=5.604975, MSE=0.5981709
## left son=4 (1611 obs) right son=5 (761 obs)
## Primary splits:
## volatile.acidity < 0.2275 to the right, improve=0.10585250, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03390500, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03204075, (0 missing)
## alcohol < 10.11667 to the left, improve=0.03136524, (0 missing)
## chlorides < 0.0585 to the right, improve=0.01633599, (0 missing)
## Surrogate splits:
## pH < 3.485 to the left, agree=0.694, adj=0.047, (0 split)
## sulphates < 0.755 to the left, agree=0.685, adj=0.020, (0 split)
## total.sulfur.dioxide < 105.5 to the right, agree=0.683, adj=0.011, (0 split)
## residual.sugar < 0.75 to the right, agree=0.681, adj=0.007, (0 split)
## chlorides < 0.0285 to the right, agree=0.680, adj=0.003, (0 split)
##
## Node number 3: 1378 observations, complexity param=0.02796998
## mean=6.328737, MSE=0.7765472
## left son=6 (84 obs) right son=7 (1294 obs)
## Primary splits:
## free.sulfur.dioxide < 10.5 to the left, improve=0.07699080, (0 missing)
## alcohol < 11.76667 to the left, improve=0.06210660, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.04438619, (0 missing)
## residual.sugar < 1.375 to the left, improve=0.02905351, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02613259, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 53.5 to the left, agree=0.952, adj=0.214, (0 split)
## volatile.acidity < 0.875 to the right, agree=0.940, adj=0.024, (0 split)
##
## Node number 4: 1611 observations, complexity param=0.01265926
## mean=5.43203, MSE=0.5098121
## left son=8 (688 obs) right son=9 (923 obs)
## Primary splits:
## volatile.acidity < 0.3025 to the right, improve=0.04540111, (0 missing)
## alcohol < 10.05 to the left, improve=0.03874403, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03338886, (0 missing)
## chlorides < 0.0495 to the right, improve=0.02574623, (0 missing)
## citric.acid < 0.195 to the left, improve=0.02327981, (0 missing)
## Surrogate splits:
## citric.acid < 0.215 to the left, agree=0.633, adj=0.141, (0 split)
## free.sulfur.dioxide < 20.5 to the left, agree=0.600, adj=0.063, (0 split)
## chlorides < 0.0595 to the right, agree=0.593, adj=0.047, (0 split)
## residual.sugar < 1.15 to the left, agree=0.583, adj=0.023, (0 split)
## total.sulfur.dioxide < 219.25 to the right, agree=0.582, adj=0.022, (0 split)
##
## Node number 5: 761 observations
## mean=5.971091, MSE=0.5878633
##
## Node number 6: 84 observations
## mean=5.369048, MSE=1.137613
##
## Node number 7: 1294 observations, complexity param=0.01970128
## mean=6.391036, MSE=0.6894405
## left son=14 (629 obs) right son=15 (665 obs)
## Primary splits:
## alcohol < 11.76667 to the left, improve=0.06504696, (0 missing)
## chlorides < 0.0395 to the right, improve=0.02758705, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02750932, (0 missing)
## pH < 3.055 to the left, improve=0.02307356, (0 missing)
## total.sulfur.dioxide < 191.5 to the right, improve=0.02186818, (0 missing)
## Surrogate splits:
## density < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
## volatile.acidity < 0.2675 to the left, agree=0.637, adj=0.253, (0 split)
## chlorides < 0.0365 to the right, agree=0.630, adj=0.238, (0 split)
## residual.sugar < 1.475 to the left, agree=0.575, adj=0.126, (0 split)
## total.sulfur.dioxide < 128.5 to the right, agree=0.574, adj=0.124, (0 split)
##
## Node number 8: 688 observations
## mean=5.255814, MSE=0.4054895
##
## Node number 9: 923 observations
## mean=5.56338, MSE=0.5471747
##
## Node number 14: 629 observations, complexity param=0.01007193
## mean=6.173291, MSE=0.6838017
## left son=28 (11 obs) right son=29 (618 obs)
## Primary splits:
## volatile.acidity < 0.465 to the right, improve=0.06897561, (0 missing)
## total.sulfur.dioxide < 200 to the right, improve=0.04223066, (0 missing)
## residual.sugar < 0.975 to the left, improve=0.03061714, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02978501, (0 missing)
## sulphates < 0.575 to the left, improve=0.02165970, (0 missing)
## Surrogate splits:
## citric.acid < 0.045 to the left, agree=0.986, adj=0.182, (0 split)
## total.sulfur.dioxide < 279.25 to the right, agree=0.986, adj=0.182, (0 split)
##
## Node number 15: 665 observations
## mean=6.596992, MSE=0.6075098
##
## Node number 28: 11 observations
## mean=4.545455, MSE=0.9752066
##
## Node number 29: 618 observations
## mean=6.202265, MSE=0.6306098
rpart.plot(m.rpart, digits=3)
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
Answer: The root-mean-square error is 0.537, the lower RMSE value, the more model fit the observed data, and 0.537 is still high.
Method #4. News Popularity
Step 1: Collecting the Data
news <- read.csv("/Users/RunhaoWang/Desktop/530 90/OnlineNewsPopularity_for_R.csv")
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
str(newsShort)
## 'data.frame': 39644 obs. of 17 variables:
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity: num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
Step 2: Pre-processing
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "1"}
else {newsShort$popular[i] = "0"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
prop.table(table(news_train$shares))
##
## 0 1
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
##
## 0 1
## 0.414 0.586
Step 3: Modeling and evaluation
library("C50")
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Feb 23 23:48:16 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 9000 cases (18 attributes) from undefined.data
##
## Decision tree:
##
## popular = 0: 0 (3878)
## popular = 1: 1 (5122)
##
##
## Evaluation on training data (9000 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 3878 (a): class 0
## 5122 (b): class 1
##
##
## Attribute usage:
##
## 100.00% popular
##
##
## Time: 0.1 secs
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##
## news_pred 0 1
## 0 414 0
## 1 0 586
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100
Answer:
Step 1: RANDOM FOREST
news <- read.csv("/Users/RunhaoWang/Desktop/530 90/OnlineNewsPopularity_for_R.csv")
news <- news[,-(1:2)]
#Check for outliers
news=news[!news$n_unique_tokens==701,]
#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#Standardize the dataset
for(i in ncol(news)-1){
news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}
#Define popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))
set.seed(12345)
news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:3964, ]
news_test <- news_rand[3965:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_modelNews <- randomForest(news_train$shares ~ . , data= news_train)
#Model training
cred_pridRF <- predict(random_modelNews, news_test)
(p2 <- table(cred_pridRF, news_test$shares))
##
## cred_pridRF 0 1
## 0 10531 7712
## 1 7542 9894
(Accuracy <- sum(diag(p2))/sum(p2)*100)
## [1] 57.24656
importance(random_modelNews)
## MeanDecreaseGini
## n_tokens_title 111.06636
## n_tokens_content 165.82171
## n_unique_tokens 184.11362
## n_non_stop_words 167.86920
## num_hrefs 131.42586
## num_imgs 99.78308
## num_videos 56.95850
## average_token_length 197.00327
## num_keywords 96.41648
## kw_max_max 54.20264
## global_sentiment_polarity 209.21613
## avg_positive_polarity 190.79576
## title_subjectivity 77.55101
## title_sentiment_polarity 85.25876
## abs_title_subjectivity 70.34923
## abs_title_sentiment_polarity 70.29701
Step 2: Decision Tree
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Feb 23 23:48:31 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 3964 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## num_imgs > 3:
## :...n_tokens_title <= 9:
## : :...num_imgs <= 14:
## : : :...avg_positive_polarity > 0.4936364: 1 (20)
## : : : avg_positive_polarity <= 0.4936364:
## : : : :...title_sentiment_polarity <= -0.375: 0 (13/4)
## : : : title_sentiment_polarity > -0.375: 1 (207/59)
## : : num_imgs > 14:
## : : :...kw_max_max <= 690400: 0 (26/7)
## : : kw_max_max > 690400:
## : : :...num_keywords <= 4: 0 (7/1)
## : : num_keywords > 4: 1 (88/30)
## : n_tokens_title > 9:
## : :...n_tokens_title > 14: 0 (32/11)
## : n_tokens_title <= 14:
## : :...n_tokens_content <= 452: 1 (256/85)
## : n_tokens_content > 452:
## : :...kw_max_max <= 617900:
## : :...num_hrefs <= 6: 1 (11/2)
## : : num_hrefs > 6: 0 (34/8)
## : kw_max_max > 617900:
## : :...num_keywords <= 6: 0 (103/42)
## : num_keywords > 6: 1 (278/123)
## num_imgs <= 3:
## :...global_sentiment_polarity > 0.09618686:
## :...kw_max_max > 73100:
## : :...n_tokens_content <= 859:
## : : :...num_imgs <= 1: 0 (1125/525)
## : : : num_imgs > 1: 1 (152/66)
## : : n_tokens_content > 859:
## : : :...num_hrefs <= 4: 0 (13/2)
## : : num_hrefs > 4: 1 (216/75)
## : kw_max_max <= 73100:
## : :...abs_title_subjectivity <= 0.02272727: 1 (17)
## : abs_title_subjectivity > 0.02272727:
## : :...num_hrefs <= 4: 1 (36/8)
## : num_hrefs > 4:
## : :...title_subjectivity > 0.75: 1 (11)
## : title_subjectivity <= 0.75:
## : :...n_tokens_content <= 1020: 0 (73/28)
## : n_tokens_content > 1020: 1 (7)
## global_sentiment_polarity <= 0.09618686:
## :...average_token_length > 4.633452:
## :...n_tokens_title > 8: 0 (609/179)
## : n_tokens_title <= 8:
## : :...n_unique_tokens > 0.5629771:
## : :...avg_positive_polarity <= 0.1463265: 1 (4)
## : : avg_positive_polarity > 0.1463265: 0 (56/13)
## : n_unique_tokens <= 0.5629771:
## : :...abs_title_subjectivity <= 0.1388889: 1 (7)
## : abs_title_subjectivity > 0.1388889:
## : :...n_tokens_title <= 7: 1 (15/3)
## : n_tokens_title > 7: 0 (35/15)
## average_token_length <= 4.633452:
## :...kw_max_max <= 690400: 1 (114/40)
## kw_max_max > 690400:
## :...num_keywords <= 6: 0 (191/62)
## num_keywords > 6:
## :...n_non_stop_words <= 0: 1 (57/23)
## n_non_stop_words > 0:
## :...num_hrefs > 3: 1 (122/53)
## num_hrefs <= 3:
## :...n_tokens_title > 10: 0 (20/2)
## n_tokens_title <= 10:
## :...title_sentiment_polarity <= -0.05: 0 (4)
## title_sentiment_polarity > -0.05: 1 (5)
##
##
## Evaluation on training data (3964 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 34 1466(37.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 1442 567 (a): class 0
## 899 1056 (b): class 1
##
##
## Attribute usage:
##
## 100.00% num_imgs
## 72.88% global_sentiment_polarity
## 68.37% kw_max_max
## 57.21% n_tokens_content
## 46.17% n_tokens_title
## 31.26% average_token_length
## 22.07% num_keywords
## 13.93% num_hrefs
## 7.57% avg_positive_polarity
## 5.78% title_sentiment_polarity
## 5.25% n_non_stop_words
## 5.07% abs_title_subjectivity
## 2.95% n_unique_tokens
## 2.30% title_subjectivity
##
##
## Time: 0.1 secs
#Model training
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 35679
##
##
## | predicted shares
## actual shares | 0 | 1 | Row Total |
## --------------|-----------|-----------|-----------|
## 0 | 11669 | 6404 | 18073 |
## | 0.327 | 0.179 | |
## --------------|-----------|-----------|-----------|
## 1 | 9231 | 8375 | 17606 |
## | 0.259 | 0.235 | |
## --------------|-----------|-----------|-----------|
## Column Total | 20900 | 14779 | 35679 |
## --------------|-----------|-----------|-----------|
##
##
(p3 <- table(news_pred, news_test$shares))
##
## news_pred 0 1
## 0 11669 9231
## 1 6404 8375
(Accuracy <- sum(diag(p3))/sum(p3)*100)
## [1] 56.1787