credit <- read.csv("C:/Users/zeemo/Desktop/Files/Data/RStudio/credit.csv", header=TRUE)
str(credit)
‘data.frame’: 1000 obs. of 21 variables: $ Creditability : int 1 1 1 1 1 1 1 1 1 1 … $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 … $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 … $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 … $ Purpose : int 2 0 9 0 0 0 0 0 3 3 … $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 … $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 … $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 … $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 … $ Sex…Marital.Status : int 2 3 2 3 3 3 3 3 2 2 … $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 … $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 … $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 … $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 … $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 … $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 … $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 … $ Occupation : int 3 3 2 2 2 2 2 2 1 1 … $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 … $ Telephone : int 1 1 1 1 1 1 1 1 1 1 … $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 …
summary(credit$amount)
## Length Class Mode
## 0 NULL NULL
table(credit$Creditability)
##
## 0 1
## 300 700
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
credit_train <-credit_rand[1:900, ]
credit_test <-credit_rand[901:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3088889 0.6911111
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.22 0.78
##Step 3: Training a model on the data
library(C50)
## Warning: package 'C50' was built under R version 3.6.3
credit_train$Creditability <- as.factor(credit_train$Creditability)
credit_test$Creditability <- as.factor(credit_test$Creditability)
credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)
credit_model
##
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 20
##
## Tree size: 85
##
## Non-standard options: attempt to group attributes
##Step 4: Evaluating Model Performance
cred_pred <- predict(credit_model, credit_test)
if (!require("gmodels")) {
install.packages("gmodels")
library(gmodels)
}
## Loading required package: gmodels
## Warning: package 'gmodels' was built under R version 3.6.3
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ---------------------|-----------|-----------|-----------|
## 0 | 8 | 14 | 22 |
## | 0.080 | 0.140 | |
## ---------------------|-----------|-----------|-----------|
## 1 | 17 | 61 | 78 |
## | 0.170 | 0.610 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 25 | 75 | 100 |
## ---------------------|-----------|-----------|-----------|
##
##
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 8 17
## 1 14 61
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 69
#Q1 ### It does not mean 100% has a perfect model, however, it does indicate that the model fit well. However, in case to avoide the situation of overmatch, we need further tests or other parameters.
if (!require("randomForest")) {
install.packages("randomForest")
library(randomForest)
}
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
credit_train$Creditability <-as.factor(credit_train$Creditability)
random_model <-randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 900 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1800 matrix numeric
## oob.times 900 -none- numeric
## classes 2 -none- character
## importance 20 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 900 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 11 10
## 1 11 68
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 79
importance(random_model)
## MeanDecreaseGini
## Account.Balance 42.599355
## Duration.of.Credit..month. 37.502785
## Payment.Status.of.Previous.Credit 22.563009
## Purpose 23.774048
## Credit.Amount 52.397155
## Value.Savings.Stocks 19.388385
## Length.of.current.employment 20.221289
## Instalment.per.cent 16.394636
## Sex...Marital.Status 13.424449
## Guarantors 7.475422
## Duration.in.Current.address 15.563685
## Most.valuable.available.asset 17.326842
## Age..years. 37.377916
## Concurrent.Credits 8.480725
## Type.of.apartment 9.595344
## No.of.Credits.at.this.Bank 8.424006
## Occupation 12.669816
## No.of.dependents 5.774473
## Telephone 7.505291
## Foreign.Worker 1.746964
set.seed(23458)
random_model1 <- randomForest(Creditability ~ (Credit.Amount + Account.Balance + Age..years.), data= credit_train)
cred_pred1 <- predict(random_model1, credit_test)
(p1 <- table(cred_pred1, credit_test$Creditability))
##
## cred_pred1 0 1
## 0 5 4
## 1 17 74
(Accuracy <- sum(diag(p1))/sum(p1)*100)
## [1] 79
wine <- read.csv("C:/Users/zeemo/Desktop/Files/Data/RStudio/whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
##check to see if the class variable, quality, follows a normal distribution or is nearly normal.
hist(wine$quality)
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
if (!require("rpart")) {
install.packages("rpart")
library(rpart)
}
## Loading required package: rpart
if (!require("rpart.plot")) {
install.packages("rpart.plot")
library(rpart.plot)
}
## Loading required package: rpart.plot
## Warning: package 'rpart.plot' was built under R version 3.6.3
m.rpart <- rpart(quality ~ ., data=wine_train)
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.15501053 0 1.0000000 1.0006805 0.02446810
## 2 0.05098911 1 0.8449895 0.8513823 0.02347467
## 3 0.02796998 2 0.7940004 0.8064997 0.02277860
## 4 0.01970128 3 0.7660304 0.7885635 0.02183879
## 5 0.01265926 4 0.7463291 0.7638299 0.02077561
## 6 0.01007193 5 0.7336698 0.7507459 0.02052777
## 7 0.01000000 6 0.7235979 0.7470415 0.02049539
##
## Variable importance
## alcohol density volatile.acidity
## 34 21 15
## chlorides total.sulfur.dioxide free.sulfur.dioxide
## 11 7 6
## residual.sugar sulphates citric.acid
## 3 1 1
##
## Node number 1: 3750 observations, complexity param=0.1550105
## mean=5.870933, MSE=0.7854751
## left son=2 (2372 obs) right son=3 (1378 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.15501050, (0 missing)
## density < 0.992035 to the right, improve=0.10915940, (0 missing)
## chlorides < 0.0395 to the right, improve=0.07682258, (0 missing)
## total.sulfur.dioxide < 158.5 to the right, improve=0.04089663, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03636458, (0 missing)
## Surrogate splits:
## density < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
## chlorides < 0.0375 to the right, agree=0.757, adj=0.339, (0 split)
## total.sulfur.dioxide < 103.5 to the right, agree=0.690, adj=0.155, (0 split)
## residual.sugar < 5.375 to the right, agree=0.667, adj=0.094, (0 split)
## sulphates < 0.345 to the right, agree=0.647, adj=0.038, (0 split)
##
## Node number 2: 2372 observations, complexity param=0.05098911
## mean=5.604975, MSE=0.5981709
## left son=4 (1611 obs) right son=5 (761 obs)
## Primary splits:
## volatile.acidity < 0.2275 to the right, improve=0.10585250, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03390500, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03204075, (0 missing)
## alcohol < 10.11667 to the left, improve=0.03136524, (0 missing)
## chlorides < 0.0585 to the right, improve=0.01633599, (0 missing)
## Surrogate splits:
## pH < 3.485 to the left, agree=0.694, adj=0.047, (0 split)
## sulphates < 0.755 to the left, agree=0.685, adj=0.020, (0 split)
## total.sulfur.dioxide < 105.5 to the right, agree=0.683, adj=0.011, (0 split)
## residual.sugar < 0.75 to the right, agree=0.681, adj=0.007, (0 split)
## chlorides < 0.0285 to the right, agree=0.680, adj=0.003, (0 split)
##
## Node number 3: 1378 observations, complexity param=0.02796998
## mean=6.328737, MSE=0.7765472
## left son=6 (84 obs) right son=7 (1294 obs)
## Primary splits:
## free.sulfur.dioxide < 10.5 to the left, improve=0.07699080, (0 missing)
## alcohol < 11.76667 to the left, improve=0.06210660, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.04438619, (0 missing)
## residual.sugar < 1.375 to the left, improve=0.02905351, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02613259, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 53.5 to the left, agree=0.952, adj=0.214, (0 split)
## volatile.acidity < 0.875 to the right, agree=0.940, adj=0.024, (0 split)
##
## Node number 4: 1611 observations, complexity param=0.01265926
## mean=5.43203, MSE=0.5098121
## left son=8 (688 obs) right son=9 (923 obs)
## Primary splits:
## volatile.acidity < 0.3025 to the right, improve=0.04540111, (0 missing)
## alcohol < 10.05 to the left, improve=0.03874403, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03338886, (0 missing)
## chlorides < 0.0495 to the right, improve=0.02574623, (0 missing)
## citric.acid < 0.195 to the left, improve=0.02327981, (0 missing)
## Surrogate splits:
## citric.acid < 0.215 to the left, agree=0.633, adj=0.141, (0 split)
## free.sulfur.dioxide < 20.5 to the left, agree=0.600, adj=0.063, (0 split)
## chlorides < 0.0595 to the right, agree=0.593, adj=0.047, (0 split)
## residual.sugar < 1.15 to the left, agree=0.583, adj=0.023, (0 split)
## total.sulfur.dioxide < 219.25 to the right, agree=0.582, adj=0.022, (0 split)
##
## Node number 5: 761 observations
## mean=5.971091, MSE=0.5878633
##
## Node number 6: 84 observations
## mean=5.369048, MSE=1.137613
##
## Node number 7: 1294 observations, complexity param=0.01970128
## mean=6.391036, MSE=0.6894405
## left son=14 (629 obs) right son=15 (665 obs)
## Primary splits:
## alcohol < 11.76667 to the left, improve=0.06504696, (0 missing)
## chlorides < 0.0395 to the right, improve=0.02758705, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02750932, (0 missing)
## pH < 3.055 to the left, improve=0.02307356, (0 missing)
## total.sulfur.dioxide < 191.5 to the right, improve=0.02186818, (0 missing)
## Surrogate splits:
## density < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
## volatile.acidity < 0.2675 to the left, agree=0.637, adj=0.253, (0 split)
## chlorides < 0.0365 to the right, agree=0.630, adj=0.238, (0 split)
## residual.sugar < 1.475 to the left, agree=0.575, adj=0.126, (0 split)
## total.sulfur.dioxide < 128.5 to the right, agree=0.574, adj=0.124, (0 split)
##
## Node number 8: 688 observations
## mean=5.255814, MSE=0.4054895
##
## Node number 9: 923 observations
## mean=5.56338, MSE=0.5471747
##
## Node number 14: 629 observations, complexity param=0.01007193
## mean=6.173291, MSE=0.6838017
## left son=28 (11 obs) right son=29 (618 obs)
## Primary splits:
## volatile.acidity < 0.465 to the right, improve=0.06897561, (0 missing)
## total.sulfur.dioxide < 200 to the right, improve=0.04223066, (0 missing)
## residual.sugar < 0.975 to the left, improve=0.03061714, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02978501, (0 missing)
## sulphates < 0.575 to the left, improve=0.02165970, (0 missing)
## Surrogate splits:
## citric.acid < 0.045 to the left, agree=0.986, adj=0.182, (0 split)
## total.sulfur.dioxide < 279.25 to the right, agree=0.986, adj=0.182, (0 split)
##
## Node number 15: 665 observations
## mean=6.596992, MSE=0.6075098
##
## Node number 28: 11 observations
## mean=4.545455, MSE=0.9752066
##
## Node number 29: 618 observations
## mean=6.202265, MSE=0.6306098
rpart.plot(m.rpart, digits=3)
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
news <- read.csv("C:/Users/zeemo/Desktop/Files/Data/RStudio/OnlineNewsPopularity_for_R.csv")
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
str(newsShort)
## 'data.frame': 39644 obs. of 17 variables:
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity: num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "1"}
else {newsShort$popular[i] = "0"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
prop.table(table(news_train$shares))
##
## 0 1
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
##
## 0 1
## 0.414 0.586
library("C50")
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Sep 06 17:32:39 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 9000 cases (18 attributes) from undefined.data
##
## Decision tree:
##
## popular = 0: 0 (3878)
## popular = 1: 1 (5122)
##
##
## Evaluation on training data (9000 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 3878 (a): class 0
## 5122 (b): class 1
##
##
## Attribute usage:
##
## 100.00% popular
##
##
## Time: 0.1 secs
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##
## news_pred 0 1
## 0 414 0
## 1 0 586
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100
news <- read.csv("C:/Users/zeemo/Desktop/Files/Data/RStudio/OnlineNewsPopularity_for_R.csv")
news <- news[,-(1:2)]
news=news[!news$n_unique_tokens==701,]
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
for(i in ncol(news)-1){
news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))
set.seed(12345)
news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:3964, ]
news_test <- news_rand[3965:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_modelNews <- randomForest(news_train$shares ~ . , data= news_train)
cred_pridRF <- predict(random_modelNews, news_test)
(p2 <- table(cred_pridRF, news_test$shares))
##
## cred_pridRF 0 1
## 0 10531 7712
## 1 7542 9894
(Accuracy <- sum(diag(p2))/sum(p2)*100)
## [1] 57.24656
importance(random_modelNews)
## MeanDecreaseGini
## n_tokens_title 111.06636
## n_tokens_content 165.82171
## n_unique_tokens 184.11362
## n_non_stop_words 167.86920
## num_hrefs 131.42586
## num_imgs 99.78308
## num_videos 56.95850
## average_token_length 197.00327
## num_keywords 96.41648
## kw_max_max 54.20264
## global_sentiment_polarity 209.21613
## avg_positive_polarity 190.79576
## title_subjectivity 77.55101
## title_sentiment_polarity 85.25876
## abs_title_subjectivity 70.34923
## abs_title_sentiment_polarity 70.29701
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Sep 06 17:32:46 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 3964 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## num_imgs > 3:
## :...n_tokens_title <= 9:
## : :...num_imgs <= 14:
## : : :...avg_positive_polarity > 0.4936364: 1 (20)
## : : : avg_positive_polarity <= 0.4936364:
## : : : :...title_sentiment_polarity <= -0.375: 0 (13/4)
## : : : title_sentiment_polarity > -0.375: 1 (207/59)
## : : num_imgs > 14:
## : : :...kw_max_max <= 690400: 0 (26/7)
## : : kw_max_max > 690400:
## : : :...num_keywords <= 4: 0 (7/1)
## : : num_keywords > 4: 1 (88/30)
## : n_tokens_title > 9:
## : :...n_tokens_title > 14: 0 (32/11)
## : n_tokens_title <= 14:
## : :...n_tokens_content <= 452: 1 (256/85)
## : n_tokens_content > 452:
## : :...kw_max_max <= 617900:
## : :...num_hrefs <= 6: 1 (11/2)
## : : num_hrefs > 6: 0 (34/8)
## : kw_max_max > 617900:
## : :...num_keywords <= 6: 0 (103/42)
## : num_keywords > 6: 1 (278/123)
## num_imgs <= 3:
## :...global_sentiment_polarity > 0.09618686:
## :...kw_max_max > 73100:
## : :...n_tokens_content <= 859:
## : : :...num_imgs <= 1: 0 (1125/525)
## : : : num_imgs > 1: 1 (152/66)
## : : n_tokens_content > 859:
## : : :...num_hrefs <= 4: 0 (13/2)
## : : num_hrefs > 4: 1 (216/75)
## : kw_max_max <= 73100:
## : :...abs_title_subjectivity <= 0.02272727: 1 (17)
## : abs_title_subjectivity > 0.02272727:
## : :...num_hrefs <= 4: 1 (36/8)
## : num_hrefs > 4:
## : :...title_subjectivity > 0.75: 1 (11)
## : title_subjectivity <= 0.75:
## : :...n_tokens_content <= 1020: 0 (73/28)
## : n_tokens_content > 1020: 1 (7)
## global_sentiment_polarity <= 0.09618686:
## :...average_token_length > 4.633452:
## :...n_tokens_title > 8: 0 (609/179)
## : n_tokens_title <= 8:
## : :...n_unique_tokens > 0.5629771:
## : :...avg_positive_polarity <= 0.1463265: 1 (4)
## : : avg_positive_polarity > 0.1463265: 0 (56/13)
## : n_unique_tokens <= 0.5629771:
## : :...abs_title_subjectivity <= 0.1388889: 1 (7)
## : abs_title_subjectivity > 0.1388889:
## : :...n_tokens_title <= 7: 1 (15/3)
## : n_tokens_title > 7: 0 (35/15)
## average_token_length <= 4.633452:
## :...kw_max_max <= 690400: 1 (114/40)
## kw_max_max > 690400:
## :...num_keywords <= 6: 0 (191/62)
## num_keywords > 6:
## :...n_non_stop_words <= 0: 1 (57/23)
## n_non_stop_words > 0:
## :...num_hrefs > 3: 1 (122/53)
## num_hrefs <= 3:
## :...n_tokens_title > 10: 0 (20/2)
## n_tokens_title <= 10:
## :...title_sentiment_polarity <= -0.05: 0 (4)
## title_sentiment_polarity > -0.05: 1 (5)
##
##
## Evaluation on training data (3964 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 34 1466(37.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 1442 567 (a): class 0
## 899 1056 (b): class 1
##
##
## Attribute usage:
##
## 100.00% num_imgs
## 72.88% global_sentiment_polarity
## 68.37% kw_max_max
## 57.21% n_tokens_content
## 46.17% n_tokens_title
## 31.26% average_token_length
## 22.07% num_keywords
## 13.93% num_hrefs
## 7.57% avg_positive_polarity
## 5.78% title_sentiment_polarity
## 5.25% n_non_stop_words
## 5.07% abs_title_subjectivity
## 2.95% n_unique_tokens
## 2.30% title_subjectivity
##
##
## Time: 0.1 secs
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 35679
##
##
## | predicted shares
## actual shares | 0 | 1 | Row Total |
## --------------|-----------|-----------|-----------|
## 0 | 11669 | 6404 | 18073 |
## | 0.327 | 0.179 | |
## --------------|-----------|-----------|-----------|
## 1 | 9231 | 8375 | 17606 |
## | 0.259 | 0.235 | |
## --------------|-----------|-----------|-----------|
## Column Total | 20900 | 14779 | 35679 |
## --------------|-----------|-----------|-----------|
##
##
(p3 <- table(news_pred, news_test$shares))
##
## news_pred 0 1
## 0 11669 9231
## 1 6404 8375
(Accuracy <- sum(diag(p3))/sum(p3)*100)
## [1] 56.1787