ANLY 530 Lab 1

Method #1. Tree-based classification

Step 1: Collecting the data

credit <- read.csv("/Users/RunhaoWang/Desktop/530 90/credit.csv")
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

table(credit$Creditability)

## 
##   0   1 
## 300 700

Step 2:Exploring the data

summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

table(credit$Creditability)

## 
##   0   1 
## 300 700

set.seed(12345)

credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]

prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.3088889 0.6911111

prop.table(table(credit_test$Creditability))

## 
##    0    1 
## 0.22 0.78

Step 3: Training the model

if (!require("C50")) {
install.packages("C50")
library(C50)
}

## Loading required package: C50

credit_train$Creditability <- as.factor(credit_train$Creditability)
credit_test$Creditability <- as.factor(credit_test$Creditability)

credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)
credit_model

## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Tree size: 85 
## 
## Non-standard options: attempt to group attributes

Step 4: Evaluating Model Performance

cred_pred <- predict(credit_model, credit_test)

if (!require("gmodels")) {
install.packages("gmodels")
library(gmodels)
}

## Loading required package: gmodels

CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |         8 |        14 |        22 | 
##                      |     0.080 |     0.140 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        17 |        61 |        78 | 
##                      |     0.170 |     0.610 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        25 |        75 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
##

(p <- table(cred_pred, credit_test$Creditability))

##          
## cred_pred  0  1
##         0  8 17
##         1 14 61

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 69

Q1- If you see the accuracy 0f 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.

Answer:Accuracy means that the data generated using training model data set match the test set. But it doesn’t not mean the model is well designed, because it could be overmatched or it might cause an error in the creation of the model. The combination of accuracy and other parameters will give a better support on whether the model is a good one.

Method #2. Random forest

if (!require("randomForest")) {
install.packages("randomForest")
library(randomForest)
}

## Loading required package: randomForest

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)

##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))

##          
## cred_pred  0  1
##         0 11 10
##         1 11 68

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 79

Q2- What are the three most important features in this model.Now, Change the random seed to 23458 and find the new accuracy of random forest.

Answer:

importance(random_model)

##                                   MeanDecreaseGini
## Account.Balance                          42.599355
## Duration.of.Credit..month.               37.502785
## Payment.Status.of.Previous.Credit        22.563009
## Purpose                                  23.774048
## Credit.Amount                            52.397155
## Value.Savings.Stocks                     19.388385
## Length.of.current.employment             20.221289
## Instalment.per.cent                      16.394636
## Sex...Marital.Status                     13.424449
## Guarantors                                7.475422
## Duration.in.Current.address              15.563685
## Most.valuable.available.asset            17.326842
## Age..years.                              37.377916
## Concurrent.Credits                        8.480725
## Type.of.apartment                         9.595344
## No.of.Credits.at.this.Bank                8.424006
## Occupation                               12.669816
## No.of.dependents                          5.774473
## Telephone                                 7.505291
## Foreign.Worker                            1.746964

Three most imprtant features are Credit Amount at 49.280407, Account Balance at 41.291567, and Age by years at 39.224293.

set.seed(23458)
random_model1 <- randomForest(Creditability ~ (Credit.Amount + Account.Balance + Age..years.), data= credit_train)
cred_pred1 <- predict(random_model1, credit_test)
(p1 <- table(cred_pred1, credit_test$Creditability))

##           
## cred_pred1  0  1
##          0  5  4
##          1 17 74

(Accuracy <- sum(diag(p1))/sum(p1)*100)

## [1] 79

Now the accuracy is drop to 67 for this model after add more parameters.

Method #3. Adding regression to trees

wine <- read.csv("/Users/RunhaoWang/Desktop/530 90/whitewines.csv")
str(wine)

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...

#hist(wine$quality)

wine_train <- wine[1:3750, ] 
wine_test <- wine[3751:4898, ]

if (!require("rpart")) {
install.packages("rpart")
library(rpart)
}

## Loading required package: rpart

if (!require("rpart.plot")) {
install.packages("rpart.plot")
library(rpart.plot)
}

## Loading required package: rpart.plot

m.rpart <- rpart(quality ~ ., data=wine_train)
summary(m.rpart)

## Call:
## rpart(formula = quality ~ ., data = wine_train)
##   n= 3750 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.15501053      0 1.0000000 1.0008512 0.02447624
## 2 0.05098911      1 0.8449895 0.8528051 0.02344581
## 3 0.02796998      2 0.7940004 0.8069699 0.02277538
## 4 0.01970128      3 0.7660304 0.7913725 0.02219904
## 5 0.01265926      4 0.7463291 0.7628362 0.02090075
## 6 0.01007193      5 0.7336698 0.7593753 0.02087092
## 7 0.01000000      6 0.7235979 0.7534140 0.02074481
## 
## Variable importance
##              alcohol              density     volatile.acidity 
##                   34                   21                   15 
##            chlorides total.sulfur.dioxide  free.sulfur.dioxide 
##                   11                    7                    6 
##       residual.sugar            sulphates          citric.acid 
##                    3                    1                    1 
## 
## Node number 1: 3750 observations,    complexity param=0.1550105
##   mean=5.870933, MSE=0.7854751 
##   left son=2 (2372 obs) right son=3 (1378 obs)
##   Primary splits:
##       alcohol              < 10.85    to the left,  improve=0.15501050, (0 missing)
##       density              < 0.992035 to the right, improve=0.10915940, (0 missing)
##       chlorides            < 0.0395   to the right, improve=0.07682258, (0 missing)
##       total.sulfur.dioxide < 158.5    to the right, improve=0.04089663, (0 missing)
##       citric.acid          < 0.235    to the left,  improve=0.03636458, (0 missing)
##   Surrogate splits:
##       density              < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
##       chlorides            < 0.0375   to the right, agree=0.757, adj=0.339, (0 split)
##       total.sulfur.dioxide < 103.5    to the right, agree=0.690, adj=0.155, (0 split)
##       residual.sugar       < 5.375    to the right, agree=0.667, adj=0.094, (0 split)
##       sulphates            < 0.345    to the right, agree=0.647, adj=0.038, (0 split)
## 
## Node number 2: 2372 observations,    complexity param=0.05098911
##   mean=5.604975, MSE=0.5981709 
##   left son=4 (1611 obs) right son=5 (761 obs)
##   Primary splits:
##       volatile.acidity    < 0.2275   to the right, improve=0.10585250, (0 missing)
##       free.sulfur.dioxide < 13.5     to the left,  improve=0.03390500, (0 missing)
##       citric.acid         < 0.235    to the left,  improve=0.03204075, (0 missing)
##       alcohol             < 10.11667 to the left,  improve=0.03136524, (0 missing)
##       chlorides           < 0.0585   to the right, improve=0.01633599, (0 missing)
##   Surrogate splits:
##       pH                   < 3.485    to the left,  agree=0.694, adj=0.047, (0 split)
##       sulphates            < 0.755    to the left,  agree=0.685, adj=0.020, (0 split)
##       total.sulfur.dioxide < 105.5    to the right, agree=0.683, adj=0.011, (0 split)
##       residual.sugar       < 0.75     to the right, agree=0.681, adj=0.007, (0 split)
##       chlorides            < 0.0285   to the right, agree=0.680, adj=0.003, (0 split)
## 
## Node number 3: 1378 observations,    complexity param=0.02796998
##   mean=6.328737, MSE=0.7765472 
##   left son=6 (84 obs) right son=7 (1294 obs)
##   Primary splits:
##       free.sulfur.dioxide  < 10.5     to the left,  improve=0.07699080, (0 missing)
##       alcohol              < 11.76667 to the left,  improve=0.06210660, (0 missing)
##       total.sulfur.dioxide < 67.5     to the left,  improve=0.04438619, (0 missing)
##       residual.sugar       < 1.375    to the left,  improve=0.02905351, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02613259, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 53.5     to the left,  agree=0.952, adj=0.214, (0 split)
##       volatile.acidity     < 0.875    to the right, agree=0.940, adj=0.024, (0 split)
## 
## Node number 4: 1611 observations,    complexity param=0.01265926
##   mean=5.43203, MSE=0.5098121 
##   left son=8 (688 obs) right son=9 (923 obs)
##   Primary splits:
##       volatile.acidity    < 0.3025   to the right, improve=0.04540111, (0 missing)
##       alcohol             < 10.05    to the left,  improve=0.03874403, (0 missing)
##       free.sulfur.dioxide < 13.5     to the left,  improve=0.03338886, (0 missing)
##       chlorides           < 0.0495   to the right, improve=0.02574623, (0 missing)
##       citric.acid         < 0.195    to the left,  improve=0.02327981, (0 missing)
##   Surrogate splits:
##       citric.acid          < 0.215    to the left,  agree=0.633, adj=0.141, (0 split)
##       free.sulfur.dioxide  < 20.5     to the left,  agree=0.600, adj=0.063, (0 split)
##       chlorides            < 0.0595   to the right, agree=0.593, adj=0.047, (0 split)
##       residual.sugar       < 1.15     to the left,  agree=0.583, adj=0.023, (0 split)
##       total.sulfur.dioxide < 219.25   to the right, agree=0.582, adj=0.022, (0 split)
## 
## Node number 5: 761 observations
##   mean=5.971091, MSE=0.5878633 
## 
## Node number 6: 84 observations
##   mean=5.369048, MSE=1.137613 
## 
## Node number 7: 1294 observations,    complexity param=0.01970128
##   mean=6.391036, MSE=0.6894405 
##   left son=14 (629 obs) right son=15 (665 obs)
##   Primary splits:
##       alcohol              < 11.76667 to the left,  improve=0.06504696, (0 missing)
##       chlorides            < 0.0395   to the right, improve=0.02758705, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02750932, (0 missing)
##       pH                   < 3.055    to the left,  improve=0.02307356, (0 missing)
##       total.sulfur.dioxide < 191.5    to the right, improve=0.02186818, (0 missing)
##   Surrogate splits:
##       density              < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
##       volatile.acidity     < 0.2675   to the left,  agree=0.637, adj=0.253, (0 split)
##       chlorides            < 0.0365   to the right, agree=0.630, adj=0.238, (0 split)
##       residual.sugar       < 1.475    to the left,  agree=0.575, adj=0.126, (0 split)
##       total.sulfur.dioxide < 128.5    to the right, agree=0.574, adj=0.124, (0 split)
## 
## Node number 8: 688 observations
##   mean=5.255814, MSE=0.4054895 
## 
## Node number 9: 923 observations
##   mean=5.56338, MSE=0.5471747 
## 
## Node number 14: 629 observations,    complexity param=0.01007193
##   mean=6.173291, MSE=0.6838017 
##   left son=28 (11 obs) right son=29 (618 obs)
##   Primary splits:
##       volatile.acidity     < 0.465    to the right, improve=0.06897561, (0 missing)
##       total.sulfur.dioxide < 200      to the right, improve=0.04223066, (0 missing)
##       residual.sugar       < 0.975    to the left,  improve=0.03061714, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02978501, (0 missing)
##       sulphates            < 0.575    to the left,  improve=0.02165970, (0 missing)
##   Surrogate splits:
##       citric.acid          < 0.045    to the left,  agree=0.986, adj=0.182, (0 split)
##       total.sulfur.dioxide < 279.25   to the right, agree=0.986, adj=0.182, (0 split)
## 
## Node number 15: 665 observations
##   mean=6.596992, MSE=0.6075098 
## 
## Node number 28: 11 observations
##   mean=4.545455, MSE=0.9752066 
## 
## Node number 29: 618 observations
##   mean=6.202265, MSE=0.6306098

rpart.plot(m.rpart, digits=3)

p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597

summary(wine_test$quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000

cor(p.rpart, wine_test$quality)

## [1] 0.5369525

Q3- What is your interpretation about this amount of RMSE?

Answer: The root-mean-square error is 0.537, the lower RMSE value, the more model fit the observed data, and 0.537 is still high.

Method #4. News Popularity

Step 1: Collecting the Data

news <- read.csv("/Users/RunhaoWang/Desktop/530 90/OnlineNewsPopularity_for_R.csv")
str(news)

## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
str(newsShort)

## 'data.frame':    39644 obs. of  17 variables:
##  $ n_tokens_title              : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content            : num  219 255 211 531 1072 ...
##  $ n_unique_tokens             : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words            : num  1 1 1 1 1 ...
##  $ num_hrefs                   : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_imgs                    : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                  : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length        : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ kw_max_max                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ global_sentiment_polarity   : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ avg_positive_polarity       : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ title_subjectivity          : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity    : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity      : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity: num  0.188 0 0 0 0.136 ...
##  $ shares                      : int  593 711 1500 1200 505 855 556 891 3600 710 ...

Step 2: Pre-processing

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "1"} 
     else {newsShort$popular[i] = "0"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)

news_rand <- newsShort[order(runif(10000)), ]
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

prop.table(table(news_train$shares))

## 
##         0         1 
## 0.4308889 0.5691111

prop.table(table(news_test$shares))

## 
##     0     1 
## 0.414 0.586

Step 3: Modeling and evaluation

library("C50")
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)

## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Feb 23 23:48:16 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 9000 cases (18 attributes) from undefined.data
## 
## Decision tree:
## 
## popular = 0: 0 (3878)
## popular = 1: 1 (5122)
## 
## 
## Evaluation on training data (9000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3878          (a): class 0
##          5122    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% popular
## 
## 
## Time: 0.1 secs

news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))

##          
## news_pred   0   1
##         0 414   0
##         1   0 586

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 100

Q4- Try decision tree and random forest and evaluate the model.

Answer:

Step 1: RANDOM FOREST

news <- read.csv("/Users/RunhaoWang/Desktop/530 90/OnlineNewsPopularity_for_R.csv")
news <- news[,-(1:2)]

#Check for outliers
news=news[!news$n_unique_tokens==701,]

#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

#Standardize the dataset
for(i in ncol(news)-1){ 
  news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}

#Define popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))

set.seed(12345)

news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:3964, ]
news_test <- news_rand[3965:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_modelNews <- randomForest(news_train$shares ~ . , data= news_train)

#Model training
cred_pridRF <- predict(random_modelNews, news_test)
(p2 <- table(cred_pridRF, news_test$shares))

##            
## cred_pridRF     0     1
##           0 10531  7712
##           1  7542  9894

(Accuracy <- sum(diag(p2))/sum(p2)*100)

## [1] 57.24656

importance(random_modelNews)

##                              MeanDecreaseGini
## n_tokens_title                      111.06636
## n_tokens_content                    165.82171
## n_unique_tokens                     184.11362
## n_non_stop_words                    167.86920
## num_hrefs                           131.42586
## num_imgs                             99.78308
## num_videos                           56.95850
## average_token_length                197.00327
## num_keywords                         96.41648
## kw_max_max                           54.20264
## global_sentiment_polarity           209.21613
## avg_positive_polarity               190.79576
## title_subjectivity                   77.55101
## title_sentiment_polarity             85.25876
## abs_title_subjectivity               70.34923
## abs_title_sentiment_polarity         70.29701

Step 2: Decision Tree

news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)

## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Feb 23 23:48:31 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 3964 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## num_imgs > 3:
## :...n_tokens_title <= 9:
## :   :...num_imgs <= 14:
## :   :   :...avg_positive_polarity > 0.4936364: 1 (20)
## :   :   :   avg_positive_polarity <= 0.4936364:
## :   :   :   :...title_sentiment_polarity <= -0.375: 0 (13/4)
## :   :   :       title_sentiment_polarity > -0.375: 1 (207/59)
## :   :   num_imgs > 14:
## :   :   :...kw_max_max <= 690400: 0 (26/7)
## :   :       kw_max_max > 690400:
## :   :       :...num_keywords <= 4: 0 (7/1)
## :   :           num_keywords > 4: 1 (88/30)
## :   n_tokens_title > 9:
## :   :...n_tokens_title > 14: 0 (32/11)
## :       n_tokens_title <= 14:
## :       :...n_tokens_content <= 452: 1 (256/85)
## :           n_tokens_content > 452:
## :           :...kw_max_max <= 617900:
## :               :...num_hrefs <= 6: 1 (11/2)
## :               :   num_hrefs > 6: 0 (34/8)
## :               kw_max_max > 617900:
## :               :...num_keywords <= 6: 0 (103/42)
## :                   num_keywords > 6: 1 (278/123)
## num_imgs <= 3:
## :...global_sentiment_polarity > 0.09618686:
##     :...kw_max_max > 73100:
##     :   :...n_tokens_content <= 859:
##     :   :   :...num_imgs <= 1: 0 (1125/525)
##     :   :   :   num_imgs > 1: 1 (152/66)
##     :   :   n_tokens_content > 859:
##     :   :   :...num_hrefs <= 4: 0 (13/2)
##     :   :       num_hrefs > 4: 1 (216/75)
##     :   kw_max_max <= 73100:
##     :   :...abs_title_subjectivity <= 0.02272727: 1 (17)
##     :       abs_title_subjectivity > 0.02272727:
##     :       :...num_hrefs <= 4: 1 (36/8)
##     :           num_hrefs > 4:
##     :           :...title_subjectivity > 0.75: 1 (11)
##     :               title_subjectivity <= 0.75:
##     :               :...n_tokens_content <= 1020: 0 (73/28)
##     :                   n_tokens_content > 1020: 1 (7)
##     global_sentiment_polarity <= 0.09618686:
##     :...average_token_length > 4.633452:
##         :...n_tokens_title > 8: 0 (609/179)
##         :   n_tokens_title <= 8:
##         :   :...n_unique_tokens > 0.5629771:
##         :       :...avg_positive_polarity <= 0.1463265: 1 (4)
##         :       :   avg_positive_polarity > 0.1463265: 0 (56/13)
##         :       n_unique_tokens <= 0.5629771:
##         :       :...abs_title_subjectivity <= 0.1388889: 1 (7)
##         :           abs_title_subjectivity > 0.1388889:
##         :           :...n_tokens_title <= 7: 1 (15/3)
##         :               n_tokens_title > 7: 0 (35/15)
##         average_token_length <= 4.633452:
##         :...kw_max_max <= 690400: 1 (114/40)
##             kw_max_max > 690400:
##             :...num_keywords <= 6: 0 (191/62)
##                 num_keywords > 6:
##                 :...n_non_stop_words <= 0: 1 (57/23)
##                     n_non_stop_words > 0:
##                     :...num_hrefs > 3: 1 (122/53)
##                         num_hrefs <= 3:
##                         :...n_tokens_title > 10: 0 (20/2)
##                             n_tokens_title <= 10:
##                             :...title_sentiment_polarity <= -0.05: 0 (4)
##                                 title_sentiment_polarity > -0.05: 1 (5)
## 
## 
## Evaluation on training data (3964 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      34 1466(37.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    1442   567    (a): class 0
##     899  1056    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% num_imgs
##   72.88% global_sentiment_polarity
##   68.37% kw_max_max
##   57.21% n_tokens_content
##   46.17% n_tokens_title
##   31.26% average_token_length
##   22.07% num_keywords
##   13.93% num_hrefs
##    7.57% avg_positive_polarity
##    5.78% title_sentiment_polarity
##    5.25% n_non_stop_words
##    5.07% abs_title_subjectivity
##    2.95% n_unique_tokens
##    2.30% title_subjectivity
## 
## 
## Time: 0.1 secs

#Model training
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  35679 
## 
##  
##               | predicted shares 
## actual shares |         0 |         1 | Row Total | 
## --------------|-----------|-----------|-----------|
##             0 |     11669 |      6404 |     18073 | 
##               |     0.327 |     0.179 |           | 
## --------------|-----------|-----------|-----------|
##             1 |      9231 |      8375 |     17606 | 
##               |     0.259 |     0.235 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |     20900 |     14779 |     35679 | 
## --------------|-----------|-----------|-----------|
## 
##

(p3 <- table(news_pred, news_test$shares))

##          
## news_pred     0     1
##         0 11669  9231
##         1  6404  8375

(Accuracy <- sum(diag(p3))/sum(p3)*100)

## [1] 56.1787